Merge branch 'akpm' (patches from Andrew)

+5

.gitignore

··· 53 53 /debian/ 54 54 55 55 # 56 + # tar directory (make tar*-pkg) 57 + # 58 + /tar-install/ 59 + 60 + # 56 61 # git files that we don't want to ignore even it they are dot-files 57 62 # 58 63 !.gitignore

+37

Documentation/filesystems/proc.txt

··· 145 145 stack Report full stack trace, enable via CONFIG_STACKTRACE 146 146 smaps a extension based on maps, showing the memory consumption of 147 147 each mapping and flags associated with it 148 + numa_maps an extension based on maps, showing the memory locality and 149 + binding policy as well as mem usage (in pages) of each mapping. 148 150 .............................................................................. 149 151 150 152 For example, to get the status information of a process, all you have to do is ··· 491 489 To clear the soft-dirty bit 492 490 > echo 4 > /proc/PID/clear_refs 493 491 492 + To reset the peak resident set size ("high water mark") to the process's 493 + current value: 494 + > echo 5 > /proc/PID/clear_refs 495 + 494 496 Any other value written to /proc/PID/clear_refs will have no effect. 495 497 496 498 The /proc/pid/pagemap gives the PFN, which can be used to find the pageflags 497 499 using /proc/kpageflags and number of times a page is mapped using 498 500 /proc/kpagecount. For detailed explanation, see Documentation/vm/pagemap.txt. 501 + 502 + The /proc/pid/numa_maps is an extension based on maps, showing the memory 503 + locality and binding policy, as well as the memory usage (in pages) of 504 + each mapping. The output follows a general format where mapping details get 505 + summarized separated by blank spaces, one mapping per each file line: 506 + 507 + address policy mapping details 508 + 509 + 00400000 default file=/usr/local/bin/app mapped=1 active=0 N3=1 kernelpagesize_kB=4 510 + 00600000 default file=/usr/local/bin/app anon=1 dirty=1 N3=1 kernelpagesize_kB=4 511 + 3206000000 default file=/lib64/ld-2.12.so mapped=26 mapmax=6 N0=24 N3=2 kernelpagesize_kB=4 512 + 320621f000 default file=/lib64/ld-2.12.so anon=1 dirty=1 N3=1 kernelpagesize_kB=4 513 + 3206220000 default file=/lib64/ld-2.12.so anon=1 dirty=1 N3=1 kernelpagesize_kB=4 514 + 3206221000 default anon=1 dirty=1 N3=1 kernelpagesize_kB=4 515 + 3206800000 default file=/lib64/libc-2.12.so mapped=59 mapmax=21 active=55 N0=41 N3=18 kernelpagesize_kB=4 516 + 320698b000 default file=/lib64/libc-2.12.so 517 + 3206b8a000 default file=/lib64/libc-2.12.so anon=2 dirty=2 N3=2 kernelpagesize_kB=4 518 + 3206b8e000 default file=/lib64/libc-2.12.so anon=1 dirty=1 N3=1 kernelpagesize_kB=4 519 + 3206b8f000 default anon=3 dirty=3 active=1 N3=3 kernelpagesize_kB=4 520 + 7f4dc10a2000 default anon=3 dirty=3 N3=3 kernelpagesize_kB=4 521 + 7f4dc10b4000 default anon=2 dirty=2 active=1 N3=2 kernelpagesize_kB=4 522 + 7f4dc1200000 default file=/anon_hugepage\040(deleted) huge anon=1 dirty=1 N3=1 kernelpagesize_kB=2048 523 + 7fff335f0000 default stack anon=3 dirty=3 N3=3 kernelpagesize_kB=4 524 + 7fff3369d000 default mapped=1 mapmax=35 active=0 N3=1 kernelpagesize_kB=4 525 + 526 + Where: 527 + "address" is the starting address for the mapping; 528 + "policy" reports the NUMA memory policy set for the mapping (see vm/numa_memory_policy.txt); 529 + "mapping details" summarizes mapping data such as mapping type, page usage counters, 530 + node locality page counters (N0 == node0, N1 == node1, ...) and the kernel page 531 + size, in KB, that is backing the mapping up. 499 532 500 533 1.2 Kernel data 501 534 ---------------

-5

arch/alpha/include/asm/thread_info.h

··· 27 27 int bpt_nsaved; 28 28 unsigned long bpt_addr[2]; /* breakpoint handling */ 29 29 unsigned int bpt_insn[2]; 30 - 31 - struct restart_block restart_block; 32 30 }; 33 31 34 32 /* ··· 38 40 .exec_domain = &default_exec_domain, \ 39 41 .addr_limit = KERNEL_DS, \ 40 42 .preempt_count = INIT_PREEMPT_COUNT, \ 41 - .restart_block = { \ 42 - .fn = do_no_restart_syscall, \ 43 - }, \ 44 43 } 45 44 46 45 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/alpha/kernel/signal.c

··· 150 150 struct switch_stack *sw = (struct switch_stack *)regs - 1; 151 151 long i, err = __get_user(regs->pc, &sc->sc_pc); 152 152 153 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 153 + current->restart_block.fn = do_no_restart_syscall; 154 154 155 155 sw->r26 = (unsigned long) ret_from_sys_call; 156 156

-4

arch/arc/include/asm/thread_info.h

··· 46 46 struct exec_domain *exec_domain;/* execution domain */ 47 47 __u32 cpu; /* current CPU */ 48 48 unsigned long thr_ptr; /* TLS ptr */ 49 - struct restart_block restart_block; 50 49 }; 51 50 52 51 /* ··· 61 62 .cpu = 0, \ 62 63 .preempt_count = INIT_PREEMPT_COUNT, \ 63 64 .addr_limit = KERNEL_DS, \ 64 - .restart_block = { \ 65 - .fn = do_no_restart_syscall, \ 66 - }, \ 67 65 } 68 66 69 67 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/arc/kernel/signal.c

··· 104 104 struct pt_regs *regs = current_pt_regs(); 105 105 106 106 /* Always make any pending restarted system calls return -EINTR */ 107 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 107 + current->restart_block.fn = do_no_restart_syscall; 108 108 109 109 /* Since we stacked the signal on a word boundary, 110 110 * then 'sp' should be word aligned here. If it's

+4 -1

arch/arm/include/asm/pgtable-3level.h

··· 257 257 #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) 258 258 259 259 /* represent a notpresent pmd by zero, this is used by pmdp_invalidate */ 260 - #define pmd_mknotpresent(pmd) (__pmd(0)) 260 + static inline pmd_t pmd_mknotpresent(pmd_t pmd) 261 + { 262 + return __pmd(0); 263 + } 261 264 262 265 static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) 263 266 {

-4

arch/arm/include/asm/thread_info.h

··· 68 68 #ifdef CONFIG_ARM_THUMBEE 69 69 unsigned long thumbee_state; /* ThumbEE Handler Base register */ 70 70 #endif 71 - struct restart_block restart_block; 72 71 }; 73 72 74 73 #define INIT_THREAD_INFO(tsk) \ ··· 80 81 .cpu_domain = domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \ 81 82 domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \ 82 83 domain_val(DOMAIN_IO, DOMAIN_CLIENT), \ 83 - .restart_block = { \ 84 - .fn = do_no_restart_syscall, \ 85 - }, \ 86 84 } 87 85 88 86 #define init_thread_info (init_thread_union.thread_info)

+2 -2

arch/arm/kernel/signal.c

··· 191 191 struct sigframe __user *frame; 192 192 193 193 /* Always make any pending restarted system calls return -EINTR */ 194 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 194 + current->restart_block.fn = do_no_restart_syscall; 195 195 196 196 /* 197 197 * Since we stacked the signal on a 64-bit boundary, ··· 221 221 struct rt_sigframe __user *frame; 222 222 223 223 /* Always make any pending restarted system calls return -EINTR */ 224 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 224 + current->restart_block.fn = do_no_restart_syscall; 225 225 226 226 /* 227 227 * Since we stacked the signal on a 64-bit boundary,

-4

arch/arm64/include/asm/thread_info.h

··· 48 48 mm_segment_t addr_limit; /* address limit */ 49 49 struct task_struct *task; /* main task structure */ 50 50 struct exec_domain *exec_domain; /* execution domain */ 51 - struct restart_block restart_block; 52 51 int preempt_count; /* 0 => preemptable, <0 => bug */ 53 52 int cpu; /* cpu */ 54 53 }; ··· 59 60 .flags = 0, \ 60 61 .preempt_count = INIT_PREEMPT_COUNT, \ 61 62 .addr_limit = KERNEL_DS, \ 62 - .restart_block = { \ 63 - .fn = do_no_restart_syscall, \ 64 - }, \ 65 63 } 66 64 67 65 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/arm64/kernel/signal.c

··· 131 131 struct rt_sigframe __user *frame; 132 132 133 133 /* Always make any pending restarted system calls return -EINTR */ 134 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 134 + current->restart_block.fn = do_no_restart_syscall; 135 135 136 136 /* 137 137 * Since we stacked the signal on a 128-bit boundary, then 'sp' should

+2 -2

arch/arm64/kernel/signal32.c

··· 347 347 struct compat_sigframe __user *frame; 348 348 349 349 /* Always make any pending restarted system calls return -EINTR */ 350 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 350 + current->restart_block.fn = do_no_restart_syscall; 351 351 352 352 /* 353 353 * Since we stacked the signal on a 64-bit boundary, ··· 381 381 struct compat_rt_sigframe __user *frame; 382 382 383 383 /* Always make any pending restarted system calls return -EINTR */ 384 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 384 + current->restart_block.fn = do_no_restart_syscall; 385 385 386 386 /* 387 387 * Since we stacked the signal on a 64-bit boundary,

-4

arch/avr32/include/asm/thread_info.h

··· 30 30 saved by debug handler 31 31 when setting up 32 32 trampoline */ 33 - struct restart_block restart_block; 34 33 __u8 supervisor_stack[0]; 35 34 }; 36 35 ··· 40 41 .flags = 0, \ 41 42 .cpu = 0, \ 42 43 .preempt_count = INIT_PREEMPT_COUNT, \ 43 - .restart_block = { \ 44 - .fn = do_no_restart_syscall \ 45 - } \ 46 44 } 47 45 48 46 #define init_thread_info (init_thread_union.thread_info)

-1

arch/avr32/kernel/asm-offsets.c

··· 18 18 OFFSET(TI_preempt_count, thread_info, preempt_count); 19 19 OFFSET(TI_rar_saved, thread_info, rar_saved); 20 20 OFFSET(TI_rsr_saved, thread_info, rsr_saved); 21 - OFFSET(TI_restart_block, thread_info, restart_block); 22 21 BLANK(); 23 22 OFFSET(TSK_active_mm, task_struct, active_mm); 24 23 BLANK();

+1 -1

arch/avr32/kernel/signal.c

··· 69 69 sigset_t set; 70 70 71 71 /* Always make any pending restarted system calls return -EINTR */ 72 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 72 + current->restart_block.fn = do_no_restart_syscall; 73 73 74 74 frame = (struct rt_sigframe __user *)regs->sp; 75 75 pr_debug("SIG return: frame = %p\n", frame);

-4

arch/blackfin/include/asm/thread_info.h

··· 42 42 int cpu; /* cpu we're on */ 43 43 int preempt_count; /* 0 => preemptable, <0 => BUG */ 44 44 mm_segment_t addr_limit; /* address limit */ 45 - struct restart_block restart_block; 46 45 #ifndef CONFIG_SMP 47 46 struct l1_scratch_task_info l1_task_info; 48 47 #endif ··· 57 58 .flags = 0, \ 58 59 .cpu = 0, \ 59 60 .preempt_count = INIT_PREEMPT_COUNT, \ 60 - .restart_block = { \ 61 - .fn = do_no_restart_syscall, \ 62 - }, \ 63 61 } 64 62 #define init_thread_info (init_thread_union.thread_info) 65 63 #define init_stack (init_thread_union.stack)

+1 -1

arch/blackfin/kernel/signal.c

··· 44 44 int err = 0; 45 45 46 46 /* Always make any pending restarted system calls return -EINTR */ 47 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 47 + current->restart_block.fn = do_no_restart_syscall; 48 48 49 49 #define RESTORE(x) err |= __get_user(regs->x, &sc->sc_##x) 50 50

-4

arch/c6x/include/asm/thread_info.h

··· 45 45 int cpu; /* cpu we're on */ 46 46 int preempt_count; /* 0 = preemptable, <0 = BUG */ 47 47 mm_segment_t addr_limit; /* thread address space */ 48 - struct restart_block restart_block; 49 48 }; 50 49 51 50 /* ··· 60 61 .cpu = 0, \ 61 62 .preempt_count = INIT_PREEMPT_COUNT, \ 62 63 .addr_limit = KERNEL_DS, \ 63 - .restart_block = { \ 64 - .fn = do_no_restart_syscall, \ 65 - }, \ 66 64 } 67 65 68 66 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/c6x/kernel/signal.c

··· 68 68 sigset_t set; 69 69 70 70 /* Always make any pending restarted system calls return -EINTR */ 71 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 71 + current->restart_block.fn = do_no_restart_syscall; 72 72 73 73 /* 74 74 * Since we stacked the signal on a dword boundary,

+1 -1

arch/cris/arch-v10/kernel/signal.c

··· 67 67 unsigned long old_usp; 68 68 69 69 /* Always make any pending restarted system calls return -EINTR */ 70 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 70 + current->restart_block.fn = do_no_restart_syscall; 71 71 72 72 /* restore the regs from &sc->regs (same as sc, since regs is first) 73 73 * (sc is already checked for VERIFY_READ since the sigframe was

+1 -1

arch/cris/arch-v32/kernel/signal.c

··· 59 59 unsigned long old_usp; 60 60 61 61 /* Always make any pending restarted system calls return -EINTR */ 62 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 62 + current->restart_block.fn = do_no_restart_syscall; 63 63 64 64 /* 65 65 * Restore the registers from &sc->regs. sc is already checked

-4

arch/cris/include/asm/thread_info.h

··· 38 38 0-0xBFFFFFFF for user-thead 39 39 0-0xFFFFFFFF for kernel-thread 40 40 */ 41 - struct restart_block restart_block; 42 41 __u8 supervisor_stack[0]; 43 42 }; 44 43 ··· 55 56 .cpu = 0, \ 56 57 .preempt_count = INIT_PREEMPT_COUNT, \ 57 58 .addr_limit = KERNEL_DS, \ 58 - .restart_block = { \ 59 - .fn = do_no_restart_syscall, \ 60 - }, \ 61 59 } 62 60 63 61 #define init_thread_info (init_thread_union.thread_info)

-1

arch/frv/include/asm/string.h

··· 33 33 #define __HAVE_ARCH_STRNCAT 1 34 34 #define __HAVE_ARCH_STRCMP 1 35 35 #define __HAVE_ARCH_STRNCMP 1 36 - #define __HAVE_ARCH_STRNICMP 1 37 36 #define __HAVE_ARCH_STRCHR 1 38 37 #define __HAVE_ARCH_STRRCHR 1 39 38 #define __HAVE_ARCH_STRSTR 1

-4

arch/frv/include/asm/thread_info.h

··· 41 41 * 0-0xBFFFFFFF for user-thead 42 42 * 0-0xFFFFFFFF for kernel-thread 43 43 */ 44 - struct restart_block restart_block; 45 44 46 45 __u8 supervisor_stack[0]; 47 46 }; ··· 64 65 .cpu = 0, \ 65 66 .preempt_count = INIT_PREEMPT_COUNT, \ 66 67 .addr_limit = KERNEL_DS, \ 67 - .restart_block = { \ 68 - .fn = do_no_restart_syscall, \ 69 - }, \ 70 68 } 71 69 72 70 #define init_thread_info (init_thread_union.thread_info)

-1

arch/frv/kernel/asm-offsets.c

··· 40 40 OFFSET(TI_CPU, thread_info, cpu); 41 41 OFFSET(TI_PREEMPT_COUNT, thread_info, preempt_count); 42 42 OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit); 43 - OFFSET(TI_RESTART_BLOCK, thread_info, restart_block); 44 43 BLANK(); 45 44 46 45 /* offsets into register file storage */

+1 -1

arch/frv/kernel/signal.c

··· 62 62 unsigned long tbr, psr; 63 63 64 64 /* Always make any pending restarted system calls return -EINTR */ 65 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 65 + current->restart_block.fn = do_no_restart_syscall; 66 66 67 67 tbr = user->i.tbr; 68 68 psr = user->i.psr;

-23

arch/frv/mm/extable.c

··· 10 10 extern const void __memcpy_end, __memcpy_user_error_lr, __memcpy_user_error_handler; 11 11 extern spinlock_t modlist_lock; 12 12 13 - /*****************************************************************************/ 14 - /* 15 - * 16 - */ 17 - static inline unsigned long search_one_table(const struct exception_table_entry *first, 18 - const struct exception_table_entry *last, 19 - unsigned long value) 20 - { 21 - while (first <= last) { 22 - const struct exception_table_entry __attribute__((aligned(8))) *mid; 23 - long diff; 24 - 25 - mid = (last - first) / 2 + first; 26 - diff = mid->insn - value; 27 - if (diff == 0) 28 - return mid->fixup; 29 - else if (diff < 0) 30 - first = mid + 1; 31 - else 32 - last = mid - 1; 33 - } 34 - return 0; 35 - } /* end search_one_table() */ 36 13 37 14 /*****************************************************************************/ 38 15 /*

-4

arch/hexagon/include/asm/thread_info.h

··· 56 56 * used for syscalls somehow; 57 57 * seems to have a function pointer and four arguments 58 58 */ 59 - struct restart_block restart_block; 60 59 /* Points to the current pt_regs frame */ 61 60 struct pt_regs *regs; 62 61 /* ··· 82 83 .cpu = 0, \ 83 84 .preempt_count = 1, \ 84 85 .addr_limit = KERNEL_DS, \ 85 - .restart_block = { \ 86 - .fn = do_no_restart_syscall, \ 87 - }, \ 88 86 .sp = 0, \ 89 87 .regs = NULL, \ 90 88 }

+1 -1

arch/hexagon/kernel/signal.c

··· 239 239 sigset_t blocked; 240 240 241 241 /* Always make any pending restarted system calls return -EINTR */ 242 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 242 + current->restart_block.fn = do_no_restart_syscall; 243 243 244 244 frame = (struct rt_sigframe __user *)pt_psp(regs); 245 245 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))

-4

arch/ia64/include/asm/thread_info.h

··· 27 27 __u32 status; /* Thread synchronous flags */ 28 28 mm_segment_t addr_limit; /* user-level address space limit */ 29 29 int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ 30 - struct restart_block restart_block; 31 30 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 32 31 __u64 ac_stamp; 33 32 __u64 ac_leave; ··· 45 46 .cpu = 0, \ 46 47 .addr_limit = KERNEL_DS, \ 47 48 .preempt_count = INIT_PREEMPT_COUNT, \ 48 - .restart_block = { \ 49 - .fn = do_no_restart_syscall, \ 50 - }, \ 51 49 } 52 50 53 51 #ifndef ASM_OFFSETS_C

+1 -1

arch/ia64/kernel/signal.c

··· 46 46 long err; 47 47 48 48 /* Always make any pending restarted system calls return -EINTR */ 49 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 49 + current->restart_block.fn = do_no_restart_syscall; 50 50 51 51 /* restore scratch that always needs gets updated during signal delivery: */ 52 52 err = __get_user(flags, &sc->sc_flags);

-5

arch/m32r/include/asm/thread_info.h

··· 34 34 0-0xBFFFFFFF for user-thread 35 35 0-0xFFFFFFFF for kernel-thread 36 36 */ 37 - struct restart_block restart_block; 38 37 39 38 __u8 supervisor_stack[0]; 40 39 }; ··· 48 49 #define TI_CPU 0x00000010 49 50 #define TI_PRE_COUNT 0x00000014 50 51 #define TI_ADDR_LIMIT 0x00000018 51 - #define TI_RESTART_BLOCK 0x000001C 52 52 53 53 #endif 54 54 ··· 66 68 .cpu = 0, \ 67 69 .preempt_count = INIT_PREEMPT_COUNT, \ 68 70 .addr_limit = KERNEL_DS, \ 69 - .restart_block = { \ 70 - .fn = do_no_restart_syscall, \ 71 - }, \ 72 71 } 73 72 74 73 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/m32r/kernel/signal.c

··· 48 48 unsigned int err = 0; 49 49 50 50 /* Always make any pending restarted system calls return -EINTR */ 51 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 51 + current->restart_block.fn = do_no_restart_syscall; 52 52 53 53 #define COPY(x) err |= __get_user(regs->x, &sc->sc_##x) 54 54 COPY(r4);

-4

arch/m68k/include/asm/thread_info.h

··· 31 31 int preempt_count; /* 0 => preemptable, <0 => BUG */ 32 32 __u32 cpu; /* should always be 0 on m68k */ 33 33 unsigned long tp_value; /* thread pointer */ 34 - struct restart_block restart_block; 35 34 }; 36 35 #endif /* __ASSEMBLY__ */ 37 36 ··· 40 41 .exec_domain = &default_exec_domain, \ 41 42 .addr_limit = KERNEL_DS, \ 42 43 .preempt_count = INIT_PREEMPT_COUNT, \ 43 - .restart_block = { \ 44 - .fn = do_no_restart_syscall, \ 45 - }, \ 46 44 } 47 45 48 46 #define init_stack (init_thread_union.stack)

+2 -2

arch/m68k/kernel/signal.c

··· 655 655 int err = 0; 656 656 657 657 /* Always make any pending restarted system calls return -EINTR */ 658 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 658 + current->restart_block.fn = do_no_restart_syscall; 659 659 660 660 /* get previous context */ 661 661 if (copy_from_user(&context, usc, sizeof(context))) ··· 693 693 int err; 694 694 695 695 /* Always make any pending restarted system calls return -EINTR */ 696 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 696 + current->restart_block.fn = do_no_restart_syscall; 697 697 698 698 err = __get_user(temp, &uc->uc_mcontext.version); 699 699 if (temp != MCONTEXT_VERSION)

+1 -5

arch/metag/include/asm/thread_info.h

··· 35 35 int preempt_count; /* 0 => preemptable, <0 => BUG */ 36 36 37 37 mm_segment_t addr_limit; /* thread address space */ 38 - struct restart_block restart_block; 39 38 40 - u8 supervisor_stack[0]; 39 + u8 supervisor_stack[0] __aligned(8); 41 40 }; 42 41 43 42 #else /* !__ASSEMBLY__ */ ··· 73 74 .cpu = 0, \ 74 75 .preempt_count = INIT_PREEMPT_COUNT, \ 75 76 .addr_limit = KERNEL_DS, \ 76 - .restart_block = { \ 77 - .fn = do_no_restart_syscall, \ 78 - }, \ 79 77 } 80 78 81 79 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/metag/kernel/signal.c

··· 48 48 int err; 49 49 50 50 /* Always make any pending restarted system calls return -EINTR */ 51 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 51 + current->restart_block.fn = do_no_restart_syscall; 52 52 53 53 err = metag_gp_regs_copyin(regs, 0, sizeof(struct user_gp_regs), NULL, 54 54 &sc->regs);

-4

arch/microblaze/include/asm/thread_info.h

··· 71 71 __u32 cpu; /* current CPU */ 72 72 __s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/ 73 73 mm_segment_t addr_limit; /* thread address space */ 74 - struct restart_block restart_block; 75 74 76 75 struct cpu_context cpu_context; 77 76 }; ··· 86 87 .cpu = 0, \ 87 88 .preempt_count = INIT_PREEMPT_COUNT, \ 88 89 .addr_limit = KERNEL_DS, \ 89 - .restart_block = { \ 90 - .fn = do_no_restart_syscall, \ 91 - }, \ 92 90 } 93 91 94 92 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/microblaze/kernel/signal.c

··· 89 89 int rval; 90 90 91 91 /* Always make any pending restarted system calls return -EINTR */ 92 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 92 + current->restart_block.fn = do_no_restart_syscall; 93 93 94 94 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 95 95 goto badframe;

-4

arch/mips/include/asm/thread_info.h

··· 34 34 * 0x7fffffff for user-thead 35 35 * 0xffffffff for kernel-thread 36 36 */ 37 - struct restart_block restart_block; 38 37 struct pt_regs *regs; 39 38 long syscall; /* syscall number */ 40 39 }; ··· 49 50 .cpu = 0, \ 50 51 .preempt_count = INIT_PREEMPT_COUNT, \ 51 52 .addr_limit = KERNEL_DS, \ 52 - .restart_block = { \ 53 - .fn = do_no_restart_syscall, \ 54 - }, \ 55 53 } 56 54 57 55 #define init_thread_info (init_thread_union.thread_info)

-1

arch/mips/kernel/asm-offsets.c

··· 98 98 OFFSET(TI_CPU, thread_info, cpu); 99 99 OFFSET(TI_PRE_COUNT, thread_info, preempt_count); 100 100 OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit); 101 - OFFSET(TI_RESTART_BLOCK, thread_info, restart_block); 102 101 OFFSET(TI_REGS, thread_info, regs); 103 102 DEFINE(_THREAD_SIZE, THREAD_SIZE); 104 103 DEFINE(_THREAD_MASK, THREAD_MASK);

+1 -1

arch/mips/kernel/signal.c

··· 243 243 int i; 244 244 245 245 /* Always make any pending restarted system calls return -EINTR */ 246 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 246 + current->restart_block.fn = do_no_restart_syscall; 247 247 248 248 err |= __get_user(regs->cp0_epc, &sc->sc_pc); 249 249

+1 -1

arch/mips/kernel/signal32.c

··· 220 220 int i; 221 221 222 222 /* Always make any pending restarted system calls return -EINTR */ 223 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 223 + current->restart_block.fn = do_no_restart_syscall; 224 224 225 225 err |= __get_user(regs->cp0_epc, &sc->sc_pc); 226 226 err |= __get_user(regs->hi, &sc->sc_mdhi);

-4

arch/mn10300/include/asm/thread_info.h

··· 50 50 0-0xBFFFFFFF for user-thead 51 51 0-0xFFFFFFFF for kernel-thread 52 52 */ 53 - struct restart_block restart_block; 54 53 55 54 __u8 supervisor_stack[0]; 56 55 }; ··· 79 80 .cpu = 0, \ 80 81 .preempt_count = INIT_PREEMPT_COUNT, \ 81 82 .addr_limit = KERNEL_DS, \ 82 - .restart_block = { \ 83 - .fn = do_no_restart_syscall, \ 84 - }, \ 85 83 } 86 84 87 85 #define init_thread_info (init_thread_union.thread_info)

-1

arch/mn10300/kernel/asm-offsets.c

··· 28 28 OFFSET(TI_cpu, thread_info, cpu); 29 29 OFFSET(TI_preempt_count, thread_info, preempt_count); 30 30 OFFSET(TI_addr_limit, thread_info, addr_limit); 31 - OFFSET(TI_restart_block, thread_info, restart_block); 32 31 BLANK(); 33 32 34 33 OFFSET(REG_D0, pt_regs, d0);

+1 -1

arch/mn10300/kernel/signal.c

··· 40 40 unsigned int err = 0; 41 41 42 42 /* Always make any pending restarted system calls return -EINTR */ 43 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 43 + current->restart_block.fn = do_no_restart_syscall; 44 44 45 45 if (is_using_fpu(current)) 46 46 fpu_kill_state(current);

-4

arch/openrisc/include/asm/thread_info.h

··· 57 57 0-0x7FFFFFFF for user-thead 58 58 0-0xFFFFFFFF for kernel-thread 59 59 */ 60 - struct restart_block restart_block; 61 60 __u8 supervisor_stack[0]; 62 61 63 62 /* saved context data */ ··· 78 79 .cpu = 0, \ 79 80 .preempt_count = 1, \ 80 81 .addr_limit = KERNEL_DS, \ 81 - .restart_block = { \ 82 - .fn = do_no_restart_syscall, \ 83 - }, \ 84 82 .ksp = 0, \ 85 83 } 86 84

+1 -1

arch/openrisc/kernel/signal.c

··· 46 46 int err = 0; 47 47 48 48 /* Always make any pending restarted system calls return -EINTR */ 49 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 49 + current->restart_block.fn = do_no_restart_syscall; 50 50 51 51 /* 52 52 * Restore the regs from &sc->regs.

-4

arch/parisc/include/asm/thread_info.h

··· 14 14 mm_segment_t addr_limit; /* user-level address space limit */ 15 15 __u32 cpu; /* current CPU */ 16 16 int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ 17 - struct restart_block restart_block; 18 17 }; 19 18 20 19 #define INIT_THREAD_INFO(tsk) \ ··· 24 25 .cpu = 0, \ 25 26 .addr_limit = KERNEL_DS, \ 26 27 .preempt_count = INIT_PREEMPT_COUNT, \ 27 - .restart_block = { \ 28 - .fn = do_no_restart_syscall \ 29 - } \ 30 28 } 31 29 32 30 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/parisc/kernel/signal.c

··· 99 99 sigframe_size = PARISC_RT_SIGFRAME_SIZE32; 100 100 #endif 101 101 102 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 102 + current->restart_block.fn = do_no_restart_syscall; 103 103 104 104 /* Unwind the user stack to get the rt_sigframe structure. */ 105 105 frame = (struct rt_sigframe __user *)

+9 -45

arch/powerpc/include/asm/pgtable.h

··· 40 40 static inline pgprot_t pte_pgprot(pte_t pte) { return __pgprot(pte_val(pte) & PAGE_PROT_BITS); } 41 41 42 42 #ifdef CONFIG_NUMA_BALANCING 43 - static inline int pte_present(pte_t pte) 44 - { 45 - return pte_val(pte) & _PAGE_NUMA_MASK; 46 - } 47 - 48 - #define pte_present_nonuma pte_present_nonuma 49 - static inline int pte_present_nonuma(pte_t pte) 50 - { 51 - return pte_val(pte) & (_PAGE_PRESENT); 52 - } 53 - 54 - #define ptep_set_numa ptep_set_numa 55 - static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, 56 - pte_t *ptep) 57 - { 58 - if ((pte_val(*ptep) & _PAGE_PRESENT) == 0) 59 - VM_BUG_ON(1); 60 - 61 - pte_update(mm, addr, ptep, _PAGE_PRESENT, _PAGE_NUMA, 0); 62 - return; 63 - } 64 - 65 - #define pmdp_set_numa pmdp_set_numa 66 - static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, 67 - pmd_t *pmdp) 68 - { 69 - if ((pmd_val(*pmdp) & _PAGE_PRESENT) == 0) 70 - VM_BUG_ON(1); 71 - 72 - pmd_hugepage_update(mm, addr, pmdp, _PAGE_PRESENT, _PAGE_NUMA); 73 - return; 74 - } 75 - 76 43 /* 77 - * Generic NUMA pte helpers expect pteval_t and pmdval_t types to exist 78 - * which was inherited from x86. For the purposes of powerpc pte_basic_t and 79 - * pmd_t are equivalent 44 + * These work without NUMA balancing but the kernel does not care. See the 45 + * comment in include/asm-generic/pgtable.h . On powerpc, this will only 46 + * work for user pages and always return true for kernel pages. 80 47 */ 81 - #define pteval_t pte_basic_t 82 - #define pmdval_t pmd_t 83 - static inline pteval_t ptenuma_flags(pte_t pte) 48 + static inline int pte_protnone(pte_t pte) 84 49 { 85 - return pte_val(pte) & _PAGE_NUMA_MASK; 50 + return (pte_val(pte) & 51 + (_PAGE_PRESENT | _PAGE_USER)) == _PAGE_PRESENT; 86 52 } 87 53 88 - static inline pmdval_t pmdnuma_flags(pmd_t pmd) 54 + static inline int pmd_protnone(pmd_t pmd) 89 55 { 90 - return pmd_val(pmd) & _PAGE_NUMA_MASK; 56 + return pte_protnone(pmd_pte(pmd)); 91 57 } 92 - 93 - # else 58 + #endif /* CONFIG_NUMA_BALANCING */ 94 59 95 60 static inline int pte_present(pte_t pte) 96 61 { 97 62 return pte_val(pte) & _PAGE_PRESENT; 98 63 } 99 - #endif /* CONFIG_NUMA_BALANCING */ 100 64 101 65 /* Conversion functions: convert a page and protection to a page entry, 102 66 * and a page entry and page directory to the page they refer to.

-5

arch/powerpc/include/asm/pte-common.h

··· 104 104 _PAGE_USER | _PAGE_ACCESSED | _PAGE_RO | \ 105 105 _PAGE_RW | _PAGE_HWWRITE | _PAGE_DIRTY | _PAGE_EXEC) 106 106 107 - #ifdef CONFIG_NUMA_BALANCING 108 - /* Mask of bits that distinguish present and numa ptes */ 109 - #define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PRESENT) 110 - #endif 111 - 112 107 /* 113 108 * We define 2 sets of base prot bits, one for basic pages (ie, 114 109 * cacheable kernel and user pages) and one for non cacheable

-6

arch/powerpc/include/asm/pte-hash64.h

··· 27 27 #define _PAGE_RW 0x0200 /* software: user write access allowed */ 28 28 #define _PAGE_BUSY 0x0800 /* software: PTE & hash are busy */ 29 29 30 - /* 31 - * Used for tracking numa faults 32 - */ 33 - #define _PAGE_NUMA 0x00000010 /* Gather numa placement stats */ 34 - 35 - 36 30 /* No separate kernel read-only */ 37 31 #define _PAGE_KERNEL_RW (_PAGE_RW | _PAGE_DIRTY) /* user access blocked by key */ 38 32 #define _PAGE_KERNEL_RO _PAGE_KERNEL_RW

-4

arch/powerpc/include/asm/thread_info.h

··· 43 43 int cpu; /* cpu we're on */ 44 44 int preempt_count; /* 0 => preemptable, 45 45 <0 => BUG */ 46 - struct restart_block restart_block; 47 46 unsigned long local_flags; /* private flags for thread */ 48 47 49 48 /* low level flags - has atomic operations done on it */ ··· 58 59 .exec_domain = &default_exec_domain, \ 59 60 .cpu = 0, \ 60 61 .preempt_count = INIT_PREEMPT_COUNT, \ 61 - .restart_block = { \ 62 - .fn = do_no_restart_syscall, \ 63 - }, \ 64 62 .flags = 0, \ 65 63 } 66 64

+2 -2

arch/powerpc/kernel/signal_32.c

··· 1231 1231 int tm_restore = 0; 1232 1232 #endif 1233 1233 /* Always make any pending restarted system calls return -EINTR */ 1234 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 1234 + current->restart_block.fn = do_no_restart_syscall; 1235 1235 1236 1236 rt_sf = (struct rt_sigframe __user *) 1237 1237 (regs->gpr[1] + __SIGNAL_FRAMESIZE + 16); ··· 1504 1504 #endif 1505 1505 1506 1506 /* Always make any pending restarted system calls return -EINTR */ 1507 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 1507 + current->restart_block.fn = do_no_restart_syscall; 1508 1508 1509 1509 sf = (struct sigframe __user *)(regs->gpr[1] + __SIGNAL_FRAMESIZE); 1510 1510 sc = &sf->sctx;

+1 -1

arch/powerpc/kernel/signal_64.c

··· 666 666 #endif 667 667 668 668 /* Always make any pending restarted system calls return -EINTR */ 669 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 669 + current->restart_block.fn = do_no_restart_syscall; 670 670 671 671 if (!access_ok(VERIFY_READ, uc, sizeof(*uc))) 672 672 goto badframe;

+32

arch/powerpc/kernel/time.c

··· 621 621 return mulhdu(get_tb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift; 622 622 } 623 623 624 + 625 + #ifdef CONFIG_PPC_PSERIES 626 + 627 + /* 628 + * Running clock - attempts to give a view of time passing for a virtualised 629 + * kernels. 630 + * Uses the VTB register if available otherwise a next best guess. 631 + */ 632 + unsigned long long running_clock(void) 633 + { 634 + /* 635 + * Don't read the VTB as a host since KVM does not switch in host 636 + * timebase into the VTB when it takes a guest off the CPU, reading the 637 + * VTB would result in reading 'last switched out' guest VTB. 638 + * 639 + * Host kernels are often compiled with CONFIG_PPC_PSERIES checked, it 640 + * would be unsafe to rely only on the #ifdef above. 641 + */ 642 + if (firmware_has_feature(FW_FEATURE_LPAR) && 643 + cpu_has_feature(CPU_FTR_ARCH_207S)) 644 + return mulhdu(get_vtb() - boot_tb, tb_to_ns_scale) << tb_to_ns_shift; 645 + 646 + /* 647 + * This is a next best approximation without a VTB. 648 + * On a host which is running bare metal there should never be any stolen 649 + * time and on a host which doesn't do any virtualisation TB *should* equal 650 + * VTB so it makes no difference anyway. 651 + */ 652 + return local_clock() - cputime_to_nsecs(kcpustat_this_cpu->cpustat[CPUTIME_STEAL]); 653 + } 654 + #endif 655 + 624 656 static int __init get_freq(char *name, int cells, unsigned long *val) 625 657 { 626 658 struct device_node *cpu;

+1 -1

arch/powerpc/kvm/book3s_hv_rm_mmu.c

··· 212 212 /* Look up the Linux PTE for the backing page */ 213 213 pte_size = psize; 214 214 pte = lookup_linux_pte_and_update(pgdir, hva, writing, &pte_size); 215 - if (pte_present(pte) && !pte_numa(pte)) { 215 + if (pte_present(pte) && !pte_protnone(pte)) { 216 216 if (writing && !pte_write(pte)) 217 217 /* make the actual HPTE be read-only */ 218 218 ptel = hpte_make_readonly(ptel);

+6 -2

arch/powerpc/mm/copro_fault.c

··· 64 64 if (!(vma->vm_flags & VM_WRITE)) 65 65 goto out_unlock; 66 66 } else { 67 - if (dsisr & DSISR_PROTFAULT) 68 - goto out_unlock; 69 67 if (!(vma->vm_flags & (VM_READ | VM_EXEC))) 70 68 goto out_unlock; 69 + /* 70 + * protfault should only happen due to us 71 + * mapping a region readonly temporarily. PROT_NONE 72 + * is also covered by the VMA check above. 73 + */ 74 + WARN_ON_ONCE(dsisr & DSISR_PROTFAULT); 71 75 } 72 76 73 77 ret = 0;

+9 -16

arch/powerpc/mm/fault.c

··· 389 389 #endif /* CONFIG_8xx */ 390 390 391 391 if (is_exec) { 392 - #ifdef CONFIG_PPC_STD_MMU 393 - /* Protection fault on exec go straight to failure on 394 - * Hash based MMUs as they either don't support per-page 395 - * execute permission, or if they do, it's handled already 396 - * at the hash level. This test would probably have to 397 - * be removed if we change the way this works to make hash 398 - * processors use the same I/D cache coherency mechanism 399 - * as embedded. 400 - */ 401 - if (error_code & DSISR_PROTFAULT) 402 - goto bad_area; 403 - #endif /* CONFIG_PPC_STD_MMU */ 404 - 405 392 /* 406 393 * Allow execution from readable areas if the MMU does not 407 394 * provide separate controls over reading and executing. ··· 403 416 (cpu_has_feature(CPU_FTR_NOEXECUTE) || 404 417 !(vma->vm_flags & (VM_READ | VM_WRITE)))) 405 418 goto bad_area; 419 + #ifdef CONFIG_PPC_STD_MMU 420 + /* 421 + * protfault should only happen due to us 422 + * mapping a region readonly temporarily. PROT_NONE 423 + * is also covered by the VMA check above. 424 + */ 425 + WARN_ON_ONCE(error_code & DSISR_PROTFAULT); 426 + #endif /* CONFIG_PPC_STD_MMU */ 406 427 /* a write */ 407 428 } else if (is_write) { 408 429 if (!(vma->vm_flags & VM_WRITE)) ··· 418 423 flags |= FAULT_FLAG_WRITE; 419 424 /* a read */ 420 425 } else { 421 - /* protection fault */ 422 - if (error_code & 0x08000000) 423 - goto bad_area; 424 426 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) 425 427 goto bad_area; 428 + WARN_ON_ONCE(error_code & DSISR_PROTFAULT); 426 429 } 427 430 428 431 /*

+8 -3

arch/powerpc/mm/pgtable.c

··· 172 172 void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, 173 173 pte_t pte) 174 174 { 175 - #ifdef CONFIG_DEBUG_VM 176 - WARN_ON(pte_val(*ptep) & _PAGE_PRESENT); 177 - #endif 175 + /* 176 + * When handling numa faults, we already have the pte marked 177 + * _PAGE_PRESENT, but we can be sure that it is not in hpte. 178 + * Hence we can use set_pte_at for them. 179 + */ 180 + VM_WARN_ON((pte_val(*ptep) & (_PAGE_PRESENT | _PAGE_USER)) == 181 + (_PAGE_PRESENT | _PAGE_USER)); 182 + 178 183 /* Note: mm->context.id might not yet have been assigned as 179 184 * this context might not have been activated yet when this 180 185 * is called.

+2 -1

arch/powerpc/mm/pgtable_64.c

··· 718 718 pmd_t *pmdp, pmd_t pmd) 719 719 { 720 720 #ifdef CONFIG_DEBUG_VM 721 - WARN_ON(pmd_val(*pmdp) & _PAGE_PRESENT); 721 + WARN_ON((pmd_val(*pmdp) & (_PAGE_PRESENT | _PAGE_USER)) == 722 + (_PAGE_PRESENT | _PAGE_USER)); 722 723 assert_spin_locked(&mm->page_table_lock); 723 724 WARN_ON(!pmd_trans_huge(pmd)); 724 725 #endif

-1

arch/s390/include/asm/string.h

··· 44 44 #undef __HAVE_ARCH_STRCHR 45 45 #undef __HAVE_ARCH_STRNCHR 46 46 #undef __HAVE_ARCH_STRNCMP 47 - #undef __HAVE_ARCH_STRNICMP 48 47 #undef __HAVE_ARCH_STRPBRK 49 48 #undef __HAVE_ARCH_STRSEP 50 49 #undef __HAVE_ARCH_STRSPN

-4

arch/s390/include/asm/thread_info.h

··· 39 39 unsigned long sys_call_table; /* System call table address */ 40 40 unsigned int cpu; /* current CPU */ 41 41 int preempt_count; /* 0 => preemptable, <0 => BUG */ 42 - struct restart_block restart_block; 43 42 unsigned int system_call; 44 43 __u64 user_timer; 45 44 __u64 system_timer; ··· 55 56 .flags = 0, \ 56 57 .cpu = 0, \ 57 58 .preempt_count = INIT_PREEMPT_COUNT, \ 58 - .restart_block = { \ 59 - .fn = do_no_restart_syscall, \ 60 - }, \ 61 59 } 62 60 63 61 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/s390/kernel/compat_signal.c

··· 209 209 int i; 210 210 211 211 /* Alwys make any pending restarted system call return -EINTR */ 212 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 212 + current->restart_block.fn = do_no_restart_syscall; 213 213 214 214 if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs))) 215 215 return -EFAULT;

+1 -1

arch/s390/kernel/signal.c

··· 162 162 _sigregs user_sregs; 163 163 164 164 /* Alwys make any pending restarted system call return -EINTR */ 165 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 165 + current->restart_block.fn = do_no_restart_syscall; 166 166 167 167 if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs))) 168 168 return -EFAULT;

-4

arch/score/include/asm/thread_info.h

··· 42 42 * 0-0xFFFFFFFF for kernel-thread 43 43 */ 44 44 mm_segment_t addr_limit; 45 - struct restart_block restart_block; 46 45 struct pt_regs *regs; 47 46 }; 48 47 ··· 57 58 .cpu = 0, \ 58 59 .preempt_count = 1, \ 59 60 .addr_limit = KERNEL_DS, \ 60 - .restart_block = { \ 61 - .fn = do_no_restart_syscall, \ 62 - }, \ 63 61 } 64 62 65 63 #define init_thread_info (init_thread_union.thread_info)

-1

arch/score/kernel/asm-offsets.c

··· 106 106 OFFSET(TI_CPU, thread_info, cpu); 107 107 OFFSET(TI_PRE_COUNT, thread_info, preempt_count); 108 108 OFFSET(TI_ADDR_LIMIT, thread_info, addr_limit); 109 - OFFSET(TI_RESTART_BLOCK, thread_info, restart_block); 110 109 OFFSET(TI_REGS, thread_info, regs); 111 110 DEFINE(KERNEL_STACK_SIZE, THREAD_SIZE); 112 111 DEFINE(KERNEL_STACK_MASK, THREAD_MASK);

+1 -1

arch/score/kernel/signal.c

··· 141 141 int sig; 142 142 143 143 /* Always make any pending restarted system calls return -EINTR */ 144 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 144 + current->restart_block.fn = do_no_restart_syscall; 145 145 146 146 frame = (struct rt_sigframe __user *) regs->regs[0]; 147 147 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))

-4

arch/sh/include/asm/thread_info.h

··· 33 33 __u32 cpu; 34 34 int preempt_count; /* 0 => preemptable, <0 => BUG */ 35 35 mm_segment_t addr_limit; /* thread address space */ 36 - struct restart_block restart_block; 37 36 unsigned long previous_sp; /* sp of previous stack in case 38 37 of nested IRQ stacks */ 39 38 __u8 supervisor_stack[0]; ··· 62 63 .cpu = 0, \ 63 64 .preempt_count = INIT_PREEMPT_COUNT, \ 64 65 .addr_limit = KERNEL_DS, \ 65 - .restart_block = { \ 66 - .fn = do_no_restart_syscall, \ 67 - }, \ 68 66 } 69 67 70 68 #define init_thread_info (init_thread_union.thread_info)

-1

arch/sh/kernel/asm-offsets.c

··· 25 25 DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); 26 26 DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); 27 27 DEFINE(TI_PRE_COUNT, offsetof(struct thread_info, preempt_count)); 28 - DEFINE(TI_RESTART_BLOCK,offsetof(struct thread_info, restart_block)); 29 28 DEFINE(TI_SIZE, sizeof(struct thread_info)); 30 29 31 30 #ifdef CONFIG_HIBERNATION

+2 -2

arch/sh/kernel/signal_32.c

··· 156 156 int r0; 157 157 158 158 /* Always make any pending restarted system calls return -EINTR */ 159 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 159 + current->restart_block.fn = do_no_restart_syscall; 160 160 161 161 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 162 162 goto badframe; ··· 186 186 int r0; 187 187 188 188 /* Always make any pending restarted system calls return -EINTR */ 189 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 189 + current->restart_block.fn = do_no_restart_syscall; 190 190 191 191 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 192 192 goto badframe;

+2 -2

arch/sh/kernel/signal_64.c

··· 260 260 long long ret; 261 261 262 262 /* Always make any pending restarted system calls return -EINTR */ 263 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 263 + current->restart_block.fn = do_no_restart_syscall; 264 264 265 265 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 266 266 goto badframe; ··· 294 294 long long ret; 295 295 296 296 /* Always make any pending restarted system calls return -EINTR */ 297 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 297 + current->restart_block.fn = do_no_restart_syscall; 298 298 299 299 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 300 300 goto badframe;

-6

arch/sparc/include/asm/thread_info_32.h

··· 47 47 struct reg_window32 reg_window[NSWINS]; /* align for ldd! */ 48 48 unsigned long rwbuf_stkptrs[NSWINS]; 49 49 unsigned long w_saved; 50 - 51 - struct restart_block restart_block; 52 50 }; 53 51 54 52 /* ··· 60 62 .flags = 0, \ 61 63 .cpu = 0, \ 62 64 .preempt_count = INIT_PREEMPT_COUNT, \ 63 - .restart_block = { \ 64 - .fn = do_no_restart_syscall, \ 65 - }, \ 66 65 } 67 66 68 67 #define init_thread_info (init_thread_union.thread_info) ··· 98 103 #define TI_REG_WINDOW 0x30 99 104 #define TI_RWIN_SPTRS 0x230 100 105 #define TI_W_SAVED 0x250 101 - /* #define TI_RESTART_BLOCK 0x25n */ /* Nobody cares */ 102 106 103 107 /* 104 108 * thread information flag bit numbers

+3 -9

arch/sparc/include/asm/thread_info_64.h

··· 58 58 unsigned long gsr[7]; 59 59 unsigned long xfsr[7]; 60 60 61 - struct restart_block restart_block; 62 - 63 61 struct pt_regs *kern_una_regs; 64 62 unsigned int kern_una_insn; 65 63 ··· 90 92 #define TI_RWIN_SPTRS 0x000003c8 91 93 #define TI_GSR 0x00000400 92 94 #define TI_XFSR 0x00000438 93 - #define TI_RESTART_BLOCK 0x00000470 94 - #define TI_KUNA_REGS 0x000004a0 95 - #define TI_KUNA_INSN 0x000004a8 96 - #define TI_FPREGS 0x000004c0 95 + #define TI_KUNA_REGS 0x00000470 96 + #define TI_KUNA_INSN 0x00000478 97 + #define TI_FPREGS 0x00000480 97 98 98 99 /* We embed this in the uppermost byte of thread_info->flags */ 99 100 #define FAULT_CODE_WRITE 0x01 /* Write access, implies D-TLB */ ··· 121 124 .current_ds = ASI_P, \ 122 125 .exec_domain = &default_exec_domain, \ 123 126 .preempt_count = INIT_PREEMPT_COUNT, \ 124 - .restart_block = { \ 125 - .fn = do_no_restart_syscall, \ 126 - }, \ 127 127 } 128 128 129 129 #define init_thread_info (init_thread_union.thread_info)

+2 -2

arch/sparc/kernel/signal32.c

··· 150 150 int err, i; 151 151 152 152 /* Always make any pending restarted system calls return -EINTR */ 153 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 153 + current->restart_block.fn = do_no_restart_syscall; 154 154 155 155 synchronize_user_stack(); 156 156 ··· 235 235 int err, i; 236 236 237 237 /* Always make any pending restarted system calls return -EINTR */ 238 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 238 + current->restart_block.fn = do_no_restart_syscall; 239 239 240 240 synchronize_user_stack(); 241 241 regs->u_regs[UREG_FP] &= 0x00000000ffffffffUL;

+1 -1

arch/sparc/kernel/signal_32.c

··· 70 70 int err; 71 71 72 72 /* Always make any pending restarted system calls return -EINTR */ 73 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 73 + current->restart_block.fn = do_no_restart_syscall; 74 74 75 75 synchronize_user_stack(); 76 76

+1 -1

arch/sparc/kernel/signal_64.c

··· 254 254 int err; 255 255 256 256 /* Always make any pending restarted system calls return -EINTR */ 257 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 257 + current->restart_block.fn = do_no_restart_syscall; 258 258 259 259 synchronize_user_stack (); 260 260 sf = (struct rt_signal_frame __user *)

-2

arch/sparc/kernel/traps_64.c

··· 2730 2730 TI_NEW_CHILD != offsetof(struct thread_info, new_child) || 2731 2731 TI_CURRENT_DS != offsetof(struct thread_info, 2732 2732 current_ds) || 2733 - TI_RESTART_BLOCK != offsetof(struct thread_info, 2734 - restart_block) || 2735 2733 TI_KUNA_REGS != offsetof(struct thread_info, 2736 2734 kern_una_regs) || 2737 2735 TI_KUNA_INSN != offsetof(struct thread_info,

-4

arch/tile/include/asm/thread_info.h

··· 36 36 37 37 mm_segment_t addr_limit; /* thread address space 38 38 (KERNEL_DS or USER_DS) */ 39 - struct restart_block restart_block; 40 39 struct single_step_state *step_state; /* single step state 41 40 (if non-zero) */ 42 41 int align_ctl; /* controls unaligned access */ ··· 56 57 .cpu = 0, \ 57 58 .preempt_count = INIT_PREEMPT_COUNT, \ 58 59 .addr_limit = KERNEL_DS, \ 59 - .restart_block = { \ 60 - .fn = do_no_restart_syscall, \ 61 - }, \ 62 60 .step_state = NULL, \ 63 61 .align_ctl = 0, \ 64 62 }

+1 -1

arch/tile/kernel/signal.c

··· 48 48 int err; 49 49 50 50 /* Always make any pending restarted system calls return -EINTR */ 51 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 51 + current->restart_block.fn = do_no_restart_syscall; 52 52 53 53 /* 54 54 * Enforce that sigcontext is like pt_regs, and doesn't mess

-4

arch/um/include/asm/thread_info.h

··· 22 22 mm_segment_t addr_limit; /* thread address space: 23 23 0-0xBFFFFFFF for user 24 24 0-0xFFFFFFFF for kernel */ 25 - struct restart_block restart_block; 26 25 struct thread_info *real_thread; /* Points to non-IRQ stack */ 27 26 }; 28 27 ··· 33 34 .cpu = 0, \ 34 35 .preempt_count = INIT_PREEMPT_COUNT, \ 35 36 .addr_limit = KERNEL_DS, \ 36 - .restart_block = { \ 37 - .fn = do_no_restart_syscall, \ 38 - }, \ 39 37 .real_thread = NULL, \ 40 38 } 41 39

-4

arch/unicore32/include/asm/thread_info.h

··· 79 79 #ifdef CONFIG_UNICORE_FPU_F64 80 80 struct fp_state fpstate __attribute__((aligned(8))); 81 81 #endif 82 - struct restart_block restart_block; 83 82 }; 84 83 85 84 #define INIT_THREAD_INFO(tsk) \ ··· 88 89 .flags = 0, \ 89 90 .preempt_count = INIT_PREEMPT_COUNT, \ 90 91 .addr_limit = KERNEL_DS, \ 91 - .restart_block = { \ 92 - .fn = do_no_restart_syscall, \ 93 - }, \ 94 92 } 95 93 96 94 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/unicore32/kernel/signal.c

··· 105 105 struct rt_sigframe __user *frame; 106 106 107 107 /* Always make any pending restarted system calls return -EINTR */ 108 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 108 + current->restart_block.fn = do_no_restart_syscall; 109 109 110 110 /* 111 111 * Since we stacked the signal on a 64-bit boundary,

+1 -1

arch/x86/ia32/ia32_signal.c

··· 169 169 u32 tmp; 170 170 171 171 /* Always make any pending restarted system calls return -EINTR */ 172 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 172 + current->restart_block.fn = do_no_restart_syscall; 173 173 174 174 get_user_try { 175 175 /*

+20 -26

arch/x86/include/asm/pgtable.h

··· 132 132 133 133 static inline int pte_special(pte_t pte) 134 134 { 135 - /* 136 - * See CONFIG_NUMA_BALANCING pte_numa in include/asm-generic/pgtable.h. 137 - * On x86 we have _PAGE_BIT_NUMA == _PAGE_BIT_GLOBAL+1 == 138 - * __PAGE_BIT_SOFTW1 == _PAGE_BIT_SPECIAL. 139 - */ 140 - return (pte_flags(pte) & _PAGE_SPECIAL) && 141 - (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_PROTNONE)); 135 + return pte_flags(pte) & _PAGE_SPECIAL; 142 136 } 143 137 144 138 static inline unsigned long pte_pfn(pte_t pte) ··· 294 300 295 301 static inline pmd_t pmd_mknotpresent(pmd_t pmd) 296 302 { 297 - return pmd_clear_flags(pmd, _PAGE_PRESENT); 303 + return pmd_clear_flags(pmd, _PAGE_PRESENT | _PAGE_PROTNONE); 298 304 } 299 305 300 306 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY ··· 437 443 438 444 static inline int pte_present(pte_t a) 439 445 { 440 - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | 441 - _PAGE_NUMA); 442 - } 443 - 444 - #define pte_present_nonuma pte_present_nonuma 445 - static inline int pte_present_nonuma(pte_t a) 446 - { 447 446 return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); 448 447 } 449 448 ··· 446 459 if (pte_flags(a) & _PAGE_PRESENT) 447 460 return true; 448 461 449 - if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) && 462 + if ((pte_flags(a) & _PAGE_PROTNONE) && 450 463 mm_tlb_flush_pending(mm)) 451 464 return true; 452 465 ··· 466 479 * the _PAGE_PSE flag will remain set at all times while the 467 480 * _PAGE_PRESENT bit is clear). 468 481 */ 469 - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | 470 - _PAGE_NUMA); 482 + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); 471 483 } 484 + 485 + #ifdef CONFIG_NUMA_BALANCING 486 + /* 487 + * These work without NUMA balancing but the kernel does not care. See the 488 + * comment in include/asm-generic/pgtable.h 489 + */ 490 + static inline int pte_protnone(pte_t pte) 491 + { 492 + return pte_flags(pte) & _PAGE_PROTNONE; 493 + } 494 + 495 + static inline int pmd_protnone(pmd_t pmd) 496 + { 497 + return pmd_flags(pmd) & _PAGE_PROTNONE; 498 + } 499 + #endif /* CONFIG_NUMA_BALANCING */ 472 500 473 501 static inline int pmd_none(pmd_t pmd) 474 502 { ··· 541 539 542 540 static inline int pmd_bad(pmd_t pmd) 543 541 { 544 - #ifdef CONFIG_NUMA_BALANCING 545 - /* pmd_numa check */ 546 - if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) 547 - return 0; 548 - #endif 549 542 return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; 550 543 } 551 544 ··· 859 862 #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY 860 863 static inline pte_t pte_swp_mksoft_dirty(pte_t pte) 861 864 { 862 - VM_BUG_ON(pte_present_nonuma(pte)); 863 865 return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); 864 866 } 865 867 866 868 static inline int pte_swp_soft_dirty(pte_t pte) 867 869 { 868 - VM_BUG_ON(pte_present_nonuma(pte)); 869 870 return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; 870 871 } 871 872 872 873 static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) 873 874 { 874 - VM_BUG_ON(pte_present_nonuma(pte)); 875 875 return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); 876 876 } 877 877 #endif

-5

arch/x86/include/asm/pgtable_64.h

··· 142 142 143 143 /* Encode and de-code a swap entry */ 144 144 #define SWP_TYPE_BITS 5 145 - #ifdef CONFIG_NUMA_BALANCING 146 - /* Automatic NUMA balancing needs to be distinguishable from swap entries */ 147 - #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) 148 - #else 149 145 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) 150 - #endif 151 146 152 147 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) 153 148

+2 -39

arch/x86/include/asm/pgtable_types.h

··· 27 27 #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ 28 28 #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 29 29 30 - /* 31 - * Swap offsets on configurations that allow automatic NUMA balancing use the 32 - * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from 33 - * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the 34 - * maximum possible swap space from 16TB to 8TB. 35 - */ 36 - #define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) 37 - 38 30 /* If _PAGE_BIT_PRESENT is clear, we use these: */ 39 31 /* - if the user mapped it with PROT_NONE; pte_present gives true */ 40 32 #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL ··· 68 76 #endif 69 77 70 78 /* 71 - * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page 72 - * that is not present. The hinting fault gathers numa placement statistics 73 - * (see pte_numa()). The bit is always zero when the PTE is not present. 74 - * 75 - * The bit picked must be always zero when the pmd is present and not 76 - * present, so that we don't lose information when we set it while 77 - * atomically clearing the present bit. 78 - */ 79 - #ifdef CONFIG_NUMA_BALANCING 80 - #define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) 81 - #else 82 - #define _PAGE_NUMA (_AT(pteval_t, 0)) 83 - #endif 84 - 85 - /* 86 79 * Tracking soft dirty bit when a page goes to a swap is tricky. 87 80 * We need a bit which can be stored in pte _and_ not conflict 88 81 * with swap entry format. On x86 bits 6 and 7 are *not* involved ··· 99 122 /* Set of bits not changed in pte_modify */ 100 123 #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 101 124 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ 102 - _PAGE_SOFT_DIRTY | _PAGE_NUMA) 103 - #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) 125 + _PAGE_SOFT_DIRTY) 126 + #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) 104 127 105 128 /* 106 129 * The cache modes defined here are used to translate between pure SW usage ··· 300 323 { 301 324 return native_pte_val(pte) & PTE_FLAGS_MASK; 302 325 } 303 - 304 - #ifdef CONFIG_NUMA_BALANCING 305 - /* Set of bits that distinguishes present, prot_none and numa ptes */ 306 - #define _PAGE_NUMA_MASK (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT) 307 - static inline pteval_t ptenuma_flags(pte_t pte) 308 - { 309 - return pte_flags(pte) & _PAGE_NUMA_MASK; 310 - } 311 - 312 - static inline pmdval_t pmdnuma_flags(pmd_t pmd) 313 - { 314 - return pmd_flags(pmd) & _PAGE_NUMA_MASK; 315 - } 316 - #endif /* CONFIG_NUMA_BALANCING */ 317 326 318 327 #define pgprot_val(x) ((x).pgprot) 319 328 #define __pgprot(x) ((pgprot_t) { (x) } )

-4

arch/x86/include/asm/thread_info.h

··· 31 31 __u32 cpu; /* current CPU */ 32 32 int saved_preempt_count; 33 33 mm_segment_t addr_limit; 34 - struct restart_block restart_block; 35 34 void __user *sysenter_return; 36 35 unsigned int sig_on_uaccess_error:1; 37 36 unsigned int uaccess_err:1; /* uaccess failed */ ··· 44 45 .cpu = 0, \ 45 46 .saved_preempt_count = INIT_PREEMPT_COUNT, \ 46 47 .addr_limit = KERNEL_DS, \ 47 - .restart_block = { \ 48 - .fn = do_no_restart_syscall, \ 49 - }, \ 50 48 } 51 49 52 50 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/x86/kernel/hpet.c

··· 168 168 #define hpet_print_config() \ 169 169 do { \ 170 170 if (hpet_verbose) \ 171 - _hpet_print_config(__FUNCTION__, __LINE__); \ 171 + _hpet_print_config(__func__, __LINE__); \ 172 172 } while (0) 173 173 174 174 /*

+2 -2

arch/x86/kernel/rtc.c

··· 49 49 retval = set_rtc_time(&tm); 50 50 if (retval) 51 51 printk(KERN_ERR "%s: RTC write failed with error %d\n", 52 - __FUNCTION__, retval); 52 + __func__, retval); 53 53 } else { 54 54 printk(KERN_ERR 55 55 "%s: Invalid RTC value: write of %lx to RTC failed\n", 56 - __FUNCTION__, nowtime); 56 + __func__, nowtime); 57 57 retval = -EINVAL; 58 58 } 59 59 return retval;

+1 -1

arch/x86/kernel/signal.c

··· 69 69 unsigned int err = 0; 70 70 71 71 /* Always make any pending restarted system calls return -EINTR */ 72 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 72 + current->restart_block.fn = do_no_restart_syscall; 73 73 74 74 get_user_try { 75 75

+2 -2

arch/x86/mm/gup.c

··· 84 84 struct page *page; 85 85 86 86 /* Similar to the PMD case, NUMA hinting must take slow path */ 87 - if (pte_numa(pte)) { 87 + if (pte_protnone(pte)) { 88 88 pte_unmap(ptep); 89 89 return 0; 90 90 } ··· 178 178 * slowpath for accounting purposes and so that they 179 179 * can be serialised against THP migration. 180 180 */ 181 - if (pmd_numa(pmd)) 181 + if (pmd_protnone(pmd)) 182 182 return 0; 183 183 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 184 184 return 0;

+1 -1

arch/x86/platform/intel-mid/intel_mid_vrtc.c

··· 110 110 spin_unlock_irqrestore(&rtc_lock, flags); 111 111 } else { 112 112 pr_err("%s: Invalid vRTC value: write of %lx to vRTC failed\n", 113 - __FUNCTION__, now->tv_sec); 113 + __func__, now->tv_sec); 114 114 retval = -EINVAL; 115 115 } 116 116 return retval;

+1 -1

arch/x86/um/signal.c

··· 157 157 int err, pid; 158 158 159 159 /* Always make any pending restarted system calls return -EINTR */ 160 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 160 + current->restart_block.fn = do_no_restart_syscall; 161 161 162 162 err = copy_from_user(&sc, from, sizeof(sc)); 163 163 if (err)

-5

arch/xtensa/include/asm/thread_info.h

··· 51 51 __s32 preempt_count; /* 0 => preemptable,< 0 => BUG*/ 52 52 53 53 mm_segment_t addr_limit; /* thread address space */ 54 - struct restart_block restart_block; 55 54 56 55 unsigned long cpenable; 57 56 ··· 71 72 #define TI_CPU 0x00000010 72 73 #define TI_PRE_COUNT 0x00000014 73 74 #define TI_ADDR_LIMIT 0x00000018 74 - #define TI_RESTART_BLOCK 0x000001C 75 75 76 76 #endif 77 77 ··· 88 90 .cpu = 0, \ 89 91 .preempt_count = INIT_PREEMPT_COUNT, \ 90 92 .addr_limit = KERNEL_DS, \ 91 - .restart_block = { \ 92 - .fn = do_no_restart_syscall, \ 93 - }, \ 94 93 } 95 94 96 95 #define init_thread_info (init_thread_union.thread_info)

+1 -1

arch/xtensa/kernel/signal.c

··· 245 245 int ret; 246 246 247 247 /* Always make any pending restarted system calls return -EINTR */ 248 - current_thread_info()->restart_block.fn = do_no_restart_syscall; 248 + current->restart_block.fn = do_no_restart_syscall; 249 249 250 250 if (regs->depc > 64) 251 251 panic("rt_sigreturn in double exception!\n");

+2 -2

drivers/acpi/acpica/utdebug.c

··· 111 111 * RETURN: Updated pointer to the function name 112 112 * 113 113 * DESCRIPTION: Remove the "Acpi" prefix from the function name, if present. 114 - * This allows compiler macros such as __FUNCTION__ to be used 115 - * with no change to the debug output. 114 + * This allows compiler macros such as __func__ to be used with no 115 + * change to the debug output. 116 116 * 117 117 ******************************************************************************/ 118 118

+1 -1

drivers/block/xen-blkfront.c

··· 1391 1391 if (major != XENVBD_MAJOR) { 1392 1392 printk(KERN_INFO 1393 1393 "%s: HVM does not support vbd %d as xen block device\n", 1394 - __FUNCTION__, vdevice); 1394 + __func__, vdevice); 1395 1395 return -ENODEV; 1396 1396 } 1397 1397 }

+121 -106

drivers/block/zram/zram_drv.c

··· 53 53 } \ 54 54 static DEVICE_ATTR_RO(name); 55 55 56 - static inline int init_done(struct zram *zram) 56 + static inline bool init_done(struct zram *zram) 57 57 { 58 - return zram->meta != NULL; 58 + return zram->disksize; 59 59 } 60 60 61 61 static inline struct zram *dev_to_zram(struct device *dev) ··· 307 307 return 1; 308 308 } 309 309 310 - static void zram_meta_free(struct zram_meta *meta) 310 + static void zram_meta_free(struct zram_meta *meta, u64 disksize) 311 311 { 312 + size_t num_pages = disksize >> PAGE_SHIFT; 313 + size_t index; 314 + 315 + /* Free all pages that are still in this zram device */ 316 + for (index = 0; index < num_pages; index++) { 317 + unsigned long handle = meta->table[index].handle; 318 + 319 + if (!handle) 320 + continue; 321 + 322 + zs_free(meta->mem_pool, handle); 323 + } 324 + 312 325 zs_destroy_pool(meta->mem_pool); 313 326 vfree(meta->table); 314 327 kfree(meta); 315 328 } 316 329 317 - static struct zram_meta *zram_meta_alloc(u64 disksize) 330 + static struct zram_meta *zram_meta_alloc(int device_id, u64 disksize) 318 331 { 319 332 size_t num_pages; 333 + char pool_name[8]; 320 334 struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL); 335 + 321 336 if (!meta) 322 - goto out; 337 + return NULL; 323 338 324 339 num_pages = disksize >> PAGE_SHIFT; 325 340 meta->table = vzalloc(num_pages * sizeof(*meta->table)); 326 341 if (!meta->table) { 327 342 pr_err("Error allocating zram address table\n"); 328 - goto free_meta; 343 + goto out_error; 329 344 } 330 345 331 - meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM); 346 + snprintf(pool_name, sizeof(pool_name), "zram%d", device_id); 347 + meta->mem_pool = zs_create_pool(pool_name, GFP_NOIO | __GFP_HIGHMEM); 332 348 if (!meta->mem_pool) { 333 349 pr_err("Error creating memory pool\n"); 334 - goto free_table; 350 + goto out_error; 335 351 } 336 352 337 353 return meta; 338 354 339 - free_table: 355 + out_error: 340 356 vfree(meta->table); 341 - free_meta: 342 357 kfree(meta); 343 - meta = NULL; 344 - out: 345 - return meta; 358 + return NULL; 359 + } 360 + 361 + static inline bool zram_meta_get(struct zram *zram) 362 + { 363 + if (atomic_inc_not_zero(&zram->refcount)) 364 + return true; 365 + return false; 366 + } 367 + 368 + static inline void zram_meta_put(struct zram *zram) 369 + { 370 + atomic_dec(&zram->refcount); 346 371 } 347 372 348 373 static void update_position(u32 *index, int *offset, struct bio_vec *bvec) ··· 729 704 } 730 705 } 731 706 732 - static void zram_reset_device(struct zram *zram, bool reset_capacity) 707 + static void zram_reset_device(struct zram *zram) 733 708 { 734 - size_t index; 735 709 struct zram_meta *meta; 710 + struct zcomp *comp; 711 + u64 disksize; 736 712 737 713 down_write(&zram->init_lock); 738 714 ··· 745 719 } 746 720 747 721 meta = zram->meta; 748 - /* Free all pages that are still in this zram device */ 749 - for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) { 750 - unsigned long handle = meta->table[index].handle; 751 - if (!handle) 752 - continue; 722 + comp = zram->comp; 723 + disksize = zram->disksize; 724 + /* 725 + * Refcount will go down to 0 eventually and r/w handler 726 + * cannot handle further I/O so it will bail out by 727 + * check zram_meta_get. 728 + */ 729 + zram_meta_put(zram); 730 + /* 731 + * We want to free zram_meta in process context to avoid 732 + * deadlock between reclaim path and any other locks. 733 + */ 734 + wait_event(zram->io_done, atomic_read(&zram->refcount) == 0); 753 735 754 - zs_free(meta->mem_pool, handle); 755 - } 756 - 757 - zcomp_destroy(zram->comp); 758 - zram->max_comp_streams = 1; 759 - 760 - zram_meta_free(zram->meta); 761 - zram->meta = NULL; 762 736 /* Reset stats */ 763 737 memset(&zram->stats, 0, sizeof(zram->stats)); 764 - 765 738 zram->disksize = 0; 766 - if (reset_capacity) 767 - set_capacity(zram->disk, 0); 739 + zram->max_comp_streams = 1; 740 + set_capacity(zram->disk, 0); 768 741 769 742 up_write(&zram->init_lock); 770 - 771 - /* 772 - * Revalidate disk out of the init_lock to avoid lockdep splat. 773 - * It's okay because disk's capacity is protected by init_lock 774 - * so that revalidate_disk always sees up-to-date capacity. 775 - */ 776 - if (reset_capacity) 777 - revalidate_disk(zram->disk); 743 + /* I/O operation under all of CPU are done so let's free */ 744 + zram_meta_free(meta, disksize); 745 + zcomp_destroy(comp); 778 746 } 779 747 780 748 static ssize_t disksize_store(struct device *dev, ··· 785 765 return -EINVAL; 786 766 787 767 disksize = PAGE_ALIGN(disksize); 788 - meta = zram_meta_alloc(disksize); 768 + meta = zram_meta_alloc(zram->disk->first_minor, disksize); 789 769 if (!meta) 790 770 return -ENOMEM; 791 771 ··· 804 784 goto out_destroy_comp; 805 785 } 806 786 787 + init_waitqueue_head(&zram->io_done); 788 + atomic_set(&zram->refcount, 1); 807 789 zram->meta = meta; 808 790 zram->comp = comp; 809 791 zram->disksize = disksize; ··· 825 803 up_write(&zram->init_lock); 826 804 zcomp_destroy(comp); 827 805 out_free_meta: 828 - zram_meta_free(meta); 806 + zram_meta_free(meta, disksize); 829 807 return err; 830 808 } 831 809 ··· 843 821 if (!bdev) 844 822 return -ENOMEM; 845 823 824 + mutex_lock(&bdev->bd_mutex); 846 825 /* Do not reset an active device! */ 847 - if (bdev->bd_holders) { 826 + if (bdev->bd_openers) { 848 827 ret = -EBUSY; 849 828 goto out; 850 829 } ··· 861 838 862 839 /* Make sure all pending I/O is finished */ 863 840 fsync_bdev(bdev); 841 + zram_reset_device(zram); 842 + 843 + mutex_unlock(&bdev->bd_mutex); 844 + revalidate_disk(zram->disk); 864 845 bdput(bdev); 865 846 866 - zram_reset_device(zram, true); 867 847 return len; 868 848 869 849 out: 850 + mutex_unlock(&bdev->bd_mutex); 870 851 bdput(bdev); 871 852 return ret; 872 853 } ··· 936 909 { 937 910 struct zram *zram = queue->queuedata; 938 911 939 - down_read(&zram->init_lock); 940 - if (unlikely(!init_done(zram))) 912 + if (unlikely(!zram_meta_get(zram))) 941 913 goto error; 942 914 943 915 if (!valid_io_request(zram, bio->bi_iter.bi_sector, 944 916 bio->bi_iter.bi_size)) { 945 917 atomic64_inc(&zram->stats.invalid_io); 946 - goto error; 918 + goto put_zram; 947 919 } 948 920 949 921 __zram_make_request(zram, bio); 950 - up_read(&zram->init_lock); 951 - 922 + zram_meta_put(zram); 952 923 return; 953 - 924 + put_zram: 925 + zram_meta_put(zram); 954 926 error: 955 - up_read(&zram->init_lock); 956 927 bio_io_error(bio); 957 928 } 958 929 ··· 972 947 static int zram_rw_page(struct block_device *bdev, sector_t sector, 973 948 struct page *page, int rw) 974 949 { 975 - int offset, err; 950 + int offset, err = -EIO; 976 951 u32 index; 977 952 struct zram *zram; 978 953 struct bio_vec bv; 979 954 980 955 zram = bdev->bd_disk->private_data; 956 + if (unlikely(!zram_meta_get(zram))) 957 + goto out; 958 + 981 959 if (!valid_io_request(zram, sector, PAGE_SIZE)) { 982 960 atomic64_inc(&zram->stats.invalid_io); 983 - return -EINVAL; 984 - } 985 - 986 - down_read(&zram->init_lock); 987 - if (unlikely(!init_done(zram))) { 988 - err = -EIO; 989 - goto out_unlock; 961 + err = -EINVAL; 962 + goto put_zram; 990 963 } 991 964 992 965 index = sector >> SECTORS_PER_PAGE_SHIFT; ··· 995 972 bv.bv_offset = 0; 996 973 997 974 err = zram_bvec_rw(zram, &bv, index, offset, rw); 998 - out_unlock: 999 - up_read(&zram->init_lock); 975 + put_zram: 976 + zram_meta_put(zram); 977 + out: 1000 978 /* 1001 979 * If I/O fails, just return error(ie, non-zero) without 1002 980 * calling page_endio. ··· 1063 1039 1064 1040 static int create_device(struct zram *zram, int device_id) 1065 1041 { 1042 + struct request_queue *queue; 1066 1043 int ret = -ENOMEM; 1067 1044 1068 1045 init_rwsem(&zram->init_lock); 1069 1046 1070 - zram->queue = blk_alloc_queue(GFP_KERNEL); 1071 - if (!zram->queue) { 1047 + queue = blk_alloc_queue(GFP_KERNEL); 1048 + if (!queue) { 1072 1049 pr_err("Error allocating disk queue for device %d\n", 1073 1050 device_id); 1074 1051 goto out; 1075 1052 } 1076 1053 1077 - blk_queue_make_request(zram->queue, zram_make_request); 1078 - zram->queue->queuedata = zram; 1054 + blk_queue_make_request(queue, zram_make_request); 1079 1055 1080 1056 /* gendisk structure */ 1081 1057 zram->disk = alloc_disk(1); ··· 1088 1064 zram->disk->major = zram_major; 1089 1065 zram->disk->first_minor = device_id; 1090 1066 zram->disk->fops = &zram_devops; 1091 - zram->disk->queue = zram->queue; 1067 + zram->disk->queue = queue; 1068 + zram->disk->queue->queuedata = zram; 1092 1069 zram->disk->private_data = zram; 1093 1070 snprintf(zram->disk->disk_name, 16, "zram%d", device_id); 1094 1071 ··· 1140 1115 del_gendisk(zram->disk); 1141 1116 put_disk(zram->disk); 1142 1117 out_free_queue: 1143 - blk_cleanup_queue(zram->queue); 1118 + blk_cleanup_queue(queue); 1144 1119 out: 1145 1120 return ret; 1146 1121 } 1147 1122 1148 - static void destroy_device(struct zram *zram) 1123 + static void destroy_devices(unsigned int nr) 1149 1124 { 1150 - sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, 1151 - &zram_disk_attr_group); 1125 + struct zram *zram; 1126 + unsigned int i; 1152 1127 1153 - del_gendisk(zram->disk); 1154 - put_disk(zram->disk); 1128 + for (i = 0; i < nr; i++) { 1129 + zram = &zram_devices[i]; 1130 + /* 1131 + * Remove sysfs first, so no one will perform a disksize 1132 + * store while we destroy the devices 1133 + */ 1134 + sysfs_remove_group(&disk_to_dev(zram->disk)->kobj, 1135 + &zram_disk_attr_group); 1155 1136 1156 - blk_cleanup_queue(zram->queue); 1137 + zram_reset_device(zram); 1138 + 1139 + blk_cleanup_queue(zram->disk->queue); 1140 + del_gendisk(zram->disk); 1141 + put_disk(zram->disk); 1142 + } 1143 + 1144 + kfree(zram_devices); 1145 + unregister_blkdev(zram_major, "zram"); 1146 + pr_info("Destroyed %u device(s)\n", nr); 1157 1147 } 1158 1148 1159 1149 static int __init zram_init(void) ··· 1178 1138 if (num_devices > max_num_devices) { 1179 1139 pr_warn("Invalid value for num_devices: %u\n", 1180 1140 num_devices); 1181 - ret = -EINVAL; 1182 - goto out; 1141 + return -EINVAL; 1183 1142 } 1184 1143 1185 1144 zram_major = register_blkdev(0, "zram"); 1186 1145 if (zram_major <= 0) { 1187 1146 pr_warn("Unable to get major number\n"); 1188 - ret = -EBUSY; 1189 - goto out; 1147 + return -EBUSY; 1190 1148 } 1191 1149 1192 1150 /* Allocate the device array and initialize each one */ 1193 1151 zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL); 1194 1152 if (!zram_devices) { 1195 - ret = -ENOMEM; 1196 - goto unregister; 1153 + unregister_blkdev(zram_major, "zram"); 1154 + return -ENOMEM; 1197 1155 } 1198 1156 1199 1157 for (dev_id = 0; dev_id < num_devices; dev_id++) { 1200 1158 ret = create_device(&zram_devices[dev_id], dev_id); 1201 1159 if (ret) 1202 - goto free_devices; 1160 + goto out_error; 1203 1161 } 1204 1162 1205 - pr_info("Created %u device(s) ...\n", num_devices); 1206 - 1163 + pr_info("Created %u device(s)\n", num_devices); 1207 1164 return 0; 1208 1165 1209 - free_devices: 1210 - while (dev_id) 1211 - destroy_device(&zram_devices[--dev_id]); 1212 - kfree(zram_devices); 1213 - unregister: 1214 - unregister_blkdev(zram_major, "zram"); 1215 - out: 1166 + out_error: 1167 + destroy_devices(dev_id); 1216 1168 return ret; 1217 1169 } 1218 1170 1219 1171 static void __exit zram_exit(void) 1220 1172 { 1221 - int i; 1222 - struct zram *zram; 1223 - 1224 - for (i = 0; i < num_devices; i++) { 1225 - zram = &zram_devices[i]; 1226 - 1227 - destroy_device(zram); 1228 - /* 1229 - * Shouldn't access zram->disk after destroy_device 1230 - * because destroy_device already released zram->disk. 1231 - */ 1232 - zram_reset_device(zram, false); 1233 - } 1234 - 1235 - unregister_blkdev(zram_major, "zram"); 1236 - 1237 - kfree(zram_devices); 1238 - pr_debug("Cleanup done!\n"); 1173 + destroy_devices(num_devices); 1239 1174 } 1240 1175 1241 1176 module_init(zram_init);

+12 -11

drivers/block/zram/zram_drv.h

··· 100 100 101 101 struct zram { 102 102 struct zram_meta *meta; 103 - struct request_queue *queue; 104 - struct gendisk *disk; 105 103 struct zcomp *comp; 106 - 107 - /* Prevent concurrent execution of device init, reset and R/W request */ 104 + struct gendisk *disk; 105 + /* Prevent concurrent execution of device init */ 108 106 struct rw_semaphore init_lock; 107 + /* 108 + * the number of pages zram can consume for storing compressed data 109 + */ 110 + unsigned long limit_pages; 111 + int max_comp_streams; 112 + 113 + struct zram_stats stats; 114 + atomic_t refcount; /* refcount for zram_meta */ 115 + /* wait all IO under all of cpu are done */ 116 + wait_queue_head_t io_done; 109 117 /* 110 118 * This is the limit on amount of *uncompressed* worth of data 111 119 * we can store in a disk. 112 120 */ 113 121 u64 disksize; /* bytes */ 114 - int max_comp_streams; 115 - struct zram_stats stats; 116 - /* 117 - * the number of pages zram can consume for storing compressed data 118 - */ 119 - unsigned long limit_pages; 120 - 121 122 char compressor[10]; 122 123 }; 123 124 #endif

+17 -18

fs/dcache.c

··· 400 400 * LRU lists entirely, while shrink_move moves it to the indicated 401 401 * private list. 402 402 */ 403 - static void d_lru_isolate(struct dentry *dentry) 403 + static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) 404 404 { 405 405 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); 406 406 dentry->d_flags &= ~DCACHE_LRU_LIST; 407 407 this_cpu_dec(nr_dentry_unused); 408 - list_del_init(&dentry->d_lru); 408 + list_lru_isolate(lru, &dentry->d_lru); 409 409 } 410 410 411 - static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list) 411 + static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, 412 + struct list_head *list) 412 413 { 413 414 D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); 414 415 dentry->d_flags |= DCACHE_SHRINK_LIST; 415 - list_move_tail(&dentry->d_lru, list); 416 + list_lru_isolate_move(lru, &dentry->d_lru, list); 416 417 } 417 418 418 419 /* ··· 870 869 } 871 870 } 872 871 873 - static enum lru_status 874 - dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) 872 + static enum lru_status dentry_lru_isolate(struct list_head *item, 873 + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) 875 874 { 876 875 struct list_head *freeable = arg; 877 876 struct dentry *dentry = container_of(item, struct dentry, d_lru); ··· 891 890 * another pass through the LRU. 892 891 */ 893 892 if (dentry->d_lockref.count) { 894 - d_lru_isolate(dentry); 893 + d_lru_isolate(lru, dentry); 895 894 spin_unlock(&dentry->d_lock); 896 895 return LRU_REMOVED; 897 896 } ··· 922 921 return LRU_ROTATE; 923 922 } 924 923 925 - d_lru_shrink_move(dentry, freeable); 924 + d_lru_shrink_move(lru, dentry, freeable); 926 925 spin_unlock(&dentry->d_lock); 927 926 928 927 return LRU_REMOVED; ··· 931 930 /** 932 931 * prune_dcache_sb - shrink the dcache 933 932 * @sb: superblock 934 - * @nr_to_scan : number of entries to try to free 935 - * @nid: which node to scan for freeable entities 933 + * @sc: shrink control, passed to list_lru_shrink_walk() 936 934 * 937 - * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is 938 - * done when we need more memory an called from the superblock shrinker 935 + * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This 936 + * is done when we need more memory and called from the superblock shrinker 939 937 * function. 940 938 * 941 939 * This function may fail to free any resources if all the dentries are in 942 940 * use. 943 941 */ 944 - long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, 945 - int nid) 942 + long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) 946 943 { 947 944 LIST_HEAD(dispose); 948 945 long freed; 949 946 950 - freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, 951 - &dispose, &nr_to_scan); 947 + freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, 948 + dentry_lru_isolate, &dispose); 952 949 shrink_dentry_list(&dispose); 953 950 return freed; 954 951 } 955 952 956 953 static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, 957 - spinlock_t *lru_lock, void *arg) 954 + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) 958 955 { 959 956 struct list_head *freeable = arg; 960 957 struct dentry *dentry = container_of(item, struct dentry, d_lru); ··· 965 966 if (!spin_trylock(&dentry->d_lock)) 966 967 return LRU_SKIP; 967 968 968 - d_lru_shrink_move(dentry, freeable); 969 + d_lru_shrink_move(lru, dentry, freeable); 969 970 spin_unlock(&dentry->d_lock); 970 971 971 972 return LRU_REMOVED;

-14

fs/drop_caches.c

··· 37 37 iput(toput_inode); 38 38 } 39 39 40 - static void drop_slab(void) 41 - { 42 - int nr_objects; 43 - 44 - do { 45 - int nid; 46 - 47 - nr_objects = 0; 48 - for_each_online_node(nid) 49 - nr_objects += shrink_node_slabs(GFP_KERNEL, nid, 50 - 1000, 1000); 51 - } while (nr_objects > 10); 52 - } 53 - 54 40 int drop_caches_sysctl_handler(struct ctl_table *table, int write, 55 41 void __user *buffer, size_t *length, loff_t *ppos) 56 42 {

+6 -5

fs/gfs2/quota.c

··· 145 145 } 146 146 147 147 148 - static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg) 148 + static enum lru_status gfs2_qd_isolate(struct list_head *item, 149 + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) 149 150 { 150 151 struct list_head *dispose = arg; 151 152 struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); ··· 156 155 157 156 if (qd->qd_lockref.count == 0) { 158 157 lockref_mark_dead(&qd->qd_lockref); 159 - list_move(&qd->qd_lru, dispose); 158 + list_lru_isolate_move(lru, &qd->qd_lru, dispose); 160 159 } 161 160 162 161 spin_unlock(&qd->qd_lockref.lock); ··· 172 171 if (!(sc->gfp_mask & __GFP_FS)) 173 172 return SHRINK_STOP; 174 173 175 - freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, 176 - &dispose, &sc->nr_to_scan); 174 + freed = list_lru_shrink_walk(&gfs2_qd_lru, sc, 175 + gfs2_qd_isolate, &dispose); 177 176 178 177 gfs2_qd_dispose(&dispose); 179 178 ··· 183 182 static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, 184 183 struct shrink_control *sc) 185 184 { 186 - return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); 185 + return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); 187 186 } 188 187 189 188 struct shrinker gfs2_qd_shrinker = {

+7 -8

fs/inode.c

··· 672 672 * LRU does not have strict ordering. Hence we don't want to reclaim inodes 673 673 * with this flag set because they are the inodes that are out of order. 674 674 */ 675 - static enum lru_status 676 - inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) 675 + static enum lru_status inode_lru_isolate(struct list_head *item, 676 + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) 677 677 { 678 678 struct list_head *freeable = arg; 679 679 struct inode *inode = container_of(item, struct inode, i_lru); ··· 691 691 */ 692 692 if (atomic_read(&inode->i_count) || 693 693 (inode->i_state & ~I_REFERENCED)) { 694 - list_del_init(&inode->i_lru); 694 + list_lru_isolate(lru, &inode->i_lru); 695 695 spin_unlock(&inode->i_lock); 696 696 this_cpu_dec(nr_unused); 697 697 return LRU_REMOVED; ··· 725 725 726 726 WARN_ON(inode->i_state & I_NEW); 727 727 inode->i_state |= I_FREEING; 728 - list_move(&inode->i_lru, freeable); 728 + list_lru_isolate_move(lru, &inode->i_lru, freeable); 729 729 spin_unlock(&inode->i_lock); 730 730 731 731 this_cpu_dec(nr_unused); ··· 738 738 * to trim from the LRU. Inodes to be freed are moved to a temporary list and 739 739 * then are freed outside inode_lock by dispose_list(). 740 740 */ 741 - long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, 742 - int nid) 741 + long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) 743 742 { 744 743 LIST_HEAD(freeable); 745 744 long freed; 746 745 747 - freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, 748 - &freeable, &nr_to_scan); 746 + freed = list_lru_shrink_walk(&sb->s_inode_lru, sc, 747 + inode_lru_isolate, &freeable); 749 748 dispose_list(&freeable); 750 749 return freed; 751 750 }

+3 -4

fs/internal.h

··· 14 14 struct linux_binprm; 15 15 struct path; 16 16 struct mount; 17 + struct shrink_control; 17 18 18 19 /* 19 20 * block_dev.c ··· 112 111 * inode.c 113 112 */ 114 113 extern spinlock_t inode_sb_list_lock; 115 - extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, 116 - int nid); 114 + extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); 117 115 extern void inode_add_lru(struct inode *inode); 118 116 119 117 /* ··· 129 129 */ 130 130 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); 131 131 extern int d_set_mounted(struct dentry *dentry); 132 - extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, 133 - int nid); 132 + extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc); 134 133 135 134 /* 136 135 * read_write.c

+7 -27

fs/proc/array.c

··· 81 81 #include <linux/pid_namespace.h> 82 82 #include <linux/ptrace.h> 83 83 #include <linux/tracehook.h> 84 + #include <linux/string_helpers.h> 84 85 #include <linux/user_namespace.h> 85 86 86 87 #include <asm/pgtable.h> ··· 90 89 91 90 static inline void task_name(struct seq_file *m, struct task_struct *p) 92 91 { 93 - int i; 94 - char *buf, *end; 95 - char *name; 92 + char *buf; 96 93 char tcomm[sizeof(p->comm)]; 97 94 98 95 get_task_comm(tcomm, p); 99 96 100 97 seq_puts(m, "Name:\t"); 101 - end = m->buf + m->size; 102 98 buf = m->buf + m->count; 103 - name = tcomm; 104 - i = sizeof(tcomm); 105 - while (i && (buf < end)) { 106 - unsigned char c = *name; 107 - name++; 108 - i--; 109 - *buf = c; 110 - if (!c) 111 - break; 112 - if (c == '\\') { 113 - buf++; 114 - if (buf < end) 115 - *buf++ = c; 116 - continue; 117 - } 118 - if (c == '\n') { 119 - *buf++ = '\\'; 120 - if (buf < end) 121 - *buf++ = 'n'; 122 - continue; 123 - } 124 - buf++; 125 - } 99 + 100 + /* Ignore error for now */ 101 + string_escape_str(tcomm, &buf, m->size - m->count, 102 + ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); 103 + 126 104 m->count = buf - m->buf; 127 105 seq_putc(m, '\n'); 128 106 }

+1 -1

fs/proc/generic.c

··· 122 122 struct kstat *stat) 123 123 { 124 124 struct inode *inode = dentry->d_inode; 125 - struct proc_dir_entry *de = PROC_I(inode)->pde; 125 + struct proc_dir_entry *de = PDE(inode); 126 126 if (de && de->nlink) 127 127 set_nlink(inode, de->nlink); 128 128

+1 -1

fs/proc/inode.c

··· 40 40 put_pid(PROC_I(inode)->pid); 41 41 42 42 /* Let go of any associated proc directory entry */ 43 - de = PROC_I(inode)->pde; 43 + de = PDE(inode); 44 44 if (de) 45 45 pde_put(de); 46 46 head = PROC_I(inode)->sysctl;

+16

fs/proc/task_mmu.c

··· 732 732 CLEAR_REFS_ANON, 733 733 CLEAR_REFS_MAPPED, 734 734 CLEAR_REFS_SOFT_DIRTY, 735 + CLEAR_REFS_MM_HIWATER_RSS, 735 736 CLEAR_REFS_LAST, 736 737 }; 737 738 ··· 908 907 .mm = mm, 909 908 .private = &cp, 910 909 }; 910 + 911 + if (type == CLEAR_REFS_MM_HIWATER_RSS) { 912 + /* 913 + * Writing 5 to /proc/pid/clear_refs resets the peak 914 + * resident set size to this mm's current rss value. 915 + */ 916 + down_write(&mm->mmap_sem); 917 + reset_mm_hiwater_rss(mm); 918 + up_write(&mm->mmap_sem); 919 + goto out_mm; 920 + } 921 + 911 922 down_read(&mm->mmap_sem); 912 923 if (type == CLEAR_REFS_SOFT_DIRTY) { 913 924 for (vma = mm->mmap; vma; vma = vma->vm_next) { ··· 941 928 mmu_notifier_invalidate_range_end(mm, 0, -1); 942 929 flush_tlb_mm(mm); 943 930 up_read(&mm->mmap_sem); 931 + out_mm: 944 932 mmput(mm); 945 933 } 946 934 put_task_struct(task); ··· 1557 1543 for_each_node_state(nid, N_MEMORY) 1558 1544 if (md->node[nid]) 1559 1545 seq_printf(m, " N%d=%lu", nid, md->node[nid]); 1546 + 1547 + seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10); 1560 1548 out: 1561 1549 seq_putc(m, '\n'); 1562 1550 m_cache_vma(m, vma);

+1 -1

fs/select.c

··· 971 971 if (ret == -EINTR) { 972 972 struct restart_block *restart_block; 973 973 974 - restart_block = &current_thread_info()->restart_block; 974 + restart_block = &current->restart_block; 975 975 restart_block->fn = do_restart_poll; 976 976 restart_block->poll.ufds = ufds; 977 977 restart_block->poll.nfds = nfds;

+28 -19

fs/super.c

··· 75 75 return SHRINK_STOP; 76 76 77 77 if (sb->s_op->nr_cached_objects) 78 - fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); 78 + fs_objects = sb->s_op->nr_cached_objects(sb, sc); 79 79 80 - inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); 81 - dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); 80 + inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); 81 + dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); 82 82 total_objects = dentries + inodes + fs_objects + 1; 83 83 if (!total_objects) 84 84 total_objects = 1; ··· 86 86 /* proportion the scan between the caches */ 87 87 dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); 88 88 inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); 89 + fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); 89 90 90 91 /* 91 92 * prune the dcache first as the icache is pinned by it, then 92 93 * prune the icache, followed by the filesystem specific caches 94 + * 95 + * Ensure that we always scan at least one object - memcg kmem 96 + * accounting uses this to fully empty the caches. 93 97 */ 94 - freed = prune_dcache_sb(sb, dentries, sc->nid); 95 - freed += prune_icache_sb(sb, inodes, sc->nid); 98 + sc->nr_to_scan = dentries + 1; 99 + freed = prune_dcache_sb(sb, sc); 100 + sc->nr_to_scan = inodes + 1; 101 + freed += prune_icache_sb(sb, sc); 96 102 97 103 if (fs_objects) { 98 - fs_objects = mult_frac(sc->nr_to_scan, fs_objects, 99 - total_objects); 100 - freed += sb->s_op->free_cached_objects(sb, fs_objects, 101 - sc->nid); 104 + sc->nr_to_scan = fs_objects + 1; 105 + freed += sb->s_op->free_cached_objects(sb, sc); 102 106 } 103 107 104 108 drop_super(sb); ··· 122 118 * scalability bottleneck. The counts could get updated 123 119 * between super_cache_count and super_cache_scan anyway. 124 120 * Call to super_cache_count with shrinker_rwsem held 125 - * ensures the safety of call to list_lru_count_node() and 121 + * ensures the safety of call to list_lru_shrink_count() and 126 122 * s_op->nr_cached_objects(). 127 123 */ 128 124 if (sb->s_op && sb->s_op->nr_cached_objects) 129 - total_objects = sb->s_op->nr_cached_objects(sb, 130 - sc->nid); 125 + total_objects = sb->s_op->nr_cached_objects(sb, sc); 131 126 132 - total_objects += list_lru_count_node(&sb->s_dentry_lru, 133 - sc->nid); 134 - total_objects += list_lru_count_node(&sb->s_inode_lru, 135 - sc->nid); 127 + total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); 128 + total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); 136 129 137 130 total_objects = vfs_pressure_ratio(total_objects); 138 131 return total_objects; ··· 192 191 INIT_HLIST_BL_HEAD(&s->s_anon); 193 192 INIT_LIST_HEAD(&s->s_inodes); 194 193 195 - if (list_lru_init(&s->s_dentry_lru)) 194 + if (list_lru_init_memcg(&s->s_dentry_lru)) 196 195 goto fail; 197 - if (list_lru_init(&s->s_inode_lru)) 196 + if (list_lru_init_memcg(&s->s_inode_lru)) 198 197 goto fail; 199 198 200 199 init_rwsem(&s->s_umount); ··· 230 229 s->s_shrink.scan_objects = super_cache_scan; 231 230 s->s_shrink.count_objects = super_cache_count; 232 231 s->s_shrink.batch = 1024; 233 - s->s_shrink.flags = SHRINKER_NUMA_AWARE; 232 + s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; 234 233 return s; 235 234 236 235 fail: ··· 284 283 cleancache_invalidate_fs(s); 285 284 unregister_shrinker(&s->s_shrink); 286 285 fs->kill_sb(s); 286 + 287 + /* 288 + * Since list_lru_destroy() may sleep, we cannot call it from 289 + * put_super(), where we hold the sb_lock. Therefore we destroy 290 + * the lru lists right now. 291 + */ 292 + list_lru_destroy(&s->s_dentry_lru); 293 + list_lru_destroy(&s->s_inode_lru); 287 294 288 295 put_filesystem(fs); 289 296 put_super(s);

+7 -6

fs/xfs/xfs_buf.c

··· 1488 1488 static enum lru_status 1489 1489 xfs_buftarg_wait_rele( 1490 1490 struct list_head *item, 1491 + struct list_lru_one *lru, 1491 1492 spinlock_t *lru_lock, 1492 1493 void *arg) 1493 1494 ··· 1510 1509 */ 1511 1510 atomic_set(&bp->b_lru_ref, 0); 1512 1511 bp->b_state |= XFS_BSTATE_DISPOSE; 1513 - list_move(item, dispose); 1512 + list_lru_isolate_move(lru, item, dispose); 1514 1513 spin_unlock(&bp->b_lock); 1515 1514 return LRU_REMOVED; 1516 1515 } ··· 1547 1546 static enum lru_status 1548 1547 xfs_buftarg_isolate( 1549 1548 struct list_head *item, 1549 + struct list_lru_one *lru, 1550 1550 spinlock_t *lru_lock, 1551 1551 void *arg) 1552 1552 { ··· 1571 1569 } 1572 1570 1573 1571 bp->b_state |= XFS_BSTATE_DISPOSE; 1574 - list_move(item, dispose); 1572 + list_lru_isolate_move(lru, item, dispose); 1575 1573 spin_unlock(&bp->b_lock); 1576 1574 return LRU_REMOVED; 1577 1575 } ··· 1585 1583 struct xfs_buftarg, bt_shrinker); 1586 1584 LIST_HEAD(dispose); 1587 1585 unsigned long freed; 1588 - unsigned long nr_to_scan = sc->nr_to_scan; 1589 1586 1590 - freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, 1591 - &dispose, &nr_to_scan); 1587 + freed = list_lru_shrink_walk(&btp->bt_lru, sc, 1588 + xfs_buftarg_isolate, &dispose); 1592 1589 1593 1590 while (!list_empty(&dispose)) { 1594 1591 struct xfs_buf *bp; ··· 1606 1605 { 1607 1606 struct xfs_buftarg *btp = container_of(shrink, 1608 1607 struct xfs_buftarg, bt_shrinker); 1609 - return list_lru_count_node(&btp->bt_lru, sc->nid); 1608 + return list_lru_shrink_count(&btp->bt_lru, sc); 1610 1609 } 1611 1610 1612 1611 void

+6 -6

fs/xfs/xfs_qm.c

··· 430 430 static enum lru_status 431 431 xfs_qm_dquot_isolate( 432 432 struct list_head *item, 433 + struct list_lru_one *lru, 433 434 spinlock_t *lru_lock, 434 435 void *arg) 435 436 __releases(lru_lock) __acquires(lru_lock) ··· 451 450 XFS_STATS_INC(xs_qm_dqwants); 452 451 453 452 trace_xfs_dqreclaim_want(dqp); 454 - list_del_init(&dqp->q_lru); 453 + list_lru_isolate(lru, &dqp->q_lru); 455 454 XFS_STATS_DEC(xs_qm_dquot_unused); 456 455 return LRU_REMOVED; 457 456 } ··· 495 494 xfs_dqunlock(dqp); 496 495 497 496 ASSERT(dqp->q_nrefs == 0); 498 - list_move_tail(&dqp->q_lru, &isol->dispose); 497 + list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose); 499 498 XFS_STATS_DEC(xs_qm_dquot_unused); 500 499 trace_xfs_dqreclaim_done(dqp); 501 500 XFS_STATS_INC(xs_qm_dqreclaims); ··· 524 523 struct xfs_qm_isolate isol; 525 524 unsigned long freed; 526 525 int error; 527 - unsigned long nr_to_scan = sc->nr_to_scan; 528 526 529 527 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) 530 528 return 0; ··· 531 531 INIT_LIST_HEAD(&isol.buffers); 532 532 INIT_LIST_HEAD(&isol.dispose); 533 533 534 - freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, 535 - &nr_to_scan); 534 + freed = list_lru_shrink_walk(&qi->qi_lru, sc, 535 + xfs_qm_dquot_isolate, &isol); 536 536 537 537 error = xfs_buf_delwri_submit(&isol.buffers); 538 538 if (error) ··· 557 557 struct xfs_quotainfo *qi = container_of(shrink, 558 558 struct xfs_quotainfo, qi_shrinker); 559 559 560 - return list_lru_count_node(&qi->qi_lru, sc->nid); 560 + return list_lru_shrink_count(&qi->qi_lru, sc); 561 561 } 562 562 563 563 /*

+3 -4

fs/xfs/xfs_super.c

··· 1537 1537 static long 1538 1538 xfs_fs_nr_cached_objects( 1539 1539 struct super_block *sb, 1540 - int nid) 1540 + struct shrink_control *sc) 1541 1541 { 1542 1542 return xfs_reclaim_inodes_count(XFS_M(sb)); 1543 1543 } ··· 1545 1545 static long 1546 1546 xfs_fs_free_cached_objects( 1547 1547 struct super_block *sb, 1548 - long nr_to_scan, 1549 - int nid) 1548 + struct shrink_control *sc) 1550 1549 { 1551 - return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan); 1550 + return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan); 1552 1551 } 1553 1552 1554 1553 static const struct super_operations xfs_super_operations = {

+3 -3

include/acpi/acoutput.h

··· 240 240 /* 241 241 * If ACPI_GET_FUNCTION_NAME was not defined in the compiler-dependent header, 242 242 * define it now. This is the case where there the compiler does not support 243 - * a __FUNCTION__ macro or equivalent. 243 + * a __func__ macro or equivalent. 244 244 */ 245 245 #ifndef ACPI_GET_FUNCTION_NAME 246 246 #define ACPI_GET_FUNCTION_NAME _acpi_function_name ··· 249 249 * The Name parameter should be the procedure name as a quoted string. 250 250 * The function name is also used by the function exit macros below. 251 251 * Note: (const char) is used to be compatible with the debug interfaces 252 - * and macros such as __FUNCTION__. 252 + * and macros such as __func__. 253 253 */ 254 254 #define ACPI_FUNCTION_NAME(name) static const char _acpi_function_name[] = #name; 255 255 256 256 #else 257 - /* Compiler supports __FUNCTION__ (or equivalent) -- Ignore this macro */ 257 + /* Compiler supports __func__ (or equivalent) -- Ignore this macro */ 258 258 259 259 #define ACPI_FUNCTION_NAME(name) 260 260 #endif /* ACPI_GET_FUNCTION_NAME */

+9 -144

include/asm-generic/pgtable.h

··· 244 244 # define pte_accessible(mm, pte) ((void)(pte), 1) 245 245 #endif 246 246 247 - #ifndef pte_present_nonuma 248 - #define pte_present_nonuma(pte) pte_present(pte) 249 - #endif 250 - 251 247 #ifndef flush_tlb_fix_spurious_fault 252 248 #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) 253 249 #endif ··· 669 673 #endif 670 674 } 671 675 672 - #ifdef CONFIG_NUMA_BALANCING 676 + #ifndef CONFIG_NUMA_BALANCING 673 677 /* 674 - * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that 675 - * is protected for PROT_NONE and a NUMA hinting fault entry. If the 676 - * architecture defines __PAGE_PROTNONE then it should take that into account 677 - * but those that do not can rely on the fact that the NUMA hinting scanner 678 - * skips inaccessible VMAs. 679 - * 680 - * pte/pmd_present() returns true if pte/pmd_numa returns true. Page 681 - * fault triggers on those regions if pte/pmd_numa returns true 682 - * (because _PAGE_PRESENT is not set). 678 + * Technically a PTE can be PROTNONE even when not doing NUMA balancing but 679 + * the only case the kernel cares is for NUMA balancing and is only ever set 680 + * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked 681 + * _PAGE_PROTNONE so by by default, implement the helper as "always no". It 682 + * is the responsibility of the caller to distinguish between PROT_NONE 683 + * protections and NUMA hinting fault protections. 683 684 */ 684 - #ifndef pte_numa 685 - static inline int pte_numa(pte_t pte) 686 - { 687 - return ptenuma_flags(pte) == _PAGE_NUMA; 688 - } 689 - #endif 690 - 691 - #ifndef pmd_numa 692 - static inline int pmd_numa(pmd_t pmd) 693 - { 694 - return pmdnuma_flags(pmd) == _PAGE_NUMA; 695 - } 696 - #endif 697 - 698 - /* 699 - * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically 700 - * because they're called by the NUMA hinting minor page fault. If we 701 - * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler 702 - * would be forced to set it later while filling the TLB after we 703 - * return to userland. That would trigger a second write to memory 704 - * that we optimize away by setting _PAGE_ACCESSED here. 705 - */ 706 - #ifndef pte_mknonnuma 707 - static inline pte_t pte_mknonnuma(pte_t pte) 708 - { 709 - pteval_t val = pte_val(pte); 710 - 711 - val &= ~_PAGE_NUMA; 712 - val |= (_PAGE_PRESENT|_PAGE_ACCESSED); 713 - return __pte(val); 714 - } 715 - #endif 716 - 717 - #ifndef pmd_mknonnuma 718 - static inline pmd_t pmd_mknonnuma(pmd_t pmd) 719 - { 720 - pmdval_t val = pmd_val(pmd); 721 - 722 - val &= ~_PAGE_NUMA; 723 - val |= (_PAGE_PRESENT|_PAGE_ACCESSED); 724 - 725 - return __pmd(val); 726 - } 727 - #endif 728 - 729 - #ifndef pte_mknuma 730 - static inline pte_t pte_mknuma(pte_t pte) 731 - { 732 - pteval_t val = pte_val(pte); 733 - 734 - VM_BUG_ON(!(val & _PAGE_PRESENT)); 735 - 736 - val &= ~_PAGE_PRESENT; 737 - val |= _PAGE_NUMA; 738 - 739 - return __pte(val); 740 - } 741 - #endif 742 - 743 - #ifndef ptep_set_numa 744 - static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, 745 - pte_t *ptep) 746 - { 747 - pte_t ptent = *ptep; 748 - 749 - ptent = pte_mknuma(ptent); 750 - set_pte_at(mm, addr, ptep, ptent); 751 - return; 752 - } 753 - #endif 754 - 755 - #ifndef pmd_mknuma 756 - static inline pmd_t pmd_mknuma(pmd_t pmd) 757 - { 758 - pmdval_t val = pmd_val(pmd); 759 - 760 - val &= ~_PAGE_PRESENT; 761 - val |= _PAGE_NUMA; 762 - 763 - return __pmd(val); 764 - } 765 - #endif 766 - 767 - #ifndef pmdp_set_numa 768 - static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, 769 - pmd_t *pmdp) 770 - { 771 - pmd_t pmd = *pmdp; 772 - 773 - pmd = pmd_mknuma(pmd); 774 - set_pmd_at(mm, addr, pmdp, pmd); 775 - return; 776 - } 777 - #endif 778 - #else 779 - static inline int pmd_numa(pmd_t pmd) 685 + static inline int pte_protnone(pte_t pte) 780 686 { 781 687 return 0; 782 688 } 783 689 784 - static inline int pte_numa(pte_t pte) 690 + static inline int pmd_protnone(pmd_t pmd) 785 691 { 786 692 return 0; 787 - } 788 - 789 - static inline pte_t pte_mknonnuma(pte_t pte) 790 - { 791 - return pte; 792 - } 793 - 794 - static inline pmd_t pmd_mknonnuma(pmd_t pmd) 795 - { 796 - return pmd; 797 - } 798 - 799 - static inline pte_t pte_mknuma(pte_t pte) 800 - { 801 - return pte; 802 - } 803 - 804 - static inline void ptep_set_numa(struct mm_struct *mm, unsigned long addr, 805 - pte_t *ptep) 806 - { 807 - return; 808 - } 809 - 810 - 811 - static inline pmd_t pmd_mknuma(pmd_t pmd) 812 - { 813 - return pmd; 814 - } 815 - 816 - static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, 817 - pmd_t *pmdp) 818 - { 819 - return ; 820 693 } 821 694 #endif /* CONFIG_NUMA_BALANCING */ 822 695

+11 -11

include/linux/bitmap.h

··· 160 160 extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen, 161 161 unsigned long *dst, int nbits); 162 162 extern void bitmap_remap(unsigned long *dst, const unsigned long *src, 163 - const unsigned long *old, const unsigned long *new, int bits); 163 + const unsigned long *old, const unsigned long *new, unsigned int nbits); 164 164 extern int bitmap_bitremap(int oldbit, 165 165 const unsigned long *old, const unsigned long *new, int bits); 166 166 extern void bitmap_onto(unsigned long *dst, const unsigned long *orig, 167 - const unsigned long *relmap, int bits); 167 + const unsigned long *relmap, unsigned int bits); 168 168 extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, 169 - int sz, int bits); 169 + unsigned int sz, unsigned int nbits); 170 170 extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); 171 171 extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); 172 172 extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); 173 173 extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); 174 - extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); 174 + extern unsigned int bitmap_ord_to_pos(const unsigned long *bitmap, unsigned int ord, unsigned int nbits); 175 175 extern int bitmap_print_to_pagebuf(bool list, char *buf, 176 176 const unsigned long *maskp, int nmaskbits); 177 177 ··· 185 185 #define small_const_nbits(nbits) \ 186 186 (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG) 187 187 188 - static inline void bitmap_zero(unsigned long *dst, int nbits) 188 + static inline void bitmap_zero(unsigned long *dst, unsigned int nbits) 189 189 { 190 190 if (small_const_nbits(nbits)) 191 191 *dst = 0UL; 192 192 else { 193 - int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); 193 + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); 194 194 memset(dst, 0, len); 195 195 } 196 196 } 197 197 198 - static inline void bitmap_fill(unsigned long *dst, int nbits) 198 + static inline void bitmap_fill(unsigned long *dst, unsigned int nbits) 199 199 { 200 - size_t nlongs = BITS_TO_LONGS(nbits); 200 + unsigned int nlongs = BITS_TO_LONGS(nbits); 201 201 if (!small_const_nbits(nbits)) { 202 - int len = (nlongs - 1) * sizeof(unsigned long); 202 + unsigned int len = (nlongs - 1) * sizeof(unsigned long); 203 203 memset(dst, 0xff, len); 204 204 } 205 205 dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits); 206 206 } 207 207 208 208 static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, 209 - int nbits) 209 + unsigned int nbits) 210 210 { 211 211 if (small_const_nbits(nbits)) 212 212 *dst = *src; 213 213 else { 214 - int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); 214 + unsigned int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long); 215 215 memcpy(dst, src, len); 216 216 } 217 217 }

+11 -11

include/linux/cpumask.h

··· 905 905 } 906 906 907 907 #define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS) 908 - static inline void __cpus_setall(cpumask_t *dstp, int nbits) 908 + static inline void __cpus_setall(cpumask_t *dstp, unsigned int nbits) 909 909 { 910 910 bitmap_fill(dstp->bits, nbits); 911 911 } 912 912 913 913 #define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS) 914 - static inline void __cpus_clear(cpumask_t *dstp, int nbits) 914 + static inline void __cpus_clear(cpumask_t *dstp, unsigned int nbits) 915 915 { 916 916 bitmap_zero(dstp->bits, nbits); 917 917 } ··· 927 927 928 928 #define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS) 929 929 static inline int __cpus_and(cpumask_t *dstp, const cpumask_t *src1p, 930 - const cpumask_t *src2p, int nbits) 930 + const cpumask_t *src2p, unsigned int nbits) 931 931 { 932 932 return bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); 933 933 } 934 934 935 935 #define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS) 936 936 static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p, 937 - const cpumask_t *src2p, int nbits) 937 + const cpumask_t *src2p, unsigned int nbits) 938 938 { 939 939 bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); 940 940 } 941 941 942 942 #define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS) 943 943 static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p, 944 - const cpumask_t *src2p, int nbits) 944 + const cpumask_t *src2p, unsigned int nbits) 945 945 { 946 946 bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); 947 947 } ··· 949 949 #define cpus_andnot(dst, src1, src2) \ 950 950 __cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS) 951 951 static inline int __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p, 952 - const cpumask_t *src2p, int nbits) 952 + const cpumask_t *src2p, unsigned int nbits) 953 953 { 954 954 return bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); 955 955 } 956 956 957 957 #define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS) 958 958 static inline int __cpus_equal(const cpumask_t *src1p, 959 - const cpumask_t *src2p, int nbits) 959 + const cpumask_t *src2p, unsigned int nbits) 960 960 { 961 961 return bitmap_equal(src1p->bits, src2p->bits, nbits); 962 962 } 963 963 964 964 #define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS) 965 965 static inline int __cpus_intersects(const cpumask_t *src1p, 966 - const cpumask_t *src2p, int nbits) 966 + const cpumask_t *src2p, unsigned int nbits) 967 967 { 968 968 return bitmap_intersects(src1p->bits, src2p->bits, nbits); 969 969 } 970 970 971 971 #define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS) 972 972 static inline int __cpus_subset(const cpumask_t *src1p, 973 - const cpumask_t *src2p, int nbits) 973 + const cpumask_t *src2p, unsigned int nbits) 974 974 { 975 975 return bitmap_subset(src1p->bits, src2p->bits, nbits); 976 976 } 977 977 978 978 #define cpus_empty(src) __cpus_empty(&(src), NR_CPUS) 979 - static inline int __cpus_empty(const cpumask_t *srcp, int nbits) 979 + static inline int __cpus_empty(const cpumask_t *srcp, unsigned int nbits) 980 980 { 981 981 return bitmap_empty(srcp->bits, nbits); 982 982 } 983 983 984 984 #define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS) 985 - static inline int __cpus_weight(const cpumask_t *srcp, int nbits) 985 + static inline int __cpus_weight(const cpumask_t *srcp, unsigned int nbits) 986 986 { 987 987 return bitmap_weight(srcp->bits, nbits); 988 988 }

+2

include/linux/cryptohash.h

··· 1 1 #ifndef __CRYPTOHASH_H 2 2 #define __CRYPTOHASH_H 3 3 4 + #include <uapi/linux/types.h> 5 + 4 6 #define SHA_DIGEST_WORDS 5 5 7 #define SHA_MESSAGE_BYTES (512 /*bits*/ / 8) 6 8 #define SHA_WORKSPACE_WORDS 16

+4 -2

include/linux/fs.h

··· 1655 1655 struct dquot **(*get_dquots)(struct inode *); 1656 1656 #endif 1657 1657 int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); 1658 - long (*nr_cached_objects)(struct super_block *, int); 1659 - long (*free_cached_objects)(struct super_block *, long, int); 1658 + long (*nr_cached_objects)(struct super_block *, 1659 + struct shrink_control *); 1660 + long (*free_cached_objects)(struct super_block *, 1661 + struct shrink_control *); 1660 1662 }; 1661 1663 1662 1664 /*

+3

include/linux/init_task.h

··· 193 193 .nr_cpus_allowed= NR_CPUS, \ 194 194 .mm = NULL, \ 195 195 .active_mm = &init_mm, \ 196 + .restart_block = { \ 197 + .fn = do_no_restart_syscall, \ 198 + }, \ 196 199 .se = { \ 197 200 .group_node = LIST_HEAD_INIT(tsk.se.group_node), \ 198 201 }, \

-3

include/linux/kernel.h

··· 800 800 const typeof( ((type *)0)->member ) *__mptr = (ptr); \ 801 801 (type *)( (char *)__mptr - offsetof(type,member) );}) 802 802 803 - /* Trap pasters of __FUNCTION__ at compile-time */ 804 - #define __FUNCTION__ (__func__) 805 - 806 803 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ 807 804 #ifdef CONFIG_FTRACE_MCOUNT_RECORD 808 805 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD

+67 -15

include/linux/list_lru.h

··· 9 9 10 10 #include <linux/list.h> 11 11 #include <linux/nodemask.h> 12 + #include <linux/shrinker.h> 13 + 14 + struct mem_cgroup; 12 15 13 16 /* list_lru_walk_cb has to always return one of those */ 14 17 enum lru_status { ··· 24 21 internally, but has to return locked. */ 25 22 }; 26 23 27 - struct list_lru_node { 28 - spinlock_t lock; 24 + struct list_lru_one { 29 25 struct list_head list; 30 - /* kept as signed so we can catch imbalance bugs */ 26 + /* may become negative during memcg reparenting */ 31 27 long nr_items; 28 + }; 29 + 30 + struct list_lru_memcg { 31 + /* array of per cgroup lists, indexed by memcg_cache_id */ 32 + struct list_lru_one *lru[0]; 33 + }; 34 + 35 + struct list_lru_node { 36 + /* protects all lists on the node, including per cgroup */ 37 + spinlock_t lock; 38 + /* global list, used for the root cgroup in cgroup aware lrus */ 39 + struct list_lru_one lru; 40 + #ifdef CONFIG_MEMCG_KMEM 41 + /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ 42 + struct list_lru_memcg *memcg_lrus; 43 + #endif 32 44 } ____cacheline_aligned_in_smp; 33 45 34 46 struct list_lru { 35 47 struct list_lru_node *node; 36 - nodemask_t active_nodes; 48 + #ifdef CONFIG_MEMCG_KMEM 49 + struct list_head list; 50 + #endif 37 51 }; 38 52 39 53 void list_lru_destroy(struct list_lru *lru); 40 - int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key); 41 - static inline int list_lru_init(struct list_lru *lru) 42 - { 43 - return list_lru_init_key(lru, NULL); 44 - } 54 + int __list_lru_init(struct list_lru *lru, bool memcg_aware, 55 + struct lock_class_key *key); 56 + 57 + #define list_lru_init(lru) __list_lru_init((lru), false, NULL) 58 + #define list_lru_init_key(lru, key) __list_lru_init((lru), false, (key)) 59 + #define list_lru_init_memcg(lru) __list_lru_init((lru), true, NULL) 60 + 61 + int memcg_update_all_list_lrus(int num_memcgs); 62 + void memcg_drain_all_list_lrus(int src_idx, int dst_idx); 45 63 46 64 /** 47 65 * list_lru_add: add an element to the lru list's tail ··· 96 72 bool list_lru_del(struct list_lru *lru, struct list_head *item); 97 73 98 74 /** 99 - * list_lru_count_node: return the number of objects currently held by @lru 75 + * list_lru_count_one: return the number of objects currently held by @lru 100 76 * @lru: the lru pointer. 101 77 * @nid: the node id to count from. 78 + * @memcg: the cgroup to count from. 102 79 * 103 80 * Always return a non-negative number, 0 for empty lists. There is no 104 81 * guarantee that the list is not updated while the count is being computed. 105 82 * Callers that want such a guarantee need to provide an outer lock. 106 83 */ 84 + unsigned long list_lru_count_one(struct list_lru *lru, 85 + int nid, struct mem_cgroup *memcg); 107 86 unsigned long list_lru_count_node(struct list_lru *lru, int nid); 87 + 88 + static inline unsigned long list_lru_shrink_count(struct list_lru *lru, 89 + struct shrink_control *sc) 90 + { 91 + return list_lru_count_one(lru, sc->nid, sc->memcg); 92 + } 93 + 108 94 static inline unsigned long list_lru_count(struct list_lru *lru) 109 95 { 110 96 long count = 0; 111 97 int nid; 112 98 113 - for_each_node_mask(nid, lru->active_nodes) 99 + for_each_node_state(nid, N_NORMAL_MEMORY) 114 100 count += list_lru_count_node(lru, nid); 115 101 116 102 return count; 117 103 } 118 104 119 - typedef enum lru_status 120 - (*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg); 105 + void list_lru_isolate(struct list_lru_one *list, struct list_head *item); 106 + void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, 107 + struct list_head *head); 108 + 109 + typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, 110 + struct list_lru_one *list, spinlock_t *lock, void *cb_arg); 111 + 121 112 /** 122 - * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items. 113 + * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. 123 114 * @lru: the lru pointer. 124 115 * @nid: the node id to scan from. 116 + * @memcg: the cgroup to scan from. 125 117 * @isolate: callback function that is resposible for deciding what to do with 126 118 * the item currently being scanned 127 119 * @cb_arg: opaque type that will be passed to @isolate ··· 155 115 * 156 116 * Return value: the number of objects effectively removed from the LRU. 157 117 */ 118 + unsigned long list_lru_walk_one(struct list_lru *lru, 119 + int nid, struct mem_cgroup *memcg, 120 + list_lru_walk_cb isolate, void *cb_arg, 121 + unsigned long *nr_to_walk); 158 122 unsigned long list_lru_walk_node(struct list_lru *lru, int nid, 159 123 list_lru_walk_cb isolate, void *cb_arg, 160 124 unsigned long *nr_to_walk); 125 + 126 + static inline unsigned long 127 + list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc, 128 + list_lru_walk_cb isolate, void *cb_arg) 129 + { 130 + return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg, 131 + &sc->nr_to_scan); 132 + } 161 133 162 134 static inline unsigned long 163 135 list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, ··· 178 126 long isolated = 0; 179 127 int nid; 180 128 181 - for_each_node_mask(nid, lru->active_nodes) { 129 + for_each_node_state(nid, N_NORMAL_MEMORY) { 182 130 isolated += list_lru_walk_node(lru, nid, isolate, 183 131 cb_arg, &nr_to_walk); 184 132 if (nr_to_walk <= 0)

+33 -4

include/linux/memcontrol.h

··· 398 398 #ifdef CONFIG_MEMCG_KMEM 399 399 extern struct static_key memcg_kmem_enabled_key; 400 400 401 - extern int memcg_limited_groups_array_size; 401 + extern int memcg_nr_cache_ids; 402 + extern void memcg_get_cache_ids(void); 403 + extern void memcg_put_cache_ids(void); 402 404 403 405 /* 404 406 * Helper macro to loop through all memcg-specific caches. Callers must still ··· 408 406 * the slab_mutex must be held when looping through those caches 409 407 */ 410 408 #define for_each_memcg_cache_index(_idx) \ 411 - for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++) 409 + for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++) 412 410 413 411 static inline bool memcg_kmem_enabled(void) 414 412 { 415 413 return static_key_false(&memcg_kmem_enabled_key); 416 414 } 415 + 416 + bool memcg_kmem_is_active(struct mem_cgroup *memcg); 417 417 418 418 /* 419 419 * In general, we'll do everything in our power to not incur in any overhead ··· 436 432 437 433 int memcg_cache_id(struct mem_cgroup *memcg); 438 434 439 - void memcg_update_array_size(int num_groups); 440 - 441 435 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep); 442 436 void __memcg_kmem_put_cache(struct kmem_cache *cachep); 437 + 438 + struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr); 443 439 444 440 int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, 445 441 unsigned long nr_pages); ··· 537 533 if (memcg_kmem_enabled()) 538 534 __memcg_kmem_put_cache(cachep); 539 535 } 536 + 537 + static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) 538 + { 539 + if (!memcg_kmem_enabled()) 540 + return NULL; 541 + return __mem_cgroup_from_kmem(ptr); 542 + } 540 543 #else 541 544 #define for_each_memcg_cache_index(_idx) \ 542 545 for (; NULL; ) 543 546 544 547 static inline bool memcg_kmem_enabled(void) 548 + { 549 + return false; 550 + } 551 + 552 + static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg) 545 553 { 546 554 return false; 547 555 } ··· 578 562 return -1; 579 563 } 580 564 565 + static inline void memcg_get_cache_ids(void) 566 + { 567 + } 568 + 569 + static inline void memcg_put_cache_ids(void) 570 + { 571 + } 572 + 581 573 static inline struct kmem_cache * 582 574 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) 583 575 { ··· 594 570 595 571 static inline void memcg_kmem_put_cache(struct kmem_cache *cachep) 596 572 { 573 + } 574 + 575 + static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr) 576 + { 577 + return NULL; 597 578 } 598 579 #endif /* CONFIG_MEMCG_KMEM */ 599 580 #endif /* _LINUX_MEMCONTROL_H */

-4

include/linux/migrate.h

··· 67 67 68 68 #ifdef CONFIG_NUMA_BALANCING 69 69 extern bool pmd_trans_migrating(pmd_t pmd); 70 - extern void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd); 71 70 extern int migrate_misplaced_page(struct page *page, 72 71 struct vm_area_struct *vma, int node); 73 72 extern bool migrate_ratelimited(int node); ··· 74 75 static inline bool pmd_trans_migrating(pmd_t pmd) 75 76 { 76 77 return false; 77 - } 78 - static inline void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) 79 - { 80 78 } 81 79 static inline int migrate_misplaced_page(struct page *page, 82 80 struct vm_area_struct *vma, int node)

+15 -4

include/linux/mm.h

··· 1408 1408 mm->hiwater_vm = mm->total_vm; 1409 1409 } 1410 1410 1411 + static inline void reset_mm_hiwater_rss(struct mm_struct *mm) 1412 + { 1413 + mm->hiwater_rss = get_mm_rss(mm); 1414 + } 1415 + 1411 1416 static inline void setmax_mm_hiwater_rss(unsigned long *maxrss, 1412 1417 struct mm_struct *mm) 1413 1418 { ··· 1452 1447 int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); 1453 1448 #endif 1454 1449 1455 - #ifdef __PAGETABLE_PMD_FOLDED 1450 + #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) 1456 1451 static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, 1457 1452 unsigned long address) 1458 1453 { 1459 1454 return 0; 1460 1455 } 1456 + 1457 + static inline void mm_nr_pmds_init(struct mm_struct *mm) {} 1461 1458 1462 1459 static inline unsigned long mm_nr_pmds(struct mm_struct *mm) 1463 1460 { ··· 1471 1464 1472 1465 #else 1473 1466 int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); 1467 + 1468 + static inline void mm_nr_pmds_init(struct mm_struct *mm) 1469 + { 1470 + atomic_long_set(&mm->nr_pmds, 0); 1471 + } 1474 1472 1475 1473 static inline unsigned long mm_nr_pmds(struct mm_struct *mm) 1476 1474 { ··· 2180 2168 void __user *, size_t *, loff_t *); 2181 2169 #endif 2182 2170 2183 - unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, 2184 - unsigned long nr_scanned, 2185 - unsigned long nr_eligible); 2171 + void drop_slab(void); 2172 + void drop_slab_node(int nid); 2186 2173 2187 2174 #ifndef CONFIG_MMU 2188 2175 #define randomize_va_space 0

+13 -13

include/linux/nodemask.h

··· 120 120 } 121 121 122 122 #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) 123 - static inline void __nodes_setall(nodemask_t *dstp, int nbits) 123 + static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) 124 124 { 125 125 bitmap_fill(dstp->bits, nbits); 126 126 } 127 127 128 128 #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) 129 - static inline void __nodes_clear(nodemask_t *dstp, int nbits) 129 + static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) 130 130 { 131 131 bitmap_zero(dstp->bits, nbits); 132 132 } ··· 144 144 #define nodes_and(dst, src1, src2) \ 145 145 __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) 146 146 static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, 147 - const nodemask_t *src2p, int nbits) 147 + const nodemask_t *src2p, unsigned int nbits) 148 148 { 149 149 bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); 150 150 } ··· 152 152 #define nodes_or(dst, src1, src2) \ 153 153 __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) 154 154 static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, 155 - const nodemask_t *src2p, int nbits) 155 + const nodemask_t *src2p, unsigned int nbits) 156 156 { 157 157 bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); 158 158 } ··· 160 160 #define nodes_xor(dst, src1, src2) \ 161 161 __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) 162 162 static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, 163 - const nodemask_t *src2p, int nbits) 163 + const nodemask_t *src2p, unsigned int nbits) 164 164 { 165 165 bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); 166 166 } ··· 168 168 #define nodes_andnot(dst, src1, src2) \ 169 169 __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) 170 170 static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, 171 - const nodemask_t *src2p, int nbits) 171 + const nodemask_t *src2p, unsigned int nbits) 172 172 { 173 173 bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); 174 174 } ··· 176 176 #define nodes_complement(dst, src) \ 177 177 __nodes_complement(&(dst), &(src), MAX_NUMNODES) 178 178 static inline void __nodes_complement(nodemask_t *dstp, 179 - const nodemask_t *srcp, int nbits) 179 + const nodemask_t *srcp, unsigned int nbits) 180 180 { 181 181 bitmap_complement(dstp->bits, srcp->bits, nbits); 182 182 } ··· 184 184 #define nodes_equal(src1, src2) \ 185 185 __nodes_equal(&(src1), &(src2), MAX_NUMNODES) 186 186 static inline int __nodes_equal(const nodemask_t *src1p, 187 - const nodemask_t *src2p, int nbits) 187 + const nodemask_t *src2p, unsigned int nbits) 188 188 { 189 189 return bitmap_equal(src1p->bits, src2p->bits, nbits); 190 190 } ··· 192 192 #define nodes_intersects(src1, src2) \ 193 193 __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) 194 194 static inline int __nodes_intersects(const nodemask_t *src1p, 195 - const nodemask_t *src2p, int nbits) 195 + const nodemask_t *src2p, unsigned int nbits) 196 196 { 197 197 return bitmap_intersects(src1p->bits, src2p->bits, nbits); 198 198 } ··· 200 200 #define nodes_subset(src1, src2) \ 201 201 __nodes_subset(&(src1), &(src2), MAX_NUMNODES) 202 202 static inline int __nodes_subset(const nodemask_t *src1p, 203 - const nodemask_t *src2p, int nbits) 203 + const nodemask_t *src2p, unsigned int nbits) 204 204 { 205 205 return bitmap_subset(src1p->bits, src2p->bits, nbits); 206 206 } 207 207 208 208 #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) 209 - static inline int __nodes_empty(const nodemask_t *srcp, int nbits) 209 + static inline int __nodes_empty(const nodemask_t *srcp, unsigned int nbits) 210 210 { 211 211 return bitmap_empty(srcp->bits, nbits); 212 212 } 213 213 214 214 #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) 215 - static inline int __nodes_full(const nodemask_t *srcp, int nbits) 215 + static inline int __nodes_full(const nodemask_t *srcp, unsigned int nbits) 216 216 { 217 217 return bitmap_full(srcp->bits, nbits); 218 218 } 219 219 220 220 #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) 221 - static inline int __nodes_weight(const nodemask_t *srcp, int nbits) 221 + static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) 222 222 { 223 223 return bitmap_weight(srcp->bits, nbits); 224 224 }

+3 -3

include/linux/printk.h

··· 417 417 DUMP_PREFIX_ADDRESS, 418 418 DUMP_PREFIX_OFFSET 419 419 }; 420 - extern void hex_dump_to_buffer(const void *buf, size_t len, 421 - int rowsize, int groupsize, 422 - char *linebuf, size_t linebuflen, bool ascii); 420 + extern int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, 421 + int groupsize, char *linebuf, size_t linebuflen, 422 + bool ascii); 423 423 #ifdef CONFIG_PRINTK 424 424 extern void print_hex_dump(const char *level, const char *prefix_str, 425 425 int prefix_type, int rowsize, int groupsize,

+3

include/linux/sched.h

··· 1370 1370 1371 1371 unsigned long atomic_flags; /* Flags needing atomic access. */ 1372 1372 1373 + struct restart_block restart_block; 1374 + 1373 1375 pid_t pid; 1374 1376 pid_t tgid; 1375 1377 ··· 2147 2145 */ 2148 2146 extern u64 cpu_clock(int cpu); 2149 2147 extern u64 local_clock(void); 2148 + extern u64 running_clock(void); 2150 2149 extern u64 sched_clock_cpu(int cpu); 2151 2150 2152 2151

+5 -1

include/linux/shrinker.h

··· 20 20 21 21 /* current node being shrunk (for NUMA aware shrinkers) */ 22 22 int nid; 23 + 24 + /* current memcg being shrunk (for memcg aware shrinkers) */ 25 + struct mem_cgroup *memcg; 23 26 }; 24 27 25 28 #define SHRINK_STOP (~0UL) ··· 64 61 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ 65 62 66 63 /* Flags */ 67 - #define SHRINKER_NUMA_AWARE (1 << 0) 64 + #define SHRINKER_NUMA_AWARE (1 << 0) 65 + #define SHRINKER_MEMCG_AWARE (1 << 1) 68 66 69 67 extern int register_shrinker(struct shrinker *); 70 68 extern void unregister_shrinker(struct shrinker *);

+16 -15

include/linux/slab.h

··· 115 115 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, 116 116 unsigned long, 117 117 void (*)(void *)); 118 - #ifdef CONFIG_MEMCG_KMEM 119 - void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); 120 - void memcg_destroy_kmem_caches(struct mem_cgroup *); 121 - #endif 122 118 void kmem_cache_destroy(struct kmem_cache *); 123 119 int kmem_cache_shrink(struct kmem_cache *); 124 - void kmem_cache_free(struct kmem_cache *, void *); 120 + 121 + void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *); 122 + void memcg_deactivate_kmem_caches(struct mem_cgroup *); 123 + void memcg_destroy_kmem_caches(struct mem_cgroup *); 125 124 126 125 /* 127 126 * Please use this macro to create slab caches. Simply specify the ··· 287 288 288 289 void *__kmalloc(size_t size, gfp_t flags); 289 290 void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags); 291 + void kmem_cache_free(struct kmem_cache *, void *); 290 292 291 293 #ifdef CONFIG_NUMA 292 294 void *__kmalloc_node(size_t size, gfp_t flags, int node); ··· 473 473 #ifndef ARCH_SLAB_MINALIGN 474 474 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) 475 475 #endif 476 + 477 + struct memcg_cache_array { 478 + struct rcu_head rcu; 479 + struct kmem_cache *entries[0]; 480 + }; 481 + 476 482 /* 477 483 * This is the main placeholder for memcg-related information in kmem caches. 478 - * struct kmem_cache will hold a pointer to it, so the memory cost while 479 - * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it 480 - * would otherwise be if that would be bundled in kmem_cache: we'll need an 481 - * extra pointer chase. But the trade off clearly lays in favor of not 482 - * penalizing non-users. 483 - * 484 484 * Both the root cache and the child caches will have it. For the root cache, 485 485 * this will hold a dynamically allocated array large enough to hold 486 486 * information about the currently limited memcgs in the system. To allow the ··· 491 491 * 492 492 * @memcg: pointer to the memcg this cache belongs to 493 493 * @root_cache: pointer to the global, root cache, this cache was derived from 494 + * 495 + * Both root and child caches of the same kind are linked into a list chained 496 + * through @list. 494 497 */ 495 498 struct memcg_cache_params { 496 499 bool is_root_cache; 500 + struct list_head list; 497 501 union { 498 - struct { 499 - struct rcu_head rcu_head; 500 - struct kmem_cache *memcg_caches[0]; 501 - }; 502 + struct memcg_cache_array __rcu *memcg_caches; 502 503 struct { 503 504 struct mem_cgroup *memcg; 504 505 struct kmem_cache *root_cache;

+1 -1

include/linux/slab_def.h

··· 70 70 int obj_offset; 71 71 #endif /* CONFIG_DEBUG_SLAB */ 72 72 #ifdef CONFIG_MEMCG_KMEM 73 - struct memcg_cache_params *memcg_params; 73 + struct memcg_cache_params memcg_params; 74 74 #endif 75 75 76 76 struct kmem_cache_node *node[MAX_NUMNODES];

+1 -1

include/linux/slub_def.h

··· 85 85 struct kobject kobj; /* For sysfs */ 86 86 #endif 87 87 #ifdef CONFIG_MEMCG_KMEM 88 - struct memcg_cache_params *memcg_params; 88 + struct memcg_cache_params memcg_params; 89 89 int max_attr_size; /* for propagation, maximum size of a stored attr */ 90 90 #ifdef CONFIG_SYSFS 91 91 struct kset *memcg_kset;

-3

include/linux/string.h

··· 40 40 #ifndef __HAVE_ARCH_STRNCMP 41 41 extern int strncmp(const char *,const char *,__kernel_size_t); 42 42 #endif 43 - #ifndef __HAVE_ARCH_STRNICMP 44 - #define strnicmp strncasecmp 45 - #endif 46 43 #ifndef __HAVE_ARCH_STRCASECMP 47 44 extern int strcasecmp(const char *s1, const char *s2); 48 45 #endif

+2 -2

include/linux/string_helpers.h

··· 10 10 STRING_UNITS_2, /* use binary powers of 2^10 */ 11 11 }; 12 12 13 - int string_get_size(u64 size, enum string_size_units units, 14 - char *buf, int len); 13 + void string_get_size(u64 size, enum string_size_units units, 14 + char *buf, int len); 15 15 16 16 #define UNESCAPE_SPACE 0x01 17 17 #define UNESCAPE_OCTAL 0x02

+1 -1

include/linux/swapops.h

··· 54 54 /* check whether a pte points to a swap entry */ 55 55 static inline int is_swap_pte(pte_t pte) 56 56 { 57 - return !pte_none(pte) && !pte_present_nonuma(pte); 57 + return !pte_none(pte) && !pte_present(pte); 58 58 } 59 59 #endif 60 60

+1 -4

include/linux/types.h

··· 135 135 #endif 136 136 137 137 /* 138 - * The type of an index into the pagecache. Use a #define so asm/types.h 139 - * can override it. 138 + * The type of an index into the pagecache. 140 139 */ 141 - #ifndef pgoff_t 142 140 #define pgoff_t unsigned long 143 - #endif 144 141 145 142 /* A dma_addr_t can hold any valid DMA or bus address for the platform */ 146 143 #ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT

+3 -2

include/linux/zpool.h

··· 36 36 ZPOOL_MM_DEFAULT = ZPOOL_MM_RW 37 37 }; 38 38 39 - struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops); 39 + struct zpool *zpool_create_pool(char *type, char *name, 40 + gfp_t gfp, struct zpool_ops *ops); 40 41 41 42 char *zpool_get_type(struct zpool *pool); 42 43 ··· 81 80 atomic_t refcount; 82 81 struct list_head list; 83 82 84 - void *(*create)(gfp_t gfp, struct zpool_ops *ops); 83 + void *(*create)(char *name, gfp_t gfp, struct zpool_ops *ops); 85 84 void (*destroy)(void *pool); 86 85 87 86 int (*malloc)(void *pool, size_t size, gfp_t gfp,

+1 -1

include/linux/zsmalloc.h

··· 36 36 37 37 struct zs_pool; 38 38 39 - struct zs_pool *zs_create_pool(gfp_t flags); 39 + struct zs_pool *zs_create_pool(char *name, gfp_t flags); 40 40 void zs_destroy_pool(struct zs_pool *pool); 41 41 42 42 unsigned long zs_malloc(struct zs_pool *pool, size_t size);

-5

include/net/sock.h

··· 1077 1077 return test_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); 1078 1078 } 1079 1079 1080 - static inline bool memcg_proto_activated(struct cg_proto *cg_proto) 1081 - { 1082 - return test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags); 1083 - } 1084 - 1085 1080 #ifdef SOCK_REFCNT_DEBUG 1086 1081 static inline void sk_refcnt_debug_inc(struct sock *sk) 1087 1082 {

+1 -1

include/uapi/linux/mempolicy.h

··· 67 67 #define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */ 68 68 #define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */ 69 69 #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ 70 - #define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */ 70 + #define MPOL_F_MORON (1 << 4) /* Migrate On protnone Reference On Node */ 71 71 72 72 73 73 #endif /* _UAPI_LINUX_MEMPOLICY_H */

+7 -3

kernel/cgroup.c

··· 4373 4373 { 4374 4374 struct cgroup_subsys_state *css = 4375 4375 container_of(work, struct cgroup_subsys_state, destroy_work); 4376 + struct cgroup_subsys *ss = css->ss; 4376 4377 struct cgroup *cgrp = css->cgroup; 4377 4378 4378 4379 percpu_ref_exit(&css->refcnt); 4379 4380 4380 - if (css->ss) { 4381 + if (ss) { 4381 4382 /* css free path */ 4383 + int id = css->id; 4384 + 4382 4385 if (css->parent) 4383 4386 css_put(css->parent); 4384 4387 4385 - css->ss->css_free(css); 4388 + ss->css_free(css); 4389 + cgroup_idr_remove(&ss->css_idr, id); 4386 4390 cgroup_put(cgrp); 4387 4391 } else { 4388 4392 /* cgroup free path */ ··· 4438 4434 4439 4435 if (ss) { 4440 4436 /* css release path */ 4441 - cgroup_idr_remove(&ss->css_idr, css->id); 4437 + cgroup_idr_replace(&ss->css_idr, NULL, css->id); 4442 4438 if (ss->css_released) 4443 4439 ss->css_released(css); 4444 4440 } else {

+2 -3

kernel/compat.c

··· 276 276 * core implementation decides to return random nonsense. 277 277 */ 278 278 if (ret == -ERESTART_RESTARTBLOCK) { 279 - struct restart_block *restart 280 - = &current_thread_info()->restart_block; 279 + struct restart_block *restart = &current->restart_block; 281 280 282 281 restart->fn = compat_nanosleep_restart; 283 282 restart->nanosleep.compat_rmtp = rmtp; ··· 859 860 return -EFAULT; 860 861 861 862 if (err == -ERESTART_RESTARTBLOCK) { 862 - restart = &current_thread_info()->restart_block; 863 + restart = &current->restart_block; 863 864 restart->fn = compat_clock_nanosleep_restart; 864 865 restart->nanosleep.compat_rmtp = rmtp; 865 866 }

+1 -1

kernel/cpuset.c

··· 2400 2400 */ 2401 2401 } 2402 2402 2403 - void cpuset_init_current_mems_allowed(void) 2403 + void __init cpuset_init_current_mems_allowed(void) 2404 2404 { 2405 2405 nodes_setall(current->mems_allowed); 2406 2406 }

+1 -3

kernel/fork.c

··· 555 555 INIT_LIST_HEAD(&mm->mmlist); 556 556 mm->core_state = NULL; 557 557 atomic_long_set(&mm->nr_ptes, 0); 558 - #ifndef __PAGETABLE_PMD_FOLDED 559 - atomic_long_set(&mm->nr_pmds, 0); 560 - #endif 558 + mm_nr_pmds_init(mm); 561 559 mm->map_count = 0; 562 560 mm->locked_vm = 0; 563 561 mm->pinned_vm = 0;

+1 -1

kernel/futex.c

··· 2217 2217 if (!abs_time) 2218 2218 goto out; 2219 2219 2220 - restart = &current_thread_info()->restart_block; 2220 + restart = &current->restart_block; 2221 2221 restart->fn = futex_wait_restart; 2222 2222 restart->futex.uaddr = uaddr; 2223 2223 restart->futex.val = val;

+6 -6

kernel/printk/printk.c

··· 935 935 936 936 early_param("ignore_loglevel", ignore_loglevel_setup); 937 937 module_param(ignore_loglevel, bool, S_IRUGO | S_IWUSR); 938 - MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to" 939 - "print all kernel messages to the console."); 938 + MODULE_PARM_DESC(ignore_loglevel, 939 + "ignore loglevel setting (prints all kernel messages to the console)"); 940 940 941 941 #ifdef CONFIG_BOOT_PRINTK_DELAY 942 942 ··· 1419 1419 } 1420 1420 1421 1421 /* 1422 - * Zap console related locks when oopsing. Only zap at most once 1423 - * every 10 seconds, to leave time for slow consoles to print a 1424 - * full oops. 1422 + * Zap console related locks when oopsing. 1423 + * To leave time for slow consoles to print a full oops, 1424 + * only zap at most once every 30 seconds. 1425 1425 */ 1426 1426 static void zap_locks(void) 1427 1427 { 1428 1428 static unsigned long oops_timestamp; 1429 1429 1430 1430 if (time_after_eq(jiffies, oops_timestamp) && 1431 - !time_after(jiffies, oops_timestamp + 30 * HZ)) 1431 + !time_after(jiffies, oops_timestamp + 30 * HZ)) 1432 1432 return; 1433 1433 1434 1434 oops_timestamp = jiffies;

+13

kernel/sched/clock.c

··· 420 420 421 421 EXPORT_SYMBOL_GPL(cpu_clock); 422 422 EXPORT_SYMBOL_GPL(local_clock); 423 + 424 + /* 425 + * Running clock - returns the time that has elapsed while a guest has been 426 + * running. 427 + * On a guest this value should be local_clock minus the time the guest was 428 + * suspended by the hypervisor (for any reason). 429 + * On bare metal this function should return the same as local_clock. 430 + * Architectures and sub-architectures can override this. 431 + */ 432 + u64 __weak running_clock(void) 433 + { 434 + return local_clock(); 435 + }

+1 -1

kernel/signal.c

··· 2501 2501 */ 2502 2502 SYSCALL_DEFINE0(restart_syscall) 2503 2503 { 2504 - struct restart_block *restart = &current_thread_info()->restart_block; 2504 + struct restart_block *restart = &current->restart_block; 2505 2505 return restart->fn(restart); 2506 2506 } 2507 2507

+1 -1

kernel/time/alarmtimer.c

··· 788 788 goto out; 789 789 } 790 790 791 - restart = &current_thread_info()->restart_block; 791 + restart = &current->restart_block; 792 792 restart->fn = alarm_timer_nsleep_restart; 793 793 restart->nanosleep.clockid = type; 794 794 restart->nanosleep.expires = exp.tv64;

+1 -1

kernel/time/hrtimer.c

··· 1583 1583 goto out; 1584 1584 } 1585 1585 1586 - restart = &current_thread_info()->restart_block; 1586 + restart = &current->restart_block; 1587 1587 restart->fn = hrtimer_nanosleep_restart; 1588 1588 restart->nanosleep.clockid = t.timer.base->clockid; 1589 1589 restart->nanosleep.rmtp = rmtp;

+1 -2

kernel/time/posix-cpu-timers.c

··· 1334 1334 static int posix_cpu_nsleep(const clockid_t which_clock, int flags, 1335 1335 struct timespec *rqtp, struct timespec __user *rmtp) 1336 1336 { 1337 - struct restart_block *restart_block = 1338 - &current_thread_info()->restart_block; 1337 + struct restart_block *restart_block = &current->restart_block; 1339 1338 struct itimerspec it; 1340 1339 int error; 1341 1340

+1 -1

kernel/watchdog.c

··· 154 154 */ 155 155 static unsigned long get_timestamp(void) 156 156 { 157 - return local_clock() >> 30LL; /* 2^30 ~= 10^9 */ 157 + return running_clock() >> 30LL; /* 2^30 ~= 10^9 */ 158 158 } 159 159 160 160 static void set_sample_period(void)

+3

lib/Kconfig.debug

··· 1580 1580 1581 1581 If unsure, say N. 1582 1582 1583 + config TEST_HEXDUMP 1584 + tristate "Test functions located in the hexdump module at runtime" 1585 + 1583 1586 config TEST_STRING_HELPERS 1584 1587 tristate "Test functions located in the string_helpers module at runtime" 1585 1588

+3 -1

lib/Makefile

··· 23 23 obj-y += lockref.o 24 24 25 25 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ 26 - bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ 26 + bust_spinlocks.o kasprintf.o bitmap.o scatterlist.o \ 27 27 gcd.o lcm.o list_sort.o uuid.o flex_array.o clz_ctz.o \ 28 28 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \ 29 29 percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o 30 30 obj-y += string_helpers.o 31 31 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o 32 + obj-y += hexdump.o 33 + obj-$(CONFIG_TEST_HEXDUMP) += test-hexdump.o 32 34 obj-y += kstrtox.o 33 35 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o 34 36 obj-$(CONFIG_TEST_LKM) += test_module.o

+32 -48

lib/bitmap.c

··· 744 744 /** 745 745 * bitmap_pos_to_ord - find ordinal of set bit at given position in bitmap 746 746 * @buf: pointer to a bitmap 747 - * @pos: a bit position in @buf (0 <= @pos < @bits) 748 - * @bits: number of valid bit positions in @buf 747 + * @pos: a bit position in @buf (0 <= @pos < @nbits) 748 + * @nbits: number of valid bit positions in @buf 749 749 * 750 - * Map the bit at position @pos in @buf (of length @bits) to the 750 + * Map the bit at position @pos in @buf (of length @nbits) to the 751 751 * ordinal of which set bit it is. If it is not set or if @pos 752 752 * is not a valid bit position, map to -1. 753 753 * ··· 759 759 * 760 760 * The bit positions 0 through @bits are valid positions in @buf. 761 761 */ 762 - static int bitmap_pos_to_ord(const unsigned long *buf, int pos, int bits) 762 + static int bitmap_pos_to_ord(const unsigned long *buf, unsigned int pos, unsigned int nbits) 763 763 { 764 - int i, ord; 765 - 766 - if (pos < 0 || pos >= bits || !test_bit(pos, buf)) 764 + if (pos >= nbits || !test_bit(pos, buf)) 767 765 return -1; 768 766 769 - i = find_first_bit(buf, bits); 770 - ord = 0; 771 - while (i < pos) { 772 - i = find_next_bit(buf, bits, i + 1); 773 - ord++; 774 - } 775 - BUG_ON(i != pos); 776 - 777 - return ord; 767 + return __bitmap_weight(buf, pos); 778 768 } 779 769 780 770 /** 781 771 * bitmap_ord_to_pos - find position of n-th set bit in bitmap 782 772 * @buf: pointer to bitmap 783 773 * @ord: ordinal bit position (n-th set bit, n >= 0) 784 - * @bits: number of valid bit positions in @buf 774 + * @nbits: number of valid bit positions in @buf 785 775 * 786 776 * Map the ordinal offset of bit @ord in @buf to its position in @buf. 787 - * Value of @ord should be in range 0 <= @ord < weight(buf), else 788 - * results are undefined. 777 + * Value of @ord should be in range 0 <= @ord < weight(buf). If @ord 778 + * >= weight(buf), returns @nbits. 789 779 * 790 780 * If for example, just bits 4 through 7 are set in @buf, then @ord 791 781 * values 0 through 3 will get mapped to 4 through 7, respectively, 792 - * and all other @ord values return undefined values. When @ord value 3 782 + * and all other @ord values returns @nbits. When @ord value 3 793 783 * gets mapped to (returns) @pos value 7 in this example, that means 794 784 * that the 3rd set bit (starting with 0th) is at position 7 in @buf. 795 785 * 796 - * The bit positions 0 through @bits are valid positions in @buf. 786 + * The bit positions 0 through @nbits-1 are valid positions in @buf. 797 787 */ 798 - int bitmap_ord_to_pos(const unsigned long *buf, int ord, int bits) 788 + unsigned int bitmap_ord_to_pos(const unsigned long *buf, unsigned int ord, unsigned int nbits) 799 789 { 800 - int pos = 0; 790 + unsigned int pos; 801 791 802 - if (ord >= 0 && ord < bits) { 803 - int i; 804 - 805 - for (i = find_first_bit(buf, bits); 806 - i < bits && ord > 0; 807 - i = find_next_bit(buf, bits, i + 1)) 808 - ord--; 809 - if (i < bits && ord == 0) 810 - pos = i; 811 - } 792 + for (pos = find_first_bit(buf, nbits); 793 + pos < nbits && ord; 794 + pos = find_next_bit(buf, nbits, pos + 1)) 795 + ord--; 812 796 813 797 return pos; 814 798 } ··· 803 819 * @src: subset to be remapped 804 820 * @old: defines domain of map 805 821 * @new: defines range of map 806 - * @bits: number of bits in each of these bitmaps 822 + * @nbits: number of bits in each of these bitmaps 807 823 * 808 824 * Let @old and @new define a mapping of bit positions, such that 809 825 * whatever position is held by the n-th set bit in @old is mapped ··· 831 847 */ 832 848 void bitmap_remap(unsigned long *dst, const unsigned long *src, 833 849 const unsigned long *old, const unsigned long *new, 834 - int bits) 850 + unsigned int nbits) 835 851 { 836 - int oldbit, w; 852 + unsigned int oldbit, w; 837 853 838 854 if (dst == src) /* following doesn't handle inplace remaps */ 839 855 return; 840 - bitmap_zero(dst, bits); 856 + bitmap_zero(dst, nbits); 841 857 842 - w = bitmap_weight(new, bits); 843 - for_each_set_bit(oldbit, src, bits) { 844 - int n = bitmap_pos_to_ord(old, oldbit, bits); 858 + w = bitmap_weight(new, nbits); 859 + for_each_set_bit(oldbit, src, nbits) { 860 + int n = bitmap_pos_to_ord(old, oldbit, nbits); 845 861 846 862 if (n < 0 || w == 0) 847 863 set_bit(oldbit, dst); /* identity map */ 848 864 else 849 - set_bit(bitmap_ord_to_pos(new, n % w, bits), dst); 865 + set_bit(bitmap_ord_to_pos(new, n % w, nbits), dst); 850 866 } 851 867 } 852 868 EXPORT_SYMBOL(bitmap_remap); ··· 990 1006 * All bits in @dst not set by the above rule are cleared. 991 1007 */ 992 1008 void bitmap_onto(unsigned long *dst, const unsigned long *orig, 993 - const unsigned long *relmap, int bits) 1009 + const unsigned long *relmap, unsigned int bits) 994 1010 { 995 - int n, m; /* same meaning as in above comment */ 1011 + unsigned int n, m; /* same meaning as in above comment */ 996 1012 997 1013 if (dst == orig) /* following doesn't handle inplace mappings */ 998 1014 return; ··· 1023 1039 * @dst: resulting smaller bitmap 1024 1040 * @orig: original larger bitmap 1025 1041 * @sz: specified size 1026 - * @bits: number of bits in each of these bitmaps 1042 + * @nbits: number of bits in each of these bitmaps 1027 1043 * 1028 1044 * For each bit oldbit in @orig, set bit oldbit mod @sz in @dst. 1029 1045 * Clear all other bits in @dst. See further the comment and 1030 1046 * Example [2] for bitmap_onto() for why and how to use this. 1031 1047 */ 1032 1048 void bitmap_fold(unsigned long *dst, const unsigned long *orig, 1033 - int sz, int bits) 1049 + unsigned int sz, unsigned int nbits) 1034 1050 { 1035 - int oldbit; 1051 + unsigned int oldbit; 1036 1052 1037 1053 if (dst == orig) /* following doesn't handle inplace mappings */ 1038 1054 return; 1039 - bitmap_zero(dst, bits); 1055 + bitmap_zero(dst, nbits); 1040 1056 1041 - for_each_set_bit(oldbit, orig, bits) 1057 + for_each_set_bit(oldbit, orig, nbits) 1042 1058 set_bit(oldbit % sz, dst); 1043 1059 } 1044 1060 EXPORT_SYMBOL(bitmap_fold);

+2 -2

lib/dynamic_queue_limits.c

··· 3 3 * 4 4 * Copyright (c) 2011, Tom Herbert <therbert@google.com> 5 5 */ 6 - #include <linux/module.h> 7 6 #include <linux/types.h> 8 - #include <linux/ctype.h> 9 7 #include <linux/kernel.h> 10 8 #include <linux/jiffies.h> 11 9 #include <linux/dynamic_queue_limits.h> 10 + #include <linux/compiler.h> 11 + #include <linux/export.h> 12 12 13 13 #define POSDIFF(A, B) ((int)((A) - (B)) > 0 ? (A) - (B) : 0) 14 14 #define AFTER_EQ(A, B) ((int)((A) - (B)) >= 0)

+1 -2

lib/genalloc.c

··· 34 34 #include <linux/rculist.h> 35 35 #include <linux/interrupt.h> 36 36 #include <linux/genalloc.h> 37 - #include <linux/of_address.h> 38 37 #include <linux/of_device.h> 39 38 40 39 static inline size_t chunk_size(const struct gen_pool_chunk *chunk) ··· 414 415 size_t size) 415 416 { 416 417 bool found = false; 417 - unsigned long end = start + size; 418 + unsigned long end = start + size - 1; 418 419 struct gen_pool_chunk *chunk; 419 420 420 421 rcu_read_lock();

+1 -1

lib/halfmd4.c

··· 1 - #include <linux/kernel.h> 1 + #include <linux/compiler.h> 2 2 #include <linux/export.h> 3 3 #include <linux/cryptohash.h> 4 4

+64 -41

lib/hexdump.c

··· 97 97 * 98 98 * example output buffer: 99 99 * 40 41 42 43 44 45 46 47 48 49 4a 4b 4c 4d 4e 4f @ABCDEFGHIJKLMNO 100 + * 101 + * Return: 102 + * The amount of bytes placed in the buffer without terminating NUL. If the 103 + * output was truncated, then the return value is the number of bytes 104 + * (excluding the terminating NUL) which would have been written to the final 105 + * string if enough space had been available. 100 106 */ 101 - void hex_dump_to_buffer(const void *buf, size_t len, int rowsize, 102 - int groupsize, char *linebuf, size_t linebuflen, 103 - bool ascii) 107 + int hex_dump_to_buffer(const void *buf, size_t len, int rowsize, int groupsize, 108 + char *linebuf, size_t linebuflen, bool ascii) 104 109 { 105 110 const u8 *ptr = buf; 111 + int ngroups; 106 112 u8 ch; 107 113 int j, lx = 0; 108 114 int ascii_column; 115 + int ret; 109 116 110 117 if (rowsize != 16 && rowsize != 32) 111 118 rowsize = 16; 112 119 113 - if (!len) 114 - goto nil; 115 120 if (len > rowsize) /* limit to one line at a time */ 116 121 len = rowsize; 122 + if (!is_power_of_2(groupsize) || groupsize > 8) 123 + groupsize = 1; 117 124 if ((len % groupsize) != 0) /* no mixed size output */ 118 125 groupsize = 1; 119 126 120 - switch (groupsize) { 121 - case 8: { 127 + ngroups = len / groupsize; 128 + ascii_column = rowsize * 2 + rowsize / groupsize + 1; 129 + 130 + if (!linebuflen) 131 + goto overflow1; 132 + 133 + if (!len) 134 + goto nil; 135 + 136 + if (groupsize == 8) { 122 137 const u64 *ptr8 = buf; 123 - int ngroups = len / groupsize; 124 138 125 - for (j = 0; j < ngroups; j++) 126 - lx += scnprintf(linebuf + lx, linebuflen - lx, 127 - "%s%16.16llx", j ? " " : "", 128 - (unsigned long long)*(ptr8 + j)); 129 - ascii_column = 17 * ngroups + 2; 130 - break; 131 - } 132 - 133 - case 4: { 139 + for (j = 0; j < ngroups; j++) { 140 + ret = snprintf(linebuf + lx, linebuflen - lx, 141 + "%s%16.16llx", j ? " " : "", 142 + (unsigned long long)*(ptr8 + j)); 143 + if (ret >= linebuflen - lx) 144 + goto overflow1; 145 + lx += ret; 146 + } 147 + } else if (groupsize == 4) { 134 148 const u32 *ptr4 = buf; 135 - int ngroups = len / groupsize; 136 149 137 - for (j = 0; j < ngroups; j++) 138 - lx += scnprintf(linebuf + lx, linebuflen - lx, 139 - "%s%8.8x", j ? " " : "", *(ptr4 + j)); 140 - ascii_column = 9 * ngroups + 2; 141 - break; 142 - } 143 - 144 - case 2: { 150 + for (j = 0; j < ngroups; j++) { 151 + ret = snprintf(linebuf + lx, linebuflen - lx, 152 + "%s%8.8x", j ? " " : "", 153 + *(ptr4 + j)); 154 + if (ret >= linebuflen - lx) 155 + goto overflow1; 156 + lx += ret; 157 + } 158 + } else if (groupsize == 2) { 145 159 const u16 *ptr2 = buf; 146 - int ngroups = len / groupsize; 147 160 148 - for (j = 0; j < ngroups; j++) 149 - lx += scnprintf(linebuf + lx, linebuflen - lx, 150 - "%s%4.4x", j ? " " : "", *(ptr2 + j)); 151 - ascii_column = 5 * ngroups + 2; 152 - break; 153 - } 154 - 155 - default: 156 - for (j = 0; (j < len) && (lx + 3) <= linebuflen; j++) { 161 + for (j = 0; j < ngroups; j++) { 162 + ret = snprintf(linebuf + lx, linebuflen - lx, 163 + "%s%4.4x", j ? " " : "", 164 + *(ptr2 + j)); 165 + if (ret >= linebuflen - lx) 166 + goto overflow1; 167 + lx += ret; 168 + } 169 + } else { 170 + for (j = 0; j < len; j++) { 171 + if (linebuflen < lx + 3) 172 + goto overflow2; 157 173 ch = ptr[j]; 158 174 linebuf[lx++] = hex_asc_hi(ch); 159 175 linebuf[lx++] = hex_asc_lo(ch); ··· 177 161 } 178 162 if (j) 179 163 lx--; 180 - 181 - ascii_column = 3 * rowsize + 2; 182 - break; 183 164 } 184 165 if (!ascii) 185 166 goto nil; 186 167 187 - while (lx < (linebuflen - 1) && lx < (ascii_column - 1)) 168 + while (lx < ascii_column) { 169 + if (linebuflen < lx + 2) 170 + goto overflow2; 188 171 linebuf[lx++] = ' '; 189 - for (j = 0; (j < len) && (lx + 2) < linebuflen; j++) { 172 + } 173 + for (j = 0; j < len; j++) { 174 + if (linebuflen < lx + 2) 175 + goto overflow2; 190 176 ch = ptr[j]; 191 177 linebuf[lx++] = (isascii(ch) && isprint(ch)) ? ch : '.'; 192 178 } 193 179 nil: 180 + linebuf[lx] = '\0'; 181 + return lx; 182 + overflow2: 194 183 linebuf[lx++] = '\0'; 184 + overflow1: 185 + return ascii ? ascii_column + len : (groupsize * 2 + 1) * ngroups - 1; 195 186 } 196 187 EXPORT_SYMBOL(hex_dump_to_buffer); 197 188

-1

lib/idr.c

··· 30 30 #include <linux/idr.h> 31 31 #include <linux/spinlock.h> 32 32 #include <linux/percpu.h> 33 - #include <linux/hardirq.h> 34 33 35 34 #define MAX_IDR_SHIFT (sizeof(int) * 8 - 1) 36 35 #define MAX_IDR_BIT (1U << MAX_IDR_SHIFT)

+2 -2

lib/interval_tree.c

··· 1 - #include <linux/init.h> 2 1 #include <linux/interval_tree.h> 3 2 #include <linux/interval_tree_generic.h> 4 - #include <linux/module.h> 3 + #include <linux/compiler.h> 4 + #include <linux/export.h> 5 5 6 6 #define START(node) ((node)->start) 7 7 #define LAST(node) ((node)->last)

-1

lib/kobject_uevent.c

··· 20 20 #include <linux/export.h> 21 21 #include <linux/kmod.h> 22 22 #include <linux/slab.h> 23 - #include <linux/user_namespace.h> 24 23 #include <linux/socket.h> 25 24 #include <linux/skbuff.h> 26 25 #include <linux/netlink.h>

+1 -1

lib/lcm.c

··· 1 - #include <linux/kernel.h> 1 + #include <linux/compiler.h> 2 2 #include <linux/gcd.h> 3 3 #include <linux/export.h> 4 4 #include <linux/lcm.h>

+5 -2

lib/list_sort.c

··· 2 2 #define pr_fmt(fmt) "list_sort_test: " fmt 3 3 4 4 #include <linux/kernel.h> 5 - #include <linux/module.h> 5 + #include <linux/bug.h> 6 + #include <linux/compiler.h> 7 + #include <linux/export.h> 8 + #include <linux/string.h> 6 9 #include <linux/list_sort.h> 7 - #include <linux/slab.h> 8 10 #include <linux/list.h> 9 11 10 12 #define MAX_LIST_LENGTH_BITS 20 ··· 148 146 149 147 #ifdef CONFIG_TEST_LIST_SORT 150 148 149 + #include <linux/slab.h> 151 150 #include <linux/random.h> 152 151 153 152 /*

-1

lib/llist.c

··· 24 24 */ 25 25 #include <linux/kernel.h> 26 26 #include <linux/export.h> 27 - #include <linux/interrupt.h> 28 27 #include <linux/llist.h> 29 28 30 29

+1 -1

lib/md5.c

··· 1 - #include <linux/kernel.h> 1 + #include <linux/compiler.h> 2 2 #include <linux/export.h> 3 3 #include <linux/cryptohash.h> 4 4

-1

lib/nlattr.c

··· 9 9 #include <linux/kernel.h> 10 10 #include <linux/errno.h> 11 11 #include <linux/jiffies.h> 12 - #include <linux/netdevice.h> 13 12 #include <linux/skbuff.h> 14 13 #include <linux/string.h> 15 14 #include <linux/types.h>

-3

lib/percpu_ida.c

··· 19 19 #include <linux/bug.h> 20 20 #include <linux/err.h> 21 21 #include <linux/export.h> 22 - #include <linux/hardirq.h> 23 - #include <linux/idr.h> 24 22 #include <linux/init.h> 25 23 #include <linux/kernel.h> 26 24 #include <linux/percpu.h> 27 25 #include <linux/sched.h> 28 - #include <linux/slab.h> 29 26 #include <linux/string.h> 30 27 #include <linux/spinlock.h> 31 28 #include <linux/percpu_ida.h>

-1

lib/plist.c

··· 25 25 26 26 #include <linux/bug.h> 27 27 #include <linux/plist.h> 28 - #include <linux/spinlock.h> 29 28 30 29 #ifdef CONFIG_DEBUG_PI_LIST 31 30

+1 -1

lib/radix-tree.c

··· 33 33 #include <linux/string.h> 34 34 #include <linux/bitops.h> 35 35 #include <linux/rcupdate.h> 36 - #include <linux/hardirq.h> /* in_interrupt() */ 36 + #include <linux/preempt_mask.h> /* in_interrupt() */ 37 37 38 38 39 39 /*

-1

lib/show_mem.c

··· 6 6 */ 7 7 8 8 #include <linux/mm.h> 9 - #include <linux/nmi.h> 10 9 #include <linux/quicklist.h> 11 10 #include <linux/cma.h> 12 11

+3 -3

lib/sort.c

··· 4 4 * Jan 23 2005 Matt Mackall <mpm@selenic.com> 5 5 */ 6 6 7 - #include <linux/kernel.h> 8 - #include <linux/module.h> 7 + #include <linux/types.h> 8 + #include <linux/export.h> 9 9 #include <linux/sort.h> 10 - #include <linux/slab.h> 11 10 12 11 static void u32_swap(void *a, void *b, int size) 13 12 { ··· 84 85 EXPORT_SYMBOL(sort); 85 86 86 87 #if 0 88 + #include <linux/slab.h> 87 89 /* a simple boot-time regression test */ 88 90 89 91 int cmpint(const void *a, const void *b)

+2 -1

lib/stmp_device.c

··· 15 15 #include <linux/io.h> 16 16 #include <linux/errno.h> 17 17 #include <linux/delay.h> 18 - #include <linux/module.h> 18 + #include <linux/compiler.h> 19 + #include <linux/export.h> 19 20 #include <linux/stmp_device.h> 20 21 21 22 #define STMP_MODULE_CLKGATE (1 << 30)

-8

lib/string.c

··· 58 58 } 59 59 EXPORT_SYMBOL(strncasecmp); 60 60 #endif 61 - #ifndef __HAVE_ARCH_STRNICMP 62 - #undef strnicmp 63 - int strnicmp(const char *s1, const char *s2, size_t len) 64 - { 65 - return strncasecmp(s1, s2, len); 66 - } 67 - EXPORT_SYMBOL(strnicmp); 68 - #endif 69 61 70 62 #ifndef __HAVE_ARCH_STRCASECMP 71 63 int strcasecmp(const char *s1, const char *s2)

+11 -15

lib/string_helpers.c

··· 20 20 * @len: length of buffer 21 21 * 22 22 * This function returns a string formatted to 3 significant figures 23 - * giving the size in the required units. Returns 0 on success or 24 - * error on failure. @buf is always zero terminated. 23 + * giving the size in the required units. @buf should have room for 24 + * at least 9 bytes and will always be zero terminated. 25 25 * 26 26 */ 27 - int string_get_size(u64 size, const enum string_size_units units, 28 - char *buf, int len) 27 + void string_get_size(u64 size, const enum string_size_units units, 28 + char *buf, int len) 29 29 { 30 30 static const char *const units_10[] = { 31 - "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", NULL 31 + "B", "kB", "MB", "GB", "TB", "PB", "EB" 32 32 }; 33 33 static const char *const units_2[] = { 34 - "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB", 35 - NULL 34 + "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB" 36 35 }; 37 36 static const char *const *const units_str[] = { 38 37 [STRING_UNITS_10] = units_10, ··· 42 43 [STRING_UNITS_2] = 1024, 43 44 }; 44 45 int i, j; 45 - u64 remainder = 0, sf_cap; 46 + u32 remainder = 0, sf_cap; 46 47 char tmp[8]; 47 48 48 49 tmp[0] = '\0'; 49 50 i = 0; 50 51 if (size >= divisor[units]) { 51 - while (size >= divisor[units] && units_str[units][i]) { 52 + while (size >= divisor[units]) { 52 53 remainder = do_div(size, divisor[units]); 53 54 i++; 54 55 } ··· 59 60 60 61 if (j) { 61 62 remainder *= 1000; 62 - do_div(remainder, divisor[units]); 63 - snprintf(tmp, sizeof(tmp), ".%03lld", 64 - (unsigned long long)remainder); 63 + remainder /= divisor[units]; 64 + snprintf(tmp, sizeof(tmp), ".%03u", remainder); 65 65 tmp[j+1] = '\0'; 66 66 } 67 67 } 68 68 69 - snprintf(buf, len, "%lld%s %s", (unsigned long long)size, 69 + snprintf(buf, len, "%u%s %s", (u32)size, 70 70 tmp, units_str[units][i]); 71 - 72 - return 0; 73 71 } 74 72 EXPORT_SYMBOL(string_get_size); 75 73

+2 -1

lib/strncpy_from_user.c

··· 1 - #include <linux/module.h> 1 + #include <linux/compiler.h> 2 + #include <linux/export.h> 2 3 #include <linux/uaccess.h> 3 4 #include <linux/kernel.h> 4 5 #include <linux/errno.h>

+180

lib/test-hexdump.c

··· 1 + /* 2 + * Test cases for lib/hexdump.c module. 3 + */ 4 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 5 + 6 + #include <linux/init.h> 7 + #include <linux/kernel.h> 8 + #include <linux/module.h> 9 + #include <linux/random.h> 10 + #include <linux/string.h> 11 + 12 + static const unsigned char data_b[] = { 13 + '\xbe', '\x32', '\xdb', '\x7b', '\x0a', '\x18', '\x93', '\xb2', /* 00 - 07 */ 14 + '\x70', '\xba', '\xc4', '\x24', '\x7d', '\x83', '\x34', '\x9b', /* 08 - 0f */ 15 + '\xa6', '\x9c', '\x31', '\xad', '\x9c', '\x0f', '\xac', '\xe9', /* 10 - 17 */ 16 + '\x4c', '\xd1', '\x19', '\x99', '\x43', '\xb1', '\xaf', '\x0c', /* 18 - 1f */ 17 + }; 18 + 19 + static const unsigned char data_a[] = ".2.{....p..$}.4...1.....L...C..."; 20 + 21 + static const char *test_data_1_le[] __initconst = { 22 + "be", "32", "db", "7b", "0a", "18", "93", "b2", 23 + "70", "ba", "c4", "24", "7d", "83", "34", "9b", 24 + "a6", "9c", "31", "ad", "9c", "0f", "ac", "e9", 25 + "4c", "d1", "19", "99", "43", "b1", "af", "0c", 26 + }; 27 + 28 + static const char *test_data_2_le[] __initconst = { 29 + "32be", "7bdb", "180a", "b293", 30 + "ba70", "24c4", "837d", "9b34", 31 + "9ca6", "ad31", "0f9c", "e9ac", 32 + "d14c", "9919", "b143", "0caf", 33 + }; 34 + 35 + static const char *test_data_4_le[] __initconst = { 36 + "7bdb32be", "b293180a", "24c4ba70", "9b34837d", 37 + "ad319ca6", "e9ac0f9c", "9919d14c", "0cafb143", 38 + }; 39 + 40 + static const char *test_data_8_le[] __initconst = { 41 + "b293180a7bdb32be", "9b34837d24c4ba70", 42 + "e9ac0f9cad319ca6", "0cafb1439919d14c", 43 + }; 44 + 45 + static void __init test_hexdump(size_t len, int rowsize, int groupsize, 46 + bool ascii) 47 + { 48 + char test[32 * 3 + 2 + 32 + 1]; 49 + char real[32 * 3 + 2 + 32 + 1]; 50 + char *p; 51 + const char **result; 52 + size_t l = len; 53 + int gs = groupsize, rs = rowsize; 54 + unsigned int i; 55 + 56 + hex_dump_to_buffer(data_b, l, rs, gs, real, sizeof(real), ascii); 57 + 58 + if (rs != 16 && rs != 32) 59 + rs = 16; 60 + 61 + if (l > rs) 62 + l = rs; 63 + 64 + if (!is_power_of_2(gs) || gs > 8 || (len % gs != 0)) 65 + gs = 1; 66 + 67 + if (gs == 8) 68 + result = test_data_8_le; 69 + else if (gs == 4) 70 + result = test_data_4_le; 71 + else if (gs == 2) 72 + result = test_data_2_le; 73 + else 74 + result = test_data_1_le; 75 + 76 + memset(test, ' ', sizeof(test)); 77 + 78 + /* hex dump */ 79 + p = test; 80 + for (i = 0; i < l / gs; i++) { 81 + const char *q = *result++; 82 + size_t amount = strlen(q); 83 + 84 + strncpy(p, q, amount); 85 + p += amount + 1; 86 + } 87 + if (i) 88 + p--; 89 + 90 + /* ASCII part */ 91 + if (ascii) { 92 + p = test + rs * 2 + rs / gs + 1; 93 + strncpy(p, data_a, l); 94 + p += l; 95 + } 96 + 97 + *p = '\0'; 98 + 99 + if (strcmp(test, real)) { 100 + pr_err("Len: %zu row: %d group: %d\n", len, rowsize, groupsize); 101 + pr_err("Result: '%s'\n", real); 102 + pr_err("Expect: '%s'\n", test); 103 + } 104 + } 105 + 106 + static void __init test_hexdump_set(int rowsize, bool ascii) 107 + { 108 + size_t d = min_t(size_t, sizeof(data_b), rowsize); 109 + size_t len = get_random_int() % d + 1; 110 + 111 + test_hexdump(len, rowsize, 4, ascii); 112 + test_hexdump(len, rowsize, 2, ascii); 113 + test_hexdump(len, rowsize, 8, ascii); 114 + test_hexdump(len, rowsize, 1, ascii); 115 + } 116 + 117 + static void __init test_hexdump_overflow(bool ascii) 118 + { 119 + char buf[56]; 120 + const char *t = test_data_1_le[0]; 121 + size_t l = get_random_int() % sizeof(buf); 122 + bool a; 123 + int e, r; 124 + 125 + memset(buf, ' ', sizeof(buf)); 126 + 127 + r = hex_dump_to_buffer(data_b, 1, 16, 1, buf, l, ascii); 128 + 129 + if (ascii) 130 + e = 50; 131 + else 132 + e = 2; 133 + buf[e + 2] = '\0'; 134 + 135 + if (!l) { 136 + a = r == e && buf[0] == ' '; 137 + } else if (l < 3) { 138 + a = r == e && buf[0] == '\0'; 139 + } else if (l < 4) { 140 + a = r == e && !strcmp(buf, t); 141 + } else if (ascii) { 142 + if (l < 51) 143 + a = r == e && buf[l - 1] == '\0' && buf[l - 2] == ' '; 144 + else 145 + a = r == e && buf[50] == '\0' && buf[49] == '.'; 146 + } else { 147 + a = r == e && buf[e] == '\0'; 148 + } 149 + 150 + if (!a) { 151 + pr_err("Len: %zu rc: %u strlen: %zu\n", l, r, strlen(buf)); 152 + pr_err("Result: '%s'\n", buf); 153 + } 154 + } 155 + 156 + static int __init test_hexdump_init(void) 157 + { 158 + unsigned int i; 159 + int rowsize; 160 + 161 + pr_info("Running tests...\n"); 162 + 163 + rowsize = (get_random_int() % 2 + 1) * 16; 164 + for (i = 0; i < 16; i++) 165 + test_hexdump_set(rowsize, false); 166 + 167 + rowsize = (get_random_int() % 2 + 1) * 16; 168 + for (i = 0; i < 16; i++) 169 + test_hexdump_set(rowsize, true); 170 + 171 + for (i = 0; i < 16; i++) 172 + test_hexdump_overflow(false); 173 + 174 + for (i = 0; i < 16; i++) 175 + test_hexdump_overflow(true); 176 + 177 + return -EINVAL; 178 + } 179 + module_init(test_hexdump_init); 180 + MODULE_LICENSE("Dual BSD/GPL");

+6 -6

lib/vsprintf.c

··· 114 114 { 115 115 int i = 0; 116 116 117 - while (isdigit(**s)) 117 + do { 118 118 i = i*10 + *((*s)++) - '0'; 119 + } while (isdigit(**s)); 119 120 120 121 return i; 121 122 } ··· 1605 1604 1606 1605 case 'p': 1607 1606 spec->type = FORMAT_TYPE_PTR; 1608 - return fmt - start; 1609 - /* skip alnum */ 1607 + return ++fmt - start; 1610 1608 1611 1609 case '%': 1612 1610 spec->type = FORMAT_TYPE_PERCENT_CHAR; ··· 1728 1728 1729 1729 /* Reject out-of-range values early. Large positive sizes are 1730 1730 used for unknown buffer sizes. */ 1731 - if (WARN_ON_ONCE((int) size < 0)) 1731 + if (WARN_ON_ONCE(size > INT_MAX)) 1732 1732 return 0; 1733 1733 1734 1734 str = buf; ··· 1794 1794 break; 1795 1795 1796 1796 case FORMAT_TYPE_PTR: 1797 - str = pointer(fmt+1, str, end, va_arg(args, void *), 1797 + str = pointer(fmt, str, end, va_arg(args, void *), 1798 1798 spec); 1799 1799 while (isalnum(*fmt)) 1800 1800 fmt++; ··· 2232 2232 } 2233 2233 2234 2234 case FORMAT_TYPE_PTR: 2235 - str = pointer(fmt+1, str, end, get_arg(void *), spec); 2235 + str = pointer(fmt, str, end, get_arg(void *), spec); 2236 2236 while (isalnum(*fmt)) 2237 2237 fmt++; 2238 2238 break;

+10

mm/Kconfig

··· 602 602 You can check speed with zsmalloc benchmark: 603 603 https://github.com/spartacus06/zsmapbench 604 604 605 + config ZSMALLOC_STAT 606 + bool "Export zsmalloc statistics" 607 + depends on ZSMALLOC 608 + select DEBUG_FS 609 + help 610 + This option enables code in the zsmalloc to collect various 611 + statistics about whats happening in zsmalloc and exports that 612 + information to userspace via debugfs. 613 + If unsure, say N. 614 + 605 615 config GENERIC_EARLY_IOREMAP 606 616 bool 607 617

+14 -9

mm/compaction.c

··· 490 490 491 491 /* If a page was split, advance to the end of it */ 492 492 if (isolated) { 493 + cc->nr_freepages += isolated; 494 + if (!strict && 495 + cc->nr_migratepages <= cc->nr_freepages) { 496 + blockpfn += isolated; 497 + break; 498 + } 499 + 493 500 blockpfn += isolated - 1; 494 501 cursor += isolated - 1; 495 502 continue; ··· 906 899 unsigned long isolate_start_pfn; /* exact pfn we start at */ 907 900 unsigned long block_end_pfn; /* end of current pageblock */ 908 901 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 909 - int nr_freepages = cc->nr_freepages; 910 902 struct list_head *freelist = &cc->freepages; 911 903 912 904 /* ··· 930 924 * pages on cc->migratepages. We stop searching if the migrate 931 925 * and free page scanners meet or enough free pages are isolated. 932 926 */ 933 - for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; 927 + for (; block_start_pfn >= low_pfn && 928 + cc->nr_migratepages > cc->nr_freepages; 934 929 block_end_pfn = block_start_pfn, 935 930 block_start_pfn -= pageblock_nr_pages, 936 931 isolate_start_pfn = block_start_pfn) { 937 - unsigned long isolated; 938 932 939 933 /* 940 934 * This can iterate a massively long zone without finding any ··· 959 953 continue; 960 954 961 955 /* Found a block suitable for isolating free pages from. */ 962 - isolated = isolate_freepages_block(cc, &isolate_start_pfn, 956 + isolate_freepages_block(cc, &isolate_start_pfn, 963 957 block_end_pfn, freelist, false); 964 - nr_freepages += isolated; 965 958 966 959 /* 967 960 * Remember where the free scanner should restart next time, ··· 992 987 */ 993 988 if (block_start_pfn < low_pfn) 994 989 cc->free_pfn = cc->migrate_pfn; 995 - 996 - cc->nr_freepages = nr_freepages; 997 990 } 998 991 999 992 /* ··· 1103 1100 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, 1104 1101 isolate_mode); 1105 1102 1106 - if (!low_pfn || cc->contended) 1103 + if (!low_pfn || cc->contended) { 1104 + acct_isolated(zone, cc); 1107 1105 return ISOLATE_ABORT; 1106 + } 1108 1107 1109 1108 /* 1110 1109 * Either we isolated something and proceed with migration. Or ··· 1178 1173 return COMPACT_PARTIAL; 1179 1174 1180 1175 /* Job done if allocation would set block type */ 1181 - if (cc->order >= pageblock_order && area->nr_free) 1176 + if (order >= pageblock_order && area->nr_free) 1182 1177 return COMPACT_PARTIAL; 1183 1178 } 1184 1179

+5 -5

mm/gup.c

··· 64 64 migration_entry_wait(mm, pmd, address); 65 65 goto retry; 66 66 } 67 - if ((flags & FOLL_NUMA) && pte_numa(pte)) 67 + if ((flags & FOLL_NUMA) && pte_protnone(pte)) 68 68 goto no_page; 69 69 if ((flags & FOLL_WRITE) && !pte_write(pte)) { 70 70 pte_unmap_unlock(ptep, ptl); ··· 184 184 return page; 185 185 return no_page_table(vma, flags); 186 186 } 187 - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 187 + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) 188 188 return no_page_table(vma, flags); 189 189 if (pmd_trans_huge(*pmd)) { 190 190 if (flags & FOLL_SPLIT) { ··· 906 906 907 907 /* 908 908 * Similar to the PMD case below, NUMA hinting must take slow 909 - * path 909 + * path using the pte_protnone check. 910 910 */ 911 911 if (!pte_present(pte) || pte_special(pte) || 912 - pte_numa(pte) || (write && !pte_write(pte))) 912 + pte_protnone(pte) || (write && !pte_write(pte))) 913 913 goto pte_unmap; 914 914 915 915 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); ··· 1104 1104 * slowpath for accounting purposes and so that they 1105 1105 * can be serialised against THP migration. 1106 1106 */ 1107 - if (pmd_numa(pmd)) 1107 + if (pmd_protnone(pmd)) 1108 1108 return 0; 1109 1109 1110 1110 if (!gup_huge_pmd(pmd, pmdp, addr, next, write,

+24 -26

mm/huge_memory.c

··· 1211 1211 return ERR_PTR(-EFAULT); 1212 1212 1213 1213 /* Full NUMA hinting faults to serialise migration in fault paths */ 1214 - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 1214 + if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) 1215 1215 goto out; 1216 1216 1217 1217 page = pmd_page(*pmd); ··· 1262 1262 bool migrated = false; 1263 1263 int flags = 0; 1264 1264 1265 + /* A PROT_NONE fault should not end up here */ 1266 + BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); 1267 + 1265 1268 ptl = pmd_lock(mm, pmdp); 1266 1269 if (unlikely(!pmd_same(pmd, *pmdp))) 1267 1270 goto out_unlock; ··· 1275 1272 * check_same as the page may no longer be mapped. 1276 1273 */ 1277 1274 if (unlikely(pmd_trans_migrating(*pmdp))) { 1275 + page = pmd_page(*pmdp); 1278 1276 spin_unlock(ptl); 1279 - wait_migrate_huge_page(vma->anon_vma, pmdp); 1277 + wait_on_page_locked(page); 1280 1278 goto out; 1281 1279 } 1282 1280 ··· 1345 1341 1346 1342 /* 1347 1343 * Migrate the THP to the requested node, returns with page unlocked 1348 - * and pmd_numa cleared. 1344 + * and access rights restored. 1349 1345 */ 1350 1346 spin_unlock(ptl); 1351 1347 migrated = migrate_misplaced_transhuge_page(mm, vma, ··· 1358 1354 goto out; 1359 1355 clear_pmdnuma: 1360 1356 BUG_ON(!PageLocked(page)); 1361 - pmd = pmd_mknonnuma(pmd); 1357 + pmd = pmd_modify(pmd, vma->vm_page_prot); 1362 1358 set_pmd_at(mm, haddr, pmdp, pmd); 1363 - VM_BUG_ON(pmd_numa(*pmdp)); 1364 1359 update_mmu_cache_pmd(vma, addr, pmdp); 1365 1360 unlock_page(page); 1366 1361 out_unlock: ··· 1482 1479 1483 1480 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1484 1481 pmd_t entry; 1485 - ret = 1; 1486 - if (!prot_numa) { 1482 + 1483 + /* 1484 + * Avoid trapping faults against the zero page. The read-only 1485 + * data is likely to be read-cached on the local CPU and 1486 + * local/remote hits to the zero page are not interesting. 1487 + */ 1488 + if (prot_numa && is_huge_zero_pmd(*pmd)) { 1489 + spin_unlock(ptl); 1490 + return 0; 1491 + } 1492 + 1493 + if (!prot_numa || !pmd_protnone(*pmd)) { 1494 + ret = 1; 1487 1495 entry = pmdp_get_and_clear_notify(mm, addr, pmd); 1488 - if (pmd_numa(entry)) 1489 - entry = pmd_mknonnuma(entry); 1490 1496 entry = pmd_modify(entry, newprot); 1491 1497 ret = HPAGE_PMD_NR; 1492 1498 set_pmd_at(mm, addr, pmd, entry); 1493 1499 BUG_ON(pmd_write(entry)); 1494 - } else { 1495 - struct page *page = pmd_page(*pmd); 1496 - 1497 - /* 1498 - * Do not trap faults against the zero page. The 1499 - * read-only data is likely to be read-cached on the 1500 - * local CPU cache and it is less useful to know about 1501 - * local vs remote hits on the zero page. 1502 - */ 1503 - if (!is_huge_zero_page(page) && 1504 - !pmd_numa(*pmd)) { 1505 - pmdp_set_numa(mm, addr, pmd); 1506 - ret = HPAGE_PMD_NR; 1507 - } 1508 1500 } 1509 1501 spin_unlock(ptl); 1510 1502 } ··· 1764 1766 pte_t *pte, entry; 1765 1767 BUG_ON(PageCompound(page+i)); 1766 1768 /* 1767 - * Note that pmd_numa is not transferred deliberately 1768 - * to avoid any possibility that pte_numa leaks to 1769 - * a PROT_NONE VMA by accident. 1769 + * Note that NUMA hinting access restrictions are not 1770 + * transferred to avoid any possibility of altering 1771 + * permissions across VMAs. 1770 1772 */ 1771 1773 entry = mk_pte(page + i, vma->vm_page_prot); 1772 1774 entry = maybe_mkwrite(pte_mkdirty(entry), vma);

+4 -2

mm/internal.h

··· 351 351 #define mminit_dprintk(level, prefix, fmt, arg...) \ 352 352 do { \ 353 353 if (level < mminit_loglevel) { \ 354 - printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ 355 - printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ 354 + if (level <= MMINIT_WARNING) \ 355 + printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \ 356 + else \ 357 + printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \ 356 358 } \ 357 359 } while (0) 358 360

+438 -29

mm/list_lru.c

··· 9 9 #include <linux/mm.h> 10 10 #include <linux/list_lru.h> 11 11 #include <linux/slab.h> 12 + #include <linux/mutex.h> 13 + #include <linux/memcontrol.h> 14 + 15 + #ifdef CONFIG_MEMCG_KMEM 16 + static LIST_HEAD(list_lrus); 17 + static DEFINE_MUTEX(list_lrus_mutex); 18 + 19 + static void list_lru_register(struct list_lru *lru) 20 + { 21 + mutex_lock(&list_lrus_mutex); 22 + list_add(&lru->list, &list_lrus); 23 + mutex_unlock(&list_lrus_mutex); 24 + } 25 + 26 + static void list_lru_unregister(struct list_lru *lru) 27 + { 28 + mutex_lock(&list_lrus_mutex); 29 + list_del(&lru->list); 30 + mutex_unlock(&list_lrus_mutex); 31 + } 32 + #else 33 + static void list_lru_register(struct list_lru *lru) 34 + { 35 + } 36 + 37 + static void list_lru_unregister(struct list_lru *lru) 38 + { 39 + } 40 + #endif /* CONFIG_MEMCG_KMEM */ 41 + 42 + #ifdef CONFIG_MEMCG_KMEM 43 + static inline bool list_lru_memcg_aware(struct list_lru *lru) 44 + { 45 + return !!lru->node[0].memcg_lrus; 46 + } 47 + 48 + static inline struct list_lru_one * 49 + list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) 50 + { 51 + /* 52 + * The lock protects the array of per cgroup lists from relocation 53 + * (see memcg_update_list_lru_node). 54 + */ 55 + lockdep_assert_held(&nlru->lock); 56 + if (nlru->memcg_lrus && idx >= 0) 57 + return nlru->memcg_lrus->lru[idx]; 58 + 59 + return &nlru->lru; 60 + } 61 + 62 + static inline struct list_lru_one * 63 + list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) 64 + { 65 + struct mem_cgroup *memcg; 66 + 67 + if (!nlru->memcg_lrus) 68 + return &nlru->lru; 69 + 70 + memcg = mem_cgroup_from_kmem(ptr); 71 + if (!memcg) 72 + return &nlru->lru; 73 + 74 + return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg)); 75 + } 76 + #else 77 + static inline bool list_lru_memcg_aware(struct list_lru *lru) 78 + { 79 + return false; 80 + } 81 + 82 + static inline struct list_lru_one * 83 + list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx) 84 + { 85 + return &nlru->lru; 86 + } 87 + 88 + static inline struct list_lru_one * 89 + list_lru_from_kmem(struct list_lru_node *nlru, void *ptr) 90 + { 91 + return &nlru->lru; 92 + } 93 + #endif /* CONFIG_MEMCG_KMEM */ 12 94 13 95 bool list_lru_add(struct list_lru *lru, struct list_head *item) 14 96 { 15 97 int nid = page_to_nid(virt_to_page(item)); 16 98 struct list_lru_node *nlru = &lru->node[nid]; 99 + struct list_lru_one *l; 17 100 18 101 spin_lock(&nlru->lock); 19 - WARN_ON_ONCE(nlru->nr_items < 0); 102 + l = list_lru_from_kmem(nlru, item); 20 103 if (list_empty(item)) { 21 - list_add_tail(item, &nlru->list); 22 - if (nlru->nr_items++ == 0) 23 - node_set(nid, lru->active_nodes); 104 + list_add_tail(item, &l->list); 105 + l->nr_items++; 24 106 spin_unlock(&nlru->lock); 25 107 return true; 26 108 } ··· 115 33 { 116 34 int nid = page_to_nid(virt_to_page(item)); 117 35 struct list_lru_node *nlru = &lru->node[nid]; 36 + struct list_lru_one *l; 118 37 119 38 spin_lock(&nlru->lock); 39 + l = list_lru_from_kmem(nlru, item); 120 40 if (!list_empty(item)) { 121 41 list_del_init(item); 122 - if (--nlru->nr_items == 0) 123 - node_clear(nid, lru->active_nodes); 124 - WARN_ON_ONCE(nlru->nr_items < 0); 42 + l->nr_items--; 125 43 spin_unlock(&nlru->lock); 126 44 return true; 127 45 } ··· 130 48 } 131 49 EXPORT_SYMBOL_GPL(list_lru_del); 132 50 133 - unsigned long 134 - list_lru_count_node(struct list_lru *lru, int nid) 51 + void list_lru_isolate(struct list_lru_one *list, struct list_head *item) 135 52 { 136 - unsigned long count = 0; 53 + list_del_init(item); 54 + list->nr_items--; 55 + } 56 + EXPORT_SYMBOL_GPL(list_lru_isolate); 57 + 58 + void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, 59 + struct list_head *head) 60 + { 61 + list_move(item, head); 62 + list->nr_items--; 63 + } 64 + EXPORT_SYMBOL_GPL(list_lru_isolate_move); 65 + 66 + static unsigned long __list_lru_count_one(struct list_lru *lru, 67 + int nid, int memcg_idx) 68 + { 137 69 struct list_lru_node *nlru = &lru->node[nid]; 70 + struct list_lru_one *l; 71 + unsigned long count; 138 72 139 73 spin_lock(&nlru->lock); 140 - WARN_ON_ONCE(nlru->nr_items < 0); 141 - count += nlru->nr_items; 74 + l = list_lru_from_memcg_idx(nlru, memcg_idx); 75 + count = l->nr_items; 142 76 spin_unlock(&nlru->lock); 143 77 144 78 return count; 145 79 } 80 + 81 + unsigned long list_lru_count_one(struct list_lru *lru, 82 + int nid, struct mem_cgroup *memcg) 83 + { 84 + return __list_lru_count_one(lru, nid, memcg_cache_id(memcg)); 85 + } 86 + EXPORT_SYMBOL_GPL(list_lru_count_one); 87 + 88 + unsigned long list_lru_count_node(struct list_lru *lru, int nid) 89 + { 90 + long count = 0; 91 + int memcg_idx; 92 + 93 + count += __list_lru_count_one(lru, nid, -1); 94 + if (list_lru_memcg_aware(lru)) { 95 + for_each_memcg_cache_index(memcg_idx) 96 + count += __list_lru_count_one(lru, nid, memcg_idx); 97 + } 98 + return count; 99 + } 146 100 EXPORT_SYMBOL_GPL(list_lru_count_node); 147 101 148 - unsigned long 149 - list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, 150 - void *cb_arg, unsigned long *nr_to_walk) 102 + static unsigned long 103 + __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, 104 + list_lru_walk_cb isolate, void *cb_arg, 105 + unsigned long *nr_to_walk) 151 106 { 152 107 153 - struct list_lru_node *nlru = &lru->node[nid]; 108 + struct list_lru_node *nlru = &lru->node[nid]; 109 + struct list_lru_one *l; 154 110 struct list_head *item, *n; 155 111 unsigned long isolated = 0; 156 112 157 113 spin_lock(&nlru->lock); 114 + l = list_lru_from_memcg_idx(nlru, memcg_idx); 158 115 restart: 159 - list_for_each_safe(item, n, &nlru->list) { 116 + list_for_each_safe(item, n, &l->list) { 160 117 enum lru_status ret; 161 118 162 119 /* ··· 206 85 break; 207 86 --*nr_to_walk; 208 87 209 - ret = isolate(item, &nlru->lock, cb_arg); 88 + ret = isolate(item, l, &nlru->lock, cb_arg); 210 89 switch (ret) { 211 90 case LRU_REMOVED_RETRY: 212 91 assert_spin_locked(&nlru->lock); 213 92 case LRU_REMOVED: 214 - if (--nlru->nr_items == 0) 215 - node_clear(nid, lru->active_nodes); 216 - WARN_ON_ONCE(nlru->nr_items < 0); 217 93 isolated++; 218 94 /* 219 95 * If the lru lock has been dropped, our list ··· 221 103 goto restart; 222 104 break; 223 105 case LRU_ROTATE: 224 - list_move_tail(item, &nlru->list); 106 + list_move_tail(item, &l->list); 225 107 break; 226 108 case LRU_SKIP: 227 109 break; ··· 240 122 spin_unlock(&nlru->lock); 241 123 return isolated; 242 124 } 125 + 126 + unsigned long 127 + list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, 128 + list_lru_walk_cb isolate, void *cb_arg, 129 + unsigned long *nr_to_walk) 130 + { 131 + return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg), 132 + isolate, cb_arg, nr_to_walk); 133 + } 134 + EXPORT_SYMBOL_GPL(list_lru_walk_one); 135 + 136 + unsigned long list_lru_walk_node(struct list_lru *lru, int nid, 137 + list_lru_walk_cb isolate, void *cb_arg, 138 + unsigned long *nr_to_walk) 139 + { 140 + long isolated = 0; 141 + int memcg_idx; 142 + 143 + isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg, 144 + nr_to_walk); 145 + if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { 146 + for_each_memcg_cache_index(memcg_idx) { 147 + isolated += __list_lru_walk_one(lru, nid, memcg_idx, 148 + isolate, cb_arg, nr_to_walk); 149 + if (*nr_to_walk <= 0) 150 + break; 151 + } 152 + } 153 + return isolated; 154 + } 243 155 EXPORT_SYMBOL_GPL(list_lru_walk_node); 244 156 245 - int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) 157 + static void init_one_lru(struct list_lru_one *l) 158 + { 159 + INIT_LIST_HEAD(&l->list); 160 + l->nr_items = 0; 161 + } 162 + 163 + #ifdef CONFIG_MEMCG_KMEM 164 + static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus, 165 + int begin, int end) 166 + { 167 + int i; 168 + 169 + for (i = begin; i < end; i++) 170 + kfree(memcg_lrus->lru[i]); 171 + } 172 + 173 + static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus, 174 + int begin, int end) 175 + { 176 + int i; 177 + 178 + for (i = begin; i < end; i++) { 179 + struct list_lru_one *l; 180 + 181 + l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL); 182 + if (!l) 183 + goto fail; 184 + 185 + init_one_lru(l); 186 + memcg_lrus->lru[i] = l; 187 + } 188 + return 0; 189 + fail: 190 + __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1); 191 + return -ENOMEM; 192 + } 193 + 194 + static int memcg_init_list_lru_node(struct list_lru_node *nlru) 195 + { 196 + int size = memcg_nr_cache_ids; 197 + 198 + nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL); 199 + if (!nlru->memcg_lrus) 200 + return -ENOMEM; 201 + 202 + if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) { 203 + kfree(nlru->memcg_lrus); 204 + return -ENOMEM; 205 + } 206 + 207 + return 0; 208 + } 209 + 210 + static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) 211 + { 212 + __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids); 213 + kfree(nlru->memcg_lrus); 214 + } 215 + 216 + static int memcg_update_list_lru_node(struct list_lru_node *nlru, 217 + int old_size, int new_size) 218 + { 219 + struct list_lru_memcg *old, *new; 220 + 221 + BUG_ON(old_size > new_size); 222 + 223 + old = nlru->memcg_lrus; 224 + new = kmalloc(new_size * sizeof(void *), GFP_KERNEL); 225 + if (!new) 226 + return -ENOMEM; 227 + 228 + if (__memcg_init_list_lru_node(new, old_size, new_size)) { 229 + kfree(new); 230 + return -ENOMEM; 231 + } 232 + 233 + memcpy(new, old, old_size * sizeof(void *)); 234 + 235 + /* 236 + * The lock guarantees that we won't race with a reader 237 + * (see list_lru_from_memcg_idx). 238 + * 239 + * Since list_lru_{add,del} may be called under an IRQ-safe lock, 240 + * we have to use IRQ-safe primitives here to avoid deadlock. 241 + */ 242 + spin_lock_irq(&nlru->lock); 243 + nlru->memcg_lrus = new; 244 + spin_unlock_irq(&nlru->lock); 245 + 246 + kfree(old); 247 + return 0; 248 + } 249 + 250 + static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru, 251 + int old_size, int new_size) 252 + { 253 + /* do not bother shrinking the array back to the old size, because we 254 + * cannot handle allocation failures here */ 255 + __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size); 256 + } 257 + 258 + static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) 259 + { 260 + int i; 261 + 262 + for (i = 0; i < nr_node_ids; i++) { 263 + if (!memcg_aware) 264 + lru->node[i].memcg_lrus = NULL; 265 + else if (memcg_init_list_lru_node(&lru->node[i])) 266 + goto fail; 267 + } 268 + return 0; 269 + fail: 270 + for (i = i - 1; i >= 0; i--) 271 + memcg_destroy_list_lru_node(&lru->node[i]); 272 + return -ENOMEM; 273 + } 274 + 275 + static void memcg_destroy_list_lru(struct list_lru *lru) 276 + { 277 + int i; 278 + 279 + if (!list_lru_memcg_aware(lru)) 280 + return; 281 + 282 + for (i = 0; i < nr_node_ids; i++) 283 + memcg_destroy_list_lru_node(&lru->node[i]); 284 + } 285 + 286 + static int memcg_update_list_lru(struct list_lru *lru, 287 + int old_size, int new_size) 288 + { 289 + int i; 290 + 291 + if (!list_lru_memcg_aware(lru)) 292 + return 0; 293 + 294 + for (i = 0; i < nr_node_ids; i++) { 295 + if (memcg_update_list_lru_node(&lru->node[i], 296 + old_size, new_size)) 297 + goto fail; 298 + } 299 + return 0; 300 + fail: 301 + for (i = i - 1; i >= 0; i--) 302 + memcg_cancel_update_list_lru_node(&lru->node[i], 303 + old_size, new_size); 304 + return -ENOMEM; 305 + } 306 + 307 + static void memcg_cancel_update_list_lru(struct list_lru *lru, 308 + int old_size, int new_size) 309 + { 310 + int i; 311 + 312 + if (!list_lru_memcg_aware(lru)) 313 + return; 314 + 315 + for (i = 0; i < nr_node_ids; i++) 316 + memcg_cancel_update_list_lru_node(&lru->node[i], 317 + old_size, new_size); 318 + } 319 + 320 + int memcg_update_all_list_lrus(int new_size) 321 + { 322 + int ret = 0; 323 + struct list_lru *lru; 324 + int old_size = memcg_nr_cache_ids; 325 + 326 + mutex_lock(&list_lrus_mutex); 327 + list_for_each_entry(lru, &list_lrus, list) { 328 + ret = memcg_update_list_lru(lru, old_size, new_size); 329 + if (ret) 330 + goto fail; 331 + } 332 + out: 333 + mutex_unlock(&list_lrus_mutex); 334 + return ret; 335 + fail: 336 + list_for_each_entry_continue_reverse(lru, &list_lrus, list) 337 + memcg_cancel_update_list_lru(lru, old_size, new_size); 338 + goto out; 339 + } 340 + 341 + static void memcg_drain_list_lru_node(struct list_lru_node *nlru, 342 + int src_idx, int dst_idx) 343 + { 344 + struct list_lru_one *src, *dst; 345 + 346 + /* 347 + * Since list_lru_{add,del} may be called under an IRQ-safe lock, 348 + * we have to use IRQ-safe primitives here to avoid deadlock. 349 + */ 350 + spin_lock_irq(&nlru->lock); 351 + 352 + src = list_lru_from_memcg_idx(nlru, src_idx); 353 + dst = list_lru_from_memcg_idx(nlru, dst_idx); 354 + 355 + list_splice_init(&src->list, &dst->list); 356 + dst->nr_items += src->nr_items; 357 + src->nr_items = 0; 358 + 359 + spin_unlock_irq(&nlru->lock); 360 + } 361 + 362 + static void memcg_drain_list_lru(struct list_lru *lru, 363 + int src_idx, int dst_idx) 364 + { 365 + int i; 366 + 367 + if (!list_lru_memcg_aware(lru)) 368 + return; 369 + 370 + for (i = 0; i < nr_node_ids; i++) 371 + memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx); 372 + } 373 + 374 + void memcg_drain_all_list_lrus(int src_idx, int dst_idx) 375 + { 376 + struct list_lru *lru; 377 + 378 + mutex_lock(&list_lrus_mutex); 379 + list_for_each_entry(lru, &list_lrus, list) 380 + memcg_drain_list_lru(lru, src_idx, dst_idx); 381 + mutex_unlock(&list_lrus_mutex); 382 + } 383 + #else 384 + static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) 385 + { 386 + return 0; 387 + } 388 + 389 + static void memcg_destroy_list_lru(struct list_lru *lru) 390 + { 391 + } 392 + #endif /* CONFIG_MEMCG_KMEM */ 393 + 394 + int __list_lru_init(struct list_lru *lru, bool memcg_aware, 395 + struct lock_class_key *key) 246 396 { 247 397 int i; 248 398 size_t size = sizeof(*lru->node) * nr_node_ids; 399 + int err = -ENOMEM; 400 + 401 + memcg_get_cache_ids(); 249 402 250 403 lru->node = kzalloc(size, GFP_KERNEL); 251 404 if (!lru->node) 252 - return -ENOMEM; 405 + goto out; 253 406 254 - nodes_clear(lru->active_nodes); 255 407 for (i = 0; i < nr_node_ids; i++) { 256 408 spin_lock_init(&lru->node[i].lock); 257 409 if (key) 258 410 lockdep_set_class(&lru->node[i].lock, key); 259 - INIT_LIST_HEAD(&lru->node[i].list); 260 - lru->node[i].nr_items = 0; 411 + init_one_lru(&lru->node[i].lru); 261 412 } 262 - return 0; 413 + 414 + err = memcg_init_list_lru(lru, memcg_aware); 415 + if (err) { 416 + kfree(lru->node); 417 + goto out; 418 + } 419 + 420 + list_lru_register(lru); 421 + out: 422 + memcg_put_cache_ids(); 423 + return err; 263 424 } 264 - EXPORT_SYMBOL_GPL(list_lru_init_key); 425 + EXPORT_SYMBOL_GPL(__list_lru_init); 265 426 266 427 void list_lru_destroy(struct list_lru *lru) 267 428 { 429 + /* Already destroyed or not yet initialized? */ 430 + if (!lru->node) 431 + return; 432 + 433 + memcg_get_cache_ids(); 434 + 435 + list_lru_unregister(lru); 436 + 437 + memcg_destroy_list_lru(lru); 268 438 kfree(lru->node); 439 + lru->node = NULL; 440 + 441 + memcg_put_cache_ids(); 269 442 } 270 443 EXPORT_SYMBOL_GPL(list_lru_destroy);

+121 -67

mm/memcontrol.c

··· 332 332 struct cg_proto tcp_mem; 333 333 #endif 334 334 #if defined(CONFIG_MEMCG_KMEM) 335 - /* Index in the kmem_cache->memcg_params->memcg_caches array */ 335 + /* Index in the kmem_cache->memcg_params.memcg_caches array */ 336 336 int kmemcg_id; 337 + bool kmem_acct_activated; 338 + bool kmem_acct_active; 337 339 #endif 338 340 339 341 int last_scanned_node; ··· 354 352 }; 355 353 356 354 #ifdef CONFIG_MEMCG_KMEM 357 - static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 355 + bool memcg_kmem_is_active(struct mem_cgroup *memcg) 358 356 { 359 - return memcg->kmemcg_id >= 0; 357 + return memcg->kmem_acct_active; 360 358 } 361 359 #endif 362 360 ··· 519 517 } 520 518 EXPORT_SYMBOL(tcp_proto_cgroup); 521 519 522 - static void disarm_sock_keys(struct mem_cgroup *memcg) 523 - { 524 - if (!memcg_proto_activated(&memcg->tcp_mem)) 525 - return; 526 - static_key_slow_dec(&memcg_socket_limit_enabled); 527 - } 528 - #else 529 - static void disarm_sock_keys(struct mem_cgroup *memcg) 530 - { 531 - } 532 520 #endif 533 521 534 522 #ifdef CONFIG_MEMCG_KMEM 535 523 /* 536 - * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 524 + * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. 537 525 * The main reason for not using cgroup id for this: 538 526 * this works better in sparse environments, where we have a lot of memcgs, 539 527 * but only a few kmem-limited. Or also, if we have, for instance, 200 540 528 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 541 529 * 200 entry array for that. 542 530 * 543 - * The current size of the caches array is stored in 544 - * memcg_limited_groups_array_size. It will double each time we have to 545 - * increase it. 531 + * The current size of the caches array is stored in memcg_nr_cache_ids. It 532 + * will double each time we have to increase it. 546 533 */ 547 - static DEFINE_IDA(kmem_limited_groups); 548 - int memcg_limited_groups_array_size; 534 + static DEFINE_IDA(memcg_cache_ida); 535 + int memcg_nr_cache_ids; 536 + 537 + /* Protects memcg_nr_cache_ids */ 538 + static DECLARE_RWSEM(memcg_cache_ids_sem); 539 + 540 + void memcg_get_cache_ids(void) 541 + { 542 + down_read(&memcg_cache_ids_sem); 543 + } 544 + 545 + void memcg_put_cache_ids(void) 546 + { 547 + up_read(&memcg_cache_ids_sem); 548 + } 549 549 550 550 /* 551 551 * MIN_SIZE is different than 1, because we would like to avoid going through ··· 573 569 struct static_key memcg_kmem_enabled_key; 574 570 EXPORT_SYMBOL(memcg_kmem_enabled_key); 575 571 576 - static void memcg_free_cache_id(int id); 577 - 578 - static void disarm_kmem_keys(struct mem_cgroup *memcg) 579 - { 580 - if (memcg_kmem_is_active(memcg)) { 581 - static_key_slow_dec(&memcg_kmem_enabled_key); 582 - memcg_free_cache_id(memcg->kmemcg_id); 583 - } 584 - /* 585 - * This check can't live in kmem destruction function, 586 - * since the charges will outlive the cgroup 587 - */ 588 - WARN_ON(page_counter_read(&memcg->kmem)); 589 - } 590 - #else 591 - static void disarm_kmem_keys(struct mem_cgroup *memcg) 592 - { 593 - } 594 572 #endif /* CONFIG_MEMCG_KMEM */ 595 - 596 - static void disarm_static_keys(struct mem_cgroup *memcg) 597 - { 598 - disarm_sock_keys(memcg); 599 - disarm_kmem_keys(memcg); 600 - } 601 573 602 574 static struct mem_cgroup_per_zone * 603 575 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) ··· 2518 2538 int id, size; 2519 2539 int err; 2520 2540 2521 - id = ida_simple_get(&kmem_limited_groups, 2541 + id = ida_simple_get(&memcg_cache_ida, 2522 2542 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2523 2543 if (id < 0) 2524 2544 return id; 2525 2545 2526 - if (id < memcg_limited_groups_array_size) 2546 + if (id < memcg_nr_cache_ids) 2527 2547 return id; 2528 2548 2529 2549 /* 2530 2550 * There's no space for the new id in memcg_caches arrays, 2531 2551 * so we have to grow them. 2532 2552 */ 2553 + down_write(&memcg_cache_ids_sem); 2533 2554 2534 2555 size = 2 * (id + 1); 2535 2556 if (size < MEMCG_CACHES_MIN_SIZE) ··· 2539 2558 size = MEMCG_CACHES_MAX_SIZE; 2540 2559 2541 2560 err = memcg_update_all_caches(size); 2561 + if (!err) 2562 + err = memcg_update_all_list_lrus(size); 2563 + if (!err) 2564 + memcg_nr_cache_ids = size; 2565 + 2566 + up_write(&memcg_cache_ids_sem); 2567 + 2542 2568 if (err) { 2543 - ida_simple_remove(&kmem_limited_groups, id); 2569 + ida_simple_remove(&memcg_cache_ida, id); 2544 2570 return err; 2545 2571 } 2546 2572 return id; ··· 2555 2567 2556 2568 static void memcg_free_cache_id(int id) 2557 2569 { 2558 - ida_simple_remove(&kmem_limited_groups, id); 2559 - } 2560 - 2561 - /* 2562 - * We should update the current array size iff all caches updates succeed. This 2563 - * can only be done from the slab side. The slab mutex needs to be held when 2564 - * calling this. 2565 - */ 2566 - void memcg_update_array_size(int num) 2567 - { 2568 - memcg_limited_groups_array_size = num; 2570 + ida_simple_remove(&memcg_cache_ida, id); 2569 2571 } 2570 2572 2571 2573 struct memcg_kmem_cache_create_work { ··· 2634 2656 { 2635 2657 struct mem_cgroup *memcg; 2636 2658 struct kmem_cache *memcg_cachep; 2659 + int kmemcg_id; 2637 2660 2638 - VM_BUG_ON(!cachep->memcg_params); 2639 - VM_BUG_ON(!cachep->memcg_params->is_root_cache); 2661 + VM_BUG_ON(!is_root_cache(cachep)); 2640 2662 2641 2663 if (current->memcg_kmem_skip_account) 2642 2664 return cachep; 2643 2665 2644 2666 memcg = get_mem_cgroup_from_mm(current->mm); 2645 - if (!memcg_kmem_is_active(memcg)) 2667 + kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id); 2668 + if (kmemcg_id < 0) 2646 2669 goto out; 2647 2670 2648 - memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 2671 + memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id); 2649 2672 if (likely(memcg_cachep)) 2650 2673 return memcg_cachep; 2651 2674 ··· 2671 2692 void __memcg_kmem_put_cache(struct kmem_cache *cachep) 2672 2693 { 2673 2694 if (!is_root_cache(cachep)) 2674 - css_put(&cachep->memcg_params->memcg->css); 2695 + css_put(&cachep->memcg_params.memcg->css); 2675 2696 } 2676 2697 2677 2698 /* ··· 2735 2756 2736 2757 memcg_uncharge_kmem(memcg, 1 << order); 2737 2758 page->mem_cgroup = NULL; 2759 + } 2760 + 2761 + struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr) 2762 + { 2763 + struct mem_cgroup *memcg = NULL; 2764 + struct kmem_cache *cachep; 2765 + struct page *page; 2766 + 2767 + page = virt_to_head_page(ptr); 2768 + if (PageSlab(page)) { 2769 + cachep = page->slab_cache; 2770 + if (!is_root_cache(cachep)) 2771 + memcg = cachep->memcg_params.memcg; 2772 + } else 2773 + /* page allocated by alloc_kmem_pages */ 2774 + memcg = page->mem_cgroup; 2775 + 2776 + return memcg; 2738 2777 } 2739 2778 #endif /* CONFIG_MEMCG_KMEM */ 2740 2779 ··· 3288 3291 int err = 0; 3289 3292 int memcg_id; 3290 3293 3291 - if (memcg_kmem_is_active(memcg)) 3292 - return 0; 3294 + BUG_ON(memcg->kmemcg_id >= 0); 3295 + BUG_ON(memcg->kmem_acct_activated); 3296 + BUG_ON(memcg->kmem_acct_active); 3293 3297 3294 3298 /* 3295 3299 * For simplicity, we won't allow this to be disabled. It also can't ··· 3333 3335 * patched. 3334 3336 */ 3335 3337 memcg->kmemcg_id = memcg_id; 3338 + memcg->kmem_acct_activated = true; 3339 + memcg->kmem_acct_active = true; 3336 3340 out: 3337 3341 return err; 3338 3342 } ··· 4014 4014 return mem_cgroup_sockets_init(memcg, ss); 4015 4015 } 4016 4016 4017 + static void memcg_deactivate_kmem(struct mem_cgroup *memcg) 4018 + { 4019 + struct cgroup_subsys_state *css; 4020 + struct mem_cgroup *parent, *child; 4021 + int kmemcg_id; 4022 + 4023 + if (!memcg->kmem_acct_active) 4024 + return; 4025 + 4026 + /* 4027 + * Clear the 'active' flag before clearing memcg_caches arrays entries. 4028 + * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it 4029 + * guarantees no cache will be created for this cgroup after we are 4030 + * done (see memcg_create_kmem_cache()). 4031 + */ 4032 + memcg->kmem_acct_active = false; 4033 + 4034 + memcg_deactivate_kmem_caches(memcg); 4035 + 4036 + kmemcg_id = memcg->kmemcg_id; 4037 + BUG_ON(kmemcg_id < 0); 4038 + 4039 + parent = parent_mem_cgroup(memcg); 4040 + if (!parent) 4041 + parent = root_mem_cgroup; 4042 + 4043 + /* 4044 + * Change kmemcg_id of this cgroup and all its descendants to the 4045 + * parent's id, and then move all entries from this cgroup's list_lrus 4046 + * to ones of the parent. After we have finished, all list_lrus 4047 + * corresponding to this cgroup are guaranteed to remain empty. The 4048 + * ordering is imposed by list_lru_node->lock taken by 4049 + * memcg_drain_all_list_lrus(). 4050 + */ 4051 + css_for_each_descendant_pre(css, &memcg->css) { 4052 + child = mem_cgroup_from_css(css); 4053 + BUG_ON(child->kmemcg_id != kmemcg_id); 4054 + child->kmemcg_id = parent->kmemcg_id; 4055 + if (!memcg->use_hierarchy) 4056 + break; 4057 + } 4058 + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); 4059 + 4060 + memcg_free_cache_id(kmemcg_id); 4061 + } 4062 + 4017 4063 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4018 4064 { 4019 - memcg_destroy_kmem_caches(memcg); 4065 + if (memcg->kmem_acct_activated) { 4066 + memcg_destroy_kmem_caches(memcg); 4067 + static_key_slow_dec(&memcg_kmem_enabled_key); 4068 + WARN_ON(page_counter_read(&memcg->kmem)); 4069 + } 4020 4070 mem_cgroup_sockets_destroy(memcg); 4021 4071 } 4022 4072 #else 4023 4073 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 4024 4074 { 4025 4075 return 0; 4076 + } 4077 + 4078 + static void memcg_deactivate_kmem(struct mem_cgroup *memcg) 4079 + { 4026 4080 } 4027 4081 4028 4082 static void memcg_destroy_kmem(struct mem_cgroup *memcg) ··· 4497 4443 free_mem_cgroup_per_zone_info(memcg, node); 4498 4444 4499 4445 free_percpu(memcg->stat); 4500 - 4501 - disarm_static_keys(memcg); 4502 4446 kfree(memcg); 4503 4447 } 4504 4448 ··· 4633 4581 spin_unlock(&memcg->event_list_lock); 4634 4582 4635 4583 vmpressure_cleanup(&memcg->vmpressure); 4584 + 4585 + memcg_deactivate_kmem(memcg); 4636 4586 } 4637 4587 4638 4588 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)

+2 -11

mm/memory-failure.c

··· 242 242 * Only call shrink_node_slabs here (which would also shrink 243 243 * other caches) if access is not potentially fatal. 244 244 */ 245 - if (access) { 246 - int nr; 247 - int nid = page_to_nid(p); 248 - do { 249 - nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000); 250 - if (page_count(p) == 1) 251 - break; 252 - } while (nr > 10); 253 - } 245 + if (access) 246 + drop_slab_node(page_to_nid(p)); 254 247 } 255 248 EXPORT_SYMBOL_GPL(shake_page); 256 249 ··· 1646 1653 * source page should be freed back to buddy before 1647 1654 * setting PG_hwpoison. 1648 1655 */ 1649 - if (!is_free_buddy_page(page)) 1650 - lru_add_drain_all(); 1651 1656 if (!is_free_buddy_page(page)) 1652 1657 drain_all_pages(page_zone(page)); 1653 1658 SetPageHWPoison(page);

+12 -8

mm/memory.c

··· 3013 3013 bool migrated = false; 3014 3014 int flags = 0; 3015 3015 3016 + /* A PROT_NONE fault should not end up here */ 3017 + BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))); 3018 + 3016 3019 /* 3017 3020 * The "pte" at this point cannot be used safely without 3018 3021 * validation through pte_unmap_same(). It's of NUMA type but 3019 3022 * the pfn may be screwed if the read is non atomic. 3020 3023 * 3021 - * ptep_modify_prot_start is not called as this is clearing 3022 - * the _PAGE_NUMA bit and it is not really expected that there 3023 - * would be concurrent hardware modifications to the PTE. 3024 + * We can safely just do a "set_pte_at()", because the old 3025 + * page table entry is not accessible, so there would be no 3026 + * concurrent hardware modifications to the PTE. 3024 3027 */ 3025 3028 ptl = pte_lockptr(mm, pmd); 3026 3029 spin_lock(ptl); ··· 3032 3029 goto out; 3033 3030 } 3034 3031 3035 - pte = pte_mknonnuma(pte); 3032 + /* Make it present again */ 3033 + pte = pte_modify(pte, vma->vm_page_prot); 3034 + pte = pte_mkyoung(pte); 3036 3035 set_pte_at(mm, addr, ptep, pte); 3037 3036 update_mmu_cache(vma, addr, ptep); 3038 3037 ··· 3043 3038 pte_unmap_unlock(ptep, ptl); 3044 3039 return 0; 3045 3040 } 3046 - BUG_ON(is_zero_pfn(page_to_pfn(page))); 3047 3041 3048 3042 /* 3049 3043 * Avoid grouping on DSO/COW pages in specific and RO pages ··· 3128 3124 pte, pmd, flags, entry); 3129 3125 } 3130 3126 3131 - if (pte_numa(entry)) 3127 + if (pte_protnone(entry)) 3132 3128 return do_numa_page(mm, vma, address, entry, pte, pmd); 3133 3129 3134 3130 ptl = pte_lockptr(mm, pmd); ··· 3206 3202 if (pmd_trans_splitting(orig_pmd)) 3207 3203 return 0; 3208 3204 3209 - if (pmd_numa(orig_pmd)) 3205 + if (pmd_protnone(orig_pmd)) 3210 3206 return do_huge_pmd_numa_page(mm, vma, address, 3211 3207 orig_pmd, pmd); 3212 3208 ··· 3462 3458 if (follow_phys(vma, addr, write, &prot, &phys_addr)) 3463 3459 return -EINVAL; 3464 3460 3465 - maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); 3461 + maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot); 3466 3462 if (write) 3467 3463 memcpy_toio(maddr + offset, buf, len); 3468 3464 else

+1 -1

mm/mempolicy.c

··· 569 569 { 570 570 int nr_updated; 571 571 572 - nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); 572 + nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1); 573 573 if (nr_updated) 574 574 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 575 575

+1 -7

mm/migrate.c

··· 1654 1654 return PageLocked(page); 1655 1655 } 1656 1656 1657 - void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd) 1658 - { 1659 - struct page *page = pmd_page(*pmd); 1660 - wait_on_page_locked(page); 1661 - } 1662 - 1663 1657 /* 1664 1658 * Attempt to migrate a misplaced page to the specified destination 1665 1659 * node. Caller is expected to have an elevated reference count on ··· 1847 1853 out_dropref: 1848 1854 ptl = pmd_lock(mm, pmd); 1849 1855 if (pmd_same(*pmd, entry)) { 1850 - entry = pmd_mknonnuma(entry); 1856 + entry = pmd_modify(entry, vma->vm_page_prot); 1851 1857 set_pmd_at(mm, mmun_start, pmd, entry); 1852 1858 update_mmu_cache_pmd(vma, address, &entry); 1853 1859 }

+2 -2

mm/mm_init.c

··· 14 14 #include "internal.h" 15 15 16 16 #ifdef CONFIG_DEBUG_MEMORY_INIT 17 - int mminit_loglevel; 17 + int __meminitdata mminit_loglevel; 18 18 19 19 #ifndef SECTIONS_SHIFT 20 20 #define SECTIONS_SHIFT 0 21 21 #endif 22 22 23 23 /* The zonelists are simply reported, validation is manual. */ 24 - void mminit_verify_zonelist(void) 24 + void __init mminit_verify_zonelist(void) 25 25 { 26 26 int nid; 27 27

+23 -25

mm/mprotect.c

··· 75 75 oldpte = *pte; 76 76 if (pte_present(oldpte)) { 77 77 pte_t ptent; 78 - bool updated = false; 79 78 80 - if (!prot_numa) { 81 - ptent = ptep_modify_prot_start(mm, addr, pte); 82 - if (pte_numa(ptent)) 83 - ptent = pte_mknonnuma(ptent); 84 - ptent = pte_modify(ptent, newprot); 85 - /* 86 - * Avoid taking write faults for pages we 87 - * know to be dirty. 88 - */ 89 - if (dirty_accountable && pte_dirty(ptent) && 90 - (pte_soft_dirty(ptent) || 91 - !(vma->vm_flags & VM_SOFTDIRTY))) 92 - ptent = pte_mkwrite(ptent); 93 - ptep_modify_prot_commit(mm, addr, pte, ptent); 94 - updated = true; 95 - } else { 79 + /* 80 + * Avoid trapping faults against the zero or KSM 81 + * pages. See similar comment in change_huge_pmd. 82 + */ 83 + if (prot_numa) { 96 84 struct page *page; 97 85 98 86 page = vm_normal_page(vma, addr, oldpte); 99 - if (page && !PageKsm(page)) { 100 - if (!pte_numa(oldpte)) { 101 - ptep_set_numa(mm, addr, pte); 102 - updated = true; 103 - } 104 - } 87 + if (!page || PageKsm(page)) 88 + continue; 89 + 90 + /* Avoid TLB flush if possible */ 91 + if (pte_protnone(oldpte)) 92 + continue; 105 93 } 106 - if (updated) 107 - pages++; 94 + 95 + ptent = ptep_modify_prot_start(mm, addr, pte); 96 + ptent = pte_modify(ptent, newprot); 97 + 98 + /* Avoid taking write faults for known dirty pages */ 99 + if (dirty_accountable && pte_dirty(ptent) && 100 + (pte_soft_dirty(ptent) || 101 + !(vma->vm_flags & VM_SOFTDIRTY))) { 102 + ptent = pte_mkwrite(ptent); 103 + } 104 + ptep_modify_prot_commit(mm, addr, pte, ptent); 105 + pages++; 108 106 } else if (IS_ENABLED(CONFIG_MIGRATION)) { 109 107 swp_entry_t entry = pte_to_swp_entry(oldpte); 110 108

+15 -4

mm/page_alloc.c

··· 172 172 * 1G machine -> (16M dma, 784M normal, 224M high) 173 173 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 174 174 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 175 - * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 175 + * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA 176 176 * 177 177 * TBD: should special case ZONE_DMA32 machines here - in those we normally 178 178 * don't need any ZONE_NORMAL reservation ··· 3871 3871 return 0; 3872 3872 } 3873 3873 3874 + static noinline void __init 3875 + build_all_zonelists_init(void) 3876 + { 3877 + __build_all_zonelists(NULL); 3878 + mminit_verify_zonelist(); 3879 + cpuset_init_current_mems_allowed(); 3880 + } 3881 + 3874 3882 /* 3875 3883 * Called with zonelists_mutex held always 3876 3884 * unless system_state == SYSTEM_BOOTING. 3885 + * 3886 + * __ref due to (1) call of __meminit annotated setup_zone_pageset 3887 + * [we're only called with non-NULL zone through __meminit paths] and 3888 + * (2) call of __init annotated helper build_all_zonelists_init 3889 + * [protected by SYSTEM_BOOTING]. 3877 3890 */ 3878 3891 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3879 3892 { 3880 3893 set_zonelist_order(); 3881 3894 3882 3895 if (system_state == SYSTEM_BOOTING) { 3883 - __build_all_zonelists(NULL); 3884 - mminit_verify_zonelist(); 3885 - cpuset_init_current_mems_allowed(); 3896 + build_all_zonelists_init(); 3886 3897 } else { 3887 3898 #ifdef CONFIG_MEMORY_HOTPLUG 3888 3899 if (zone)

-2

mm/pgtable-generic.c

··· 193 193 pmd_t *pmdp) 194 194 { 195 195 pmd_t entry = *pmdp; 196 - if (pmd_numa(entry)) 197 - entry = pmd_mknonnuma(entry); 198 196 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); 199 197 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 200 198 }

+7 -10

mm/slab.c

··· 2382 2382 return nr_freed; 2383 2383 } 2384 2384 2385 - int __kmem_cache_shrink(struct kmem_cache *cachep) 2385 + int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) 2386 2386 { 2387 2387 int ret = 0; 2388 2388 int node; ··· 2404 2404 { 2405 2405 int i; 2406 2406 struct kmem_cache_node *n; 2407 - int rc = __kmem_cache_shrink(cachep); 2407 + int rc = __kmem_cache_shrink(cachep, false); 2408 2408 2409 2409 if (rc) 2410 2410 return rc; ··· 3708 3708 int batchcount, int shared, gfp_t gfp) 3709 3709 { 3710 3710 int ret; 3711 - struct kmem_cache *c = NULL; 3712 - int i = 0; 3711 + struct kmem_cache *c; 3713 3712 3714 3713 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); 3715 3714 ··· 3718 3719 if ((ret < 0) || !is_root_cache(cachep)) 3719 3720 return ret; 3720 3721 3721 - VM_BUG_ON(!mutex_is_locked(&slab_mutex)); 3722 - for_each_memcg_cache_index(i) { 3723 - c = cache_from_memcg_idx(cachep, i); 3724 - if (c) 3725 - /* return value determined by the parent cache only */ 3726 - __do_tune_cpucache(c, limit, batchcount, shared, gfp); 3722 + lockdep_assert_held(&slab_mutex); 3723 + for_each_memcg_cache(c, cachep) { 3724 + /* return value determined by the root cache only */ 3725 + __do_tune_cpucache(c, limit, batchcount, shared, gfp); 3727 3726 } 3728 3727 3729 3728 return ret;

+41 -26

mm/slab.h

··· 86 86 extern void create_boot_cache(struct kmem_cache *, const char *name, 87 87 size_t size, unsigned long flags); 88 88 89 - struct mem_cgroup; 90 - 91 89 int slab_unmergeable(struct kmem_cache *s); 92 90 struct kmem_cache *find_mergeable(size_t size, size_t align, 93 91 unsigned long flags, const char *name, void (*ctor)(void *)); ··· 138 140 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) 139 141 140 142 int __kmem_cache_shutdown(struct kmem_cache *); 141 - int __kmem_cache_shrink(struct kmem_cache *); 143 + int __kmem_cache_shrink(struct kmem_cache *, bool); 142 144 void slab_kmem_cache_release(struct kmem_cache *); 143 145 144 146 struct seq_file; ··· 163 165 size_t count, loff_t *ppos); 164 166 165 167 #ifdef CONFIG_MEMCG_KMEM 168 + /* 169 + * Iterate over all memcg caches of the given root cache. The caller must hold 170 + * slab_mutex. 171 + */ 172 + #define for_each_memcg_cache(iter, root) \ 173 + list_for_each_entry(iter, &(root)->memcg_params.list, \ 174 + memcg_params.list) 175 + 176 + #define for_each_memcg_cache_safe(iter, tmp, root) \ 177 + list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \ 178 + memcg_params.list) 179 + 166 180 static inline bool is_root_cache(struct kmem_cache *s) 167 181 { 168 - return !s->memcg_params || s->memcg_params->is_root_cache; 182 + return s->memcg_params.is_root_cache; 169 183 } 170 184 171 185 static inline bool slab_equal_or_root(struct kmem_cache *s, 172 - struct kmem_cache *p) 186 + struct kmem_cache *p) 173 187 { 174 - return (p == s) || 175 - (s->memcg_params && (p == s->memcg_params->root_cache)); 188 + return p == s || p == s->memcg_params.root_cache; 176 189 } 177 190 178 191 /* ··· 194 185 static inline const char *cache_name(struct kmem_cache *s) 195 186 { 196 187 if (!is_root_cache(s)) 197 - return s->memcg_params->root_cache->name; 188 + s = s->memcg_params.root_cache; 198 189 return s->name; 199 190 } 200 191 201 192 /* 202 193 * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. 203 - * That said the caller must assure the memcg's cache won't go away. Since once 204 - * created a memcg's cache is destroyed only along with the root cache, it is 205 - * true if we are going to allocate from the cache or hold a reference to the 206 - * root cache by other means. Otherwise, we should hold either the slab_mutex 207 - * or the memcg's slab_caches_mutex while calling this function and accessing 208 - * the returned value. 194 + * That said the caller must assure the memcg's cache won't go away by either 195 + * taking a css reference to the owner cgroup, or holding the slab_mutex. 209 196 */ 210 197 static inline struct kmem_cache * 211 198 cache_from_memcg_idx(struct kmem_cache *s, int idx) 212 199 { 213 200 struct kmem_cache *cachep; 214 - struct memcg_cache_params *params; 215 - 216 - if (!s->memcg_params) 217 - return NULL; 201 + struct memcg_cache_array *arr; 218 202 219 203 rcu_read_lock(); 220 - params = rcu_dereference(s->memcg_params); 204 + arr = rcu_dereference(s->memcg_params.memcg_caches); 221 205 222 206 /* 223 207 * Make sure we will access the up-to-date value. The code updating 224 208 * memcg_caches issues a write barrier to match this (see 225 - * memcg_register_cache()). 209 + * memcg_create_kmem_cache()). 226 210 */ 227 - cachep = lockless_dereference(params->memcg_caches[idx]); 211 + cachep = lockless_dereference(arr->entries[idx]); 228 212 rcu_read_unlock(); 229 213 230 214 return cachep; ··· 227 225 { 228 226 if (is_root_cache(s)) 229 227 return s; 230 - return s->memcg_params->root_cache; 228 + return s->memcg_params.root_cache; 231 229 } 232 230 233 231 static __always_inline int memcg_charge_slab(struct kmem_cache *s, ··· 237 235 return 0; 238 236 if (is_root_cache(s)) 239 237 return 0; 240 - return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order); 238 + return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order); 241 239 } 242 240 243 241 static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) ··· 246 244 return; 247 245 if (is_root_cache(s)) 248 246 return; 249 - memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order); 247 + memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order); 250 248 } 251 - #else 249 + 250 + extern void slab_init_memcg_params(struct kmem_cache *); 251 + 252 + #else /* !CONFIG_MEMCG_KMEM */ 253 + 254 + #define for_each_memcg_cache(iter, root) \ 255 + for ((void)(iter), (void)(root); 0; ) 256 + #define for_each_memcg_cache_safe(iter, tmp, root) \ 257 + for ((void)(iter), (void)(tmp), (void)(root); 0; ) 258 + 252 259 static inline bool is_root_cache(struct kmem_cache *s) 253 260 { 254 261 return true; ··· 293 282 static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) 294 283 { 295 284 } 296 - #endif 285 + 286 + static inline void slab_init_memcg_params(struct kmem_cache *s) 287 + { 288 + } 289 + #endif /* CONFIG_MEMCG_KMEM */ 297 290 298 291 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) 299 292 {

+120 -81

mm/slab_common.c

··· 106 106 #endif 107 107 108 108 #ifdef CONFIG_MEMCG_KMEM 109 - static int memcg_alloc_cache_params(struct mem_cgroup *memcg, 110 - struct kmem_cache *s, struct kmem_cache *root_cache) 109 + void slab_init_memcg_params(struct kmem_cache *s) 111 110 { 112 - size_t size; 111 + s->memcg_params.is_root_cache = true; 112 + INIT_LIST_HEAD(&s->memcg_params.list); 113 + RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL); 114 + } 113 115 114 - if (!memcg_kmem_enabled()) 115 - return 0; 116 - 117 - if (!memcg) { 118 - size = offsetof(struct memcg_cache_params, memcg_caches); 119 - size += memcg_limited_groups_array_size * sizeof(void *); 120 - } else 121 - size = sizeof(struct memcg_cache_params); 122 - 123 - s->memcg_params = kzalloc(size, GFP_KERNEL); 124 - if (!s->memcg_params) 125 - return -ENOMEM; 116 + static int init_memcg_params(struct kmem_cache *s, 117 + struct mem_cgroup *memcg, struct kmem_cache *root_cache) 118 + { 119 + struct memcg_cache_array *arr; 126 120 127 121 if (memcg) { 128 - s->memcg_params->memcg = memcg; 129 - s->memcg_params->root_cache = root_cache; 130 - } else 131 - s->memcg_params->is_root_cache = true; 122 + s->memcg_params.is_root_cache = false; 123 + s->memcg_params.memcg = memcg; 124 + s->memcg_params.root_cache = root_cache; 125 + return 0; 126 + } 132 127 128 + slab_init_memcg_params(s); 129 + 130 + if (!memcg_nr_cache_ids) 131 + return 0; 132 + 133 + arr = kzalloc(sizeof(struct memcg_cache_array) + 134 + memcg_nr_cache_ids * sizeof(void *), 135 + GFP_KERNEL); 136 + if (!arr) 137 + return -ENOMEM; 138 + 139 + RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr); 133 140 return 0; 134 141 } 135 142 136 - static void memcg_free_cache_params(struct kmem_cache *s) 143 + static void destroy_memcg_params(struct kmem_cache *s) 137 144 { 138 - kfree(s->memcg_params); 145 + if (is_root_cache(s)) 146 + kfree(rcu_access_pointer(s->memcg_params.memcg_caches)); 139 147 } 140 148 141 - static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) 149 + static int update_memcg_params(struct kmem_cache *s, int new_array_size) 142 150 { 143 - int size; 144 - struct memcg_cache_params *new_params, *cur_params; 151 + struct memcg_cache_array *old, *new; 145 152 146 - BUG_ON(!is_root_cache(s)); 153 + if (!is_root_cache(s)) 154 + return 0; 147 155 148 - size = offsetof(struct memcg_cache_params, memcg_caches); 149 - size += num_memcgs * sizeof(void *); 150 - 151 - new_params = kzalloc(size, GFP_KERNEL); 152 - if (!new_params) 156 + new = kzalloc(sizeof(struct memcg_cache_array) + 157 + new_array_size * sizeof(void *), GFP_KERNEL); 158 + if (!new) 153 159 return -ENOMEM; 154 160 155 - cur_params = s->memcg_params; 156 - memcpy(new_params->memcg_caches, cur_params->memcg_caches, 157 - memcg_limited_groups_array_size * sizeof(void *)); 161 + old = rcu_dereference_protected(s->memcg_params.memcg_caches, 162 + lockdep_is_held(&slab_mutex)); 163 + if (old) 164 + memcpy(new->entries, old->entries, 165 + memcg_nr_cache_ids * sizeof(void *)); 158 166 159 - new_params->is_root_cache = true; 160 - 161 - rcu_assign_pointer(s->memcg_params, new_params); 162 - if (cur_params) 163 - kfree_rcu(cur_params, rcu_head); 164 - 167 + rcu_assign_pointer(s->memcg_params.memcg_caches, new); 168 + if (old) 169 + kfree_rcu(old, rcu); 165 170 return 0; 166 171 } 167 172 ··· 174 169 { 175 170 struct kmem_cache *s; 176 171 int ret = 0; 172 + 177 173 mutex_lock(&slab_mutex); 178 - 179 174 list_for_each_entry(s, &slab_caches, list) { 180 - if (!is_root_cache(s)) 181 - continue; 182 - 183 - ret = memcg_update_cache_params(s, num_memcgs); 175 + ret = update_memcg_params(s, num_memcgs); 184 176 /* 185 177 * Instead of freeing the memory, we'll just leave the caches 186 178 * up to this point in an updated state. 187 179 */ 188 180 if (ret) 189 - goto out; 181 + break; 190 182 } 191 - 192 - memcg_update_array_size(num_memcgs); 193 - out: 194 183 mutex_unlock(&slab_mutex); 195 184 return ret; 196 185 } 197 186 #else 198 - static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, 199 - struct kmem_cache *s, struct kmem_cache *root_cache) 187 + static inline int init_memcg_params(struct kmem_cache *s, 188 + struct mem_cgroup *memcg, struct kmem_cache *root_cache) 200 189 { 201 190 return 0; 202 191 } 203 192 204 - static inline void memcg_free_cache_params(struct kmem_cache *s) 193 + static inline void destroy_memcg_params(struct kmem_cache *s) 205 194 { 206 195 } 207 196 #endif /* CONFIG_MEMCG_KMEM */ ··· 313 314 s->align = align; 314 315 s->ctor = ctor; 315 316 316 - err = memcg_alloc_cache_params(memcg, s, root_cache); 317 + err = init_memcg_params(s, memcg, root_cache); 317 318 if (err) 318 319 goto out_free_cache; 319 320 ··· 329 330 return s; 330 331 331 332 out_free_cache: 332 - memcg_free_cache_params(s); 333 + destroy_memcg_params(s); 333 334 kmem_cache_free(kmem_cache, s); 334 335 goto out; 335 336 } ··· 368 369 369 370 get_online_cpus(); 370 371 get_online_mems(); 372 + memcg_get_cache_ids(); 371 373 372 374 mutex_lock(&slab_mutex); 373 375 ··· 407 407 out_unlock: 408 408 mutex_unlock(&slab_mutex); 409 409 410 + memcg_put_cache_ids(); 410 411 put_online_mems(); 411 412 put_online_cpus(); 412 413 ··· 440 439 *need_rcu_barrier = true; 441 440 442 441 #ifdef CONFIG_MEMCG_KMEM 443 - if (!is_root_cache(s)) { 444 - struct kmem_cache *root_cache = s->memcg_params->root_cache; 445 - int memcg_id = memcg_cache_id(s->memcg_params->memcg); 446 - 447 - BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s); 448 - root_cache->memcg_params->memcg_caches[memcg_id] = NULL; 449 - } 442 + if (!is_root_cache(s)) 443 + list_del(&s->memcg_params.list); 450 444 #endif 451 445 list_move(&s->list, release); 452 446 return 0; ··· 478 482 struct kmem_cache *root_cache) 479 483 { 480 484 static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ 481 - int memcg_id = memcg_cache_id(memcg); 485 + struct cgroup_subsys_state *css = mem_cgroup_css(memcg); 486 + struct memcg_cache_array *arr; 482 487 struct kmem_cache *s = NULL; 483 488 char *cache_name; 489 + int idx; 484 490 485 491 get_online_cpus(); 486 492 get_online_mems(); ··· 490 492 mutex_lock(&slab_mutex); 491 493 492 494 /* 495 + * The memory cgroup could have been deactivated while the cache 496 + * creation work was pending. 497 + */ 498 + if (!memcg_kmem_is_active(memcg)) 499 + goto out_unlock; 500 + 501 + idx = memcg_cache_id(memcg); 502 + arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches, 503 + lockdep_is_held(&slab_mutex)); 504 + 505 + /* 493 506 * Since per-memcg caches are created asynchronously on first 494 507 * allocation (see memcg_kmem_get_cache()), several threads can try to 495 508 * create the same cache, but only one of them may succeed. 496 509 */ 497 - if (cache_from_memcg_idx(root_cache, memcg_id)) 510 + if (arr->entries[idx]) 498 511 goto out_unlock; 499 512 500 - cgroup_name(mem_cgroup_css(memcg)->cgroup, 501 - memcg_name_buf, sizeof(memcg_name_buf)); 513 + cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf)); 502 514 cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, 503 - memcg_cache_id(memcg), memcg_name_buf); 515 + css->id, memcg_name_buf); 504 516 if (!cache_name) 505 517 goto out_unlock; 506 518 ··· 528 520 goto out_unlock; 529 521 } 530 522 523 + list_add(&s->memcg_params.list, &root_cache->memcg_params.list); 524 + 531 525 /* 532 526 * Since readers won't lock (see cache_from_memcg_idx()), we need a 533 527 * barrier here to ensure nobody will see the kmem_cache partially 534 528 * initialized. 535 529 */ 536 530 smp_wmb(); 537 - root_cache->memcg_params->memcg_caches[memcg_id] = s; 531 + arr->entries[idx] = s; 538 532 539 533 out_unlock: 534 + mutex_unlock(&slab_mutex); 535 + 536 + put_online_mems(); 537 + put_online_cpus(); 538 + } 539 + 540 + void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) 541 + { 542 + int idx; 543 + struct memcg_cache_array *arr; 544 + struct kmem_cache *s, *c; 545 + 546 + idx = memcg_cache_id(memcg); 547 + 548 + get_online_cpus(); 549 + get_online_mems(); 550 + 551 + mutex_lock(&slab_mutex); 552 + list_for_each_entry(s, &slab_caches, list) { 553 + if (!is_root_cache(s)) 554 + continue; 555 + 556 + arr = rcu_dereference_protected(s->memcg_params.memcg_caches, 557 + lockdep_is_held(&slab_mutex)); 558 + c = arr->entries[idx]; 559 + if (!c) 560 + continue; 561 + 562 + __kmem_cache_shrink(c, true); 563 + arr->entries[idx] = NULL; 564 + } 540 565 mutex_unlock(&slab_mutex); 541 566 542 567 put_online_mems(); ··· 587 546 588 547 mutex_lock(&slab_mutex); 589 548 list_for_each_entry_safe(s, s2, &slab_caches, list) { 590 - if (is_root_cache(s) || s->memcg_params->memcg != memcg) 549 + if (is_root_cache(s) || s->memcg_params.memcg != memcg) 591 550 continue; 592 551 /* 593 552 * The cgroup is about to be freed and therefore has no charges ··· 606 565 607 566 void slab_kmem_cache_release(struct kmem_cache *s) 608 567 { 609 - memcg_free_cache_params(s); 568 + destroy_memcg_params(s); 610 569 kfree(s->name); 611 570 kmem_cache_free(kmem_cache, s); 612 571 } 613 572 614 573 void kmem_cache_destroy(struct kmem_cache *s) 615 574 { 616 - int i; 575 + struct kmem_cache *c, *c2; 617 576 LIST_HEAD(release); 618 577 bool need_rcu_barrier = false; 619 578 bool busy = false; 579 + 580 + BUG_ON(!is_root_cache(s)); 620 581 621 582 get_online_cpus(); 622 583 get_online_mems(); ··· 629 586 if (s->refcount) 630 587 goto out_unlock; 631 588 632 - for_each_memcg_cache_index(i) { 633 - struct kmem_cache *c = cache_from_memcg_idx(s, i); 634 - 635 - if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) 589 + for_each_memcg_cache_safe(c, c2, s) { 590 + if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier)) 636 591 busy = true; 637 592 } 638 593 ··· 660 619 661 620 get_online_cpus(); 662 621 get_online_mems(); 663 - ret = __kmem_cache_shrink(cachep); 622 + ret = __kmem_cache_shrink(cachep, false); 664 623 put_online_mems(); 665 624 put_online_cpus(); 666 625 return ret; ··· 682 641 s->name = name; 683 642 s->size = s->object_size = size; 684 643 s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); 644 + 645 + slab_init_memcg_params(s); 646 + 685 647 err = __kmem_cache_create(s, flags); 686 648 687 649 if (err) ··· 964 920 { 965 921 struct kmem_cache *c; 966 922 struct slabinfo sinfo; 967 - int i; 968 923 969 924 if (!is_root_cache(s)) 970 925 return; 971 926 972 - for_each_memcg_cache_index(i) { 973 - c = cache_from_memcg_idx(s, i); 974 - if (!c) 975 - continue; 976 - 927 + for_each_memcg_cache(c, s) { 977 928 memset(&sinfo, 0, sizeof(sinfo)); 978 929 get_slabinfo(c, &sinfo); 979 930 ··· 1020 981 1021 982 if (p == slab_caches.next) 1022 983 print_slabinfo_header(m); 1023 - if (!is_root_cache(s) && s->memcg_params->memcg == memcg) 984 + if (!is_root_cache(s) && s->memcg_params.memcg == memcg) 1024 985 cache_show(s, m); 1025 986 return 0; 1026 987 }

+1 -1

mm/slob.c

··· 618 618 return 0; 619 619 } 620 620 621 - int __kmem_cache_shrink(struct kmem_cache *d) 621 + int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) 622 622 { 623 623 return 0; 624 624 }

+69 -48

mm/slub.c

··· 2007 2007 int pages; 2008 2008 int pobjects; 2009 2009 2010 + preempt_disable(); 2010 2011 do { 2011 2012 pages = 0; 2012 2013 pobjects = 0; ··· 2041 2040 2042 2041 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) 2043 2042 != oldpage); 2043 + if (unlikely(!s->cpu_partial)) { 2044 + unsigned long flags; 2045 + 2046 + local_irq_save(flags); 2047 + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); 2048 + local_irq_restore(flags); 2049 + } 2050 + preempt_enable(); 2044 2051 #endif 2045 2052 } 2046 2053 ··· 3367 3358 } 3368 3359 EXPORT_SYMBOL(kfree); 3369 3360 3361 + #define SHRINK_PROMOTE_MAX 32 3362 + 3370 3363 /* 3371 - * kmem_cache_shrink removes empty slabs from the partial lists and sorts 3372 - * the remaining slabs by the number of items in use. The slabs with the 3373 - * most items in use come first. New allocations will then fill those up 3374 - * and thus they can be removed from the partial lists. 3364 + * kmem_cache_shrink discards empty slabs and promotes the slabs filled 3365 + * up most to the head of the partial lists. New allocations will then 3366 + * fill those up and thus they can be removed from the partial lists. 3375 3367 * 3376 3368 * The slabs with the least items are placed last. This results in them 3377 3369 * being allocated from last increasing the chance that the last objects 3378 3370 * are freed in them. 3379 3371 */ 3380 - int __kmem_cache_shrink(struct kmem_cache *s) 3372 + int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) 3381 3373 { 3382 3374 int node; 3383 3375 int i; 3384 3376 struct kmem_cache_node *n; 3385 3377 struct page *page; 3386 3378 struct page *t; 3387 - int objects = oo_objects(s->max); 3388 - struct list_head *slabs_by_inuse = 3389 - kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); 3379 + struct list_head discard; 3380 + struct list_head promote[SHRINK_PROMOTE_MAX]; 3390 3381 unsigned long flags; 3382 + int ret = 0; 3391 3383 3392 - if (!slabs_by_inuse) 3393 - return -ENOMEM; 3384 + if (deactivate) { 3385 + /* 3386 + * Disable empty slabs caching. Used to avoid pinning offline 3387 + * memory cgroups by kmem pages that can be freed. 3388 + */ 3389 + s->cpu_partial = 0; 3390 + s->min_partial = 0; 3391 + 3392 + /* 3393 + * s->cpu_partial is checked locklessly (see put_cpu_partial), 3394 + * so we have to make sure the change is visible. 3395 + */ 3396 + kick_all_cpus_sync(); 3397 + } 3394 3398 3395 3399 flush_all(s); 3396 3400 for_each_kmem_cache_node(s, node, n) { 3397 - if (!n->nr_partial) 3398 - continue; 3399 - 3400 - for (i = 0; i < objects; i++) 3401 - INIT_LIST_HEAD(slabs_by_inuse + i); 3401 + INIT_LIST_HEAD(&discard); 3402 + for (i = 0; i < SHRINK_PROMOTE_MAX; i++) 3403 + INIT_LIST_HEAD(promote + i); 3402 3404 3403 3405 spin_lock_irqsave(&n->list_lock, flags); 3404 3406 3405 3407 /* 3406 - * Build lists indexed by the items in use in each slab. 3408 + * Build lists of slabs to discard or promote. 3407 3409 * 3408 3410 * Note that concurrent frees may occur while we hold the 3409 3411 * list_lock. page->inuse here is the upper limit. 3410 3412 */ 3411 3413 list_for_each_entry_safe(page, t, &n->partial, lru) { 3412 - list_move(&page->lru, slabs_by_inuse + page->inuse); 3413 - if (!page->inuse) 3414 + int free = page->objects - page->inuse; 3415 + 3416 + /* Do not reread page->inuse */ 3417 + barrier(); 3418 + 3419 + /* We do not keep full slabs on the list */ 3420 + BUG_ON(free <= 0); 3421 + 3422 + if (free == page->objects) { 3423 + list_move(&page->lru, &discard); 3414 3424 n->nr_partial--; 3425 + } else if (free <= SHRINK_PROMOTE_MAX) 3426 + list_move(&page->lru, promote + free - 1); 3415 3427 } 3416 3428 3417 3429 /* 3418 - * Rebuild the partial list with the slabs filled up most 3419 - * first and the least used slabs at the end. 3430 + * Promote the slabs filled up most to the head of the 3431 + * partial list. 3420 3432 */ 3421 - for (i = objects - 1; i > 0; i--) 3422 - list_splice(slabs_by_inuse + i, n->partial.prev); 3433 + for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) 3434 + list_splice(promote + i, &n->partial); 3423 3435 3424 3436 spin_unlock_irqrestore(&n->list_lock, flags); 3425 3437 3426 3438 /* Release empty slabs */ 3427 - list_for_each_entry_safe(page, t, slabs_by_inuse, lru) 3439 + list_for_each_entry_safe(page, t, &discard, lru) 3428 3440 discard_slab(s, page); 3441 + 3442 + if (slabs_node(s, node)) 3443 + ret = 1; 3429 3444 } 3430 3445 3431 - kfree(slabs_by_inuse); 3432 - return 0; 3446 + return ret; 3433 3447 } 3434 3448 3435 3449 static int slab_mem_going_offline_callback(void *arg) ··· 3461 3429 3462 3430 mutex_lock(&slab_mutex); 3463 3431 list_for_each_entry(s, &slab_caches, list) 3464 - __kmem_cache_shrink(s); 3432 + __kmem_cache_shrink(s, false); 3465 3433 mutex_unlock(&slab_mutex); 3466 3434 3467 3435 return 0; ··· 3609 3577 p->slab_cache = s; 3610 3578 #endif 3611 3579 } 3580 + slab_init_memcg_params(s); 3612 3581 list_add(&s->list, &slab_caches); 3613 3582 return s; 3614 3583 } ··· 3668 3635 __kmem_cache_alias(const char *name, size_t size, size_t align, 3669 3636 unsigned long flags, void (*ctor)(void *)) 3670 3637 { 3671 - struct kmem_cache *s; 3638 + struct kmem_cache *s, *c; 3672 3639 3673 3640 s = find_mergeable(size, align, flags, name, ctor); 3674 3641 if (s) { 3675 - int i; 3676 - struct kmem_cache *c; 3677 - 3678 3642 s->refcount++; 3679 3643 3680 3644 /* ··· 3681 3651 s->object_size = max(s->object_size, (int)size); 3682 3652 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3683 3653 3684 - for_each_memcg_cache_index(i) { 3685 - c = cache_from_memcg_idx(s, i); 3686 - if (!c) 3687 - continue; 3654 + for_each_memcg_cache(c, s) { 3688 3655 c->object_size = s->object_size; 3689 3656 c->inuse = max_t(int, c->inuse, 3690 3657 ALIGN(size, sizeof(void *))); ··· 4718 4691 static ssize_t shrink_store(struct kmem_cache *s, 4719 4692 const char *buf, size_t length) 4720 4693 { 4721 - if (buf[0] == '1') { 4722 - int rc = kmem_cache_shrink(s); 4723 - 4724 - if (rc) 4725 - return rc; 4726 - } else 4694 + if (buf[0] == '1') 4695 + kmem_cache_shrink(s); 4696 + else 4727 4697 return -EINVAL; 4728 4698 return length; 4729 4699 } ··· 4944 4920 err = attribute->store(s, buf, len); 4945 4921 #ifdef CONFIG_MEMCG_KMEM 4946 4922 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { 4947 - int i; 4923 + struct kmem_cache *c; 4948 4924 4949 4925 mutex_lock(&slab_mutex); 4950 4926 if (s->max_attr_size < len) ··· 4967 4943 * directly either failed or succeeded, in which case we loop 4968 4944 * through the descendants with best-effort propagation. 4969 4945 */ 4970 - for_each_memcg_cache_index(i) { 4971 - struct kmem_cache *c = cache_from_memcg_idx(s, i); 4972 - if (c) 4973 - attribute->store(c, buf, len); 4974 - } 4946 + for_each_memcg_cache(c, s) 4947 + attribute->store(c, buf, len); 4975 4948 mutex_unlock(&slab_mutex); 4976 4949 } 4977 4950 #endif ··· 4985 4964 if (is_root_cache(s)) 4986 4965 return; 4987 4966 4988 - root_cache = s->memcg_params->root_cache; 4967 + root_cache = s->memcg_params.root_cache; 4989 4968 4990 4969 /* 4991 4970 * This mean this cache had no attribute written. Therefore, no point ··· 5065 5044 { 5066 5045 #ifdef CONFIG_MEMCG_KMEM 5067 5046 if (!is_root_cache(s)) 5068 - return s->memcg_params->root_cache->memcg_kset; 5047 + return s->memcg_params.root_cache->memcg_kset; 5069 5048 #endif 5070 5049 return slab_kset; 5071 5050 }

+62 -21

mm/vmscan.c

··· 232 232 233 233 #define SHRINK_BATCH 128 234 234 235 - static unsigned long shrink_slabs(struct shrink_control *shrinkctl, 236 - struct shrinker *shrinker, 237 - unsigned long nr_scanned, 238 - unsigned long nr_eligible) 235 + static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, 236 + struct shrinker *shrinker, 237 + unsigned long nr_scanned, 238 + unsigned long nr_eligible) 239 239 { 240 240 unsigned long freed = 0; 241 241 unsigned long long delta; ··· 344 344 } 345 345 346 346 /** 347 - * shrink_node_slabs - shrink slab caches of a given node 347 + * shrink_slab - shrink slab caches 348 348 * @gfp_mask: allocation context 349 349 * @nid: node whose slab caches to target 350 + * @memcg: memory cgroup whose slab caches to target 350 351 * @nr_scanned: pressure numerator 351 352 * @nr_eligible: pressure denominator 352 353 * ··· 355 354 * 356 355 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, 357 356 * unaware shrinkers will receive a node id of 0 instead. 357 + * 358 + * @memcg specifies the memory cgroup to target. If it is not NULL, 359 + * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan 360 + * objects from the memory cgroup specified. Otherwise all shrinkers 361 + * are called, and memcg aware shrinkers are supposed to scan the 362 + * global list then. 358 363 * 359 364 * @nr_scanned and @nr_eligible form a ratio that indicate how much of 360 365 * the available objects should be scanned. Page reclaim for example ··· 372 365 * 373 366 * Returns the number of reclaimed slab objects. 374 367 */ 375 - unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, 376 - unsigned long nr_scanned, 377 - unsigned long nr_eligible) 368 + static unsigned long shrink_slab(gfp_t gfp_mask, int nid, 369 + struct mem_cgroup *memcg, 370 + unsigned long nr_scanned, 371 + unsigned long nr_eligible) 378 372 { 379 373 struct shrinker *shrinker; 380 374 unsigned long freed = 0; 375 + 376 + if (memcg && !memcg_kmem_is_active(memcg)) 377 + return 0; 381 378 382 379 if (nr_scanned == 0) 383 380 nr_scanned = SWAP_CLUSTER_MAX; ··· 401 390 struct shrink_control sc = { 402 391 .gfp_mask = gfp_mask, 403 392 .nid = nid, 393 + .memcg = memcg, 404 394 }; 395 + 396 + if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) 397 + continue; 405 398 406 399 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 407 400 sc.nid = 0; 408 401 409 - freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); 402 + freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); 410 403 } 411 404 412 405 up_read(&shrinker_rwsem); 413 406 out: 414 407 cond_resched(); 415 408 return freed; 409 + } 410 + 411 + void drop_slab_node(int nid) 412 + { 413 + unsigned long freed; 414 + 415 + do { 416 + struct mem_cgroup *memcg = NULL; 417 + 418 + freed = 0; 419 + do { 420 + freed += shrink_slab(GFP_KERNEL, nid, memcg, 421 + 1000, 1000); 422 + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); 423 + } while (freed > 10); 424 + } 425 + 426 + void drop_slab(void) 427 + { 428 + int nid; 429 + 430 + for_each_online_node(nid) 431 + drop_slab_node(nid); 416 432 } 417 433 418 434 static inline int is_page_cache_freeable(struct page *page) ··· 2314 2276 static bool shrink_zone(struct zone *zone, struct scan_control *sc, 2315 2277 bool is_classzone) 2316 2278 { 2279 + struct reclaim_state *reclaim_state = current->reclaim_state; 2317 2280 unsigned long nr_reclaimed, nr_scanned; 2318 2281 bool reclaimable = false; 2319 2282 ··· 2333 2294 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2334 2295 do { 2335 2296 unsigned long lru_pages; 2297 + unsigned long scanned; 2336 2298 struct lruvec *lruvec; 2337 2299 int swappiness; 2338 2300 ··· 2345 2305 2346 2306 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2347 2307 swappiness = mem_cgroup_swappiness(memcg); 2308 + scanned = sc->nr_scanned; 2348 2309 2349 2310 shrink_lruvec(lruvec, swappiness, sc, &lru_pages); 2350 2311 zone_lru_pages += lru_pages; 2312 + 2313 + if (memcg && is_classzone) 2314 + shrink_slab(sc->gfp_mask, zone_to_nid(zone), 2315 + memcg, sc->nr_scanned - scanned, 2316 + lru_pages); 2351 2317 2352 2318 /* 2353 2319 * Direct reclaim and kswapd have to scan all memory ··· 2376 2330 * Shrink the slab caches in the same proportion that 2377 2331 * the eligible LRU pages were scanned. 2378 2332 */ 2379 - if (global_reclaim(sc) && is_classzone) { 2380 - struct reclaim_state *reclaim_state; 2333 + if (global_reclaim(sc) && is_classzone) 2334 + shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, 2335 + sc->nr_scanned - nr_scanned, 2336 + zone_lru_pages); 2381 2337 2382 - shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), 2383 - sc->nr_scanned - nr_scanned, 2384 - zone_lru_pages); 2385 - 2386 - reclaim_state = current->reclaim_state; 2387 - if (reclaim_state) { 2388 - sc->nr_reclaimed += 2389 - reclaim_state->reclaimed_slab; 2390 - reclaim_state->reclaimed_slab = 0; 2391 - } 2338 + if (reclaim_state) { 2339 + sc->nr_reclaimed += reclaim_state->reclaimed_slab; 2340 + reclaim_state->reclaimed_slab = 0; 2392 2341 } 2393 2342 2394 2343 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,

+5 -4

mm/workingset.c

··· 275 275 276 276 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 277 277 local_irq_disable(); 278 - shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); 278 + shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); 279 279 local_irq_enable(); 280 280 281 281 pages = node_present_pages(sc->nid); ··· 302 302 } 303 303 304 304 static enum lru_status shadow_lru_isolate(struct list_head *item, 305 + struct list_lru_one *lru, 305 306 spinlock_t *lru_lock, 306 307 void *arg) 307 308 { ··· 333 332 goto out; 334 333 } 335 334 336 - list_del_init(item); 335 + list_lru_isolate(lru, item); 337 336 spin_unlock(lru_lock); 338 337 339 338 /* ··· 377 376 378 377 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 379 378 local_irq_disable(); 380 - ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, 381 - shadow_lru_isolate, NULL, &sc->nr_to_scan); 379 + ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, 380 + shadow_lru_isolate, NULL); 382 381 local_irq_enable(); 383 382 return ret; 384 383 }

+2 -1

mm/zbud.c

··· 130 130 .evict = zbud_zpool_evict 131 131 }; 132 132 133 - static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) 133 + static void *zbud_zpool_create(char *name, gfp_t gfp, 134 + struct zpool_ops *zpool_ops) 134 135 { 135 136 return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); 136 137 }

+4 -2

mm/zpool.c

··· 129 129 /** 130 130 * zpool_create_pool() - Create a new zpool 131 131 * @type The type of the zpool to create (e.g. zbud, zsmalloc) 132 + * @name The name of the zpool (e.g. zram0, zswap) 132 133 * @gfp The GFP flags to use when allocating the pool. 133 134 * @ops The optional ops callback. 134 135 * ··· 141 140 * 142 141 * Returns: New zpool on success, NULL on failure. 143 142 */ 144 - struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) 143 + struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp, 144 + struct zpool_ops *ops) 145 145 { 146 146 struct zpool_driver *driver; 147 147 struct zpool *zpool; ··· 170 168 171 169 zpool->type = driver->type; 172 170 zpool->driver = driver; 173 - zpool->pool = driver->create(gfp, ops); 171 + zpool->pool = driver->create(name, gfp, ops); 174 172 zpool->ops = ops; 175 173 176 174 if (!zpool->pool) {

+232 -7

mm/zsmalloc.c

··· 91 91 #include <linux/hardirq.h> 92 92 #include <linux/spinlock.h> 93 93 #include <linux/types.h> 94 + #include <linux/debugfs.h> 94 95 #include <linux/zsmalloc.h> 95 96 #include <linux/zpool.h> 96 97 ··· 169 168 ZS_FULL 170 169 }; 171 170 171 + enum zs_stat_type { 172 + OBJ_ALLOCATED, 173 + OBJ_USED, 174 + NR_ZS_STAT_TYPE, 175 + }; 176 + 177 + #ifdef CONFIG_ZSMALLOC_STAT 178 + 179 + static struct dentry *zs_stat_root; 180 + 181 + struct zs_size_stat { 182 + unsigned long objs[NR_ZS_STAT_TYPE]; 183 + }; 184 + 185 + #endif 186 + 172 187 /* 173 188 * number of size_classes 174 189 */ ··· 217 200 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 218 201 int pages_per_zspage; 219 202 203 + #ifdef CONFIG_ZSMALLOC_STAT 204 + struct zs_size_stat stats; 205 + #endif 206 + 220 207 spinlock_t lock; 221 208 222 209 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; ··· 238 217 }; 239 218 240 219 struct zs_pool { 220 + char *name; 221 + 241 222 struct size_class **size_class; 242 223 243 224 gfp_t flags; /* allocation flags used when growing pool */ 244 225 atomic_long_t pages_allocated; 226 + 227 + #ifdef CONFIG_ZSMALLOC_STAT 228 + struct dentry *stat_dentry; 229 + #endif 245 230 }; 246 231 247 232 /* ··· 273 246 274 247 #ifdef CONFIG_ZPOOL 275 248 276 - static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) 249 + static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops) 277 250 { 278 - return zs_create_pool(gfp); 251 + return zs_create_pool(name, gfp); 279 252 } 280 253 281 254 static void zs_zpool_destroy(void *pool) ··· 969 942 return true; 970 943 } 971 944 945 + #ifdef CONFIG_ZSMALLOC_STAT 946 + 947 + static inline void zs_stat_inc(struct size_class *class, 948 + enum zs_stat_type type, unsigned long cnt) 949 + { 950 + class->stats.objs[type] += cnt; 951 + } 952 + 953 + static inline void zs_stat_dec(struct size_class *class, 954 + enum zs_stat_type type, unsigned long cnt) 955 + { 956 + class->stats.objs[type] -= cnt; 957 + } 958 + 959 + static inline unsigned long zs_stat_get(struct size_class *class, 960 + enum zs_stat_type type) 961 + { 962 + return class->stats.objs[type]; 963 + } 964 + 965 + static int __init zs_stat_init(void) 966 + { 967 + if (!debugfs_initialized()) 968 + return -ENODEV; 969 + 970 + zs_stat_root = debugfs_create_dir("zsmalloc", NULL); 971 + if (!zs_stat_root) 972 + return -ENOMEM; 973 + 974 + return 0; 975 + } 976 + 977 + static void __exit zs_stat_exit(void) 978 + { 979 + debugfs_remove_recursive(zs_stat_root); 980 + } 981 + 982 + static int zs_stats_size_show(struct seq_file *s, void *v) 983 + { 984 + int i; 985 + struct zs_pool *pool = s->private; 986 + struct size_class *class; 987 + int objs_per_zspage; 988 + unsigned long obj_allocated, obj_used, pages_used; 989 + unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; 990 + 991 + seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size", 992 + "obj_allocated", "obj_used", "pages_used"); 993 + 994 + for (i = 0; i < zs_size_classes; i++) { 995 + class = pool->size_class[i]; 996 + 997 + if (class->index != i) 998 + continue; 999 + 1000 + spin_lock(&class->lock); 1001 + obj_allocated = zs_stat_get(class, OBJ_ALLOCATED); 1002 + obj_used = zs_stat_get(class, OBJ_USED); 1003 + spin_unlock(&class->lock); 1004 + 1005 + objs_per_zspage = get_maxobj_per_zspage(class->size, 1006 + class->pages_per_zspage); 1007 + pages_used = obj_allocated / objs_per_zspage * 1008 + class->pages_per_zspage; 1009 + 1010 + seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i, 1011 + class->size, obj_allocated, obj_used, pages_used); 1012 + 1013 + total_objs += obj_allocated; 1014 + total_used_objs += obj_used; 1015 + total_pages += pages_used; 1016 + } 1017 + 1018 + seq_puts(s, "\n"); 1019 + seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "", 1020 + total_objs, total_used_objs, total_pages); 1021 + 1022 + return 0; 1023 + } 1024 + 1025 + static int zs_stats_size_open(struct inode *inode, struct file *file) 1026 + { 1027 + return single_open(file, zs_stats_size_show, inode->i_private); 1028 + } 1029 + 1030 + static const struct file_operations zs_stat_size_ops = { 1031 + .open = zs_stats_size_open, 1032 + .read = seq_read, 1033 + .llseek = seq_lseek, 1034 + .release = single_release, 1035 + }; 1036 + 1037 + static int zs_pool_stat_create(char *name, struct zs_pool *pool) 1038 + { 1039 + struct dentry *entry; 1040 + 1041 + if (!zs_stat_root) 1042 + return -ENODEV; 1043 + 1044 + entry = debugfs_create_dir(name, zs_stat_root); 1045 + if (!entry) { 1046 + pr_warn("debugfs dir <%s> creation failed\n", name); 1047 + return -ENOMEM; 1048 + } 1049 + pool->stat_dentry = entry; 1050 + 1051 + entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO, 1052 + pool->stat_dentry, pool, &zs_stat_size_ops); 1053 + if (!entry) { 1054 + pr_warn("%s: debugfs file entry <%s> creation failed\n", 1055 + name, "obj_in_classes"); 1056 + return -ENOMEM; 1057 + } 1058 + 1059 + return 0; 1060 + } 1061 + 1062 + static void zs_pool_stat_destroy(struct zs_pool *pool) 1063 + { 1064 + debugfs_remove_recursive(pool->stat_dentry); 1065 + } 1066 + 1067 + #else /* CONFIG_ZSMALLOC_STAT */ 1068 + 1069 + static inline void zs_stat_inc(struct size_class *class, 1070 + enum zs_stat_type type, unsigned long cnt) 1071 + { 1072 + } 1073 + 1074 + static inline void zs_stat_dec(struct size_class *class, 1075 + enum zs_stat_type type, unsigned long cnt) 1076 + { 1077 + } 1078 + 1079 + static inline unsigned long zs_stat_get(struct size_class *class, 1080 + enum zs_stat_type type) 1081 + { 1082 + return 0; 1083 + } 1084 + 1085 + static int __init zs_stat_init(void) 1086 + { 1087 + return 0; 1088 + } 1089 + 1090 + static void __exit zs_stat_exit(void) 1091 + { 1092 + } 1093 + 1094 + static inline int zs_pool_stat_create(char *name, struct zs_pool *pool) 1095 + { 1096 + return 0; 1097 + } 1098 + 1099 + static inline void zs_pool_stat_destroy(struct zs_pool *pool) 1100 + { 1101 + } 1102 + 1103 + #endif 1104 + 972 1105 unsigned long zs_get_total_pages(struct zs_pool *pool) 973 1106 { 974 1107 return atomic_long_read(&pool->pages_allocated); ··· 1261 1074 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1262 1075 atomic_long_add(class->pages_per_zspage, 1263 1076 &pool->pages_allocated); 1077 + 1264 1078 spin_lock(&class->lock); 1079 + zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1080 + class->size, class->pages_per_zspage)); 1265 1081 } 1266 1082 1267 1083 obj = (unsigned long)first_page->freelist; ··· 1278 1088 kunmap_atomic(vaddr); 1279 1089 1280 1090 first_page->inuse++; 1091 + zs_stat_inc(class, OBJ_USED, 1); 1281 1092 /* Now move the zspage to another fullness group, if required */ 1282 1093 fix_fullness_group(pool, first_page); 1283 1094 spin_unlock(&class->lock); ··· 1319 1128 1320 1129 first_page->inuse--; 1321 1130 fullness = fix_fullness_group(pool, first_page); 1131 + 1132 + zs_stat_dec(class, OBJ_USED, 1); 1133 + if (fullness == ZS_EMPTY) 1134 + zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage( 1135 + class->size, class->pages_per_zspage)); 1136 + 1322 1137 spin_unlock(&class->lock); 1323 1138 1324 1139 if (fullness == ZS_EMPTY) { ··· 1345 1148 * On success, a pointer to the newly created pool is returned, 1346 1149 * otherwise NULL. 1347 1150 */ 1348 - struct zs_pool *zs_create_pool(gfp_t flags) 1151 + struct zs_pool *zs_create_pool(char *name, gfp_t flags) 1349 1152 { 1350 1153 int i; 1351 1154 struct zs_pool *pool; ··· 1355 1158 if (!pool) 1356 1159 return NULL; 1357 1160 1161 + pool->name = kstrdup(name, GFP_KERNEL); 1162 + if (!pool->name) { 1163 + kfree(pool); 1164 + return NULL; 1165 + } 1166 + 1358 1167 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1359 1168 GFP_KERNEL); 1360 1169 if (!pool->size_class) { 1170 + kfree(pool->name); 1361 1171 kfree(pool); 1362 1172 return NULL; 1363 1173 } ··· 1414 1210 1415 1211 pool->flags = flags; 1416 1212 1213 + if (zs_pool_stat_create(name, pool)) 1214 + goto err; 1215 + 1417 1216 return pool; 1418 1217 1419 1218 err: ··· 1428 1221 void zs_destroy_pool(struct zs_pool *pool) 1429 1222 { 1430 1223 int i; 1224 + 1225 + zs_pool_stat_destroy(pool); 1431 1226 1432 1227 for (i = 0; i < zs_size_classes; i++) { 1433 1228 int fg; ··· 1451 1242 } 1452 1243 1453 1244 kfree(pool->size_class); 1245 + kfree(pool->name); 1454 1246 kfree(pool); 1455 1247 } 1456 1248 EXPORT_SYMBOL_GPL(zs_destroy_pool); ··· 1460 1250 { 1461 1251 int ret = zs_register_cpu_notifier(); 1462 1252 1463 - if (ret) { 1464 - zs_unregister_cpu_notifier(); 1465 - return ret; 1466 - } 1253 + if (ret) 1254 + goto notifier_fail; 1467 1255 1468 1256 init_zs_size_classes(); 1469 1257 1470 1258 #ifdef CONFIG_ZPOOL 1471 1259 zpool_register_driver(&zs_zpool_driver); 1472 1260 #endif 1261 + 1262 + ret = zs_stat_init(); 1263 + if (ret) { 1264 + pr_err("zs stat initialization failed\n"); 1265 + goto stat_fail; 1266 + } 1473 1267 return 0; 1268 + 1269 + stat_fail: 1270 + #ifdef CONFIG_ZPOOL 1271 + zpool_unregister_driver(&zs_zpool_driver); 1272 + #endif 1273 + notifier_fail: 1274 + zs_unregister_cpu_notifier(); 1275 + 1276 + return ret; 1474 1277 } 1475 1278 1476 1279 static void __exit zs_exit(void) ··· 1492 1269 zpool_unregister_driver(&zs_zpool_driver); 1493 1270 #endif 1494 1271 zs_unregister_cpu_notifier(); 1272 + 1273 + zs_stat_exit(); 1495 1274 } 1496 1275 1497 1276 module_init(zs_init);

+3 -2

mm/zswap.c

··· 906 906 907 907 pr_info("loading zswap\n"); 908 908 909 - zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); 909 + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, 910 + &zswap_zpool_ops); 910 911 if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { 911 912 pr_info("%s zpool not available\n", zswap_zpool_type); 912 913 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 913 - zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, 914 + zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp, 914 915 &zswap_zpool_ops); 915 916 } 916 917 if (!zswap_pool) {

+4

net/ipv4/tcp_memcontrol.c

··· 47 47 return; 48 48 49 49 percpu_counter_destroy(&cg_proto->sockets_allocated); 50 + 51 + if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags)) 52 + static_key_slow_dec(&memcg_socket_limit_enabled); 53 + 50 54 } 51 55 EXPORT_SYMBOL(tcp_destroy_cgroup); 52 56