Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

procfs: provide stack information for threads

A patch to give a better overview of the userland application stack usage,
especially for embedded linux.

Currently you are only able to dump the main process/thread stack usage
which is showed in /proc/pid/status by the "VmStk" Value. But you get no
information about the consumed stack memory of the the threads.

There is an enhancement in the /proc/<pid>/{task/*,}/*maps and which marks
the vm mapping where the thread stack pointer reside with "[thread stack
xxxxxxxx]". xxxxxxxx is the maximum size of stack. This is a value
information, because libpthread doesn't set the start of the stack to the
top of the mapped area, depending of the pthread usage.

A sample output of /proc/<pid>/task/<tid>/maps looks like:

08048000-08049000 r-xp 00000000 03:00 8312 /opt/z
08049000-0804a000 rw-p 00001000 03:00 8312 /opt/z
0804a000-0806b000 rw-p 00000000 00:00 0 [heap]
a7d12000-a7d13000 ---p 00000000 00:00 0
a7d13000-a7f13000 rw-p 00000000 00:00 0 [thread stack: 001ff4b4]
a7f13000-a7f14000 ---p 00000000 00:00 0
a7f14000-a7f36000 rw-p 00000000 00:00 0
a7f36000-a8069000 r-xp 00000000 03:00 4222 /lib/libc.so.6
a8069000-a806b000 r--p 00133000 03:00 4222 /lib/libc.so.6
a806b000-a806c000 rw-p 00135000 03:00 4222 /lib/libc.so.6
a806c000-a806f000 rw-p 00000000 00:00 0
a806f000-a8083000 r-xp 00000000 03:00 14462 /lib/libpthread.so.0
a8083000-a8084000 r--p 00013000 03:00 14462 /lib/libpthread.so.0
a8084000-a8085000 rw-p 00014000 03:00 14462 /lib/libpthread.so.0
a8085000-a8088000 rw-p 00000000 00:00 0
a8088000-a80a4000 r-xp 00000000 03:00 8317 /lib/ld-linux.so.2
a80a4000-a80a5000 r--p 0001b000 03:00 8317 /lib/ld-linux.so.2
a80a5000-a80a6000 rw-p 0001c000 03:00 8317 /lib/ld-linux.so.2
afaf5000-afb0a000 rw-p 00000000 00:00 0 [stack]
ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso]

Also there is a new entry "stack usage" in /proc/<pid>/{task/*,}/status
which will you give the current stack usage in kb.

A sample output of /proc/self/status looks like:

Name: cat
State: R (running)
Tgid: 507
Pid: 507
.
.
.
CapBnd: fffffffffffffeff
voluntary_ctxt_switches: 0
nonvoluntary_ctxt_switches: 0
Stack usage: 12 kB

I also fixed stack base address in /proc/<pid>/{task/*,}/stat to the base
address of the associated thread stack and not the one of the main
process. This makes more sense.

[akpm@linux-foundation.org: fs/proc/array.c now needs walk_page_range()]
Signed-off-by: Stefani Seibold <stefani@seibold.net>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Stefani Seibold and committed by
Linus Torvalds
d899bf7b cba8aafe

+114 -4
+4 -1
Documentation/filesystems/proc.txt
··· 176 176 CapBnd: ffffffffffffffff 177 177 voluntary_ctxt_switches: 0 178 178 nonvoluntary_ctxt_switches: 1 179 + Stack usage: 12 kB 179 180 180 181 This shows you nearly the same information you would get if you viewed it with 181 182 the ps command. In fact, ps uses the proc file system to obtain its ··· 230 229 Mems_allowed_list Same as previous, but in "list format" 231 230 voluntary_ctxt_switches number of voluntary context switches 232 231 nonvoluntary_ctxt_switches number of non voluntary context switches 232 + Stack usage: stack usage high water mark (round up to page size) 233 233 .............................................................................. 234 234 235 235 Table 1-3: Contents of the statm files (as of 2.6.8-rc3) ··· 309 307 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test 310 308 0804a000-0806b000 rw-p 00000000 00:00 0 [heap] 311 309 a7cb1000-a7cb2000 ---p 00000000 00:00 0 312 - a7cb2000-a7eb2000 rw-p 00000000 00:00 0 310 + a7cb2000-a7eb2000 rw-p 00000000 00:00 0 [threadstack:001ff4b4] 313 311 a7eb2000-a7eb3000 ---p 00000000 00:00 0 314 312 a7eb3000-a7ed5000 rw-p 00000000 00:00 0 315 313 a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 ··· 345 343 [stack] = the stack of the main process 346 344 [vdso] = the "virtual dynamic shared object", 347 345 the kernel system call handler 346 + [threadstack:xxxxxxxx] = the stack of the thread, xxxxxxxx is the stack size 348 347 349 348 or if empty, the mapping is anonymous. 350 349
+2
fs/exec.c
··· 1357 1357 if (retval < 0) 1358 1358 goto out; 1359 1359 1360 + current->stack_start = current->mm->start_stack; 1361 + 1360 1362 /* execve succeeded */ 1361 1363 current->fs->in_exec = 0; 1362 1364 current->in_execve = 0;
+84 -1
fs/proc/array.c
··· 82 82 #include <linux/pid_namespace.h> 83 83 #include <linux/ptrace.h> 84 84 #include <linux/tracehook.h> 85 + #include <linux/swapops.h> 85 86 86 87 #include <asm/pgtable.h> 87 88 #include <asm/processor.h> ··· 322 321 p->nivcsw); 323 322 } 324 323 324 + struct stack_stats { 325 + struct vm_area_struct *vma; 326 + unsigned long startpage; 327 + unsigned long usage; 328 + }; 329 + 330 + static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr, 331 + unsigned long end, struct mm_walk *walk) 332 + { 333 + struct stack_stats *ss = walk->private; 334 + struct vm_area_struct *vma = ss->vma; 335 + pte_t *pte, ptent; 336 + spinlock_t *ptl; 337 + int ret = 0; 338 + 339 + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 340 + for (; addr != end; pte++, addr += PAGE_SIZE) { 341 + ptent = *pte; 342 + 343 + #ifdef CONFIG_STACK_GROWSUP 344 + if (pte_present(ptent) || is_swap_pte(ptent)) 345 + ss->usage = addr - ss->startpage + PAGE_SIZE; 346 + #else 347 + if (pte_present(ptent) || is_swap_pte(ptent)) { 348 + ss->usage = ss->startpage - addr + PAGE_SIZE; 349 + pte++; 350 + ret = 1; 351 + break; 352 + } 353 + #endif 354 + } 355 + pte_unmap_unlock(pte - 1, ptl); 356 + cond_resched(); 357 + return ret; 358 + } 359 + 360 + static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma, 361 + struct task_struct *task) 362 + { 363 + struct stack_stats ss; 364 + struct mm_walk stack_walk = { 365 + .pmd_entry = stack_usage_pte_range, 366 + .mm = vma->vm_mm, 367 + .private = &ss, 368 + }; 369 + 370 + if (!vma->vm_mm || is_vm_hugetlb_page(vma)) 371 + return 0; 372 + 373 + ss.vma = vma; 374 + ss.startpage = task->stack_start & PAGE_MASK; 375 + ss.usage = 0; 376 + 377 + #ifdef CONFIG_STACK_GROWSUP 378 + walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end, 379 + &stack_walk); 380 + #else 381 + walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE, 382 + &stack_walk); 383 + #endif 384 + return ss.usage; 385 + } 386 + 387 + static inline void task_show_stack_usage(struct seq_file *m, 388 + struct task_struct *task) 389 + { 390 + struct vm_area_struct *vma; 391 + struct mm_struct *mm = get_task_mm(task); 392 + 393 + if (mm) { 394 + down_read(&mm->mmap_sem); 395 + vma = find_vma(mm, task->stack_start); 396 + if (vma) 397 + seq_printf(m, "Stack usage:\t%lu kB\n", 398 + get_stack_usage_in_bytes(vma, task) >> 10); 399 + 400 + up_read(&mm->mmap_sem); 401 + mmput(mm); 402 + } 403 + } 404 + 325 405 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, 326 406 struct pid *pid, struct task_struct *task) 327 407 { ··· 422 340 task_show_regs(m, task); 423 341 #endif 424 342 task_context_switch_counts(m, task); 343 + task_show_stack_usage(m, task); 425 344 return 0; 426 345 } 427 346 ··· 564 481 rsslim, 565 482 mm ? mm->start_code : 0, 566 483 mm ? mm->end_code : 0, 567 - (permitted && mm) ? mm->start_stack : 0, 484 + (permitted) ? task->stack_start : 0, 568 485 esp, 569 486 eip, 570 487 /* The signal information here is obsolete.
+19
fs/proc/task_mmu.c
··· 243 243 } else if (vma->vm_start <= mm->start_stack && 244 244 vma->vm_end >= mm->start_stack) { 245 245 name = "[stack]"; 246 + } else { 247 + unsigned long stack_start; 248 + struct proc_maps_private *pmp; 249 + 250 + pmp = m->private; 251 + stack_start = pmp->task->stack_start; 252 + 253 + if (vma->vm_start <= stack_start && 254 + vma->vm_end >= stack_start) { 255 + pad_len_spaces(m, len); 256 + seq_printf(m, 257 + "[threadstack:%08lx]", 258 + #ifdef CONFIG_STACK_GROWSUP 259 + vma->vm_end - stack_start 260 + #else 261 + stack_start - vma->vm_start 262 + #endif 263 + ); 264 + } 246 265 } 247 266 } else { 248 267 name = "[vdso]";
+1
include/linux/sched.h
··· 1529 1529 /* bitmask of trace recursion */ 1530 1530 unsigned long trace_recursion; 1531 1531 #endif /* CONFIG_TRACING */ 1532 + unsigned long stack_start; 1532 1533 }; 1533 1534 1534 1535 /* Future-safe accessor for struct task_struct's cpus_allowed. */
+2
kernel/fork.c
··· 1095 1095 1096 1096 p->bts = NULL; 1097 1097 1098 + p->stack_start = stack_start; 1099 + 1098 1100 /* Perform scheduler related setup. Assign this task to a CPU. */ 1099 1101 sched_fork(p, clone_flags); 1100 1102
+2 -2
mm/Makefile
··· 11 11 maccess.o page_alloc.o page-writeback.o \ 12 12 readahead.o swap.o truncate.o vmscan.o shmem.o \ 13 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 14 - page_isolation.o mm_init.o mmu_context.o $(mmu-y) 14 + page_isolation.o mm_init.o mmu_context.o \ 15 + pagewalk.o $(mmu-y) 15 16 obj-y += init-mm.o 16 17 17 - obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o 18 18 obj-$(CONFIG_BOUNCE) += bounce.o 19 19 obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o 20 20 obj-$(CONFIG_HAS_DMA) += dmapool.o