Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot

In both binfmt_elf and binfmt_elf_fdpic, use a new helper
dump_vma_snapshot() to take a snapshot of the VMA list (including the gate
VMA, if we have one) while protected by the mmap_lock, and then use that
snapshot instead of walking the VMA list without locking.

An alternative approach would be to keep the mmap_lock held across the
entire core dumping operation; however, keeping the mmap_lock locked while
we may be blocked for an unbounded amount of time (e.g. because we're
dumping to a FUSE filesystem or so) isn't really optimal; the mmap_lock
blocks things like the ->release handler of userfaultfd, and we don't
really want critical system daemons to grind to a halt just because
someone "gifted" them SCM_RIGHTS to an eternally-locked userfaultfd, or
something like that.

Since both the normal ELF code and the FDPIC ELF code need this
functionality (and if any other binfmt wants to add coredump support in
the future, they'd probably need it, too), implement this with a common
helper in fs/coredump.c.

A downside of this approach is that we now need a bigger amount of kernel
memory per userspace VMA in the normal ELF case, and that we need O(n)
kernel memory in the FDPIC ELF case at all; but 40 bytes per VMA shouldn't
be terribly bad.

There currently is a data race between stack expansion and anything that
reads ->vm_start or ->vm_end under the mmap_lock held in read mode; to
mitigate that for core dumping, take the mmap_lock in write mode when
taking a snapshot of the VMA hierarchy. (If we only took the mmap_lock in
read mode, we could end up with a corrupted core dump if someone does
get_user_pages_remote() concurrently. Not really a major problem, but
taking the mmap_lock either way works here, so we might as well avoid the
issue.) (This doesn't do anything about the existing data races with stack
expansion in other mm code.)

Signed-off-by: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: "Eric W . Biederman" <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Link: http://lkml.kernel.org/r/20200827114932.3572699-6-jannh@google.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Jann Horn and committed by
Linus Torvalds
a07279c9 429a22e7

+138 -120
+22 -78
fs/binfmt_elf.c
··· 2125 2125 2126 2126 #endif 2127 2127 2128 - static struct vm_area_struct *first_vma(struct task_struct *tsk, 2129 - struct vm_area_struct *gate_vma) 2130 - { 2131 - struct vm_area_struct *ret = tsk->mm->mmap; 2132 - 2133 - if (ret) 2134 - return ret; 2135 - return gate_vma; 2136 - } 2137 - /* 2138 - * Helper function for iterating across a vma list. It ensures that the caller 2139 - * will visit `gate_vma' prior to terminating the search. 2140 - */ 2141 - static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, 2142 - struct vm_area_struct *gate_vma) 2143 - { 2144 - struct vm_area_struct *ret; 2145 - 2146 - ret = this_vma->vm_next; 2147 - if (ret) 2148 - return ret; 2149 - if (this_vma == gate_vma) 2150 - return NULL; 2151 - return gate_vma; 2152 - } 2153 - 2154 2128 static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, 2155 2129 elf_addr_t e_shoff, int segs) 2156 2130 { ··· 2151 2177 static int elf_core_dump(struct coredump_params *cprm) 2152 2178 { 2153 2179 int has_dumped = 0; 2154 - int segs, i; 2155 - size_t vma_data_size = 0; 2156 - struct vm_area_struct *vma, *gate_vma; 2180 + int vma_count, segs, i; 2181 + size_t vma_data_size; 2157 2182 struct elfhdr elf; 2158 2183 loff_t offset = 0, dataoff; 2159 2184 struct elf_note_info info = { }; ··· 2160 2187 struct elf_shdr *shdr4extnum = NULL; 2161 2188 Elf_Half e_phnum; 2162 2189 elf_addr_t e_shoff; 2163 - elf_addr_t *vma_filesz = NULL; 2190 + struct core_vma_metadata *vma_meta; 2164 2191 2165 - /* 2166 - * We no longer stop all VM operations. 2167 - * 2168 - * This is because those proceses that could possibly change map_count 2169 - * or the mmap / vma pages are now blocked in do_exit on current 2170 - * finishing this core dump. 2171 - * 2172 - * Only ptrace can touch these memory addresses, but it doesn't change 2173 - * the map_count or the pages allocated. So no possibility of crashing 2174 - * exists while dumping the mm->vm_next areas to the core file. 2175 - */ 2176 - 2192 + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) 2193 + return 0; 2194 + 2177 2195 /* 2178 2196 * The number of segs are recored into ELF header as 16bit value. 2179 2197 * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. 2180 2198 */ 2181 - segs = current->mm->map_count; 2182 - segs += elf_core_extra_phdrs(); 2183 - 2184 - gate_vma = get_gate_vma(current->mm); 2185 - if (gate_vma != NULL) 2186 - segs++; 2199 + segs = vma_count + elf_core_extra_phdrs(); 2187 2200 2188 2201 /* for notes section */ 2189 2202 segs++; ··· 2207 2248 2208 2249 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 2209 2250 2210 - /* 2211 - * Zero vma process will get ZERO_SIZE_PTR here. 2212 - * Let coredump continue for register state at least. 2213 - */ 2214 - vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)), 2215 - GFP_KERNEL); 2216 - if (!vma_filesz) 2217 - goto end_coredump; 2218 - 2219 - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; 2220 - vma = next_vma(vma, gate_vma)) { 2221 - unsigned long dump_size; 2222 - 2223 - dump_size = vma_dump_size(vma, cprm->mm_flags); 2224 - vma_filesz[i++] = dump_size; 2225 - vma_data_size += dump_size; 2226 - } 2227 - 2228 2251 offset += vma_data_size; 2229 2252 offset += elf_core_extra_data_size(); 2230 2253 e_shoff = offset; ··· 2227 2286 goto end_coredump; 2228 2287 2229 2288 /* Write program headers for segments dump */ 2230 - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; 2231 - vma = next_vma(vma, gate_vma)) { 2289 + for (i = 0; i < vma_count; i++) { 2290 + struct core_vma_metadata *meta = vma_meta + i; 2232 2291 struct elf_phdr phdr; 2233 2292 2234 2293 phdr.p_type = PT_LOAD; 2235 2294 phdr.p_offset = offset; 2236 - phdr.p_vaddr = vma->vm_start; 2295 + phdr.p_vaddr = meta->start; 2237 2296 phdr.p_paddr = 0; 2238 - phdr.p_filesz = vma_filesz[i++]; 2239 - phdr.p_memsz = vma->vm_end - vma->vm_start; 2297 + phdr.p_filesz = meta->dump_size; 2298 + phdr.p_memsz = meta->end - meta->start; 2240 2299 offset += phdr.p_filesz; 2241 - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; 2242 - if (vma->vm_flags & VM_WRITE) 2300 + phdr.p_flags = 0; 2301 + if (meta->flags & VM_READ) 2302 + phdr.p_flags |= PF_R; 2303 + if (meta->flags & VM_WRITE) 2243 2304 phdr.p_flags |= PF_W; 2244 - if (vma->vm_flags & VM_EXEC) 2305 + if (meta->flags & VM_EXEC) 2245 2306 phdr.p_flags |= PF_X; 2246 2307 phdr.p_align = ELF_EXEC_PAGESIZE; 2247 2308 ··· 2265 2322 if (!dump_skip(cprm, dataoff - cprm->pos)) 2266 2323 goto end_coredump; 2267 2324 2268 - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; 2269 - vma = next_vma(vma, gate_vma)) { 2270 - if (!dump_user_range(cprm, vma->vm_start, vma_filesz[i++])) 2325 + for (i = 0; i < vma_count; i++) { 2326 + struct core_vma_metadata *meta = vma_meta + i; 2327 + 2328 + if (!dump_user_range(cprm, meta->start, meta->dump_size)) 2271 2329 goto end_coredump; 2272 2330 } 2273 2331 dump_truncate(cprm); ··· 2284 2340 end_coredump: 2285 2341 free_note_info(&info); 2286 2342 kfree(shdr4extnum); 2287 - kvfree(vma_filesz); 2343 + kvfree(vma_meta); 2288 2344 kfree(phdr4note); 2289 2345 return has_dumped; 2290 2346 }
+27 -40
fs/binfmt_elf_fdpic.c
··· 1454 1454 /* 1455 1455 * dump the segments for an MMU process 1456 1456 */ 1457 - static bool elf_fdpic_dump_segments(struct coredump_params *cprm) 1457 + static bool elf_fdpic_dump_segments(struct coredump_params *cprm, 1458 + struct core_vma_metadata *vma_meta, 1459 + int vma_count) 1458 1460 { 1459 - struct vm_area_struct *vma; 1461 + int i; 1460 1462 1461 - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { 1462 - unsigned long size = vma_dump_size(vma, cprm->mm_flags); 1463 + for (i = 0; i < vma_count; i++) { 1464 + struct core_vma_metadata *meta = vma_meta + i; 1463 1465 1464 - if (!dump_user_range(cprm, vma->vm_start, size)) 1466 + if (!dump_user_range(cprm, meta->start, meta->dump_size)) 1465 1467 return false; 1466 1468 } 1467 1469 return true; 1468 - } 1469 - 1470 - static size_t elf_core_vma_data_size(unsigned long mm_flags) 1471 - { 1472 - struct vm_area_struct *vma; 1473 - size_t size = 0; 1474 - 1475 - for (vma = current->mm->mmap; vma; vma = vma->vm_next) 1476 - size += vma_dump_size(vma, mm_flags); 1477 - return size; 1478 1470 } 1479 1471 1480 1472 /* ··· 1479 1487 static int elf_fdpic_core_dump(struct coredump_params *cprm) 1480 1488 { 1481 1489 int has_dumped = 0; 1482 - int segs; 1490 + int vma_count, segs; 1483 1491 int i; 1484 - struct vm_area_struct *vma; 1485 1492 struct elfhdr *elf = NULL; 1486 1493 loff_t offset = 0, dataoff; 1487 1494 struct memelfnote psinfo_note, auxv_note; ··· 1494 1503 elf_addr_t e_shoff; 1495 1504 struct core_thread *ct; 1496 1505 struct elf_thread_status *tmp; 1497 - 1498 - /* 1499 - * We no longer stop all VM operations. 1500 - * 1501 - * This is because those proceses that could possibly change map_count 1502 - * or the mmap / vma pages are now blocked in do_exit on current 1503 - * finishing this core dump. 1504 - * 1505 - * Only ptrace can touch these memory addresses, but it doesn't change 1506 - * the map_count or the pages allocated. So no possibility of crashing 1507 - * exists while dumping the mm->vm_next areas to the core file. 1508 - */ 1506 + struct core_vma_metadata *vma_meta = NULL; 1507 + size_t vma_data_size; 1509 1508 1510 1509 /* alloc memory for large data structures: too large to be on stack */ 1511 1510 elf = kmalloc(sizeof(*elf), GFP_KERNEL); ··· 1503 1522 goto end_coredump; 1504 1523 psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); 1505 1524 if (!psinfo) 1525 + goto end_coredump; 1526 + 1527 + if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) 1506 1528 goto end_coredump; 1507 1529 1508 1530 for (ct = current->mm->core_state->dumper.next; ··· 1527 1543 tmp->next = thread_list; 1528 1544 thread_list = tmp; 1529 1545 1530 - segs = current->mm->map_count; 1531 - segs += elf_core_extra_phdrs(); 1546 + segs = vma_count + elf_core_extra_phdrs(); 1532 1547 1533 1548 /* for notes section */ 1534 1549 segs++; ··· 1572 1589 /* Page-align dumped data */ 1573 1590 dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); 1574 1591 1575 - offset += elf_core_vma_data_size(cprm->mm_flags); 1592 + offset += vma_data_size; 1576 1593 offset += elf_core_extra_data_size(); 1577 1594 e_shoff = offset; 1578 1595 ··· 1592 1609 goto end_coredump; 1593 1610 1594 1611 /* write program headers for segments dump */ 1595 - for (vma = current->mm->mmap; vma; vma = vma->vm_next) { 1612 + for (i = 0; i < vma_count; i++) { 1613 + struct core_vma_metadata *meta = vma_meta + i; 1596 1614 struct elf_phdr phdr; 1597 1615 size_t sz; 1598 1616 1599 - sz = vma->vm_end - vma->vm_start; 1617 + sz = meta->end - meta->start; 1600 1618 1601 1619 phdr.p_type = PT_LOAD; 1602 1620 phdr.p_offset = offset; 1603 - phdr.p_vaddr = vma->vm_start; 1621 + phdr.p_vaddr = meta->start; 1604 1622 phdr.p_paddr = 0; 1605 - phdr.p_filesz = vma_dump_size(vma, cprm->mm_flags); 1623 + phdr.p_filesz = meta->dump_size; 1606 1624 phdr.p_memsz = sz; 1607 1625 offset += phdr.p_filesz; 1608 - phdr.p_flags = vma->vm_flags & VM_READ ? PF_R : 0; 1609 - if (vma->vm_flags & VM_WRITE) 1626 + phdr.p_flags = 0; 1627 + if (meta->flags & VM_READ) 1628 + phdr.p_flags |= PF_R; 1629 + if (meta->flags & VM_WRITE) 1610 1630 phdr.p_flags |= PF_W; 1611 - if (vma->vm_flags & VM_EXEC) 1631 + if (meta->flags & VM_EXEC) 1612 1632 phdr.p_flags |= PF_X; 1613 1633 phdr.p_align = ELF_EXEC_PAGESIZE; 1614 1634 ··· 1643 1657 if (!dump_skip(cprm, dataoff - cprm->pos)) 1644 1658 goto end_coredump; 1645 1659 1646 - if (!elf_fdpic_dump_segments(cprm)) 1660 + if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) 1647 1661 goto end_coredump; 1648 1662 1649 1663 if (!elf_core_write_extra_data(cprm)) ··· 1667 1681 thread_list = thread_list->next; 1668 1682 kfree(tmp); 1669 1683 } 1684 + kvfree(vma_meta); 1670 1685 kfree(phdr4note); 1671 1686 kfree(elf); 1672 1687 kfree(psinfo);
+80 -1
fs/coredump.c
··· 971 971 /* 972 972 * Decide how much of @vma's contents should be included in a core dump. 973 973 */ 974 - unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags) 974 + static unsigned long vma_dump_size(struct vm_area_struct *vma, 975 + unsigned long mm_flags) 975 976 { 976 977 #define FILTER(type) (mm_flags & (1UL << MMF_DUMP_##type)) 977 978 ··· 1037 1036 1038 1037 whole: 1039 1038 return vma->vm_end - vma->vm_start; 1039 + } 1040 + 1041 + static struct vm_area_struct *first_vma(struct task_struct *tsk, 1042 + struct vm_area_struct *gate_vma) 1043 + { 1044 + struct vm_area_struct *ret = tsk->mm->mmap; 1045 + 1046 + if (ret) 1047 + return ret; 1048 + return gate_vma; 1049 + } 1050 + 1051 + /* 1052 + * Helper function for iterating across a vma list. It ensures that the caller 1053 + * will visit `gate_vma' prior to terminating the search. 1054 + */ 1055 + static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, 1056 + struct vm_area_struct *gate_vma) 1057 + { 1058 + struct vm_area_struct *ret; 1059 + 1060 + ret = this_vma->vm_next; 1061 + if (ret) 1062 + return ret; 1063 + if (this_vma == gate_vma) 1064 + return NULL; 1065 + return gate_vma; 1066 + } 1067 + 1068 + /* 1069 + * Under the mmap_lock, take a snapshot of relevant information about the task's 1070 + * VMAs. 1071 + */ 1072 + int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, 1073 + struct core_vma_metadata **vma_meta, 1074 + size_t *vma_data_size_ptr) 1075 + { 1076 + struct vm_area_struct *vma, *gate_vma; 1077 + struct mm_struct *mm = current->mm; 1078 + int i; 1079 + size_t vma_data_size = 0; 1080 + 1081 + /* 1082 + * Once the stack expansion code is fixed to not change VMA bounds 1083 + * under mmap_lock in read mode, this can be changed to take the 1084 + * mmap_lock in read mode. 1085 + */ 1086 + if (mmap_write_lock_killable(mm)) 1087 + return -EINTR; 1088 + 1089 + gate_vma = get_gate_vma(mm); 1090 + *vma_count = mm->map_count + (gate_vma ? 1 : 0); 1091 + 1092 + *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL); 1093 + if (!*vma_meta) { 1094 + mmap_write_unlock(mm); 1095 + return -ENOMEM; 1096 + } 1097 + 1098 + for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; 1099 + vma = next_vma(vma, gate_vma), i++) { 1100 + struct core_vma_metadata *m = (*vma_meta) + i; 1101 + 1102 + m->start = vma->vm_start; 1103 + m->end = vma->vm_end; 1104 + m->flags = vma->vm_flags; 1105 + m->dump_size = vma_dump_size(vma, cprm->mm_flags); 1106 + 1107 + vma_data_size += m->dump_size; 1108 + } 1109 + 1110 + mmap_write_unlock(mm); 1111 + 1112 + if (WARN_ON(i != *vma_count)) 1113 + return -EFAULT; 1114 + 1115 + *vma_data_size_ptr = vma_data_size; 1116 + return 0; 1040 1117 }
+9 -1
include/linux/coredump.h
··· 7 7 #include <linux/fs.h> 8 8 #include <asm/siginfo.h> 9 9 10 + struct core_vma_metadata { 11 + unsigned long start, end; 12 + unsigned long flags; 13 + unsigned long dump_size; 14 + }; 15 + 10 16 /* 11 17 * These are the only things you should do on a core-file: use only these 12 18 * functions to write out all the necessary info. ··· 22 16 extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); 23 17 extern int dump_align(struct coredump_params *cprm, int align); 24 18 extern void dump_truncate(struct coredump_params *cprm); 25 - unsigned long vma_dump_size(struct vm_area_struct *vma, unsigned long mm_flags); 26 19 int dump_user_range(struct coredump_params *cprm, unsigned long start, 27 20 unsigned long len); 21 + int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, 22 + struct core_vma_metadata **vma_meta, 23 + size_t *vma_data_size_ptr); 28 24 #ifdef CONFIG_COREDUMP 29 25 extern void do_coredump(const kernel_siginfo_t *siginfo); 30 26 #else