procfs: avoid fetching build ID while holding VMA lock

Fix PROCMAP_QUERY to fetch optional build ID only after dropping mmap_lock
or per-VMA lock, whichever was used to lock VMA under question, to avoid
deadlock reported by syzbot:

-> #1 (&mm->mmap_lock){++++}-{4:4}:
__might_fault+0xed/0x170
_copy_to_iter+0x118/0x1720
copy_page_to_iter+0x12d/0x1e0
filemap_read+0x720/0x10a0
blkdev_read_iter+0x2b5/0x4e0
vfs_read+0x7f4/0xae0
ksys_read+0x12a/0x250
do_syscall_64+0xcb/0xf80
entry_SYSCALL_64_after_hwframe+0x77/0x7f

-> #0 (&sb->s_type->i_mutex_key#8){++++}-{4:4}:
__lock_acquire+0x1509/0x26d0
lock_acquire+0x185/0x340
down_read+0x98/0x490
blkdev_read_iter+0x2a7/0x4e0
__kernel_read+0x39a/0xa90
freader_fetch+0x1d5/0xa80
__build_id_parse.isra.0+0xea/0x6a0
do_procmap_query+0xd75/0x1050
procfs_procmap_ioctl+0x7a/0xb0
__x64_sys_ioctl+0x18e/0x210
do_syscall_64+0xcb/0xf80
entry_SYSCALL_64_after_hwframe+0x77/0x7f

other info that might help us debug this:

Possible unsafe locking scenario:

CPU0 CPU1
---- ----
rlock(&mm->mmap_lock);
lock(&sb->s_type->i_mutex_key#8);
lock(&mm->mmap_lock);
rlock(&sb->s_type->i_mutex_key#8);

*** DEADLOCK ***

This seems to be exacerbated (as we haven't seen these syzbot reports
before that) by the recent:

777a8560fd29 ("lib/buildid: use __kernel_read() for sleepable context")

To make this safe, we need to grab file refcount while VMA is still locked, but
other than that everything is pretty straightforward. Internal build_id_parse()
API assumes VMA is passed, but it only needs the underlying file reference, so
just add another variant build_id_parse_file() that expects file passed
directly.

[akpm@linux-foundation.org: fix up kerneldoc]
Link: https://lkml.kernel.org/r/20260129215340.3742283-1-andrii@kernel.org
Fixes: ed5d583a88a9 ("fs/procfs: implement efficient VMA querying API for /proc/<pid>/maps")
Signed-off-by: Andrii Nakryiko <andrii@kernel.org>
Reported-by: <syzbot+4e70c8e0a2017b432f7a@syzkaller.appspotmail.com>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Tested-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Eduard Zingerman <eddyz87@gmail.com>
Cc: Hao Luo <haoluo@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Song Liu <song@kernel.org>
Cc: Stanislav Fomichev <sdf@fomichev.me>
Cc: Yonghong Song <yonghong.song@linux.dev>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by Andrii Nakryiko and committed by Andrew Morton b5cbacd7 1a47837b

+60 -27
+27 -15
fs/proc/task_mmu.c
··· 656 struct proc_maps_locking_ctx lock_ctx = { .mm = mm }; 657 struct procmap_query karg; 658 struct vm_area_struct *vma; 659 const char *name = NULL; 660 char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; 661 __u64 usize; ··· 728 karg.inode = 0; 729 } 730 731 - if (karg.build_id_size) { 732 - __u32 build_id_sz; 733 - 734 - err = build_id_parse(vma, build_id_buf, &build_id_sz); 735 - if (err) { 736 - karg.build_id_size = 0; 737 - } else { 738 - if (karg.build_id_size < build_id_sz) { 739 - err = -ENAMETOOLONG; 740 - goto out; 741 - } 742 - karg.build_id_size = build_id_sz; 743 - } 744 - } 745 - 746 if (karg.vma_name_size) { 747 size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); 748 const struct path *path; ··· 761 karg.vma_name_size = name_sz; 762 } 763 764 /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ 765 query_vma_teardown(&lock_ctx); 766 mmput(mm); 767 768 if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), 769 name, karg.vma_name_size)) { ··· 808 out: 809 query_vma_teardown(&lock_ctx); 810 mmput(mm); 811 kfree(name_buf); 812 return err; 813 }
··· 656 struct proc_maps_locking_ctx lock_ctx = { .mm = mm }; 657 struct procmap_query karg; 658 struct vm_area_struct *vma; 659 + struct file *vm_file = NULL; 660 const char *name = NULL; 661 char build_id_buf[BUILD_ID_SIZE_MAX], *name_buf = NULL; 662 __u64 usize; ··· 727 karg.inode = 0; 728 } 729 730 if (karg.vma_name_size) { 731 size_t name_buf_sz = min_t(size_t, PATH_MAX, karg.vma_name_size); 732 const struct path *path; ··· 775 karg.vma_name_size = name_sz; 776 } 777 778 + if (karg.build_id_size && vma->vm_file) 779 + vm_file = get_file(vma->vm_file); 780 + 781 /* unlock vma or mmap_lock, and put mm_struct before copying data to user */ 782 query_vma_teardown(&lock_ctx); 783 mmput(mm); 784 + 785 + if (karg.build_id_size) { 786 + __u32 build_id_sz; 787 + 788 + if (vm_file) 789 + err = build_id_parse_file(vm_file, build_id_buf, &build_id_sz); 790 + else 791 + err = -ENOENT; 792 + if (err) { 793 + karg.build_id_size = 0; 794 + } else { 795 + if (karg.build_id_size < build_id_sz) { 796 + err = -ENAMETOOLONG; 797 + goto out; 798 + } 799 + karg.build_id_size = build_id_sz; 800 + } 801 + } 802 + 803 + if (vm_file) 804 + fput(vm_file); 805 806 if (karg.vma_name_size && copy_to_user(u64_to_user_ptr(karg.vma_name_addr), 807 name, karg.vma_name_size)) { ··· 798 out: 799 query_vma_teardown(&lock_ctx); 800 mmput(mm); 801 + if (vm_file) 802 + fput(vm_file); 803 kfree(name_buf); 804 return err; 805 }
+3
include/linux/buildid.h
··· 7 #define BUILD_ID_SIZE_MAX 20 8 9 struct vm_area_struct; 10 int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); 11 int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); 12 int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); 13
··· 7 #define BUILD_ID_SIZE_MAX 20 8 9 struct vm_area_struct; 10 + struct file; 11 + 12 int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); 13 + int build_id_parse_file(struct file *file, unsigned char *build_id, __u32 *size); 14 int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); 15 int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); 16
+30 -12
lib/buildid.c
··· 279 /* enough for Elf64_Ehdr, Elf64_Phdr, and all the smaller requests */ 280 #define MAX_FREADER_BUF_SZ 64 281 282 - static int __build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, 283 __u32 *size, bool may_fault) 284 { 285 const Elf32_Ehdr *ehdr; ··· 287 char buf[MAX_FREADER_BUF_SZ]; 288 int ret; 289 290 - /* only works for page backed storage */ 291 - if (!vma->vm_file) 292 - return -EINVAL; 293 - 294 - freader_init_from_file(&r, buf, sizeof(buf), vma->vm_file, may_fault); 295 296 /* fetch first 18 bytes of ELF header for checks */ 297 ehdr = freader_fetch(&r, 0, offsetofend(Elf32_Ehdr, e_type)); ··· 315 return ret; 316 } 317 318 - /* 319 - * Parse build ID of ELF file mapped to vma 320 * @vma: vma object 321 * @build_id: buffer to store build id, at least BUILD_ID_SIZE long 322 * @size: returns actual build id size in case of success ··· 328 */ 329 int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) 330 { 331 - return __build_id_parse(vma, build_id, size, false /* !may_fault */); 332 } 333 334 - /* 335 - * Parse build ID of ELF file mapped to VMA 336 * @vma: vma object 337 * @build_id: buffer to store build id, at least BUILD_ID_SIZE long 338 * @size: returns actual build id size in case of success ··· 347 */ 348 int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) 349 { 350 - return __build_id_parse(vma, build_id, size, true /* may_fault */); 351 } 352 353 /**
··· 279 /* enough for Elf64_Ehdr, Elf64_Phdr, and all the smaller requests */ 280 #define MAX_FREADER_BUF_SZ 64 281 282 + static int __build_id_parse(struct file *file, unsigned char *build_id, 283 __u32 *size, bool may_fault) 284 { 285 const Elf32_Ehdr *ehdr; ··· 287 char buf[MAX_FREADER_BUF_SZ]; 288 int ret; 289 290 + freader_init_from_file(&r, buf, sizeof(buf), file, may_fault); 291 292 /* fetch first 18 bytes of ELF header for checks */ 293 ehdr = freader_fetch(&r, 0, offsetofend(Elf32_Ehdr, e_type)); ··· 319 return ret; 320 } 321 322 + /** 323 + * build_id_parse_nofault() - Parse build ID of ELF file mapped to vma 324 * @vma: vma object 325 * @build_id: buffer to store build id, at least BUILD_ID_SIZE long 326 * @size: returns actual build id size in case of success ··· 332 */ 333 int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) 334 { 335 + if (!vma->vm_file) 336 + return -EINVAL; 337 + 338 + return __build_id_parse(vma->vm_file, build_id, size, false /* !may_fault */); 339 } 340 341 + /** 342 + * build_id_parse() - Parse build ID of ELF file mapped to VMA 343 * @vma: vma object 344 * @build_id: buffer to store build id, at least BUILD_ID_SIZE long 345 * @size: returns actual build id size in case of success ··· 348 */ 349 int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) 350 { 351 + if (!vma->vm_file) 352 + return -EINVAL; 353 + 354 + return __build_id_parse(vma->vm_file, build_id, size, true /* may_fault */); 355 + } 356 + 357 + /** 358 + * build_id_parse_file() - Parse build ID of ELF file 359 + * @file: file object 360 + * @build_id: buffer to store build id, at least BUILD_ID_SIZE long 361 + * @size: returns actual build id size in case of success 362 + * 363 + * Assumes faultable context and can cause page faults to bring in file data 364 + * into page cache. 365 + * 366 + * Return: 0 on success; negative error, otherwise 367 + */ 368 + int build_id_parse_file(struct file *file, unsigned char *build_id, __u32 *size) 369 + { 370 + return __build_id_parse(file, build_id, size, true /* may_fault */); 371 } 372 373 /**