Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v5.18-rc6 670 lines 17 kB view raw
1// SPDX-License-Identifier: GPL-2.0-only 2/* Copyright (c) 2020 Facebook */ 3 4#include <linux/init.h> 5#include <linux/namei.h> 6#include <linux/pid_namespace.h> 7#include <linux/fs.h> 8#include <linux/fdtable.h> 9#include <linux/filter.h> 10#include <linux/btf_ids.h> 11#include "mmap_unlock_work.h" 12 13struct bpf_iter_seq_task_common { 14 struct pid_namespace *ns; 15}; 16 17struct bpf_iter_seq_task_info { 18 /* The first field must be struct bpf_iter_seq_task_common. 19 * this is assumed by {init, fini}_seq_pidns() callback functions. 20 */ 21 struct bpf_iter_seq_task_common common; 22 u32 tid; 23}; 24 25static struct task_struct *task_seq_get_next(struct pid_namespace *ns, 26 u32 *tid, 27 bool skip_if_dup_files) 28{ 29 struct task_struct *task = NULL; 30 struct pid *pid; 31 32 rcu_read_lock(); 33retry: 34 pid = find_ge_pid(*tid, ns); 35 if (pid) { 36 *tid = pid_nr_ns(pid, ns); 37 task = get_pid_task(pid, PIDTYPE_PID); 38 if (!task) { 39 ++*tid; 40 goto retry; 41 } else if (skip_if_dup_files && !thread_group_leader(task) && 42 task->files == task->group_leader->files) { 43 put_task_struct(task); 44 task = NULL; 45 ++*tid; 46 goto retry; 47 } 48 } 49 rcu_read_unlock(); 50 51 return task; 52} 53 54static void *task_seq_start(struct seq_file *seq, loff_t *pos) 55{ 56 struct bpf_iter_seq_task_info *info = seq->private; 57 struct task_struct *task; 58 59 task = task_seq_get_next(info->common.ns, &info->tid, false); 60 if (!task) 61 return NULL; 62 63 if (*pos == 0) 64 ++*pos; 65 return task; 66} 67 68static void *task_seq_next(struct seq_file *seq, void *v, loff_t *pos) 69{ 70 struct bpf_iter_seq_task_info *info = seq->private; 71 struct task_struct *task; 72 73 ++*pos; 74 ++info->tid; 75 put_task_struct((struct task_struct *)v); 76 task = task_seq_get_next(info->common.ns, &info->tid, false); 77 if (!task) 78 return NULL; 79 80 return task; 81} 82 83struct bpf_iter__task { 84 __bpf_md_ptr(struct bpf_iter_meta *, meta); 85 __bpf_md_ptr(struct task_struct *, task); 86}; 87 88DEFINE_BPF_ITER_FUNC(task, struct bpf_iter_meta *meta, struct task_struct *task) 89 90static int __task_seq_show(struct seq_file *seq, struct task_struct *task, 91 bool in_stop) 92{ 93 struct bpf_iter_meta meta; 94 struct bpf_iter__task ctx; 95 struct bpf_prog *prog; 96 97 meta.seq = seq; 98 prog = bpf_iter_get_info(&meta, in_stop); 99 if (!prog) 100 return 0; 101 102 meta.seq = seq; 103 ctx.meta = &meta; 104 ctx.task = task; 105 return bpf_iter_run_prog(prog, &ctx); 106} 107 108static int task_seq_show(struct seq_file *seq, void *v) 109{ 110 return __task_seq_show(seq, v, false); 111} 112 113static void task_seq_stop(struct seq_file *seq, void *v) 114{ 115 if (!v) 116 (void)__task_seq_show(seq, v, true); 117 else 118 put_task_struct((struct task_struct *)v); 119} 120 121static const struct seq_operations task_seq_ops = { 122 .start = task_seq_start, 123 .next = task_seq_next, 124 .stop = task_seq_stop, 125 .show = task_seq_show, 126}; 127 128struct bpf_iter_seq_task_file_info { 129 /* The first field must be struct bpf_iter_seq_task_common. 130 * this is assumed by {init, fini}_seq_pidns() callback functions. 131 */ 132 struct bpf_iter_seq_task_common common; 133 struct task_struct *task; 134 u32 tid; 135 u32 fd; 136}; 137 138static struct file * 139task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) 140{ 141 struct pid_namespace *ns = info->common.ns; 142 u32 curr_tid = info->tid; 143 struct task_struct *curr_task; 144 unsigned int curr_fd = info->fd; 145 146 /* If this function returns a non-NULL file object, 147 * it held a reference to the task/file. 148 * Otherwise, it does not hold any reference. 149 */ 150again: 151 if (info->task) { 152 curr_task = info->task; 153 curr_fd = info->fd; 154 } else { 155 curr_task = task_seq_get_next(ns, &curr_tid, true); 156 if (!curr_task) { 157 info->task = NULL; 158 info->tid = curr_tid; 159 return NULL; 160 } 161 162 /* set info->task and info->tid */ 163 info->task = curr_task; 164 if (curr_tid == info->tid) { 165 curr_fd = info->fd; 166 } else { 167 info->tid = curr_tid; 168 curr_fd = 0; 169 } 170 } 171 172 rcu_read_lock(); 173 for (;; curr_fd++) { 174 struct file *f; 175 f = task_lookup_next_fd_rcu(curr_task, &curr_fd); 176 if (!f) 177 break; 178 if (!get_file_rcu(f)) 179 continue; 180 181 /* set info->fd */ 182 info->fd = curr_fd; 183 rcu_read_unlock(); 184 return f; 185 } 186 187 /* the current task is done, go to the next task */ 188 rcu_read_unlock(); 189 put_task_struct(curr_task); 190 info->task = NULL; 191 info->fd = 0; 192 curr_tid = ++(info->tid); 193 goto again; 194} 195 196static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) 197{ 198 struct bpf_iter_seq_task_file_info *info = seq->private; 199 struct file *file; 200 201 info->task = NULL; 202 file = task_file_seq_get_next(info); 203 if (file && *pos == 0) 204 ++*pos; 205 206 return file; 207} 208 209static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) 210{ 211 struct bpf_iter_seq_task_file_info *info = seq->private; 212 213 ++*pos; 214 ++info->fd; 215 fput((struct file *)v); 216 return task_file_seq_get_next(info); 217} 218 219struct bpf_iter__task_file { 220 __bpf_md_ptr(struct bpf_iter_meta *, meta); 221 __bpf_md_ptr(struct task_struct *, task); 222 u32 fd __aligned(8); 223 __bpf_md_ptr(struct file *, file); 224}; 225 226DEFINE_BPF_ITER_FUNC(task_file, struct bpf_iter_meta *meta, 227 struct task_struct *task, u32 fd, 228 struct file *file) 229 230static int __task_file_seq_show(struct seq_file *seq, struct file *file, 231 bool in_stop) 232{ 233 struct bpf_iter_seq_task_file_info *info = seq->private; 234 struct bpf_iter__task_file ctx; 235 struct bpf_iter_meta meta; 236 struct bpf_prog *prog; 237 238 meta.seq = seq; 239 prog = bpf_iter_get_info(&meta, in_stop); 240 if (!prog) 241 return 0; 242 243 ctx.meta = &meta; 244 ctx.task = info->task; 245 ctx.fd = info->fd; 246 ctx.file = file; 247 return bpf_iter_run_prog(prog, &ctx); 248} 249 250static int task_file_seq_show(struct seq_file *seq, void *v) 251{ 252 return __task_file_seq_show(seq, v, false); 253} 254 255static void task_file_seq_stop(struct seq_file *seq, void *v) 256{ 257 struct bpf_iter_seq_task_file_info *info = seq->private; 258 259 if (!v) { 260 (void)__task_file_seq_show(seq, v, true); 261 } else { 262 fput((struct file *)v); 263 put_task_struct(info->task); 264 info->task = NULL; 265 } 266} 267 268static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux) 269{ 270 struct bpf_iter_seq_task_common *common = priv_data; 271 272 common->ns = get_pid_ns(task_active_pid_ns(current)); 273 return 0; 274} 275 276static void fini_seq_pidns(void *priv_data) 277{ 278 struct bpf_iter_seq_task_common *common = priv_data; 279 280 put_pid_ns(common->ns); 281} 282 283static const struct seq_operations task_file_seq_ops = { 284 .start = task_file_seq_start, 285 .next = task_file_seq_next, 286 .stop = task_file_seq_stop, 287 .show = task_file_seq_show, 288}; 289 290struct bpf_iter_seq_task_vma_info { 291 /* The first field must be struct bpf_iter_seq_task_common. 292 * this is assumed by {init, fini}_seq_pidns() callback functions. 293 */ 294 struct bpf_iter_seq_task_common common; 295 struct task_struct *task; 296 struct vm_area_struct *vma; 297 u32 tid; 298 unsigned long prev_vm_start; 299 unsigned long prev_vm_end; 300}; 301 302enum bpf_task_vma_iter_find_op { 303 task_vma_iter_first_vma, /* use mm->mmap */ 304 task_vma_iter_next_vma, /* use curr_vma->vm_next */ 305 task_vma_iter_find_vma, /* use find_vma() to find next vma */ 306}; 307 308static struct vm_area_struct * 309task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) 310{ 311 struct pid_namespace *ns = info->common.ns; 312 enum bpf_task_vma_iter_find_op op; 313 struct vm_area_struct *curr_vma; 314 struct task_struct *curr_task; 315 u32 curr_tid = info->tid; 316 317 /* If this function returns a non-NULL vma, it holds a reference to 318 * the task_struct, and holds read lock on vma->mm->mmap_lock. 319 * If this function returns NULL, it does not hold any reference or 320 * lock. 321 */ 322 if (info->task) { 323 curr_task = info->task; 324 curr_vma = info->vma; 325 /* In case of lock contention, drop mmap_lock to unblock 326 * the writer. 327 * 328 * After relock, call find(mm, prev_vm_end - 1) to find 329 * new vma to process. 330 * 331 * +------+------+-----------+ 332 * | VMA1 | VMA2 | VMA3 | 333 * +------+------+-----------+ 334 * | | | | 335 * 4k 8k 16k 400k 336 * 337 * For example, curr_vma == VMA2. Before unlock, we set 338 * 339 * prev_vm_start = 8k 340 * prev_vm_end = 16k 341 * 342 * There are a few cases: 343 * 344 * 1) VMA2 is freed, but VMA3 exists. 345 * 346 * find_vma() will return VMA3, just process VMA3. 347 * 348 * 2) VMA2 still exists. 349 * 350 * find_vma() will return VMA2, process VMA2->next. 351 * 352 * 3) no more vma in this mm. 353 * 354 * Process the next task. 355 * 356 * 4) find_vma() returns a different vma, VMA2'. 357 * 358 * 4.1) If VMA2 covers same range as VMA2', skip VMA2', 359 * because we already covered the range; 360 * 4.2) VMA2 and VMA2' covers different ranges, process 361 * VMA2'. 362 */ 363 if (mmap_lock_is_contended(curr_task->mm)) { 364 info->prev_vm_start = curr_vma->vm_start; 365 info->prev_vm_end = curr_vma->vm_end; 366 op = task_vma_iter_find_vma; 367 mmap_read_unlock(curr_task->mm); 368 if (mmap_read_lock_killable(curr_task->mm)) 369 goto finish; 370 } else { 371 op = task_vma_iter_next_vma; 372 } 373 } else { 374again: 375 curr_task = task_seq_get_next(ns, &curr_tid, true); 376 if (!curr_task) { 377 info->tid = curr_tid + 1; 378 goto finish; 379 } 380 381 if (curr_tid != info->tid) { 382 info->tid = curr_tid; 383 /* new task, process the first vma */ 384 op = task_vma_iter_first_vma; 385 } else { 386 /* Found the same tid, which means the user space 387 * finished data in previous buffer and read more. 388 * We dropped mmap_lock before returning to user 389 * space, so it is necessary to use find_vma() to 390 * find the next vma to process. 391 */ 392 op = task_vma_iter_find_vma; 393 } 394 395 if (!curr_task->mm) 396 goto next_task; 397 398 if (mmap_read_lock_killable(curr_task->mm)) 399 goto finish; 400 } 401 402 switch (op) { 403 case task_vma_iter_first_vma: 404 curr_vma = curr_task->mm->mmap; 405 break; 406 case task_vma_iter_next_vma: 407 curr_vma = curr_vma->vm_next; 408 break; 409 case task_vma_iter_find_vma: 410 /* We dropped mmap_lock so it is necessary to use find_vma 411 * to find the next vma. This is similar to the mechanism 412 * in show_smaps_rollup(). 413 */ 414 curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); 415 /* case 1) and 4.2) above just use curr_vma */ 416 417 /* check for case 2) or case 4.1) above */ 418 if (curr_vma && 419 curr_vma->vm_start == info->prev_vm_start && 420 curr_vma->vm_end == info->prev_vm_end) 421 curr_vma = curr_vma->vm_next; 422 break; 423 } 424 if (!curr_vma) { 425 /* case 3) above, or case 2) 4.1) with vma->next == NULL */ 426 mmap_read_unlock(curr_task->mm); 427 goto next_task; 428 } 429 info->task = curr_task; 430 info->vma = curr_vma; 431 return curr_vma; 432 433next_task: 434 put_task_struct(curr_task); 435 info->task = NULL; 436 curr_tid++; 437 goto again; 438 439finish: 440 if (curr_task) 441 put_task_struct(curr_task); 442 info->task = NULL; 443 info->vma = NULL; 444 return NULL; 445} 446 447static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) 448{ 449 struct bpf_iter_seq_task_vma_info *info = seq->private; 450 struct vm_area_struct *vma; 451 452 vma = task_vma_seq_get_next(info); 453 if (vma && *pos == 0) 454 ++*pos; 455 456 return vma; 457} 458 459static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) 460{ 461 struct bpf_iter_seq_task_vma_info *info = seq->private; 462 463 ++*pos; 464 return task_vma_seq_get_next(info); 465} 466 467struct bpf_iter__task_vma { 468 __bpf_md_ptr(struct bpf_iter_meta *, meta); 469 __bpf_md_ptr(struct task_struct *, task); 470 __bpf_md_ptr(struct vm_area_struct *, vma); 471}; 472 473DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, 474 struct task_struct *task, struct vm_area_struct *vma) 475 476static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) 477{ 478 struct bpf_iter_seq_task_vma_info *info = seq->private; 479 struct bpf_iter__task_vma ctx; 480 struct bpf_iter_meta meta; 481 struct bpf_prog *prog; 482 483 meta.seq = seq; 484 prog = bpf_iter_get_info(&meta, in_stop); 485 if (!prog) 486 return 0; 487 488 ctx.meta = &meta; 489 ctx.task = info->task; 490 ctx.vma = info->vma; 491 return bpf_iter_run_prog(prog, &ctx); 492} 493 494static int task_vma_seq_show(struct seq_file *seq, void *v) 495{ 496 return __task_vma_seq_show(seq, false); 497} 498 499static void task_vma_seq_stop(struct seq_file *seq, void *v) 500{ 501 struct bpf_iter_seq_task_vma_info *info = seq->private; 502 503 if (!v) { 504 (void)__task_vma_seq_show(seq, true); 505 } else { 506 /* info->vma has not been seen by the BPF program. If the 507 * user space reads more, task_vma_seq_get_next should 508 * return this vma again. Set prev_vm_start to ~0UL, 509 * so that we don't skip the vma returned by the next 510 * find_vma() (case task_vma_iter_find_vma in 511 * task_vma_seq_get_next()). 512 */ 513 info->prev_vm_start = ~0UL; 514 info->prev_vm_end = info->vma->vm_end; 515 mmap_read_unlock(info->task->mm); 516 put_task_struct(info->task); 517 info->task = NULL; 518 } 519} 520 521static const struct seq_operations task_vma_seq_ops = { 522 .start = task_vma_seq_start, 523 .next = task_vma_seq_next, 524 .stop = task_vma_seq_stop, 525 .show = task_vma_seq_show, 526}; 527 528static const struct bpf_iter_seq_info task_seq_info = { 529 .seq_ops = &task_seq_ops, 530 .init_seq_private = init_seq_pidns, 531 .fini_seq_private = fini_seq_pidns, 532 .seq_priv_size = sizeof(struct bpf_iter_seq_task_info), 533}; 534 535static struct bpf_iter_reg task_reg_info = { 536 .target = "task", 537 .feature = BPF_ITER_RESCHED, 538 .ctx_arg_info_size = 1, 539 .ctx_arg_info = { 540 { offsetof(struct bpf_iter__task, task), 541 PTR_TO_BTF_ID_OR_NULL }, 542 }, 543 .seq_info = &task_seq_info, 544}; 545 546static const struct bpf_iter_seq_info task_file_seq_info = { 547 .seq_ops = &task_file_seq_ops, 548 .init_seq_private = init_seq_pidns, 549 .fini_seq_private = fini_seq_pidns, 550 .seq_priv_size = sizeof(struct bpf_iter_seq_task_file_info), 551}; 552 553static struct bpf_iter_reg task_file_reg_info = { 554 .target = "task_file", 555 .feature = BPF_ITER_RESCHED, 556 .ctx_arg_info_size = 2, 557 .ctx_arg_info = { 558 { offsetof(struct bpf_iter__task_file, task), 559 PTR_TO_BTF_ID_OR_NULL }, 560 { offsetof(struct bpf_iter__task_file, file), 561 PTR_TO_BTF_ID_OR_NULL }, 562 }, 563 .seq_info = &task_file_seq_info, 564}; 565 566static const struct bpf_iter_seq_info task_vma_seq_info = { 567 .seq_ops = &task_vma_seq_ops, 568 .init_seq_private = init_seq_pidns, 569 .fini_seq_private = fini_seq_pidns, 570 .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), 571}; 572 573static struct bpf_iter_reg task_vma_reg_info = { 574 .target = "task_vma", 575 .feature = BPF_ITER_RESCHED, 576 .ctx_arg_info_size = 2, 577 .ctx_arg_info = { 578 { offsetof(struct bpf_iter__task_vma, task), 579 PTR_TO_BTF_ID_OR_NULL }, 580 { offsetof(struct bpf_iter__task_vma, vma), 581 PTR_TO_BTF_ID_OR_NULL }, 582 }, 583 .seq_info = &task_vma_seq_info, 584}; 585 586BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start, 587 bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags) 588{ 589 struct mmap_unlock_irq_work *work = NULL; 590 struct vm_area_struct *vma; 591 bool irq_work_busy = false; 592 struct mm_struct *mm; 593 int ret = -ENOENT; 594 595 if (flags) 596 return -EINVAL; 597 598 if (!task) 599 return -ENOENT; 600 601 mm = task->mm; 602 if (!mm) 603 return -ENOENT; 604 605 irq_work_busy = bpf_mmap_unlock_get_irq_work(&work); 606 607 if (irq_work_busy || !mmap_read_trylock(mm)) 608 return -EBUSY; 609 610 vma = find_vma(mm, start); 611 612 if (vma && vma->vm_start <= start && vma->vm_end > start) { 613 callback_fn((u64)(long)task, (u64)(long)vma, 614 (u64)(long)callback_ctx, 0, 0); 615 ret = 0; 616 } 617 bpf_mmap_unlock_mm(work, mm); 618 return ret; 619} 620 621const struct bpf_func_proto bpf_find_vma_proto = { 622 .func = bpf_find_vma, 623 .ret_type = RET_INTEGER, 624 .arg1_type = ARG_PTR_TO_BTF_ID, 625 .arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK], 626 .arg2_type = ARG_ANYTHING, 627 .arg3_type = ARG_PTR_TO_FUNC, 628 .arg4_type = ARG_PTR_TO_STACK_OR_NULL, 629 .arg5_type = ARG_ANYTHING, 630}; 631 632DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work); 633 634static void do_mmap_read_unlock(struct irq_work *entry) 635{ 636 struct mmap_unlock_irq_work *work; 637 638 if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT))) 639 return; 640 641 work = container_of(entry, struct mmap_unlock_irq_work, irq_work); 642 mmap_read_unlock_non_owner(work->mm); 643} 644 645static int __init task_iter_init(void) 646{ 647 struct mmap_unlock_irq_work *work; 648 int ret, cpu; 649 650 for_each_possible_cpu(cpu) { 651 work = per_cpu_ptr(&mmap_unlock_work, cpu); 652 init_irq_work(&work->irq_work, do_mmap_read_unlock); 653 } 654 655 task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 656 ret = bpf_iter_reg_target(&task_reg_info); 657 if (ret) 658 return ret; 659 660 task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 661 task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE]; 662 ret = bpf_iter_reg_target(&task_file_reg_info); 663 if (ret) 664 return ret; 665 666 task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK]; 667 task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA]; 668 return bpf_iter_reg_target(&task_vma_reg_info); 669} 670late_initcall(task_iter_init);