Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'perf-urgent-2026-03-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf events fixes from Ingo Molnar:

- Fix lock ordering bug found by lockdep in perf_event_wakeup()

- Fix uncore counter enumeration on Granite Rapids and Sierra Forest

- Fix perf_mmap() refcount bug found by Syzkaller

- Fix __perf_event_overflow() vs perf_remove_from_context() race

* tag 'perf-urgent-2026-03-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf: Fix __perf_event_overflow() vs perf_remove_from_context() race
perf/core: Fix refcount bug and potential UAF in perf_mmap
perf/x86/intel/uncore: Add per-scheduler IMC CAS count events
perf/core: Fix invalid wait context in ctx_sched_in()

+91 -24
+27 -1
arch/x86/events/intel/uncore_snbep.c
··· 6497 6497 .attr_update = uncore_alias_groups, 6498 6498 }; 6499 6499 6500 + static struct uncore_event_desc gnr_uncore_imc_events[] = { 6501 + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x01,umask=0x00"), 6502 + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0, "event=0x05,umask=0xcf"), 6503 + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0.scale, "6.103515625e-5"), 6504 + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch0.unit, "MiB"), 6505 + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1, "event=0x06,umask=0xcf"), 6506 + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1.scale, "6.103515625e-5"), 6507 + INTEL_UNCORE_EVENT_DESC(cas_count_read_sch1.unit, "MiB"), 6508 + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0, "event=0x05,umask=0xf0"), 6509 + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0.scale, "6.103515625e-5"), 6510 + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch0.unit, "MiB"), 6511 + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1, "event=0x06,umask=0xf0"), 6512 + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1.scale, "6.103515625e-5"), 6513 + INTEL_UNCORE_EVENT_DESC(cas_count_write_sch1.unit, "MiB"), 6514 + { /* end: all zeroes */ }, 6515 + }; 6516 + 6517 + static struct intel_uncore_type gnr_uncore_imc = { 6518 + SPR_UNCORE_MMIO_COMMON_FORMAT(), 6519 + .name = "imc", 6520 + .fixed_ctr_bits = 48, 6521 + .fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR, 6522 + .fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL, 6523 + .event_descs = gnr_uncore_imc_events, 6524 + }; 6525 + 6500 6526 static struct intel_uncore_type gnr_uncore_pciex8 = { 6501 6527 SPR_UNCORE_PCI_COMMON_FORMAT(), 6502 6528 .name = "pciex8", ··· 6570 6544 NULL, 6571 6545 &spr_uncore_pcu, 6572 6546 &gnr_uncore_ubox, 6573 - &spr_uncore_imc, 6547 + &gnr_uncore_imc, 6574 6548 NULL, 6575 6549 &gnr_uncore_upi, 6576 6550 NULL,
+64 -23
kernel/events/core.c
··· 4138 4138 if (*perf_event_fasync(event)) 4139 4139 event->pending_kill = POLL_ERR; 4140 4140 4141 - perf_event_wakeup(event); 4141 + event->pending_wakeup = 1; 4142 + irq_work_queue(&event->pending_irq); 4142 4143 } else { 4143 4144 struct perf_cpu_pmu_context *cpc = this_cpc(event->pmu_ctx->pmu); 4144 4145 ··· 7465 7464 ret = perf_mmap_aux(vma, event, nr_pages); 7466 7465 if (ret) 7467 7466 return ret; 7467 + 7468 + /* 7469 + * Since pinned accounting is per vm we cannot allow fork() to copy our 7470 + * vma. 7471 + */ 7472 + vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); 7473 + vma->vm_ops = &perf_mmap_vmops; 7474 + 7475 + mapped = get_mapped(event, event_mapped); 7476 + if (mapped) 7477 + mapped(event, vma->vm_mm); 7478 + 7479 + /* 7480 + * Try to map it into the page table. On fail, invoke 7481 + * perf_mmap_close() to undo the above, as the callsite expects 7482 + * full cleanup in this case and therefore does not invoke 7483 + * vmops::close(). 7484 + */ 7485 + ret = map_range(event->rb, vma); 7486 + if (ret) 7487 + perf_mmap_close(vma); 7468 7488 } 7469 - 7470 - /* 7471 - * Since pinned accounting is per vm we cannot allow fork() to copy our 7472 - * vma. 7473 - */ 7474 - vm_flags_set(vma, VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP); 7475 - vma->vm_ops = &perf_mmap_vmops; 7476 - 7477 - mapped = get_mapped(event, event_mapped); 7478 - if (mapped) 7479 - mapped(event, vma->vm_mm); 7480 - 7481 - /* 7482 - * Try to map it into the page table. On fail, invoke 7483 - * perf_mmap_close() to undo the above, as the callsite expects 7484 - * full cleanup in this case and therefore does not invoke 7485 - * vmops::close(). 7486 - */ 7487 - ret = map_range(event->rb, vma); 7488 - if (ret) 7489 - perf_mmap_close(vma); 7490 7489 7491 7490 return ret; 7492 7491 } ··· 10777 10776 struct perf_sample_data *data, 10778 10777 struct pt_regs *regs) 10779 10778 { 10779 + /* 10780 + * Entry point from hardware PMI, interrupts should be disabled here. 10781 + * This serializes us against perf_event_remove_from_context() in 10782 + * things like perf_event_release_kernel(). 10783 + */ 10784 + lockdep_assert_irqs_disabled(); 10785 + 10780 10786 return __perf_event_overflow(event, 1, data, regs); 10781 10787 } 10782 10788 ··· 10860 10852 { 10861 10853 struct hw_perf_event *hwc = &event->hw; 10862 10854 10855 + /* 10856 + * This is: 10857 + * - software preempt 10858 + * - tracepoint preempt 10859 + * - tp_target_task irq (ctx->lock) 10860 + * - uprobes preempt/irq 10861 + * - kprobes preempt/irq 10862 + * - hw_breakpoint irq 10863 + * 10864 + * Any of these are sufficient to hold off RCU and thus ensure @event 10865 + * exists. 10866 + */ 10867 + lockdep_assert_preemption_disabled(); 10863 10868 local64_add(nr, &event->count); 10864 10869 10865 10870 if (!regs) 10866 10871 return; 10867 10872 10868 10873 if (!is_sampling_event(event)) 10874 + return; 10875 + 10876 + /* 10877 + * Serialize against event_function_call() IPIs like normal overflow 10878 + * event handling. Specifically, must not allow 10879 + * perf_event_release_kernel() -> perf_remove_from_context() to make 10880 + * progress and 'release' the event from under us. 10881 + */ 10882 + guard(irqsave)(); 10883 + if (event->state != PERF_EVENT_STATE_ACTIVE) 10869 10884 return; 10870 10885 10871 10886 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) { ··· 11389 11358 struct perf_sample_data data; 11390 11359 struct perf_event *event; 11391 11360 11361 + /* 11362 + * Per being a tracepoint, this runs with preemption disabled. 11363 + */ 11364 + lockdep_assert_preemption_disabled(); 11365 + 11392 11366 struct perf_raw_record raw = { 11393 11367 .frag = { 11394 11368 .size = entry_size, ··· 11725 11689 { 11726 11690 struct perf_sample_data sample; 11727 11691 struct pt_regs *regs = data; 11692 + 11693 + /* 11694 + * Exception context, will have interrupts disabled. 11695 + */ 11696 + lockdep_assert_irqs_disabled(); 11728 11697 11729 11698 perf_sample_data_init(&sample, bp->attr.bp_addr, 0); 11730 11699 ··· 12195 12154 12196 12155 if (regs && !perf_exclude_event(event, regs)) { 12197 12156 if (!(event->attr.exclude_idle && is_idle_task(current))) 12198 - if (__perf_event_overflow(event, 1, &data, regs)) 12157 + if (perf_event_overflow(event, &data, regs)) 12199 12158 ret = HRTIMER_NORESTART; 12200 12159 } 12201 12160