Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf fixes from Thomas Gleixner:
"A bunch of fixes for perf and kprobes:
- revert a commit that caused a perf group regression
- silence dmesg spam
- fix kprobe probing errors on ia64 and ppc64
- filter kprobe faults from userspace
- lockdep fix for perf exit path
- prevent perf #GP in KVM guest
- correct perf event and filters"

* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
kprobes: Fix "Failed to find blacklist" probing errors on ia64 and ppc64
kprobes/x86: Don't try to resolve kprobe faults from userspace
perf/x86/intel: Avoid spamming kernel log for BTS buffer failure
perf/x86/intel: Protect LBR and extra_regs against KVM lying
perf: Fix lockdep warning on process exit
perf/x86/intel/uncore: Fix SNB-EP/IVT Cbox filter mappings
perf/x86/intel: Use proper dTLB-load-misses event on IvyBridge
perf: Revert ("perf: Always destroy groups on exit")

+3
arch/x86/kernel/cpu/perf_event.c
··· 118 118 continue; 119 119 if (event->attr.config1 & ~er->valid_mask) 120 120 return -EINVAL; 121 + /* Check if the extra msrs can be safely accessed*/ 122 + if (!er->extra_msr_access) 123 + return -ENXIO; 121 124 122 125 reg->idx = er->idx; 123 126 reg->config = event->attr.config1;
+7 -5
arch/x86/kernel/cpu/perf_event.h
··· 295 295 u64 config_mask; 296 296 u64 valid_mask; 297 297 int idx; /* per_xxx->regs[] reg index */ 298 + bool extra_msr_access; 298 299 }; 299 300 300 301 #define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ 301 - .event = (e), \ 302 - .msr = (ms), \ 303 - .config_mask = (m), \ 304 - .valid_mask = (vm), \ 305 - .idx = EXTRA_REG_##i, \ 302 + .event = (e), \ 303 + .msr = (ms), \ 304 + .config_mask = (m), \ 305 + .valid_mask = (vm), \ 306 + .idx = EXTRA_REG_##i, \ 307 + .extra_msr_access = true, \ 306 308 } 307 309 308 310 #define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \
+68 -1
arch/x86/kernel/cpu/perf_event_intel.c
··· 2182 2182 } 2183 2183 } 2184 2184 2185 + /* 2186 + * Under certain circumstances, access certain MSR may cause #GP. 2187 + * The function tests if the input MSR can be safely accessed. 2188 + */ 2189 + static bool check_msr(unsigned long msr, u64 mask) 2190 + { 2191 + u64 val_old, val_new, val_tmp; 2192 + 2193 + /* 2194 + * Read the current value, change it and read it back to see if it 2195 + * matches, this is needed to detect certain hardware emulators 2196 + * (qemu/kvm) that don't trap on the MSR access and always return 0s. 2197 + */ 2198 + if (rdmsrl_safe(msr, &val_old)) 2199 + return false; 2200 + 2201 + /* 2202 + * Only change the bits which can be updated by wrmsrl. 2203 + */ 2204 + val_tmp = val_old ^ mask; 2205 + if (wrmsrl_safe(msr, val_tmp) || 2206 + rdmsrl_safe(msr, &val_new)) 2207 + return false; 2208 + 2209 + if (val_new != val_tmp) 2210 + return false; 2211 + 2212 + /* Here it's sure that the MSR can be safely accessed. 2213 + * Restore the old value and return. 2214 + */ 2215 + wrmsrl(msr, val_old); 2216 + 2217 + return true; 2218 + } 2219 + 2185 2220 static __init void intel_sandybridge_quirk(void) 2186 2221 { 2187 2222 x86_pmu.check_microcode = intel_snb_check_microcode; ··· 2306 2271 union cpuid10_ebx ebx; 2307 2272 struct event_constraint *c; 2308 2273 unsigned int unused; 2309 - int version; 2274 + struct extra_reg *er; 2275 + int version, i; 2310 2276 2311 2277 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { 2312 2278 switch (boot_cpu_data.x86) { ··· 2510 2474 case 62: /* IvyBridge EP */ 2511 2475 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, 2512 2476 sizeof(hw_cache_event_ids)); 2477 + /* dTLB-load-misses on IVB is different than SNB */ 2478 + hw_cache_event_ids[C(DTLB)][C(OP_READ)][C(RESULT_MISS)] = 0x8108; /* DTLB_LOAD_MISSES.DEMAND_LD_MISS_CAUSES_A_WALK */ 2479 + 2513 2480 memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, 2514 2481 sizeof(hw_cache_extra_regs)); 2515 2482 ··· 2610 2571 2611 2572 c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; 2612 2573 c->weight += x86_pmu.num_counters; 2574 + } 2575 + } 2576 + 2577 + /* 2578 + * Access LBR MSR may cause #GP under certain circumstances. 2579 + * E.g. KVM doesn't support LBR MSR 2580 + * Check all LBT MSR here. 2581 + * Disable LBR access if any LBR MSRs can not be accessed. 2582 + */ 2583 + if (x86_pmu.lbr_nr && !check_msr(x86_pmu.lbr_tos, 0x3UL)) 2584 + x86_pmu.lbr_nr = 0; 2585 + for (i = 0; i < x86_pmu.lbr_nr; i++) { 2586 + if (!(check_msr(x86_pmu.lbr_from + i, 0xffffUL) && 2587 + check_msr(x86_pmu.lbr_to + i, 0xffffUL))) 2588 + x86_pmu.lbr_nr = 0; 2589 + } 2590 + 2591 + /* 2592 + * Access extra MSR may cause #GP under certain circumstances. 2593 + * E.g. KVM doesn't support offcore event 2594 + * Check all extra_regs here. 2595 + */ 2596 + if (x86_pmu.extra_regs) { 2597 + for (er = x86_pmu.extra_regs; er->msr; er++) { 2598 + er->extra_msr_access = check_msr(er->msr, 0x1ffUL); 2599 + /* Disable LBR select mapping */ 2600 + if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access) 2601 + x86_pmu.lbr_sel_map = NULL; 2613 2602 } 2614 2603 } 2615 2604
+4 -2
arch/x86/kernel/cpu/perf_event_intel_ds.c
··· 311 311 if (!x86_pmu.bts) 312 312 return 0; 313 313 314 - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL, node); 315 - if (unlikely(!buffer)) 314 + buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); 315 + if (unlikely(!buffer)) { 316 + WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); 316 317 return -ENOMEM; 318 + } 317 319 318 320 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; 319 321 thresh = max / 16;
+6 -5
arch/x86/kernel/cpu/perf_event_intel_uncore.c
··· 550 550 SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0x6), 551 551 SNBEP_CBO_EVENT_EXTRA_REG(0x0135, 0xffff, 0x8), 552 552 SNBEP_CBO_EVENT_EXTRA_REG(0x0335, 0xffff, 0x8), 553 - SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xc), 554 - SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xc), 553 + SNBEP_CBO_EVENT_EXTRA_REG(0x4135, 0xffff, 0xa), 554 + SNBEP_CBO_EVENT_EXTRA_REG(0x4335, 0xffff, 0xa), 555 555 SNBEP_CBO_EVENT_EXTRA_REG(0x4435, 0xffff, 0x2), 556 556 SNBEP_CBO_EVENT_EXTRA_REG(0x4835, 0xffff, 0x2), 557 557 SNBEP_CBO_EVENT_EXTRA_REG(0x4a35, 0xffff, 0x2), 558 558 SNBEP_CBO_EVENT_EXTRA_REG(0x5035, 0xffff, 0x2), 559 559 SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x8), 560 560 SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x8), 561 - SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xc), 562 - SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xc), 561 + SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0xa), 562 + SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0xa), 563 563 SNBEP_CBO_EVENT_EXTRA_REG(0x4436, 0xffff, 0x2), 564 564 SNBEP_CBO_EVENT_EXTRA_REG(0x4836, 0xffff, 0x2), 565 565 SNBEP_CBO_EVENT_EXTRA_REG(0x4a36, 0xffff, 0x2), ··· 1222 1222 SNBEP_CBO_EVENT_EXTRA_REG(SNBEP_CBO_PMON_CTL_TID_EN, 1223 1223 SNBEP_CBO_PMON_CTL_TID_EN, 0x1), 1224 1224 SNBEP_CBO_EVENT_EXTRA_REG(0x1031, 0x10ff, 0x2), 1225 + 1225 1226 SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), 1226 1227 SNBEP_CBO_EVENT_EXTRA_REG(0x4134, 0xffff, 0xc), 1227 1228 SNBEP_CBO_EVENT_EXTRA_REG(0x5134, 0xffff, 0xc), ··· 1246 1245 SNBEP_CBO_EVENT_EXTRA_REG(0x8335, 0xffff, 0x10), 1247 1246 SNBEP_CBO_EVENT_EXTRA_REG(0x0136, 0xffff, 0x10), 1248 1247 SNBEP_CBO_EVENT_EXTRA_REG(0x0336, 0xffff, 0x10), 1249 - SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), 1248 + SNBEP_CBO_EVENT_EXTRA_REG(0x2136, 0xffff, 0x10), 1250 1249 SNBEP_CBO_EVENT_EXTRA_REG(0x2336, 0xffff, 0x10), 1251 1250 SNBEP_CBO_EVENT_EXTRA_REG(0x4136, 0xffff, 0x18), 1252 1251 SNBEP_CBO_EVENT_EXTRA_REG(0x4336, 0xffff, 0x18),
+3
arch/x86/kernel/kprobes/core.c
··· 574 574 struct kprobe *p; 575 575 struct kprobe_ctlblk *kcb; 576 576 577 + if (user_mode_vm(regs)) 578 + return 0; 579 + 577 580 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); 578 581 /* 579 582 * We don't want to be preempted for the entire
+30 -2
kernel/events/core.c
··· 7458 7458 struct perf_event_context *child_ctx, 7459 7459 struct task_struct *child) 7460 7460 { 7461 - perf_remove_from_context(child_event, true); 7461 + /* 7462 + * Do not destroy the 'original' grouping; because of the context 7463 + * switch optimization the original events could've ended up in a 7464 + * random child task. 7465 + * 7466 + * If we were to destroy the original group, all group related 7467 + * operations would cease to function properly after this random 7468 + * child dies. 7469 + * 7470 + * Do destroy all inherited groups, we don't care about those 7471 + * and being thorough is better. 7472 + */ 7473 + perf_remove_from_context(child_event, !!child_event->parent); 7462 7474 7463 7475 /* 7464 7476 * It can happen that the parent exits first, and has events ··· 7486 7474 static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 7487 7475 { 7488 7476 struct perf_event *child_event, *next; 7489 - struct perf_event_context *child_ctx; 7477 + struct perf_event_context *child_ctx, *parent_ctx; 7490 7478 unsigned long flags; 7491 7479 7492 7480 if (likely(!child->perf_event_ctxp[ctxn])) { ··· 7511 7499 raw_spin_lock(&child_ctx->lock); 7512 7500 task_ctx_sched_out(child_ctx); 7513 7501 child->perf_event_ctxp[ctxn] = NULL; 7502 + 7503 + /* 7504 + * In order to avoid freeing: child_ctx->parent_ctx->task 7505 + * under perf_event_context::lock, grab another reference. 7506 + */ 7507 + parent_ctx = child_ctx->parent_ctx; 7508 + if (parent_ctx) 7509 + get_ctx(parent_ctx); 7510 + 7514 7511 /* 7515 7512 * If this context is a clone; unclone it so it can't get 7516 7513 * swapped to another process while we're removing all ··· 7528 7507 unclone_ctx(child_ctx); 7529 7508 update_context_time(child_ctx); 7530 7509 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 7510 + 7511 + /* 7512 + * Now that we no longer hold perf_event_context::lock, drop 7513 + * our extra child_ctx->parent_ctx reference. 7514 + */ 7515 + if (parent_ctx) 7516 + put_ctx(parent_ctx); 7531 7517 7532 7518 /* 7533 7519 * Report the task dead after unscheduling the events so that we
+9 -5
kernel/kprobes.c
··· 2037 2037 { 2038 2038 unsigned long *iter; 2039 2039 struct kprobe_blacklist_entry *ent; 2040 - unsigned long offset = 0, size = 0; 2040 + unsigned long entry, offset = 0, size = 0; 2041 2041 2042 2042 for (iter = start; iter < end; iter++) { 2043 - if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) { 2044 - pr_err("Failed to find blacklist %p\n", (void *)*iter); 2043 + entry = arch_deref_entry_point((void *)*iter); 2044 + 2045 + if (!kernel_text_address(entry) || 2046 + !kallsyms_lookup_size_offset(entry, &size, &offset)) { 2047 + pr_err("Failed to find blacklist at %p\n", 2048 + (void *)entry); 2045 2049 continue; 2046 2050 } 2047 2051 2048 2052 ent = kmalloc(sizeof(*ent), GFP_KERNEL); 2049 2053 if (!ent) 2050 2054 return -ENOMEM; 2051 - ent->start_addr = *iter; 2052 - ent->end_addr = *iter + size; 2055 + ent->start_addr = entry; 2056 + ent->end_addr = entry + size; 2053 2057 INIT_LIST_HEAD(&ent->list); 2054 2058 list_add_tail(&ent->list, &kprobe_blacklist); 2055 2059 }