Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

kprobes/x86: Call out into INT3 handler directly instead of using notifier

In fd4363fff3d96 ("x86: Introduce int3 (breakpoint)-based
instruction patching"), the mechanism that was introduced for
notifying alternatives code from int3 exception handler that and
exception occured was die_notifier.

This is however problematic, as early code might be using jump
labels even before the notifier registration has been performed,
which will then lead to an oops due to unhandled exception. One
of such occurences has been encountered by Fengguang:

int3: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in:
CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.11.0-rc1-01429-g04bf576 #8
task: ffff88000da1b040 ti: ffff88000da1c000 task.ti: ffff88000da1c000
RIP: 0010:[<ffffffff811098cc>] [<ffffffff811098cc>] ttwu_do_wakeup+0x28/0x225
RSP: 0000:ffff88000dd03f10 EFLAGS: 00000006
RAX: 0000000000000000 RBX: ffff88000dd12940 RCX: ffffffff81769c40
RDX: 0000000000000002 RSI: 0000000000000000 RDI: 0000000000000001
RBP: ffff88000dd03f28 R08: ffffffff8176a8c0 R09: 0000000000000002
R10: ffffffff810ff484 R11: ffff88000dd129e8 R12: ffff88000dbc90c0
R13: ffff88000dbc90c0 R14: ffff88000da1dfd8 R15: ffff88000da1dfd8
FS: 0000000000000000(0000) GS:ffff88000dd00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b
CR2: 00000000ffffffff CR3: 0000000001c88000 CR4: 00000000000006e0
Stack:
ffff88000dd12940 ffff88000dbc90c0 ffff88000da1dfd8 ffff88000dd03f48
ffffffff81109e2b ffff88000dd12940 0000000000000000 ffff88000dd03f68
ffffffff81109e9e 0000000000000000 0000000000012940 ffff88000dd03f98
Call Trace:
<IRQ>
[<ffffffff81109e2b>] ttwu_do_activate.constprop.56+0x6d/0x79
[<ffffffff81109e9e>] sched_ttwu_pending+0x67/0x84
[<ffffffff8110c845>] scheduler_ipi+0x15a/0x2b0
[<ffffffff8104dfb4>] smp_reschedule_interrupt+0x38/0x41
[<ffffffff8173bf5d>] reschedule_interrupt+0x6d/0x80
<EOI>
[<ffffffff810ff484>] ? __atomic_notifier_call_chain+0x5/0xc1
[<ffffffff8105cc30>] ? native_safe_halt+0xd/0x16
[<ffffffff81015f10>] default_idle+0x147/0x282
[<ffffffff81017026>] arch_cpu_idle+0x3d/0x5d
[<ffffffff81127d6a>] cpu_idle_loop+0x46d/0x5db
[<ffffffff81127f5c>] cpu_startup_entry+0x84/0x84
[<ffffffff8104f4f8>] start_secondary+0x3c8/0x3d5
[...]

Fix this by directly calling poke_int3_handler() from the int3
exception handler (analogically to what ftrace has been doing
already), instead of relying on notifier, registration of which
might not have yet been finalized by the time of the first trap.

Reported-and-tested-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1307231007490.14024@pobox.suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Jiri Kosina and committed by
Ingo Molnar
17f41571 4f16d61f

+15 -24
+2
arch/x86/include/asm/alternative.h
··· 5 5 #include <linux/stddef.h> 6 6 #include <linux/stringify.h> 7 7 #include <asm/asm.h> 8 + #include <asm/ptrace.h> 8 9 9 10 /* 10 11 * Alternative inline assembly for SMP. ··· 225 224 * inconsistent instruction while you patch. 226 225 */ 227 226 extern void *text_poke(void *addr, const void *opcode, size_t len); 227 + extern int poke_int3_handler(struct pt_regs *regs); 228 228 extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); 229 229 230 230 #endif /* _ASM_X86_ALTERNATIVE_H */
+8 -23
arch/x86/kernel/alternative.c
··· 605 605 static bool bp_patching_in_progress; 606 606 static void *bp_int3_handler, *bp_int3_addr; 607 607 608 - static int int3_notify(struct notifier_block *self, unsigned long val, void *data) 608 + int poke_int3_handler(struct pt_regs *regs) 609 609 { 610 - struct die_args *args = data; 611 - 612 610 /* bp_patching_in_progress */ 613 611 smp_rmb(); 614 612 615 613 if (likely(!bp_patching_in_progress)) 616 - return NOTIFY_DONE; 614 + return 0; 617 615 618 - /* we are not interested in non-int3 faults and ring > 0 faults */ 619 - if (val != DIE_INT3 || !args->regs || user_mode_vm(args->regs) 620 - || args->regs->ip != (unsigned long)bp_int3_addr) 621 - return NOTIFY_DONE; 616 + if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) 617 + return 0; 622 618 623 619 /* set up the specified breakpoint handler */ 624 - args->regs->ip = (unsigned long) bp_int3_handler; 620 + regs->ip = (unsigned long) bp_int3_handler; 625 621 626 - return NOTIFY_STOP; 622 + return 1; 623 + 627 624 } 625 + 628 626 /** 629 627 * text_poke_bp() -- update instructions on live kernel on SMP 630 628 * @addr: address to patch ··· 687 689 return addr; 688 690 } 689 691 690 - /* this one needs to run before anything else handles it as a 691 - * regular exception */ 692 - static struct notifier_block int3_nb = { 693 - .priority = 0x7fffffff, 694 - .notifier_call = int3_notify 695 - }; 696 - 697 - static int __init int3_init(void) 698 - { 699 - return register_die_notifier(&int3_nb); 700 - } 701 - 702 - arch_initcall(int3_init);
+4
arch/x86/kernel/traps.c
··· 58 58 #include <asm/mce.h> 59 59 #include <asm/fixmap.h> 60 60 #include <asm/mach_traps.h> 61 + #include <asm/alternative.h> 61 62 62 63 #ifdef CONFIG_X86_64 63 64 #include <asm/x86_init.h> ··· 328 327 ftrace_int3_handler(regs)) 329 328 return; 330 329 #endif 330 + if (poke_int3_handler(regs)) 331 + return; 332 + 331 333 prev_state = exception_enter(); 332 334 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP 333 335 if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
+1 -1
kernel/kprobes.c
··· 1709 1709 1710 1710 static struct notifier_block kprobe_exceptions_nb = { 1711 1711 .notifier_call = kprobe_exceptions_notify, 1712 - .priority = 0x7ffffff0 /* High priority, but not first. */ 1712 + .priority = 0x7fffffff /* we need to be notified first */ 1713 1713 }; 1714 1714 1715 1715 unsigned long __weak arch_deref_entry_point(void *entry)