Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

s390/ftrace,kprobes: allow to patch first instruction

If the function tracer is enabled, allow to set kprobes on the first
instruction of a function (which is the function trace caller):

If no kprobe is set handling of enabling and disabling function tracing
of a function simply patches the first instruction. Either it is a nop
(right now it's an unconditional branch, which skips the mcount block),
or it's a branch to the ftrace_caller() function.

If a kprobe is being placed on a function tracer calling instruction
we encode if we actually have a nop or branch in the remaining bytes
after the breakpoint instruction (illegal opcode).
This is possible, since the size of the instruction used for the nop
and branch is six bytes, while the size of the breakpoint is only
two bytes.
Therefore the first two bytes contain the illegal opcode and the last
four bytes contain either "0" for nop or "1" for branch. The kprobes
code will then execute/simulate the correct instruction.

Instruction patching for kprobes and function tracer is always done
with stop_machine(). Therefore we don't have any races where an
instruction is patched concurrently on a different cpu.
Besides that also the program check handler which executes the function
trace caller instruction won't be executed concurrently to any
stop_machine() execution.

This allows to keep full fault based kprobes handling which generates
correct pt_regs contents automatically.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

authored by

Heiko Carstens and committed by
Martin Schwidefsky
c933146a f7f242ff

+214 -92
+49 -7
arch/s390/include/asm/ftrace.h
··· 1 1 #ifndef _ASM_S390_FTRACE_H 2 2 #define _ASM_S390_FTRACE_H 3 3 4 + #define ARCH_SUPPORTS_FTRACE_OPS 1 5 + 6 + #define MCOUNT_INSN_SIZE 24 7 + #define MCOUNT_RETURN_FIXUP 18 8 + 4 9 #ifndef __ASSEMBLY__ 5 10 6 - extern void _mcount(void); 11 + void _mcount(void); 12 + void ftrace_caller(void); 13 + 7 14 extern char ftrace_graph_caller_end; 15 + extern unsigned long ftrace_plt; 8 16 9 17 struct dyn_arch_ftrace { }; 10 18 11 - #define MCOUNT_ADDR ((long)_mcount) 19 + #define MCOUNT_ADDR ((unsigned long)_mcount) 20 + #define FTRACE_ADDR ((unsigned long)ftrace_caller) 12 21 22 + #define KPROBE_ON_FTRACE_NOP 0 23 + #define KPROBE_ON_FTRACE_CALL 1 13 24 14 25 static inline unsigned long ftrace_call_adjust(unsigned long addr) 15 26 { 16 27 return addr; 17 28 } 18 29 30 + struct ftrace_insn { 31 + u16 opc; 32 + s32 disp; 33 + } __packed; 34 + 35 + static inline void ftrace_generate_nop_insn(struct ftrace_insn *insn) 36 + { 37 + #ifdef CONFIG_FUNCTION_TRACER 38 + /* jg .+24 */ 39 + insn->opc = 0xc0f4; 40 + insn->disp = MCOUNT_INSN_SIZE / 2; 41 + #endif 42 + } 43 + 44 + static inline int is_ftrace_nop(struct ftrace_insn *insn) 45 + { 46 + #ifdef CONFIG_FUNCTION_TRACER 47 + if (insn->disp == MCOUNT_INSN_SIZE / 2) 48 + return 1; 49 + #endif 50 + return 0; 51 + } 52 + 53 + static inline void ftrace_generate_call_insn(struct ftrace_insn *insn, 54 + unsigned long ip) 55 + { 56 + #ifdef CONFIG_FUNCTION_TRACER 57 + unsigned long target; 58 + 59 + /* brasl r0,ftrace_caller */ 60 + target = is_module_addr((void *) ip) ? ftrace_plt : FTRACE_ADDR; 61 + insn->opc = 0xc005; 62 + insn->disp = (target - ip) / 2; 63 + #endif 64 + } 65 + 19 66 #endif /* __ASSEMBLY__ */ 20 - 21 - #define MCOUNT_INSN_SIZE 18 22 - 23 - #define ARCH_SUPPORTS_FTRACE_OPS 1 24 - 25 67 #endif /* _ASM_S390_FTRACE_H */
+1
arch/s390/include/asm/kprobes.h
··· 60 60 struct arch_specific_insn { 61 61 /* copy of original instruction */ 62 62 kprobe_opcode_t *insn; 63 + unsigned int is_ftrace_insn : 1; 63 64 }; 64 65 65 66 struct prev_kprobe {
+2 -2
arch/s390/include/asm/lowcore.h
··· 147 147 __u32 softirq_pending; /* 0x02ec */ 148 148 __u32 percpu_offset; /* 0x02f0 */ 149 149 __u32 machine_flags; /* 0x02f4 */ 150 - __u32 ftrace_func; /* 0x02f8 */ 150 + __u8 pad_0x02f8[0x02fc-0x02f8]; /* 0x02f8 */ 151 151 __u32 spinlock_lockval; /* 0x02fc */ 152 152 153 153 __u8 pad_0x0300[0x0e00-0x0300]; /* 0x0300 */ ··· 297 297 __u64 percpu_offset; /* 0x0378 */ 298 298 __u64 vdso_per_cpu_data; /* 0x0380 */ 299 299 __u64 machine_flags; /* 0x0388 */ 300 - __u64 ftrace_func; /* 0x0390 */ 300 + __u8 pad_0x0390[0x0398-0x0390]; /* 0x0390 */ 301 301 __u64 gmap; /* 0x0398 */ 302 302 __u32 spinlock_lockval; /* 0x03a0 */ 303 303 __u8 pad_0x03a0[0x0400-0x03a4]; /* 0x03a4 */
+12
arch/s390/include/asm/pgtable.h
··· 133 133 #define MODULES_LEN (1UL << 31) 134 134 #endif 135 135 136 + static inline int is_module_addr(void *addr) 137 + { 138 + #ifdef CONFIG_64BIT 139 + BUILD_BUG_ON(MODULES_LEN > (1UL << 31)); 140 + if (addr < (void *)MODULES_VADDR) 141 + return 0; 142 + if (addr > (void *)MODULES_END) 143 + return 0; 144 + #endif 145 + return 1; 146 + } 147 + 136 148 /* 137 149 * A 31 bit pagetable entry of S390 has following format: 138 150 * | PFRA | | OS |
-1
arch/s390/kernel/asm-offsets.c
··· 156 156 DEFINE(__LC_INT_CLOCK, offsetof(struct _lowcore, int_clock)); 157 157 DEFINE(__LC_MCCK_CLOCK, offsetof(struct _lowcore, mcck_clock)); 158 158 DEFINE(__LC_MACHINE_FLAGS, offsetof(struct _lowcore, machine_flags)); 159 - DEFINE(__LC_FTRACE_FUNC, offsetof(struct _lowcore, ftrace_func)); 160 159 DEFINE(__LC_DUMP_REIPL, offsetof(struct _lowcore, ipib)); 161 160 BLANK(); 162 161 DEFINE(__LC_CPU_TIMER_SAVE_AREA, offsetof(struct _lowcore, cpu_timer_save_area));
-4
arch/s390/kernel/early.c
··· 12 12 #include <linux/errno.h> 13 13 #include <linux/string.h> 14 14 #include <linux/ctype.h> 15 - #include <linux/ftrace.h> 16 15 #include <linux/lockdep.h> 17 16 #include <linux/module.h> 18 17 #include <linux/pfn.h> ··· 489 490 detect_machine_facilities(); 490 491 setup_topology(); 491 492 sclp_early_detect(); 492 - #ifdef CONFIG_DYNAMIC_FTRACE 493 - S390_lowcore.ftrace_func = (unsigned long)ftrace_caller; 494 - #endif 495 493 lockdep_on(); 496 494 }
+83 -47
arch/s390/kernel/ftrace.c
··· 7 7 * Martin Schwidefsky <schwidefsky@de.ibm.com> 8 8 */ 9 9 10 + #include <linux/moduleloader.h> 10 11 #include <linux/hardirq.h> 11 12 #include <linux/uaccess.h> 12 13 #include <linux/ftrace.h> ··· 16 15 #include <linux/kprobes.h> 17 16 #include <trace/syscall.h> 18 17 #include <asm/asm-offsets.h> 18 + #include <asm/cacheflush.h> 19 19 #include "entry.h" 20 - 21 - void mcount_replace_code(void); 22 - void ftrace_disable_code(void); 23 - void ftrace_enable_insn(void); 24 20 25 21 /* 26 22 * The mcount code looks like this: ··· 25 27 * larl %r1,<&counter> # offset 6 26 28 * brasl %r14,_mcount # offset 12 27 29 * lg %r14,8(%r15) # offset 18 28 - * Total length is 24 bytes. The complete mcount block initially gets replaced 29 - * by ftrace_make_nop. Subsequent calls to ftrace_make_call / ftrace_make_nop 30 - * only patch the jg/lg instruction within the block. 31 - * Note: we do not patch the first instruction to an unconditional branch, 32 - * since that would break kprobes/jprobes. It is easier to leave the larl 33 - * instruction in and only modify the second instruction. 30 + * Total length is 24 bytes. Only the first instruction will be patched 31 + * by ftrace_make_call / ftrace_make_nop. 34 32 * The enabled ftrace code block looks like this: 35 - * larl %r0,.+24 # offset 0 36 - * > lg %r1,__LC_FTRACE_FUNC # offset 6 37 - * br %r1 # offset 12 38 - * brcl 0,0 # offset 14 39 - * brc 0,0 # offset 20 33 + * > brasl %r0,ftrace_caller # offset 0 34 + * larl %r1,<&counter> # offset 6 35 + * brasl %r14,_mcount # offset 12 36 + * lg %r14,8(%r15) # offset 18 40 37 * The ftrace function gets called with a non-standard C function call ABI 41 38 * where r0 contains the return address. It is also expected that the called 42 39 * function only clobbers r0 and r1, but restores r2-r15. 40 + * For module code we can't directly jump to ftrace caller, but need a 41 + * trampoline (ftrace_plt), which clobbers also r1. 43 42 * The return point of the ftrace function has offset 24, so execution 44 43 * continues behind the mcount block. 45 - * larl %r0,.+24 # offset 0 46 - * > jg .+18 # offset 6 47 - * br %r1 # offset 12 48 - * brcl 0,0 # offset 14 49 - * brc 0,0 # offset 20 44 + * The disabled ftrace code block looks like this: 45 + * > jg .+24 # offset 0 46 + * larl %r1,<&counter> # offset 6 47 + * brasl %r14,_mcount # offset 12 48 + * lg %r14,8(%r15) # offset 18 50 49 * The jg instruction branches to offset 24 to skip as many instructions 51 50 * as possible. 52 51 */ 53 - asm( 54 - " .align 4\n" 55 - "mcount_replace_code:\n" 56 - " larl %r0,0f\n" 57 - "ftrace_disable_code:\n" 58 - " jg 0f\n" 59 - " br %r1\n" 60 - " brcl 0,0\n" 61 - " brc 0,0\n" 62 - "0:\n" 63 - " .align 4\n" 64 - "ftrace_enable_insn:\n" 65 - " lg %r1,"__stringify(__LC_FTRACE_FUNC)"\n"); 66 52 67 - #define MCOUNT_BLOCK_SIZE 24 68 - #define MCOUNT_INSN_OFFSET 6 69 - #define FTRACE_INSN_SIZE 6 53 + unsigned long ftrace_plt; 70 54 71 55 int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, 72 56 unsigned long addr) ··· 59 79 int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, 60 80 unsigned long addr) 61 81 { 62 - /* Initial replacement of the whole mcount block */ 63 - if (addr == MCOUNT_ADDR) { 64 - if (probe_kernel_write((void *) rec->ip - MCOUNT_INSN_OFFSET, 65 - mcount_replace_code, 66 - MCOUNT_BLOCK_SIZE)) 67 - return -EPERM; 68 - return 0; 82 + struct ftrace_insn insn; 83 + unsigned short op; 84 + void *from, *to; 85 + size_t size; 86 + 87 + ftrace_generate_nop_insn(&insn); 88 + size = sizeof(insn); 89 + from = &insn; 90 + to = (void *) rec->ip; 91 + if (probe_kernel_read(&op, (void *) rec->ip, sizeof(op))) 92 + return -EFAULT; 93 + /* 94 + * If we find a breakpoint instruction, a kprobe has been placed 95 + * at the beginning of the function. We write the constant 96 + * KPROBE_ON_FTRACE_NOP into the remaining four bytes of the original 97 + * instruction so that the kprobes handler can execute a nop, if it 98 + * reaches this breakpoint. 99 + */ 100 + if (op == BREAKPOINT_INSTRUCTION) { 101 + size -= 2; 102 + from += 2; 103 + to += 2; 104 + insn.disp = KPROBE_ON_FTRACE_NOP; 69 105 } 70 - if (probe_kernel_write((void *) rec->ip, ftrace_disable_code, 71 - MCOUNT_INSN_SIZE)) 106 + if (probe_kernel_write(to, from, size)) 72 107 return -EPERM; 73 108 return 0; 74 109 } 75 110 76 111 int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) 77 112 { 78 - if (probe_kernel_write((void *) rec->ip, ftrace_enable_insn, 79 - FTRACE_INSN_SIZE)) 113 + struct ftrace_insn insn; 114 + unsigned short op; 115 + void *from, *to; 116 + size_t size; 117 + 118 + ftrace_generate_call_insn(&insn, rec->ip); 119 + size = sizeof(insn); 120 + from = &insn; 121 + to = (void *) rec->ip; 122 + if (probe_kernel_read(&op, (void *) rec->ip, sizeof(op))) 123 + return -EFAULT; 124 + /* 125 + * If we find a breakpoint instruction, a kprobe has been placed 126 + * at the beginning of the function. We write the constant 127 + * KPROBE_ON_FTRACE_CALL into the remaining four bytes of the original 128 + * instruction so that the kprobes handler can execute a brasl if it 129 + * reaches this breakpoint. 130 + */ 131 + if (op == BREAKPOINT_INSTRUCTION) { 132 + size -= 2; 133 + from += 2; 134 + to += 2; 135 + insn.disp = KPROBE_ON_FTRACE_CALL; 136 + } 137 + if (probe_kernel_write(to, from, size)) 80 138 return -EPERM; 81 139 return 0; 82 140 } ··· 128 110 { 129 111 return 0; 130 112 } 113 + 114 + static int __init ftrace_plt_init(void) 115 + { 116 + unsigned int *ip; 117 + 118 + ftrace_plt = (unsigned long) module_alloc(PAGE_SIZE); 119 + if (!ftrace_plt) 120 + panic("cannot allocate ftrace plt\n"); 121 + ip = (unsigned int *) ftrace_plt; 122 + ip[0] = 0x0d10e310; /* basr 1,0; lg 1,10(1); br 1 */ 123 + ip[1] = 0x100a0004; 124 + ip[2] = 0x07f10000; 125 + ip[3] = FTRACE_ADDR >> 32; 126 + ip[4] = FTRACE_ADDR & 0xffffffff; 127 + set_memory_ro(ftrace_plt, 1); 128 + return 0; 129 + } 130 + device_initcall(ftrace_plt_init); 131 131 132 132 #ifdef CONFIG_FUNCTION_GRAPH_TRACER 133 133 /*
+64 -26
arch/s390/kernel/kprobes.c
··· 29 29 #include <linux/module.h> 30 30 #include <linux/slab.h> 31 31 #include <linux/hardirq.h> 32 + #include <linux/ftrace.h> 32 33 #include <asm/cacheflush.h> 33 34 #include <asm/sections.h> 34 35 #include <asm/dis.h> ··· 61 60 62 61 static void __kprobes copy_instruction(struct kprobe *p) 63 62 { 63 + unsigned long ip = (unsigned long) p->addr; 64 64 s64 disp, new_disp; 65 65 u64 addr, new_addr; 66 66 67 - memcpy(p->ainsn.insn, p->addr, insn_length(p->opcode >> 8)); 67 + if (ftrace_location(ip) == ip) { 68 + /* 69 + * If kprobes patches the instruction that is morphed by 70 + * ftrace make sure that kprobes always sees the branch 71 + * "jg .+24" that skips the mcount block 72 + */ 73 + ftrace_generate_nop_insn((struct ftrace_insn *)p->ainsn.insn); 74 + p->ainsn.is_ftrace_insn = 1; 75 + } else 76 + memcpy(p->ainsn.insn, p->addr, insn_length(p->opcode >> 8)); 77 + p->opcode = p->ainsn.insn[0]; 68 78 if (!probe_is_insn_relative_long(p->ainsn.insn)) 69 79 return; 70 80 /* ··· 95 83 static inline int is_kernel_addr(void *addr) 96 84 { 97 85 return addr < (void *)_end; 98 - } 99 - 100 - static inline int is_module_addr(void *addr) 101 - { 102 - #ifdef CONFIG_64BIT 103 - BUILD_BUG_ON(MODULES_LEN > (1UL << 31)); 104 - if (addr < (void *)MODULES_VADDR) 105 - return 0; 106 - if (addr > (void *)MODULES_END) 107 - return 0; 108 - #endif 109 - return 1; 110 86 } 111 87 112 88 static int __kprobes s390_get_insn_slot(struct kprobe *p) ··· 132 132 return -EINVAL; 133 133 if (s390_get_insn_slot(p)) 134 134 return -ENOMEM; 135 - p->opcode = *p->addr; 136 135 copy_instruction(p); 137 136 return 0; 138 137 } 139 138 140 - struct ins_replace_args { 141 - kprobe_opcode_t *ptr; 142 - kprobe_opcode_t opcode; 139 + int arch_check_ftrace_location(struct kprobe *p) 140 + { 141 + return 0; 142 + } 143 + 144 + struct swap_insn_args { 145 + struct kprobe *p; 146 + unsigned int arm_kprobe : 1; 143 147 }; 144 148 145 - static int __kprobes swap_instruction(void *aref) 149 + static int __kprobes swap_instruction(void *data) 146 150 { 147 151 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 148 152 unsigned long status = kcb->kprobe_status; 149 - struct ins_replace_args *args = aref; 153 + struct swap_insn_args *args = data; 154 + struct ftrace_insn new_insn, *insn; 155 + struct kprobe *p = args->p; 156 + size_t len; 150 157 158 + new_insn.opc = args->arm_kprobe ? BREAKPOINT_INSTRUCTION : p->opcode; 159 + len = sizeof(new_insn.opc); 160 + if (!p->ainsn.is_ftrace_insn) 161 + goto skip_ftrace; 162 + len = sizeof(new_insn); 163 + insn = (struct ftrace_insn *) p->addr; 164 + if (args->arm_kprobe) { 165 + if (is_ftrace_nop(insn)) 166 + new_insn.disp = KPROBE_ON_FTRACE_NOP; 167 + else 168 + new_insn.disp = KPROBE_ON_FTRACE_CALL; 169 + } else { 170 + ftrace_generate_call_insn(&new_insn, (unsigned long)p->addr); 171 + if (insn->disp == KPROBE_ON_FTRACE_NOP) 172 + ftrace_generate_nop_insn(&new_insn); 173 + } 174 + skip_ftrace: 151 175 kcb->kprobe_status = KPROBE_SWAP_INST; 152 - probe_kernel_write(args->ptr, &args->opcode, sizeof(args->opcode)); 176 + probe_kernel_write(p->addr, &new_insn, len); 153 177 kcb->kprobe_status = status; 154 178 return 0; 155 179 } 156 180 157 181 void __kprobes arch_arm_kprobe(struct kprobe *p) 158 182 { 159 - struct ins_replace_args args; 183 + struct swap_insn_args args = {.p = p, .arm_kprobe = 1}; 160 184 161 - args.ptr = p->addr; 162 - args.opcode = BREAKPOINT_INSTRUCTION; 163 185 stop_machine(swap_instruction, &args, NULL); 164 186 } 165 187 166 188 void __kprobes arch_disarm_kprobe(struct kprobe *p) 167 189 { 168 - struct ins_replace_args args; 190 + struct swap_insn_args args = {.p = p, .arm_kprobe = 0}; 169 191 170 - args.ptr = p->addr; 171 - args.opcode = p->opcode; 172 192 stop_machine(swap_instruction, &args, NULL); 173 193 } 174 194 ··· 478 458 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 479 459 unsigned long ip = regs->psw.addr & PSW_ADDR_INSN; 480 460 int fixup = probe_get_fixup_type(p->ainsn.insn); 461 + 462 + /* Check if the kprobes location is an enabled ftrace caller */ 463 + if (p->ainsn.is_ftrace_insn) { 464 + struct ftrace_insn *insn = (struct ftrace_insn *) p->addr; 465 + struct ftrace_insn call_insn; 466 + 467 + ftrace_generate_call_insn(&call_insn, (unsigned long) p->addr); 468 + /* 469 + * A kprobe on an enabled ftrace call site actually single 470 + * stepped an unconditional branch (ftrace nop equivalent). 471 + * Now we need to fixup things and pretend that a brasl r0,... 472 + * was executed instead. 473 + */ 474 + if (insn->disp == KPROBE_ON_FTRACE_CALL) { 475 + ip += call_insn.disp * 2 - MCOUNT_INSN_SIZE; 476 + regs->gprs[0] = (unsigned long)p->addr + sizeof(*insn); 477 + } 478 + } 481 479 482 480 if (fixup & FIXUP_PSW_NORMAL) 483 481 ip += (unsigned long) p->addr - (unsigned long) p->ainsn.insn;
+1
arch/s390/kernel/mcount.S
··· 27 27 .globl ftrace_regs_caller 28 28 .set ftrace_regs_caller,ftrace_caller 29 29 lgr %r1,%r15 30 + aghi %r0,MCOUNT_RETURN_FIXUP 30 31 aghi %r15,-STACK_FRAME_SIZE 31 32 stg %r1,__SF_BACKCHAIN(%r15) 32 33 stg %r1,(STACK_PTREGS_GPRS+15*8)(%r15)
-2
arch/s390/kernel/setup.c
··· 41 41 #include <linux/ctype.h> 42 42 #include <linux/reboot.h> 43 43 #include <linux/topology.h> 44 - #include <linux/ftrace.h> 45 44 #include <linux/kexec.h> 46 45 #include <linux/crash_dump.h> 47 46 #include <linux/memory.h> ··· 355 356 lc->steal_timer = S390_lowcore.steal_timer; 356 357 lc->last_update_timer = S390_lowcore.last_update_timer; 357 358 lc->last_update_clock = S390_lowcore.last_update_clock; 358 - lc->ftrace_func = S390_lowcore.ftrace_func; 359 359 360 360 restart_stack = __alloc_bootmem(ASYNC_SIZE, ASYNC_SIZE, 0); 361 361 restart_stack += ASYNC_SIZE;
-1
arch/s390/kernel/smp.c
··· 236 236 lc->percpu_offset = __per_cpu_offset[cpu]; 237 237 lc->kernel_asce = S390_lowcore.kernel_asce; 238 238 lc->machine_flags = S390_lowcore.machine_flags; 239 - lc->ftrace_func = S390_lowcore.ftrace_func; 240 239 lc->user_timer = lc->system_timer = lc->steal_timer = 0; 241 240 __ctl_store(lc->cregs_save_area, 0, 15); 242 241 save_access_regs((unsigned int *) lc->access_regs_save_area);
+1 -1
scripts/recordmcount.c
··· 404 404 } 405 405 if (w2(ghdr->e_machine) == EM_S390) { 406 406 reltype = R_390_64; 407 - mcount_adjust_64 = -8; 407 + mcount_adjust_64 = -14; 408 408 } 409 409 if (w2(ghdr->e_machine) == EM_MIPS) { 410 410 reltype = R_MIPS_64;
+1 -1
scripts/recordmcount.pl
··· 243 243 244 244 } elsif ($arch eq "s390" && $bits == 64) { 245 245 $mcount_regex = "^\\s*([0-9a-fA-F]+):\\s*R_390_(PC|PLT)32DBL\\s+_mcount\\+0x2\$"; 246 - $mcount_adjust = -8; 246 + $mcount_adjust = -14; 247 247 $alignment = 8; 248 248 $type = ".quad"; 249 249 $ld .= " -m elf64_s390";