Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ARM: kprobes: enable OPTPROBES for ARM 32

This patch introduce kprobeopt for ARM 32.

Limitations:
- Currently only kernel compiled with ARM ISA is supported.

- Offset between probe point and optinsn slot must not larger than
32MiB. Masami Hiramatsu suggests replacing 2 words, it will make
things complex. Futher patch can make such optimization.

Kprobe opt on ARM is relatively simpler than kprobe opt on x86 because
ARM instruction is always 4 bytes aligned and 4 bytes long. This patch
replace probed instruction by a 'b', branch to trampoline code and then
calls optimized_callback(). optimized_callback() calls opt_pre_handler()
to execute kprobe handler. It also emulate/simulate replaced instruction.

When unregistering kprobe, the deferred manner of unoptimizer may leave
branch instruction before optimizer is called. Different from x86_64,
which only copy the probed insn after optprobe_template_end and
reexecute them, this patch call singlestep to emulate/simulate the insn
directly. Futher patch can optimize this behavior.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Will Deacon <will.deacon@arm.com>
Reviewed-by: Jon Medhurst (Tixy) <tixy@linaro.org>
Signed-off-by: Jon Medhurst <tixy@linaro.org>

authored by

Wang Nan and committed by
Jon Medhurst
0dc016db cbf6ab52

+377 -12
+1
arch/arm/Kconfig
··· 60 60 select HAVE_MEMBLOCK 61 61 select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND 62 62 select HAVE_OPROFILE if (HAVE_PERF_EVENTS) 63 + select HAVE_OPTPROBES if !THUMB2_KERNEL 63 64 select HAVE_PERF_EVENTS 64 65 select HAVE_PERF_REGS 65 66 select HAVE_PERF_USER_STACK_DUMP
+29
arch/arm/include/asm/kprobes.h
··· 50 50 int kprobe_exceptions_notify(struct notifier_block *self, 51 51 unsigned long val, void *data); 52 52 53 + /* optinsn template addresses */ 54 + extern __visible kprobe_opcode_t optprobe_template_entry; 55 + extern __visible kprobe_opcode_t optprobe_template_val; 56 + extern __visible kprobe_opcode_t optprobe_template_call; 57 + extern __visible kprobe_opcode_t optprobe_template_end; 58 + extern __visible kprobe_opcode_t optprobe_template_sub_sp; 59 + extern __visible kprobe_opcode_t optprobe_template_add_sp; 60 + 61 + #define MAX_OPTIMIZED_LENGTH 4 62 + #define MAX_OPTINSN_SIZE \ 63 + ((unsigned long)&optprobe_template_end - \ 64 + (unsigned long)&optprobe_template_entry) 65 + #define RELATIVEJUMP_SIZE 4 66 + 67 + struct arch_optimized_insn { 68 + /* 69 + * copy of the original instructions. 70 + * Different from x86, ARM kprobe_opcode_t is u32. 71 + */ 72 + #define MAX_COPIED_INSN DIV_ROUND_UP(RELATIVEJUMP_SIZE, sizeof(kprobe_opcode_t)) 73 + kprobe_opcode_t copied_insn[MAX_COPIED_INSN]; 74 + /* detour code buffer */ 75 + kprobe_opcode_t *insn; 76 + /* 77 + * We always copy one instruction on ARM, 78 + * so size will always be 4, and unlike x86, there is no 79 + * need for a size field. 80 + */ 81 + }; 53 82 54 83 #endif /* _ARM_KPROBES_H */
+1 -1
arch/arm/kernel/Makefile
··· 52 52 obj-$(CONFIG_JUMP_LABEL) += jump_label.o insn.o patch.o 53 53 obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o 54 54 # Main staffs in KPROBES are in arch/arm/probes/ . 55 - obj-$(CONFIG_KPROBES) += patch.o 55 + obj-$(CONFIG_KPROBES) += patch.o insn.o 56 56 obj-$(CONFIG_OABI_COMPAT) += sys_oabi-compat.o 57 57 obj-$(CONFIG_ARM_THUMBEE) += thumbee.o 58 58 obj-$(CONFIG_KGDB) += kgdb.o patch.o
+1 -2
arch/arm/kernel/ftrace.c
··· 20 20 #include <asm/cacheflush.h> 21 21 #include <asm/opcodes.h> 22 22 #include <asm/ftrace.h> 23 - 24 - #include "insn.h" 23 + #include <asm/insn.h> 25 24 26 25 #ifdef CONFIG_THUMB2_KERNEL 27 26 #define NOP 0xf85deb04 /* pop.w {lr} */
arch/arm/kernel/insn.h arch/arm/include/asm/insn.h
+1 -2
arch/arm/kernel/jump_label.c
··· 1 1 #include <linux/kernel.h> 2 2 #include <linux/jump_label.h> 3 3 #include <asm/patch.h> 4 - 5 - #include "insn.h" 4 + #include <asm/insn.h> 6 5 7 6 #ifdef HAVE_JUMP_LABEL 8 7
+1
arch/arm/probes/kprobes/Makefile
··· 7 7 test-kprobes-objs += test-thumb.o 8 8 else 9 9 obj-$(CONFIG_KPROBES) += actions-arm.o checkers-arm.o 10 + obj-$(CONFIG_OPTPROBES) += opt-arm.o 10 11 test-kprobes-objs += test-arm.o 11 12 endif
+19 -7
arch/arm/probes/kprobes/core.c
··· 163 163 * memory. It is also needed to atomically set the two half-words of a 32-bit 164 164 * Thumb breakpoint. 165 165 */ 166 - int __kprobes __arch_disarm_kprobe(void *p) 166 + struct patch { 167 + void *addr; 168 + unsigned int insn; 169 + }; 170 + 171 + static int __kprobes_remove_breakpoint(void *data) 167 172 { 168 - struct kprobe *kp = p; 169 - void *addr = (void *)((uintptr_t)kp->addr & ~1); 170 - 171 - __patch_text(addr, kp->opcode); 172 - 173 + struct patch *p = data; 174 + __patch_text(p->addr, p->insn); 173 175 return 0; 176 + } 177 + 178 + void __kprobes kprobes_remove_breakpoint(void *addr, unsigned int insn) 179 + { 180 + struct patch p = { 181 + .addr = addr, 182 + .insn = insn, 183 + }; 184 + stop_machine(__kprobes_remove_breakpoint, &p, cpu_online_mask); 174 185 } 175 186 176 187 void __kprobes arch_disarm_kprobe(struct kprobe *p) 177 188 { 178 - stop_machine(__arch_disarm_kprobe, p, cpu_online_mask); 189 + kprobes_remove_breakpoint((void *)((uintptr_t)p->addr & ~1), 190 + p->opcode); 179 191 } 180 192 181 193 void __kprobes arch_remove_kprobe(struct kprobe *p)
+2
arch/arm/probes/kprobes/core.h
··· 30 30 #define KPROBE_THUMB16_BREAKPOINT_INSTRUCTION 0xde18 31 31 #define KPROBE_THUMB32_BREAKPOINT_INSTRUCTION 0xf7f0a018 32 32 33 + extern void kprobes_remove_breakpoint(void *addr, unsigned int insn); 34 + 33 35 enum probes_insn __kprobes 34 36 kprobe_decode_ldmstm(kprobe_opcode_t insn, struct arch_probes_insn *asi, 35 37 const struct decode_header *h);
+322
arch/arm/probes/kprobes/opt-arm.c
··· 1 + /* 2 + * Kernel Probes Jump Optimization (Optprobes) 3 + * 4 + * This program is free software; you can redistribute it and/or modify 5 + * it under the terms of the GNU General Public License as published by 6 + * the Free Software Foundation; either version 2 of the License, or 7 + * (at your option) any later version. 8 + * 9 + * This program is distributed in the hope that it will be useful, 10 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 + * GNU General Public License for more details. 13 + * 14 + * You should have received a copy of the GNU General Public License 15 + * along with this program; if not, write to the Free Software 16 + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 17 + * 18 + * Copyright (C) IBM Corporation, 2002, 2004 19 + * Copyright (C) Hitachi Ltd., 2012 20 + * Copyright (C) Huawei Inc., 2014 21 + */ 22 + 23 + #include <linux/kprobes.h> 24 + #include <linux/jump_label.h> 25 + #include <asm/kprobes.h> 26 + #include <asm/cacheflush.h> 27 + /* for arm_gen_branch */ 28 + #include <asm/insn.h> 29 + /* for patch_text */ 30 + #include <asm/patch.h> 31 + 32 + #include "core.h" 33 + 34 + /* 35 + * NOTE: the first sub and add instruction will be modified according 36 + * to the stack cost of the instruction. 37 + */ 38 + asm ( 39 + ".global optprobe_template_entry\n" 40 + "optprobe_template_entry:\n" 41 + ".global optprobe_template_sub_sp\n" 42 + "optprobe_template_sub_sp:" 43 + " sub sp, sp, #0xff\n" 44 + " stmia sp, {r0 - r14} \n" 45 + ".global optprobe_template_add_sp\n" 46 + "optprobe_template_add_sp:" 47 + " add r3, sp, #0xff\n" 48 + " str r3, [sp, #52]\n" 49 + " mrs r4, cpsr\n" 50 + " str r4, [sp, #64]\n" 51 + " mov r1, sp\n" 52 + " ldr r0, 1f\n" 53 + " ldr r2, 2f\n" 54 + /* 55 + * AEABI requires an 8-bytes alignment stack. If 56 + * SP % 8 != 0 (SP % 4 == 0 should be ensured), 57 + * alloc more bytes here. 58 + */ 59 + " and r4, sp, #4\n" 60 + " sub sp, sp, r4\n" 61 + #if __LINUX_ARM_ARCH__ >= 5 62 + " blx r2\n" 63 + #else 64 + " mov lr, pc\n" 65 + " mov pc, r2\n" 66 + #endif 67 + " add sp, sp, r4\n" 68 + " ldr r1, [sp, #64]\n" 69 + " tst r1, #"__stringify(PSR_T_BIT)"\n" 70 + " ldrne r2, [sp, #60]\n" 71 + " orrne r2, #1\n" 72 + " strne r2, [sp, #60] @ set bit0 of PC for thumb\n" 73 + " msr cpsr_cxsf, r1\n" 74 + " ldmia sp, {r0 - r15}\n" 75 + ".global optprobe_template_val\n" 76 + "optprobe_template_val:\n" 77 + "1: .long 0\n" 78 + ".global optprobe_template_call\n" 79 + "optprobe_template_call:\n" 80 + "2: .long 0\n" 81 + ".global optprobe_template_end\n" 82 + "optprobe_template_end:\n"); 83 + 84 + #define TMPL_VAL_IDX \ 85 + ((unsigned long *)&optprobe_template_val - (unsigned long *)&optprobe_template_entry) 86 + #define TMPL_CALL_IDX \ 87 + ((unsigned long *)&optprobe_template_call - (unsigned long *)&optprobe_template_entry) 88 + #define TMPL_END_IDX \ 89 + ((unsigned long *)&optprobe_template_end - (unsigned long *)&optprobe_template_entry) 90 + #define TMPL_ADD_SP \ 91 + ((unsigned long *)&optprobe_template_add_sp - (unsigned long *)&optprobe_template_entry) 92 + #define TMPL_SUB_SP \ 93 + ((unsigned long *)&optprobe_template_sub_sp - (unsigned long *)&optprobe_template_entry) 94 + 95 + /* 96 + * ARM can always optimize an instruction when using ARM ISA, except 97 + * instructions like 'str r0, [sp, r1]' which store to stack and unable 98 + * to determine stack space consumption statically. 99 + */ 100 + int arch_prepared_optinsn(struct arch_optimized_insn *optinsn) 101 + { 102 + return optinsn->insn != NULL; 103 + } 104 + 105 + /* 106 + * In ARM ISA, kprobe opt always replace one instruction (4 bytes 107 + * aligned and 4 bytes long). It is impossible to encounter another 108 + * kprobe in the address range. So always return 0. 109 + */ 110 + int arch_check_optimized_kprobe(struct optimized_kprobe *op) 111 + { 112 + return 0; 113 + } 114 + 115 + /* Caller must ensure addr & 3 == 0 */ 116 + static int can_optimize(struct kprobe *kp) 117 + { 118 + if (kp->ainsn.stack_space < 0) 119 + return 0; 120 + /* 121 + * 255 is the biggest imm can be used in 'sub r0, r0, #<imm>'. 122 + * Number larger than 255 needs special encoding. 123 + */ 124 + if (kp->ainsn.stack_space > 255 - sizeof(struct pt_regs)) 125 + return 0; 126 + return 1; 127 + } 128 + 129 + /* Free optimized instruction slot */ 130 + static void 131 + __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) 132 + { 133 + if (op->optinsn.insn) { 134 + free_optinsn_slot(op->optinsn.insn, dirty); 135 + op->optinsn.insn = NULL; 136 + } 137 + } 138 + 139 + extern void kprobe_handler(struct pt_regs *regs); 140 + 141 + static void 142 + optimized_callback(struct optimized_kprobe *op, struct pt_regs *regs) 143 + { 144 + unsigned long flags; 145 + struct kprobe *p = &op->kp; 146 + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 147 + 148 + /* Save skipped registers */ 149 + regs->ARM_pc = (unsigned long)op->kp.addr; 150 + regs->ARM_ORIG_r0 = ~0UL; 151 + 152 + local_irq_save(flags); 153 + 154 + if (kprobe_running()) { 155 + kprobes_inc_nmissed_count(&op->kp); 156 + } else { 157 + __this_cpu_write(current_kprobe, &op->kp); 158 + kcb->kprobe_status = KPROBE_HIT_ACTIVE; 159 + opt_pre_handler(&op->kp, regs); 160 + __this_cpu_write(current_kprobe, NULL); 161 + } 162 + 163 + /* In each case, we must singlestep the replaced instruction. */ 164 + op->kp.ainsn.insn_singlestep(p->opcode, &p->ainsn, regs); 165 + 166 + local_irq_restore(flags); 167 + } 168 + 169 + int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *orig) 170 + { 171 + kprobe_opcode_t *code; 172 + unsigned long rel_chk; 173 + unsigned long val; 174 + unsigned long stack_protect = sizeof(struct pt_regs); 175 + 176 + if (!can_optimize(orig)) 177 + return -EILSEQ; 178 + 179 + code = get_optinsn_slot(); 180 + if (!code) 181 + return -ENOMEM; 182 + 183 + /* 184 + * Verify if the address gap is in 32MiB range, because this uses 185 + * a relative jump. 186 + * 187 + * kprobe opt use a 'b' instruction to branch to optinsn.insn. 188 + * According to ARM manual, branch instruction is: 189 + * 190 + * 31 28 27 24 23 0 191 + * +------+---+---+---+---+----------------+ 192 + * | cond | 1 | 0 | 1 | 0 | imm24 | 193 + * +------+---+---+---+---+----------------+ 194 + * 195 + * imm24 is a signed 24 bits integer. The real branch offset is computed 196 + * by: imm32 = SignExtend(imm24:'00', 32); 197 + * 198 + * So the maximum forward branch should be: 199 + * (0x007fffff << 2) = 0x01fffffc = 0x1fffffc 200 + * The maximum backword branch should be: 201 + * (0xff800000 << 2) = 0xfe000000 = -0x2000000 202 + * 203 + * We can simply check (rel & 0xfe000003): 204 + * if rel is positive, (rel & 0xfe000000) shoule be 0 205 + * if rel is negitive, (rel & 0xfe000000) should be 0xfe000000 206 + * the last '3' is used for alignment checking. 207 + */ 208 + rel_chk = (unsigned long)((long)code - 209 + (long)orig->addr + 8) & 0xfe000003; 210 + 211 + if ((rel_chk != 0) && (rel_chk != 0xfe000000)) { 212 + /* 213 + * Different from x86, we free code buf directly instead of 214 + * calling __arch_remove_optimized_kprobe() because 215 + * we have not fill any field in op. 216 + */ 217 + free_optinsn_slot(code, 0); 218 + return -ERANGE; 219 + } 220 + 221 + /* Copy arch-dep-instance from template. */ 222 + memcpy(code, &optprobe_template_entry, 223 + TMPL_END_IDX * sizeof(kprobe_opcode_t)); 224 + 225 + /* Adjust buffer according to instruction. */ 226 + BUG_ON(orig->ainsn.stack_space < 0); 227 + 228 + stack_protect += orig->ainsn.stack_space; 229 + 230 + /* Should have been filtered by can_optimize(). */ 231 + BUG_ON(stack_protect > 255); 232 + 233 + /* Create a 'sub sp, sp, #<stack_protect>' */ 234 + code[TMPL_SUB_SP] = __opcode_to_mem_arm(0xe24dd000 | stack_protect); 235 + /* Create a 'add r3, sp, #<stack_protect>' */ 236 + code[TMPL_ADD_SP] = __opcode_to_mem_arm(0xe28d3000 | stack_protect); 237 + 238 + /* Set probe information */ 239 + val = (unsigned long)op; 240 + code[TMPL_VAL_IDX] = val; 241 + 242 + /* Set probe function call */ 243 + val = (unsigned long)optimized_callback; 244 + code[TMPL_CALL_IDX] = val; 245 + 246 + flush_icache_range((unsigned long)code, 247 + (unsigned long)(&code[TMPL_END_IDX])); 248 + 249 + /* Set op->optinsn.insn means prepared. */ 250 + op->optinsn.insn = code; 251 + return 0; 252 + } 253 + 254 + void __kprobes arch_optimize_kprobes(struct list_head *oplist) 255 + { 256 + struct optimized_kprobe *op, *tmp; 257 + 258 + list_for_each_entry_safe(op, tmp, oplist, list) { 259 + unsigned long insn; 260 + WARN_ON(kprobe_disabled(&op->kp)); 261 + 262 + /* 263 + * Backup instructions which will be replaced 264 + * by jump address 265 + */ 266 + memcpy(op->optinsn.copied_insn, op->kp.addr, 267 + RELATIVEJUMP_SIZE); 268 + 269 + insn = arm_gen_branch((unsigned long)op->kp.addr, 270 + (unsigned long)op->optinsn.insn); 271 + BUG_ON(insn == 0); 272 + 273 + /* 274 + * Make it a conditional branch if replaced insn 275 + * is consitional 276 + */ 277 + insn = (__mem_to_opcode_arm( 278 + op->optinsn.copied_insn[0]) & 0xf0000000) | 279 + (insn & 0x0fffffff); 280 + 281 + /* 282 + * Similar to __arch_disarm_kprobe, operations which 283 + * removing breakpoints must be wrapped by stop_machine 284 + * to avoid racing. 285 + */ 286 + kprobes_remove_breakpoint(op->kp.addr, insn); 287 + 288 + list_del_init(&op->list); 289 + } 290 + } 291 + 292 + void arch_unoptimize_kprobe(struct optimized_kprobe *op) 293 + { 294 + arch_arm_kprobe(&op->kp); 295 + } 296 + 297 + /* 298 + * Recover original instructions and breakpoints from relative jumps. 299 + * Caller must call with locking kprobe_mutex. 300 + */ 301 + void arch_unoptimize_kprobes(struct list_head *oplist, 302 + struct list_head *done_list) 303 + { 304 + struct optimized_kprobe *op, *tmp; 305 + 306 + list_for_each_entry_safe(op, tmp, oplist, list) { 307 + arch_unoptimize_kprobe(op); 308 + list_move(&op->list, done_list); 309 + } 310 + } 311 + 312 + int arch_within_optimized_kprobe(struct optimized_kprobe *op, 313 + unsigned long addr) 314 + { 315 + return ((unsigned long)op->kp.addr <= addr && 316 + (unsigned long)op->kp.addr + RELATIVEJUMP_SIZE > addr); 317 + } 318 + 319 + void arch_remove_optimized_kprobe(struct optimized_kprobe *op) 320 + { 321 + __arch_remove_optimized_kprobe(op, 1); 322 + }