Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86: Remove arbitrary instruction size limit in instruction decoder

The current x86 instruction decoder steps along through the
instruction stream but always ensures that it never steps farther
than the largest possible instruction size (MAX_INSN_SIZE).

The MPX code is now going to be doing some decoding of userspace
instructions. We copy those from userspace in to the kernel and
they're obviously completely untrusted coming from userspace. In
addition to the constraint that instructions can only be so long,
we also have to be aware of how long the buffer is that came in
from userspace. This _looks_ to be similar to what the perf and
kprobes is doing, but it's unclear to me whether they are
affected.

The whole reason we need this is that it is perfectly valid to be
executing an instruction within MAX_INSN_SIZE bytes of an
unreadable page. We should be able to gracefully handle short
reads in those cases.

This adds support to the decoder to record how long the buffer
being decoded is and to refuse to "validate" the instruction if
we would have gone over the end of the buffer to decode it.

The kprobes code probably needs to be looked at here a bit more
carefully. This patch still respects the MAX_INSN_SIZE limit
there but the kprobes code does look like it might be able to
be a bit more strict than it currently is.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Acked-by: Jim Keniston <jkenisto@us.ibm.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: x86@kernel.org
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: "David S. Miller" <davem@davemloft.net>
Link: http://lkml.kernel.org/r/20141114153957.E6B01535@viggo.jf.intel.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

authored by

Dave Hansen and committed by
Thomas Gleixner
6ba48ff4 206c5f60

+53 -22
+6 -4
arch/x86/include/asm/insn.h
··· 65 65 unsigned char x86_64; 66 66 67 67 const insn_byte_t *kaddr; /* kernel address of insn to analyze */ 68 + const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ 68 69 const insn_byte_t *next_byte; 69 70 }; 70 71 ··· 97 96 #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ 98 97 #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ 99 98 100 - extern void insn_init(struct insn *insn, const void *kaddr, int x86_64); 99 + extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); 101 100 extern void insn_get_prefixes(struct insn *insn); 102 101 extern void insn_get_opcode(struct insn *insn); 103 102 extern void insn_get_modrm(struct insn *insn); ··· 116 115 extern int insn_rip_relative(struct insn *insn); 117 116 118 117 /* Init insn for kernel text */ 119 - static inline void kernel_insn_init(struct insn *insn, const void *kaddr) 118 + static inline void kernel_insn_init(struct insn *insn, 119 + const void *kaddr, int buf_len) 120 120 { 121 121 #ifdef CONFIG_X86_64 122 - insn_init(insn, kaddr, 1); 122 + insn_init(insn, kaddr, buf_len, 1); 123 123 #else /* CONFIG_X86_32 */ 124 - insn_init(insn, kaddr, 0); 124 + insn_init(insn, kaddr, buf_len, 0); 125 125 #endif 126 126 } 127 127
+14 -3
arch/x86/kernel/cpu/perf_event_intel_ds.c
··· 724 724 unsigned long ip = regs->ip; 725 725 int is_64bit = 0; 726 726 void *kaddr; 727 + int size; 727 728 728 729 /* 729 730 * We don't need to fixup if the PEBS assist is fault like ··· 759 758 return 1; 760 759 } 761 760 761 + size = ip - to; 762 762 if (!kernel_ip(ip)) { 763 - int size, bytes; 763 + int bytes; 764 764 u8 *buf = this_cpu_read(insn_buffer); 765 765 766 - size = ip - to; /* Must fit our buffer, see above */ 766 + /* 'size' must fit our buffer, see above */ 767 767 bytes = copy_from_user_nmi(buf, (void __user *)to, size); 768 768 if (bytes != 0) 769 769 return 0; ··· 782 780 #ifdef CONFIG_X86_64 783 781 is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); 784 782 #endif 785 - insn_init(&insn, kaddr, is_64bit); 783 + insn_init(&insn, kaddr, size, is_64bit); 786 784 insn_get_length(&insn); 785 + /* 786 + * Make sure there was not a problem decoding the 787 + * instruction and getting the length. This is 788 + * doubly important because we have an infinite 789 + * loop if insn.length=0. 790 + */ 791 + if (!insn.length) 792 + break; 787 793 788 794 to += insn.length; 789 795 kaddr += insn.length; 796 + size -= insn.length; 790 797 } while (to < ip); 791 798 792 799 if (to == ip) {
+19 -6
arch/x86/kernel/cpu/perf_event_intel_lbr.c
··· 465 465 { 466 466 struct insn insn; 467 467 void *addr; 468 - int bytes, size = MAX_INSN_SIZE; 468 + int bytes_read, bytes_left; 469 469 int ret = X86_BR_NONE; 470 470 int ext, to_plm, from_plm; 471 471 u8 buf[MAX_INSN_SIZE]; ··· 493 493 return X86_BR_NONE; 494 494 495 495 /* may fail if text not present */ 496 - bytes = copy_from_user_nmi(buf, (void __user *)from, size); 497 - if (bytes != 0) 496 + bytes_left = copy_from_user_nmi(buf, (void __user *)from, 497 + MAX_INSN_SIZE); 498 + bytes_read = MAX_INSN_SIZE - bytes_left; 499 + if (!bytes_read) 498 500 return X86_BR_NONE; 499 501 500 502 addr = buf; ··· 507 505 * Ensure we don't blindy read any address by validating it is 508 506 * a known text address. 509 507 */ 510 - if (kernel_text_address(from)) 508 + if (kernel_text_address(from)) { 511 509 addr = (void *)from; 512 - else 510 + /* 511 + * Assume we can get the maximum possible size 512 + * when grabbing kernel data. This is not 513 + * _strictly_ true since we could possibly be 514 + * executing up next to a memory hole, but 515 + * it is very unlikely to be a problem. 516 + */ 517 + bytes_read = MAX_INSN_SIZE; 518 + } else { 513 519 return X86_BR_NONE; 520 + } 514 521 } 515 522 516 523 /* ··· 529 518 #ifdef CONFIG_X86_64 530 519 is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); 531 520 #endif 532 - insn_init(&insn, addr, is64); 521 + insn_init(&insn, addr, bytes_read, is64); 533 522 insn_get_opcode(&insn); 523 + if (!insn.opcode.got) 524 + return X86_BR_ABORT; 534 525 535 526 switch (insn.opcode.bytes[0]) { 536 527 case 0xf:
+5 -3
arch/x86/kernel/kprobes/core.c
··· 285 285 * normally used, we just go through if there is no kprobe. 286 286 */ 287 287 __addr = recover_probed_instruction(buf, addr); 288 - kernel_insn_init(&insn, (void *)__addr); 288 + kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE); 289 289 insn_get_length(&insn); 290 290 291 291 /* ··· 330 330 { 331 331 struct insn insn; 332 332 kprobe_opcode_t buf[MAX_INSN_SIZE]; 333 + unsigned long recovered_insn = 334 + recover_probed_instruction(buf, (unsigned long)src); 333 335 334 - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); 336 + kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); 335 337 insn_get_length(&insn); 336 338 /* Another subsystem puts a breakpoint, failed to recover */ 337 339 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) ··· 344 342 if (insn_rip_relative(&insn)) { 345 343 s64 newdisp; 346 344 u8 *disp; 347 - kernel_insn_init(&insn, dest); 345 + kernel_insn_init(&insn, dest, insn.length); 348 346 insn_get_displacement(&insn); 349 347 /* 350 348 * The copied instruction uses the %rip-relative addressing
+3 -1
arch/x86/kernel/kprobes/opt.c
··· 251 251 /* Decode instructions */ 252 252 addr = paddr - offset; 253 253 while (addr < paddr - offset + size) { /* Decode until function end */ 254 + unsigned long recovered_insn; 254 255 if (search_exception_tables(addr)) 255 256 /* 256 257 * Since some fixup code will jumps into this function, 257 258 * we can't optimize kprobe in this function. 258 259 */ 259 260 return 0; 260 - kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); 261 + recovered_insn = recover_probed_instruction(buf, addr); 262 + kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); 261 263 insn_get_length(&insn); 262 264 /* Another subsystem puts a breakpoint */ 263 265 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
+1 -1
arch/x86/kernel/uprobes.c
··· 219 219 { 220 220 u32 volatile *good_insns; 221 221 222 - insn_init(insn, auprobe->insn, x86_64); 222 + insn_init(insn, auprobe->insn, sizeof(auprobe->insn), x86_64); 223 223 /* has the side-effect of processing the entire instruction */ 224 224 insn_get_length(insn); 225 225 if (WARN_ON_ONCE(!insn_complete(insn)))
+3 -2
arch/x86/lib/insn.c
··· 28 28 29 29 /* Verify next sizeof(t) bytes can be on the same instruction */ 30 30 #define validate_next(t, insn, n) \ 31 - ((insn)->next_byte + sizeof(t) + n - (insn)->kaddr <= MAX_INSN_SIZE) 31 + ((insn)->next_byte + sizeof(t) + n < (insn)->end_kaddr) 32 32 33 33 #define __get_next(t, insn) \ 34 34 ({ t r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) ··· 50 50 * @kaddr: address (in kernel memory) of instruction (or copy thereof) 51 51 * @x86_64: !0 for 64-bit kernel or 64-bit app 52 52 */ 53 - void insn_init(struct insn *insn, const void *kaddr, int x86_64) 53 + void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) 54 54 { 55 55 memset(insn, 0, sizeof(*insn)); 56 56 insn->kaddr = kaddr; 57 + insn->end_kaddr = kaddr + buf_len; 57 58 insn->next_byte = kaddr; 58 59 insn->x86_64 = x86_64 ? 1 : 0; 59 60 insn->opnd_bytes = 4;
+1 -1
arch/x86/tools/insn_sanity.c
··· 254 254 continue; 255 255 256 256 /* Decode an instruction */ 257 - insn_init(&insn, insn_buf, x86_64); 257 + insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); 258 258 insn_get_length(&insn); 259 259 260 260 if (insn.next_byte <= insn.kaddr ||
+1 -1
arch/x86/tools/test_get_len.c
··· 149 149 break; 150 150 } 151 151 /* Decode an instruction */ 152 - insn_init(&insn, insn_buf, x86_64); 152 + insn_init(&insn, insn_buf, sizeof(insn_buf), x86_64); 153 153 insn_get_length(&insn); 154 154 if (insn.length != nb) { 155 155 warnings++;