Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/alternative: Batch of patch operations

Currently, the patch of an address is done in three steps:

-- Pseudo-code #1 - Current implementation ---

1) add an int3 trap to the address that will be patched
sync cores (send IPI to all other CPUs)
2) update all but the first byte of the patched range
sync cores (send IPI to all other CPUs)
3) replace the first byte (int3) by the first byte of replacing opcode
sync cores (send IPI to all other CPUs)

-- Pseudo-code #1 ---

When a static key has more than one entry, these steps are called once for
each entry. The number of IPIs then is linear with regard to the number 'n' of
entries of a key: O(n*3), which is O(n).

This algorithm works fine for the update of a single key. But we think
it is possible to optimize the case in which a static key has more than
one entry. For instance, the sched_schedstats jump label has 56 entries
in my (updated) fedora kernel, resulting in 168 IPIs for each CPU in
which the thread that is enabling the key is _not_ running.

With this patch, rather than receiving a single patch to be processed, a vector
of patches is passed, enabling the rewrite of the pseudo-code #1 in this
way:

-- Pseudo-code #2 - This patch ---
1) for each patch in the vector:
add an int3 trap to the address that will be patched

sync cores (send IPI to all other CPUs)

2) for each patch in the vector:
update all but the first byte of the patched range

sync cores (send IPI to all other CPUs)

3) for each patch in the vector:
replace the first byte (int3) by the first byte of replacing opcode

sync cores (send IPI to all other CPUs)
-- Pseudo-code #2 - This patch ---

Doing the update in this way, the number of IPI becomes O(3) with regard
to the number of keys, which is O(1).

The batch mode is done with the function text_poke_bp_batch(), that receives
two arguments: a vector of "struct text_to_poke", and the number of entries
in the vector.

The vector must be sorted by the addr field of the text_to_poke structure,
enabling the binary search of a handler in the poke_int3_handler function
(a fast path).

Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris von Recklinghausen <crecklin@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Wood <swood@redhat.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/ca506ed52584c80f64de23f6f55ca288e5d079de.1560325897.git.bristot@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Daniel Bristot de Oliveira and committed by
Ingo Molnar
c0213b0a 0f133021

+135 -34
+15
arch/x86/include/asm/text-patching.h
··· 18 18 #define __parainstructions_end NULL 19 19 #endif 20 20 21 + /* 22 + * Currently, the max observed size in the kernel code is 23 + * JUMP_LABEL_NOP_SIZE/RELATIVEJUMP_SIZE, which are 5. 24 + * Raise it if needed. 25 + */ 26 + #define POKE_MAX_OPCODE_SIZE 5 27 + 28 + struct text_poke_loc { 29 + void *detour; 30 + void *addr; 31 + size_t len; 32 + const char opcode[POKE_MAX_OPCODE_SIZE]; 33 + }; 34 + 21 35 extern void text_poke_early(void *addr, const void *opcode, size_t len); 22 36 23 37 /* ··· 52 38 extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); 53 39 extern int poke_int3_handler(struct pt_regs *regs); 54 40 extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); 41 + extern void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries); 55 42 extern int after_bootmem; 56 43 extern __ro_after_init struct mm_struct *poking_mm; 57 44 extern __ro_after_init unsigned long poking_addr;
+120 -34
arch/x86/kernel/alternative.c
··· 14 14 #include <linux/kdebug.h> 15 15 #include <linux/kprobes.h> 16 16 #include <linux/mmu_context.h> 17 + #include <linux/bsearch.h> 17 18 #include <asm/text-patching.h> 18 19 #include <asm/alternative.h> 19 20 #include <asm/sections.h> ··· 849 848 sync_core(); 850 849 } 851 850 852 - static bool bp_patching_in_progress; 853 - static void *bp_int3_handler, *bp_int3_addr; 851 + static struct bp_patching_desc { 852 + struct text_poke_loc *vec; 853 + int nr_entries; 854 + } bp_patching; 855 + 856 + static int patch_cmp(const void *key, const void *elt) 857 + { 858 + struct text_poke_loc *tp = (struct text_poke_loc *) elt; 859 + 860 + if (key < tp->addr) 861 + return -1; 862 + if (key > tp->addr) 863 + return 1; 864 + return 0; 865 + } 866 + NOKPROBE_SYMBOL(patch_cmp); 854 867 855 868 int poke_int3_handler(struct pt_regs *regs) 856 869 { 870 + struct text_poke_loc *tp; 871 + unsigned char int3 = 0xcc; 872 + void *ip; 873 + 857 874 /* 858 875 * Having observed our INT3 instruction, we now must observe 859 - * bp_patching_in_progress. 876 + * bp_patching.nr_entries. 860 877 * 861 - * in_progress = TRUE INT3 878 + * nr_entries != 0 INT3 862 879 * WMB RMB 863 - * write INT3 if (in_progress) 880 + * write INT3 if (nr_entries) 864 881 * 865 - * Idem for bp_int3_handler. 882 + * Idem for other elements in bp_patching. 866 883 */ 867 884 smp_rmb(); 868 885 869 - if (likely(!bp_patching_in_progress)) 886 + if (likely(!bp_patching.nr_entries)) 870 887 return 0; 871 888 872 - if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) 889 + if (user_mode(regs)) 873 890 return 0; 874 891 875 - /* set up the specified breakpoint handler */ 876 - regs->ip = (unsigned long) bp_int3_handler; 892 + /* 893 + * Discount the sizeof(int3). See text_poke_bp_batch(). 894 + */ 895 + ip = (void *) regs->ip - sizeof(int3); 896 + 897 + /* 898 + * Skip the binary search if there is a single member in the vector. 899 + */ 900 + if (unlikely(bp_patching.nr_entries > 1)) { 901 + tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries, 902 + sizeof(struct text_poke_loc), 903 + patch_cmp); 904 + if (!tp) 905 + return 0; 906 + } else { 907 + tp = bp_patching.vec; 908 + if (tp->addr != ip) 909 + return 0; 910 + } 911 + 912 + /* set up the specified breakpoint detour */ 913 + regs->ip = (unsigned long) tp->detour; 877 914 878 915 return 1; 879 916 } 880 917 NOKPROBE_SYMBOL(poke_int3_handler); 881 918 882 919 /** 883 - * text_poke_bp() -- update instructions on live kernel on SMP 884 - * @addr: address to patch 885 - * @opcode: opcode of new instruction 886 - * @len: length to copy 887 - * @handler: address to jump to when the temporary breakpoint is hit 920 + * text_poke_bp_batch() -- update instructions on live kernel on SMP 921 + * @tp: vector of instructions to patch 922 + * @nr_entries: number of entries in the vector 888 923 * 889 924 * Modify multi-byte instruction by using int3 breakpoint on SMP. 890 925 * We completely avoid stop_machine() here, and achieve the 891 926 * synchronization using int3 breakpoint. 892 927 * 893 928 * The way it is done: 894 - * - add a int3 trap to the address that will be patched 929 + * - For each entry in the vector: 930 + * - add a int3 trap to the address that will be patched 895 931 * - sync cores 896 - * - update all but the first byte of the patched range 932 + * - For each entry in the vector: 933 + * - update all but the first byte of the patched range 897 934 * - sync cores 898 - * - replace the first byte (int3) by the first byte of 899 - * replacing opcode 935 + * - For each entry in the vector: 936 + * - replace the first byte (int3) by the first byte of 937 + * replacing opcode 900 938 * - sync cores 901 939 */ 902 - void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) 940 + void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) 903 941 { 942 + int patched_all_but_first = 0; 904 943 unsigned char int3 = 0xcc; 905 - 906 - bp_int3_handler = handler; 907 - bp_int3_addr = (u8 *)addr + sizeof(int3); 908 - bp_patching_in_progress = true; 944 + unsigned int i; 909 945 910 946 lockdep_assert_held(&text_mutex); 911 947 948 + bp_patching.vec = tp; 949 + bp_patching.nr_entries = nr_entries; 950 + 912 951 /* 913 952 * Corresponding read barrier in int3 notifier for making sure the 914 - * in_progress and handler are correctly ordered wrt. patching. 953 + * nr_entries and handler are correctly ordered wrt. patching. 915 954 */ 916 955 smp_wmb(); 917 956 918 - text_poke(addr, &int3, sizeof(int3)); 957 + /* 958 + * First step: add a int3 trap to the address that will be patched. 959 + */ 960 + for (i = 0; i < nr_entries; i++) 961 + text_poke(tp[i].addr, &int3, sizeof(int3)); 919 962 920 963 on_each_cpu(do_sync_core, NULL, 1); 921 964 922 - if (len - sizeof(int3) > 0) { 923 - /* patch all but the first byte */ 924 - text_poke((char *)addr + sizeof(int3), 925 - (const char *) opcode + sizeof(int3), 926 - len - sizeof(int3)); 965 + /* 966 + * Second step: update all but the first byte of the patched range. 967 + */ 968 + for (i = 0; i < nr_entries; i++) { 969 + if (tp[i].len - sizeof(int3) > 0) { 970 + text_poke((char *)tp[i].addr + sizeof(int3), 971 + (const char *)tp[i].opcode + sizeof(int3), 972 + tp[i].len - sizeof(int3)); 973 + patched_all_but_first++; 974 + } 975 + } 976 + 977 + if (patched_all_but_first) { 927 978 /* 928 979 * According to Intel, this core syncing is very likely 929 980 * not necessary and we'd be safe even without it. But ··· 984 931 on_each_cpu(do_sync_core, NULL, 1); 985 932 } 986 933 987 - /* patch the first byte */ 988 - text_poke(addr, opcode, sizeof(int3)); 934 + /* 935 + * Third step: replace the first byte (int3) by the first byte of 936 + * replacing opcode. 937 + */ 938 + for (i = 0; i < nr_entries; i++) 939 + text_poke(tp[i].addr, tp[i].opcode, sizeof(int3)); 989 940 990 941 on_each_cpu(do_sync_core, NULL, 1); 991 942 /* 992 943 * sync_core() implies an smp_mb() and orders this store against 993 944 * the writing of the new instruction. 994 945 */ 995 - bp_patching_in_progress = false; 946 + bp_patching.vec = NULL; 947 + bp_patching.nr_entries = 0; 996 948 } 997 949 950 + /** 951 + * text_poke_bp() -- update instructions on live kernel on SMP 952 + * @addr: address to patch 953 + * @opcode: opcode of new instruction 954 + * @len: length to copy 955 + * @handler: address to jump to when the temporary breakpoint is hit 956 + * 957 + * Update a single instruction with the vector in the stack, avoiding 958 + * dynamically allocated memory. This function should be used when it is 959 + * not possible to allocate memory. 960 + */ 961 + void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) 962 + { 963 + struct text_poke_loc tp = { 964 + .detour = handler, 965 + .addr = addr, 966 + .len = len, 967 + }; 968 + 969 + if (len > POKE_MAX_OPCODE_SIZE) { 970 + WARN_ONCE(1, "len is larger than %d\n", POKE_MAX_OPCODE_SIZE); 971 + return; 972 + } 973 + 974 + memcpy((void *)tp.opcode, opcode, len); 975 + 976 + text_poke_bp_batch(&tp, 1); 977 + }