Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/alternatives: Implement a better poke_int3_handler() completion scheme

Commit:

285a54efe386 ("x86/alternatives: Sync bp_patching update for avoiding NULL pointer exception")

added an additional text_poke_sync() IPI to text_poke_bp_batch() to
handle the rare case where another CPU is still inside an INT3 handler
while we clear the global state.

Instead of spraying IPIs around, count the active INT3 handlers and
wait for them to go away before proceeding to clear/reuse the data.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Peter Zijlstra and committed by
Ingo Molnar
1f676247 46f5cfc1

+53 -31
+53 -31
arch/x86/kernel/alternative.c
··· 948 948 const u8 text[POKE_MAX_OPCODE_SIZE]; 949 949 }; 950 950 951 - static struct bp_patching_desc { 951 + struct bp_patching_desc { 952 952 struct text_poke_loc *vec; 953 953 int nr_entries; 954 - } bp_patching; 954 + atomic_t refs; 955 + }; 956 + 957 + static struct bp_patching_desc *bp_desc; 958 + 959 + static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp) 960 + { 961 + struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */ 962 + 963 + if (!desc || !atomic_inc_not_zero(&desc->refs)) 964 + return NULL; 965 + 966 + return desc; 967 + } 968 + 969 + static inline void put_desc(struct bp_patching_desc *desc) 970 + { 971 + smp_mb__before_atomic(); 972 + atomic_dec(&desc->refs); 973 + } 955 974 956 975 static inline void *text_poke_addr(struct text_poke_loc *tp) 957 976 { ··· 991 972 992 973 int notrace poke_int3_handler(struct pt_regs *regs) 993 974 { 975 + struct bp_patching_desc *desc; 994 976 struct text_poke_loc *tp; 977 + int len, ret = 0; 995 978 void *ip; 996 - int len; 979 + 980 + if (user_mode(regs)) 981 + return 0; 997 982 998 983 /* 999 984 * Having observed our INT3 instruction, we now must observe 1000 - * bp_patching.nr_entries. 985 + * bp_desc: 1001 986 * 1002 - * nr_entries != 0 INT3 987 + * bp_desc = desc INT3 1003 988 * WMB RMB 1004 - * write INT3 if (nr_entries) 1005 - * 1006 - * Idem for other elements in bp_patching. 989 + * write INT3 if (desc) 1007 990 */ 1008 991 smp_rmb(); 1009 992 1010 - if (likely(!bp_patching.nr_entries)) 1011 - return 0; 1012 - 1013 - if (user_mode(regs)) 993 + desc = try_get_desc(&bp_desc); 994 + if (!desc) 1014 995 return 0; 1015 996 1016 997 /* ··· 1021 1002 /* 1022 1003 * Skip the binary search if there is a single member in the vector. 1023 1004 */ 1024 - if (unlikely(bp_patching.nr_entries > 1)) { 1025 - tp = bsearch(ip, bp_patching.vec, bp_patching.nr_entries, 1005 + if (unlikely(desc->nr_entries > 1)) { 1006 + tp = bsearch(ip, desc->vec, desc->nr_entries, 1026 1007 sizeof(struct text_poke_loc), 1027 1008 patch_cmp); 1028 1009 if (!tp) 1029 - return 0; 1010 + goto out_put; 1030 1011 } else { 1031 - tp = bp_patching.vec; 1012 + tp = desc->vec; 1032 1013 if (text_poke_addr(tp) != ip) 1033 - return 0; 1014 + goto out_put; 1034 1015 } 1035 1016 1036 1017 len = text_opcode_size(tp->opcode); ··· 1042 1023 * Someone poked an explicit INT3, they'll want to handle it, 1043 1024 * do not consume. 1044 1025 */ 1045 - return 0; 1026 + goto out_put; 1046 1027 1047 1028 case CALL_INSN_OPCODE: 1048 1029 int3_emulate_call(regs, (long)ip + tp->rel32); ··· 1057 1038 BUG(); 1058 1039 } 1059 1040 1060 - return 1; 1041 + ret = 1; 1042 + 1043 + out_put: 1044 + put_desc(desc); 1045 + return ret; 1061 1046 } 1062 1047 NOKPROBE_SYMBOL(poke_int3_handler); 1063 1048 ··· 1092 1069 */ 1093 1070 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) 1094 1071 { 1072 + struct bp_patching_desc desc = { 1073 + .vec = tp, 1074 + .nr_entries = nr_entries, 1075 + .refs = ATOMIC_INIT(1), 1076 + }; 1095 1077 unsigned char int3 = INT3_INSN_OPCODE; 1096 1078 unsigned int i; 1097 1079 int do_sync; 1098 1080 1099 1081 lockdep_assert_held(&text_mutex); 1100 1082 1101 - bp_patching.vec = tp; 1102 - bp_patching.nr_entries = nr_entries; 1083 + smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */ 1103 1084 1104 1085 /* 1105 1086 * Corresponding read barrier in int3 notifier for making sure the ··· 1158 1131 text_poke_sync(); 1159 1132 1160 1133 /* 1161 - * sync_core() implies an smp_mb() and orders this store against 1162 - * the writing of the new instruction. 1134 + * Remove and synchronize_rcu(), except we have a very primitive 1135 + * refcount based completion. 1163 1136 */ 1164 - bp_patching.nr_entries = 0; 1165 - /* 1166 - * This sync_core () call ensures that all INT3 handlers in progress 1167 - * have finished. This allows poke_int3_handler() after this to 1168 - * avoid touching bp_paching.vec by checking nr_entries == 0. 1169 - */ 1170 - text_poke_sync(); 1171 - bp_patching.vec = NULL; 1137 + WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */ 1138 + if (!atomic_dec_and_test(&desc.refs)) 1139 + atomic_cond_read_acquire(&desc.refs, !VAL); 1172 1140 } 1173 1141 1174 1142 void text_poke_loc_init(struct text_poke_loc *tp, void *addr,