Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'x86/mm'

Depends on the simplifications from commit 1d7e707af446 ("Revert "x86/module: prepare module loading for ROX allocations of text"")

Signed-off-by: Peter Zijlstra <peterz@infradead.org>

+426 -271
+5 -6
arch/um/kernel/um_arch.c
··· 440 440 os_check_bugs(); 441 441 } 442 442 443 - void apply_seal_endbr(s32 *start, s32 *end, struct module *mod) 443 + void apply_seal_endbr(s32 *start, s32 *end) 444 444 { 445 445 } 446 446 447 - void apply_retpolines(s32 *start, s32 *end, struct module *mod) 447 + void apply_retpolines(s32 *start, s32 *end) 448 448 { 449 449 } 450 450 451 - void apply_returns(s32 *start, s32 *end, struct module *mod) 451 + void apply_returns(s32 *start, s32 *end) 452 452 { 453 453 } 454 454 455 455 void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, 456 - s32 *start_cfi, s32 *end_cfi, struct module *mod) 456 + s32 *start_cfi, s32 *end_cfi) 457 457 { 458 458 } 459 459 460 - void apply_alternatives(struct alt_instr *start, struct alt_instr *end, 461 - struct module *mod) 460 + void apply_alternatives(struct alt_instr *start, struct alt_instr *end) 462 461 { 463 462 } 464 463
+1
arch/x86/Kconfig
··· 85 85 select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN 86 86 select ARCH_HAS_EARLY_DEBUG if KGDB 87 87 select ARCH_HAS_ELF_RANDOMIZE 88 + select ARCH_HAS_EXECMEM_ROX if X86_64 88 89 select ARCH_HAS_FAST_MULTIPLIER 89 90 select ARCH_HAS_FORTIFY_SOURCE 90 91 select ARCH_HAS_GCOV_PROFILE_ALL
+1 -2
arch/x86/entry/vdso/vma.c
··· 48 48 49 49 apply_alternatives((struct alt_instr *)(image->data + image->alt), 50 50 (struct alt_instr *)(image->data + image->alt + 51 - image->alt_len), 52 - NULL); 51 + image->alt_len)); 53 52 54 53 return 0; 55 54 }
+7 -7
arch/x86/include/asm/alternative.h
··· 87 87 * instructions were patched in already: 88 88 */ 89 89 extern int alternatives_patched; 90 - struct module; 91 90 92 91 extern void alternative_instructions(void); 93 - extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end, 94 - struct module *mod); 95 - extern void apply_retpolines(s32 *start, s32 *end, struct module *mod); 96 - extern void apply_returns(s32 *start, s32 *end, struct module *mod); 97 - extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod); 92 + extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); 93 + extern void apply_retpolines(s32 *start, s32 *end); 94 + extern void apply_returns(s32 *start, s32 *end); 95 + extern void apply_seal_endbr(s32 *start, s32 *end); 98 96 extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, 99 - s32 *start_cfi, s32 *end_cfi, struct module *mod); 97 + s32 *start_cfi, s32 *end_cfi); 98 + 99 + struct module; 100 100 101 101 struct callthunk_sites { 102 102 s32 *call_start, *call_end;
+2
arch/x86/include/asm/pgtable_types.h
··· 33 33 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 34 34 #define _PAGE_BIT_UFFD_WP _PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */ 35 35 #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ 36 + #define _PAGE_BIT_KERNEL_4K _PAGE_BIT_SOFTW3 /* page must not be converted to large */ 36 37 #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 37 38 38 39 #ifdef CONFIG_X86_64 ··· 65 64 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 66 65 #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 67 66 #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 67 + #define _PAGE_KERNEL_4K (_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K) 68 68 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 69 69 #define _PAGE_PKEY_BIT0 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0) 70 70 #define _PAGE_PKEY_BIT1 (_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
+1 -1
arch/x86/include/asm/tlbflush.h
··· 242 242 flush_tlb_mm_range((vma)->vm_mm, start, end, \ 243 243 ((vma)->vm_flags & VM_HUGETLB) \ 244 244 ? huge_page_shift(hstate_vma(vma)) \ 245 - : PAGE_SHIFT, false) 245 + : PAGE_SHIFT, true) 246 246 247 247 extern void flush_tlb_all(void); 248 248 extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
+76 -105
arch/x86/kernel/alternative.c
··· 392 392 * Rewrite the "call BUG_func" replacement to point to the target of the 393 393 * indirect pv_ops call "call *disp(%ip)". 394 394 */ 395 - static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, 396 - struct module *mod) 395 + static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) 397 396 { 398 - u8 *wr_instr = module_writable_address(mod, instr); 399 397 void *target, *bug = &BUG_func; 400 398 s32 disp; 401 399 ··· 403 405 } 404 406 405 407 if (a->instrlen != 6 || 406 - wr_instr[0] != CALL_RIP_REL_OPCODE || 407 - wr_instr[1] != CALL_RIP_REL_MODRM) { 408 + instr[0] != CALL_RIP_REL_OPCODE || 409 + instr[1] != CALL_RIP_REL_MODRM) { 408 410 pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); 409 411 BUG(); 410 412 } 411 413 412 414 /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ 413 - disp = *(s32 *)(wr_instr + 2); 415 + disp = *(s32 *)(instr + 2); 414 416 #ifdef CONFIG_X86_64 415 417 /* ff 15 00 00 00 00 call *0x0(%rip) */ 416 418 /* target address is stored at "next instruction + disp". */ ··· 448 450 * to refetch changed I$ lines. 449 451 */ 450 452 void __init_or_module noinline apply_alternatives(struct alt_instr *start, 451 - struct alt_instr *end, 452 - struct module *mod) 453 + struct alt_instr *end) 453 454 { 454 455 u8 insn_buff[MAX_PATCH_LEN]; 455 456 u8 *instr, *replacement; ··· 477 480 */ 478 481 for (a = start; a < end; a++) { 479 482 int insn_buff_sz = 0; 480 - u8 *wr_instr, *wr_replacement; 481 483 482 484 /* 483 485 * In case of nested ALTERNATIVE()s the outer alternative might ··· 490 494 } 491 495 492 496 instr = instr_va(a); 493 - wr_instr = module_writable_address(mod, instr); 494 - 495 497 replacement = (u8 *)&a->repl_offset + a->repl_offset; 496 - wr_replacement = module_writable_address(mod, replacement); 497 - 498 498 BUG_ON(a->instrlen > sizeof(insn_buff)); 499 499 BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); 500 500 ··· 501 509 * patch if feature is *NOT* present. 502 510 */ 503 511 if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { 504 - memcpy(insn_buff, wr_instr, a->instrlen); 512 + memcpy(insn_buff, instr, a->instrlen); 505 513 optimize_nops(instr, insn_buff, a->instrlen); 506 - text_poke_early(wr_instr, insn_buff, a->instrlen); 514 + text_poke_early(instr, insn_buff, a->instrlen); 507 515 continue; 508 516 } 509 517 ··· 513 521 instr, instr, a->instrlen, 514 522 replacement, a->replacementlen, a->flags); 515 523 516 - memcpy(insn_buff, wr_replacement, a->replacementlen); 524 + memcpy(insn_buff, replacement, a->replacementlen); 517 525 insn_buff_sz = a->replacementlen; 518 526 519 527 if (a->flags & ALT_FLAG_DIRECT_CALL) { 520 - insn_buff_sz = alt_replace_call(instr, insn_buff, a, 521 - mod); 528 + insn_buff_sz = alt_replace_call(instr, insn_buff, a); 522 529 if (insn_buff_sz < 0) 523 530 continue; 524 531 } ··· 527 536 528 537 apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); 529 538 530 - DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr); 539 + DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); 531 540 DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); 532 541 DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); 533 542 534 - text_poke_early(wr_instr, insn_buff, insn_buff_sz); 543 + text_poke_early(instr, insn_buff, insn_buff_sz); 535 544 } 536 545 537 546 kasan_enable_current(); ··· 722 731 /* 723 732 * Generated by 'objtool --retpoline'. 724 733 */ 725 - void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, 726 - struct module *mod) 734 + void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) 727 735 { 728 736 s32 *s; 729 737 730 738 for (s = start; s < end; s++) { 731 739 void *addr = (void *)s + *s; 732 - void *wr_addr = module_writable_address(mod, addr); 733 740 struct insn insn; 734 741 int len, ret; 735 742 u8 bytes[16]; 736 743 u8 op1, op2; 737 744 738 - ret = insn_decode_kernel(&insn, wr_addr); 745 + ret = insn_decode_kernel(&insn, addr); 739 746 if (WARN_ON_ONCE(ret < 0)) 740 747 continue; 741 748 ··· 761 772 len = patch_retpoline(addr, &insn, bytes); 762 773 if (len == insn.length) { 763 774 optimize_nops(addr, bytes, len); 764 - DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr); 775 + DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); 765 776 DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); 766 - text_poke_early(wr_addr, bytes, len); 777 + text_poke_early(addr, bytes, len); 767 778 } 768 779 } 769 780 } ··· 799 810 return i; 800 811 } 801 812 802 - void __init_or_module noinline apply_returns(s32 *start, s32 *end, 803 - struct module *mod) 813 + void __init_or_module noinline apply_returns(s32 *start, s32 *end) 804 814 { 805 815 s32 *s; 806 816 ··· 808 820 809 821 for (s = start; s < end; s++) { 810 822 void *dest = NULL, *addr = (void *)s + *s; 811 - void *wr_addr = module_writable_address(mod, addr); 812 823 struct insn insn; 813 824 int len, ret; 814 825 u8 bytes[16]; 815 826 u8 op; 816 827 817 - ret = insn_decode_kernel(&insn, wr_addr); 828 + ret = insn_decode_kernel(&insn, addr); 818 829 if (WARN_ON_ONCE(ret < 0)) 819 830 continue; 820 831 ··· 833 846 834 847 len = patch_return(addr, &insn, bytes); 835 848 if (len == insn.length) { 836 - DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr); 849 + DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); 837 850 DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); 838 - text_poke_early(wr_addr, bytes, len); 851 + text_poke_early(addr, bytes, len); 839 852 } 840 853 } 841 854 } 842 855 #else 843 - void __init_or_module noinline apply_returns(s32 *start, s32 *end, 844 - struct module *mod) { } 856 + void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 845 857 #endif /* CONFIG_MITIGATION_RETHUNK */ 846 858 847 859 #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ 848 860 849 - void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, 850 - struct module *mod) { } 851 - void __init_or_module noinline apply_returns(s32 *start, s32 *end, 852 - struct module *mod) { } 861 + void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } 862 + void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } 853 863 854 864 #endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ 855 865 856 866 #ifdef CONFIG_X86_KERNEL_IBT 857 867 858 - static void poison_cfi(void *addr, void *wr_addr); 868 + static void poison_cfi(void *addr); 859 869 860 - static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) 870 + static void __init_or_module poison_endbr(void *addr, bool warn) 861 871 { 862 872 u32 endbr, poison = gen_endbr_poison(); 863 873 864 - if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr))) 874 + if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) 865 875 return; 866 876 867 877 if (!is_endbr(endbr)) { ··· 873 889 */ 874 890 DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); 875 891 DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); 876 - text_poke_early(wr_addr, &poison, 4); 892 + text_poke_early(addr, &poison, 4); 877 893 } 878 894 879 895 /* ··· 882 898 * Seal the functions for indirect calls by clobbering the ENDBR instructions 883 899 * and the kCFI hash value. 884 900 */ 885 - void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod) 901 + void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) 886 902 { 887 903 s32 *s; 888 904 889 905 for (s = start; s < end; s++) { 890 906 void *addr = (void *)s + *s; 891 - void *wr_addr = module_writable_address(mod, addr); 892 907 893 - poison_endbr(addr, wr_addr, true); 908 + poison_endbr(addr, true); 894 909 if (IS_ENABLED(CONFIG_FINEIBT)) 895 - poison_cfi(addr - 16, wr_addr - 16); 910 + poison_cfi(addr - 16); 896 911 } 897 912 } 898 913 899 914 #else 900 915 901 - void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } 916 + void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } 902 917 903 918 #endif /* CONFIG_X86_KERNEL_IBT */ 904 919 ··· 1119 1136 } 1120 1137 1121 1138 /* .retpoline_sites */ 1122 - static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) 1139 + static int cfi_disable_callers(s32 *start, s32 *end) 1123 1140 { 1124 1141 /* 1125 1142 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate ··· 1131 1148 1132 1149 for (s = start; s < end; s++) { 1133 1150 void *addr = (void *)s + *s; 1134 - void *wr_addr; 1135 1151 u32 hash; 1136 1152 1137 1153 addr -= fineibt_caller_size; 1138 - wr_addr = module_writable_address(mod, addr); 1139 - hash = decode_caller_hash(wr_addr); 1140 - 1154 + hash = decode_caller_hash(addr); 1141 1155 if (!hash) /* nocfi callers */ 1142 1156 continue; 1143 1157 1144 - text_poke_early(wr_addr, jmp, 2); 1158 + text_poke_early(addr, jmp, 2); 1145 1159 } 1146 1160 1147 1161 return 0; 1148 1162 } 1149 1163 1150 - static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) 1164 + static int cfi_enable_callers(s32 *start, s32 *end) 1151 1165 { 1152 1166 /* 1153 1167 * Re-enable kCFI, undo what cfi_disable_callers() did. ··· 1154 1174 1155 1175 for (s = start; s < end; s++) { 1156 1176 void *addr = (void *)s + *s; 1157 - void *wr_addr; 1158 1177 u32 hash; 1159 1178 1160 1179 addr -= fineibt_caller_size; 1161 - wr_addr = module_writable_address(mod, addr); 1162 - hash = decode_caller_hash(wr_addr); 1180 + hash = decode_caller_hash(addr); 1163 1181 if (!hash) /* nocfi callers */ 1164 1182 continue; 1165 1183 1166 - text_poke_early(wr_addr, mov, 2); 1184 + text_poke_early(addr, mov, 2); 1167 1185 } 1168 1186 1169 1187 return 0; 1170 1188 } 1171 1189 1172 1190 /* .cfi_sites */ 1173 - static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod) 1191 + static int cfi_rand_preamble(s32 *start, s32 *end) 1174 1192 { 1175 1193 s32 *s; 1176 1194 1177 1195 for (s = start; s < end; s++) { 1178 1196 void *addr = (void *)s + *s; 1179 - void *wr_addr = module_writable_address(mod, addr); 1180 1197 u32 hash; 1181 1198 1182 - hash = decode_preamble_hash(wr_addr); 1199 + hash = decode_preamble_hash(addr); 1183 1200 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", 1184 1201 addr, addr, 5, addr)) 1185 1202 return -EINVAL; 1186 1203 1187 1204 hash = cfi_rehash(hash); 1188 - text_poke_early(wr_addr + 1, &hash, 4); 1205 + text_poke_early(addr + 1, &hash, 4); 1189 1206 } 1190 1207 1191 1208 return 0; 1192 1209 } 1193 1210 1194 - static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod) 1211 + static int cfi_rewrite_preamble(s32 *start, s32 *end) 1195 1212 { 1196 1213 s32 *s; 1197 1214 1198 1215 for (s = start; s < end; s++) { 1199 1216 void *addr = (void *)s + *s; 1200 - void *wr_addr = module_writable_address(mod, addr); 1201 1217 u32 hash; 1202 1218 1203 - hash = decode_preamble_hash(wr_addr); 1219 + hash = decode_preamble_hash(addr); 1204 1220 if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", 1205 1221 addr, addr, 5, addr)) 1206 1222 return -EINVAL; 1207 1223 1208 - text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size); 1209 - WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678); 1210 - text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4); 1224 + text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); 1225 + WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); 1226 + text_poke_early(addr + fineibt_preamble_hash, &hash, 4); 1211 1227 } 1212 1228 1213 1229 return 0; 1214 1230 } 1215 1231 1216 - static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod) 1232 + static void cfi_rewrite_endbr(s32 *start, s32 *end) 1217 1233 { 1218 1234 s32 *s; 1219 1235 1220 1236 for (s = start; s < end; s++) { 1221 1237 void *addr = (void *)s + *s; 1222 - void *wr_addr = module_writable_address(mod, addr); 1223 1238 1224 - poison_endbr(addr + 16, wr_addr + 16, false); 1239 + poison_endbr(addr+16, false); 1225 1240 } 1226 1241 } 1227 1242 1228 1243 /* .retpoline_sites */ 1229 - static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod) 1244 + static int cfi_rand_callers(s32 *start, s32 *end) 1230 1245 { 1231 1246 s32 *s; 1232 1247 1233 1248 for (s = start; s < end; s++) { 1234 1249 void *addr = (void *)s + *s; 1235 - void *wr_addr; 1236 1250 u32 hash; 1237 1251 1238 1252 addr -= fineibt_caller_size; 1239 - wr_addr = module_writable_address(mod, addr); 1240 - hash = decode_caller_hash(wr_addr); 1253 + hash = decode_caller_hash(addr); 1241 1254 if (hash) { 1242 1255 hash = -cfi_rehash(hash); 1243 - text_poke_early(wr_addr + 2, &hash, 4); 1256 + text_poke_early(addr + 2, &hash, 4); 1244 1257 } 1245 1258 } 1246 1259 1247 1260 return 0; 1248 1261 } 1249 1262 1250 - static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) 1263 + static int cfi_rewrite_callers(s32 *start, s32 *end) 1251 1264 { 1252 1265 s32 *s; 1253 1266 1254 1267 for (s = start; s < end; s++) { 1255 1268 void *addr = (void *)s + *s; 1256 - void *wr_addr; 1257 1269 u32 hash; 1258 1270 1259 1271 addr -= fineibt_caller_size; 1260 - wr_addr = module_writable_address(mod, addr); 1261 - hash = decode_caller_hash(wr_addr); 1272 + hash = decode_caller_hash(addr); 1262 1273 if (hash) { 1263 - text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size); 1264 - WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678); 1265 - text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4); 1274 + text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); 1275 + WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); 1276 + text_poke_early(addr + fineibt_caller_hash, &hash, 4); 1266 1277 } 1267 1278 /* rely on apply_retpolines() */ 1268 1279 } ··· 1262 1291 } 1263 1292 1264 1293 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, 1265 - s32 *start_cfi, s32 *end_cfi, struct module *mod) 1294 + s32 *start_cfi, s32 *end_cfi, bool builtin) 1266 1295 { 1267 - bool builtin = mod ? false : true; 1268 1296 int ret; 1269 1297 1270 1298 if (WARN_ONCE(fineibt_preamble_size != 16, ··· 1281 1311 * rewrite them. This disables all CFI. If this succeeds but any of the 1282 1312 * later stages fails, we're without CFI. 1283 1313 */ 1284 - ret = cfi_disable_callers(start_retpoline, end_retpoline, mod); 1314 + ret = cfi_disable_callers(start_retpoline, end_retpoline); 1285 1315 if (ret) 1286 1316 goto err; 1287 1317 ··· 1292 1322 cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); 1293 1323 } 1294 1324 1295 - ret = cfi_rand_preamble(start_cfi, end_cfi, mod); 1325 + ret = cfi_rand_preamble(start_cfi, end_cfi); 1296 1326 if (ret) 1297 1327 goto err; 1298 1328 1299 - ret = cfi_rand_callers(start_retpoline, end_retpoline, mod); 1329 + ret = cfi_rand_callers(start_retpoline, end_retpoline); 1300 1330 if (ret) 1301 1331 goto err; 1302 1332 } ··· 1308 1338 return; 1309 1339 1310 1340 case CFI_KCFI: 1311 - ret = cfi_enable_callers(start_retpoline, end_retpoline, mod); 1341 + ret = cfi_enable_callers(start_retpoline, end_retpoline); 1312 1342 if (ret) 1313 1343 goto err; 1314 1344 ··· 1318 1348 1319 1349 case CFI_FINEIBT: 1320 1350 /* place the FineIBT preamble at func()-16 */ 1321 - ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod); 1351 + ret = cfi_rewrite_preamble(start_cfi, end_cfi); 1322 1352 if (ret) 1323 1353 goto err; 1324 1354 1325 1355 /* rewrite the callers to target func()-16 */ 1326 - ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod); 1356 + ret = cfi_rewrite_callers(start_retpoline, end_retpoline); 1327 1357 if (ret) 1328 1358 goto err; 1329 1359 1330 1360 /* now that nobody targets func()+0, remove ENDBR there */ 1331 - cfi_rewrite_endbr(start_cfi, end_cfi, mod); 1361 + cfi_rewrite_endbr(start_cfi, end_cfi); 1332 1362 1333 1363 if (builtin) 1334 1364 pr_info("Using FineIBT CFI\n"); ··· 1347 1377 *(u32 *)addr = 0; 1348 1378 } 1349 1379 1350 - static void poison_cfi(void *addr, void *wr_addr) 1380 + static void poison_cfi(void *addr) 1351 1381 { 1352 1382 switch (cfi_mode) { 1353 1383 case CFI_FINEIBT: ··· 1359 1389 * ud2 1360 1390 * 1: nop 1361 1391 */ 1362 - poison_endbr(addr, wr_addr, false); 1363 - poison_hash(wr_addr + fineibt_preamble_hash); 1392 + poison_endbr(addr, false); 1393 + poison_hash(addr + fineibt_preamble_hash); 1364 1394 break; 1365 1395 1366 1396 case CFI_KCFI: ··· 1369 1399 * movl $0, %eax 1370 1400 * .skip 11, 0x90 1371 1401 */ 1372 - poison_hash(wr_addr + 1); 1402 + poison_hash(addr + 1); 1373 1403 break; 1374 1404 1375 1405 default: ··· 1380 1410 #else 1381 1411 1382 1412 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, 1383 - s32 *start_cfi, s32 *end_cfi, struct module *mod) 1413 + s32 *start_cfi, s32 *end_cfi, bool builtin) 1384 1414 { 1385 1415 } 1386 1416 1387 1417 #ifdef CONFIG_X86_KERNEL_IBT 1388 - static void poison_cfi(void *addr, void *wr_addr) { } 1418 + static void poison_cfi(void *addr) { } 1389 1419 #endif 1390 1420 1391 1421 #endif 1392 1422 1393 1423 void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, 1394 - s32 *start_cfi, s32 *end_cfi, struct module *mod) 1424 + s32 *start_cfi, s32 *end_cfi) 1395 1425 { 1396 1426 return __apply_fineibt(start_retpoline, end_retpoline, 1397 - start_cfi, end_cfi, mod); 1427 + start_cfi, end_cfi, 1428 + /* .builtin = */ false); 1398 1429 } 1399 1430 1400 1431 #ifdef CONFIG_SMP ··· 1692 1721 paravirt_set_cap(); 1693 1722 1694 1723 __apply_fineibt(__retpoline_sites, __retpoline_sites_end, 1695 - __cfi_sites, __cfi_sites_end, NULL); 1724 + __cfi_sites, __cfi_sites_end, true); 1696 1725 1697 1726 /* 1698 1727 * Rewrite the retpolines, must be done before alternatives since 1699 1728 * those can rewrite the retpoline thunks. 1700 1729 */ 1701 - apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL); 1702 - apply_returns(__return_sites, __return_sites_end, NULL); 1730 + apply_retpolines(__retpoline_sites, __retpoline_sites_end); 1731 + apply_returns(__return_sites, __return_sites_end); 1703 1732 1704 - apply_alternatives(__alt_instructions, __alt_instructions_end, NULL); 1733 + apply_alternatives(__alt_instructions, __alt_instructions_end); 1705 1734 1706 1735 /* 1707 1736 * Now all calls are established. Apply the call thunks if ··· 1712 1741 /* 1713 1742 * Seal all functions that do not have their address taken. 1714 1743 */ 1715 - apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL); 1744 + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); 1716 1745 1717 1746 #ifdef CONFIG_SMP 1718 1747 /* Patch to UP if other cpus not imminent. */
+14 -16
arch/x86/kernel/ftrace.c
··· 118 118 return ret; 119 119 120 120 /* replace the text with the new text */ 121 - if (ftrace_poke_late) { 121 + if (ftrace_poke_late) 122 122 text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); 123 - } else { 124 - mutex_lock(&text_mutex); 125 - text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); 126 - mutex_unlock(&text_mutex); 127 - } 123 + else 124 + text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); 128 125 return 0; 129 126 } 130 127 ··· 318 321 unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; 319 322 unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; 320 323 union ftrace_op_code_union op_ptr; 321 - void *ret; 324 + int ret; 322 325 323 326 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { 324 327 start_offset = (unsigned long)ftrace_regs_caller; ··· 349 352 npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); 350 353 351 354 /* Copy ftrace_caller onto the trampoline memory */ 352 - ret = text_poke_copy(trampoline, (void *)start_offset, size); 353 - if (WARN_ON(!ret)) 355 + ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); 356 + if (WARN_ON(ret < 0)) 354 357 goto fail; 355 358 356 359 ip = trampoline + size; 357 360 if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) 358 361 __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); 359 362 else 360 - text_poke_copy(ip, retq, sizeof(retq)); 363 + memcpy(ip, retq, sizeof(retq)); 361 364 362 365 /* No need to test direct calls on created trampolines */ 363 366 if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { ··· 365 368 ip = trampoline + (jmp_offset - start_offset); 366 369 if (WARN_ON(*(char *)ip != 0x75)) 367 370 goto fail; 368 - if (!text_poke_copy(ip, x86_nops[2], 2)) 371 + ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); 372 + if (ret < 0) 369 373 goto fail; 370 374 } 371 375 ··· 379 381 */ 380 382 381 383 ptr = (unsigned long *)(trampoline + size + RET_SIZE); 382 - text_poke_copy(ptr, &ops, sizeof(unsigned long)); 384 + *ptr = (unsigned long)ops; 383 385 384 386 op_offset -= start_offset; 385 387 memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); ··· 395 397 op_ptr.offset = offset; 396 398 397 399 /* put in the new offset to the ftrace_ops */ 398 - text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); 400 + memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); 399 401 400 402 /* put in the call to the function */ 401 403 mutex_lock(&text_mutex); ··· 405 407 * the depth accounting before the call already. 406 408 */ 407 409 dest = ftrace_ops_get_func(ops); 408 - text_poke_copy_locked(trampoline + call_offset, 409 - text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), 410 - CALL_INSN_SIZE, false); 410 + memcpy(trampoline + call_offset, 411 + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), 412 + CALL_INSN_SIZE); 411 413 mutex_unlock(&text_mutex); 412 414 413 415 /* ALLOC_TRAMP flags lets us know we created it */
+14 -31
arch/x86/kernel/module.c
··· 146 146 } 147 147 148 148 if (apply) { 149 - void *wr_loc = module_writable_address(me, loc); 150 - 151 - if (memcmp(wr_loc, &zero, size)) { 149 + if (memcmp(loc, &zero, size)) { 152 150 pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", 153 151 (int)ELF64_R_TYPE(rel[i].r_info), loc, val); 154 152 return -ENOEXEC; 155 153 } 156 - write(wr_loc, &val, size); 154 + write(loc, &val, size); 157 155 } else { 158 156 if (memcmp(loc, &val, size)) { 159 157 pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", 160 158 (int)ELF64_R_TYPE(rel[i].r_info), loc, val); 161 159 return -ENOEXEC; 162 160 } 163 - /* FIXME: needs care for ROX module allocations */ 164 161 write(loc, &zero, size); 165 162 } 166 163 } ··· 224 227 const Elf_Shdr *sechdrs, 225 228 struct module *me) 226 229 { 227 - const Elf_Shdr *s, *alt = NULL, 230 + const Elf_Shdr *s, *alt = NULL, *locks = NULL, 228 231 *orc = NULL, *orc_ip = NULL, 229 232 *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, 230 233 *calls = NULL, *cfi = NULL; ··· 233 236 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 234 237 if (!strcmp(".altinstructions", secstrings + s->sh_name)) 235 238 alt = s; 239 + if (!strcmp(".smp_locks", secstrings + s->sh_name)) 240 + locks = s; 236 241 if (!strcmp(".orc_unwind", secstrings + s->sh_name)) 237 242 orc = s; 238 243 if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) ··· 265 266 csize = cfi->sh_size; 266 267 } 267 268 268 - apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me); 269 + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); 269 270 } 270 271 if (retpolines) { 271 272 void *rseg = (void *)retpolines->sh_addr; 272 - apply_retpolines(rseg, rseg + retpolines->sh_size, me); 273 + apply_retpolines(rseg, rseg + retpolines->sh_size); 273 274 } 274 275 if (returns) { 275 276 void *rseg = (void *)returns->sh_addr; 276 - apply_returns(rseg, rseg + returns->sh_size, me); 277 + apply_returns(rseg, rseg + returns->sh_size); 277 278 } 278 279 if (alt) { 279 280 /* patch .altinstructions */ 280 281 void *aseg = (void *)alt->sh_addr; 281 - apply_alternatives(aseg, aseg + alt->sh_size, me); 282 + apply_alternatives(aseg, aseg + alt->sh_size); 282 283 } 283 284 if (calls || alt) { 284 285 struct callthunk_sites cs = {}; ··· 297 298 } 298 299 if (ibt_endbr) { 299 300 void *iseg = (void *)ibt_endbr->sh_addr; 300 - apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me); 301 + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); 301 302 } 302 - 303 - if (orc && orc_ip) 304 - unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, 305 - (void *)orc->sh_addr, orc->sh_size); 306 - 307 - return 0; 308 - } 309 - 310 - int module_post_finalize(const Elf_Ehdr *hdr, 311 - const Elf_Shdr *sechdrs, 312 - struct module *me) 313 - { 314 - const Elf_Shdr *s, *locks = NULL; 315 - char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 316 - 317 - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 318 - if (!strcmp(".smp_locks", secstrings + s->sh_name)) 319 - locks = s; 320 - } 321 - 322 303 if (locks) { 323 304 void *lseg = (void *)locks->sh_addr; 324 305 void *text = me->mem[MOD_TEXT].base; ··· 307 328 lseg, lseg + locks->sh_size, 308 329 text, text_end); 309 330 } 331 + 332 + if (orc && orc_ip) 333 + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, 334 + (void *)orc->sh_addr, orc->sh_size); 310 335 311 336 return 0; 312 337 }
+1 -1
arch/x86/mm/pat/cpa-test.c
··· 183 183 break; 184 184 185 185 case 1: 186 - err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1); 186 + err = change_page_attr_set(addrs, len[i], PAGE_CPA_TEST, 1); 187 187 break; 188 188 189 189 case 2:
+214 -6
arch/x86/mm/pat/set_memory.c
··· 73 73 #define CPA_ARRAY 2 74 74 #define CPA_PAGES_ARRAY 4 75 75 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ 76 + #define CPA_COLLAPSE 16 /* try to collapse large pages */ 76 77 77 78 static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) 78 79 { ··· 106 105 direct_pages_count[level - 1] += PTRS_PER_PTE; 107 106 } 108 107 108 + static void collapse_page_count(int level) 109 + { 110 + direct_pages_count[level]++; 111 + if (system_state == SYSTEM_RUNNING) { 112 + if (level == PG_LEVEL_2M) 113 + count_vm_event(DIRECT_MAP_LEVEL2_COLLAPSE); 114 + else if (level == PG_LEVEL_1G) 115 + count_vm_event(DIRECT_MAP_LEVEL3_COLLAPSE); 116 + } 117 + direct_pages_count[level - 1] -= PTRS_PER_PTE; 118 + } 119 + 109 120 void arch_report_meminfo(struct seq_file *m) 110 121 { 111 122 seq_printf(m, "DirectMap4k: %8lu kB\n", ··· 135 122 } 136 123 #else 137 124 static inline void split_page_count(int level) { } 125 + static inline void collapse_page_count(int level) { } 138 126 #endif 139 127 140 128 #ifdef CONFIG_X86_CPA_STATISTICS ··· 408 394 flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); 409 395 } 410 396 411 - static void cpa_flush(struct cpa_data *data, int cache) 397 + static int collapse_large_pages(unsigned long addr, struct list_head *pgtables); 398 + 399 + static void cpa_collapse_large_pages(struct cpa_data *cpa) 412 400 { 413 - struct cpa_data *cpa = data; 401 + unsigned long start, addr, end; 402 + struct ptdesc *ptdesc, *tmp; 403 + LIST_HEAD(pgtables); 404 + int collapsed = 0; 405 + int i; 406 + 407 + if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { 408 + for (i = 0; i < cpa->numpages; i++) 409 + collapsed += collapse_large_pages(__cpa_addr(cpa, i), 410 + &pgtables); 411 + } else { 412 + addr = __cpa_addr(cpa, 0); 413 + start = addr & PMD_MASK; 414 + end = addr + PAGE_SIZE * cpa->numpages; 415 + 416 + for (addr = start; within(addr, start, end); addr += PMD_SIZE) 417 + collapsed += collapse_large_pages(addr, &pgtables); 418 + } 419 + 420 + if (!collapsed) 421 + return; 422 + 423 + flush_tlb_all(); 424 + 425 + list_for_each_entry_safe(ptdesc, tmp, &pgtables, pt_list) { 426 + list_del(&ptdesc->pt_list); 427 + __free_page(ptdesc_page(ptdesc)); 428 + } 429 + } 430 + 431 + static void cpa_flush(struct cpa_data *cpa, int cache) 432 + { 414 433 unsigned int i; 415 434 416 435 BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); 417 436 418 437 if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { 419 438 cpa_flush_all(cache); 420 - return; 439 + goto collapse_large_pages; 421 440 } 422 441 423 442 if (cpa->force_flush_all || cpa->numpages > tlb_single_page_flush_ceiling) ··· 459 412 on_each_cpu(__cpa_flush_tlb, cpa, 1); 460 413 461 414 if (!cache) 462 - return; 415 + goto collapse_large_pages; 463 416 464 417 mb(); 465 418 for (i = 0; i < cpa->numpages; i++) { ··· 475 428 clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); 476 429 } 477 430 mb(); 431 + 432 + collapse_large_pages: 433 + if (cpa->flags & CPA_COLLAPSE) 434 + cpa_collapse_large_pages(cpa); 478 435 } 479 436 480 437 static bool overlaps(unsigned long r1_start, unsigned long r1_end, ··· 1246 1195 __free_page(base); 1247 1196 1248 1197 return 0; 1198 + } 1199 + 1200 + static int collapse_pmd_page(pmd_t *pmd, unsigned long addr, 1201 + struct list_head *pgtables) 1202 + { 1203 + pmd_t _pmd, old_pmd; 1204 + pte_t *pte, first; 1205 + unsigned long pfn; 1206 + pgprot_t pgprot; 1207 + int i = 0; 1208 + 1209 + addr &= PMD_MASK; 1210 + pte = pte_offset_kernel(pmd, addr); 1211 + first = *pte; 1212 + pfn = pte_pfn(first); 1213 + 1214 + /* Make sure alignment is suitable */ 1215 + if (PFN_PHYS(pfn) & ~PMD_MASK) 1216 + return 0; 1217 + 1218 + /* The page is 4k intentionally */ 1219 + if (pte_flags(first) & _PAGE_KERNEL_4K) 1220 + return 0; 1221 + 1222 + /* Check that the rest of PTEs are compatible with the first one */ 1223 + for (i = 1, pte++; i < PTRS_PER_PTE; i++, pte++) { 1224 + pte_t entry = *pte; 1225 + 1226 + if (!pte_present(entry)) 1227 + return 0; 1228 + if (pte_flags(entry) != pte_flags(first)) 1229 + return 0; 1230 + if (pte_pfn(entry) != pte_pfn(first) + i) 1231 + return 0; 1232 + } 1233 + 1234 + old_pmd = *pmd; 1235 + 1236 + /* Success: set up a large page */ 1237 + pgprot = pgprot_4k_2_large(pte_pgprot(first)); 1238 + pgprot_val(pgprot) |= _PAGE_PSE; 1239 + _pmd = pfn_pmd(pfn, pgprot); 1240 + set_pmd(pmd, _pmd); 1241 + 1242 + /* Queue the page table to be freed after TLB flush */ 1243 + list_add(&page_ptdesc(pmd_page(old_pmd))->pt_list, pgtables); 1244 + 1245 + if (IS_ENABLED(CONFIG_X86_32) && !SHARED_KERNEL_PMD) { 1246 + struct page *page; 1247 + 1248 + /* Update all PGD tables to use the same large page */ 1249 + list_for_each_entry(page, &pgd_list, lru) { 1250 + pgd_t *pgd = (pgd_t *)page_address(page) + pgd_index(addr); 1251 + p4d_t *p4d = p4d_offset(pgd, addr); 1252 + pud_t *pud = pud_offset(p4d, addr); 1253 + pmd_t *pmd = pmd_offset(pud, addr); 1254 + /* Something is wrong if entries doesn't match */ 1255 + if (WARN_ON(pmd_val(old_pmd) != pmd_val(*pmd))) 1256 + continue; 1257 + set_pmd(pmd, _pmd); 1258 + } 1259 + } 1260 + 1261 + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) 1262 + collapse_page_count(PG_LEVEL_2M); 1263 + 1264 + return 1; 1265 + } 1266 + 1267 + static int collapse_pud_page(pud_t *pud, unsigned long addr, 1268 + struct list_head *pgtables) 1269 + { 1270 + unsigned long pfn; 1271 + pmd_t *pmd, first; 1272 + int i; 1273 + 1274 + if (!direct_gbpages) 1275 + return 0; 1276 + 1277 + addr &= PUD_MASK; 1278 + pmd = pmd_offset(pud, addr); 1279 + first = *pmd; 1280 + 1281 + /* 1282 + * To restore PUD page all PMD entries must be large and 1283 + * have suitable alignment 1284 + */ 1285 + pfn = pmd_pfn(first); 1286 + if (!pmd_leaf(first) || (PFN_PHYS(pfn) & ~PUD_MASK)) 1287 + return 0; 1288 + 1289 + /* 1290 + * To restore PUD page, all following PMDs must be compatible with the 1291 + * first one. 1292 + */ 1293 + for (i = 1, pmd++; i < PTRS_PER_PMD; i++, pmd++) { 1294 + pmd_t entry = *pmd; 1295 + 1296 + if (!pmd_present(entry) || !pmd_leaf(entry)) 1297 + return 0; 1298 + if (pmd_flags(entry) != pmd_flags(first)) 1299 + return 0; 1300 + if (pmd_pfn(entry) != pmd_pfn(first) + i * PTRS_PER_PTE) 1301 + return 0; 1302 + } 1303 + 1304 + /* Restore PUD page and queue page table to be freed after TLB flush */ 1305 + list_add(&page_ptdesc(pud_page(*pud))->pt_list, pgtables); 1306 + set_pud(pud, pfn_pud(pfn, pmd_pgprot(first))); 1307 + 1308 + if (virt_addr_valid(addr) && pfn_range_is_mapped(pfn, pfn + 1)) 1309 + collapse_page_count(PG_LEVEL_1G); 1310 + 1311 + return 1; 1312 + } 1313 + 1314 + /* 1315 + * Collapse PMD and PUD pages in the kernel mapping around the address where 1316 + * possible. 1317 + * 1318 + * Caller must flush TLB and free page tables queued on the list before 1319 + * touching the new entries. CPU must not see TLB entries of different size 1320 + * with different attributes. 1321 + */ 1322 + static int collapse_large_pages(unsigned long addr, struct list_head *pgtables) 1323 + { 1324 + int collapsed = 0; 1325 + pgd_t *pgd; 1326 + p4d_t *p4d; 1327 + pud_t *pud; 1328 + pmd_t *pmd; 1329 + 1330 + addr &= PMD_MASK; 1331 + 1332 + spin_lock(&pgd_lock); 1333 + pgd = pgd_offset_k(addr); 1334 + if (pgd_none(*pgd)) 1335 + goto out; 1336 + p4d = p4d_offset(pgd, addr); 1337 + if (p4d_none(*p4d)) 1338 + goto out; 1339 + pud = pud_offset(p4d, addr); 1340 + if (!pud_present(*pud) || pud_leaf(*pud)) 1341 + goto out; 1342 + pmd = pmd_offset(pud, addr); 1343 + if (!pmd_present(*pmd) || pmd_leaf(*pmd)) 1344 + goto out; 1345 + 1346 + collapsed = collapse_pmd_page(pmd, addr, pgtables); 1347 + if (collapsed) 1348 + collapsed += collapse_pud_page(pud, addr, pgtables); 1349 + 1350 + out: 1351 + spin_unlock(&pgd_lock); 1352 + return collapsed; 1249 1353 } 1250 1354 1251 1355 static bool try_to_free_pte_page(pte_t *pte) ··· 2326 2120 if (__supported_pte_mask & _PAGE_NX) 2327 2121 clr.pgprot |= _PAGE_NX; 2328 2122 2329 - return change_page_attr_clear(&addr, numpages, clr, 0); 2123 + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), clr, 0, 2124 + CPA_COLLAPSE, NULL); 2330 2125 } 2331 2126 2332 2127 int set_memory_rw(unsigned long addr, int numpages) ··· 2354 2147 2355 2148 int set_memory_4k(unsigned long addr, int numpages) 2356 2149 { 2357 - return change_page_attr_set_clr(&addr, numpages, __pgprot(0), 2150 + return change_page_attr_set_clr(&addr, numpages, 2151 + __pgprot(_PAGE_KERNEL_4K), 2358 2152 __pgprot(0), 1, 0, NULL); 2359 2153 } 2360 2154
+1 -1
arch/x86/mm/tlb.c
··· 1325 1325 if (loaded_mm != current_mm) 1326 1326 return false; 1327 1327 1328 - VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); 1328 + VM_WARN_ON_ONCE(__pa(current_mm->pgd) != read_cr3_pa()); 1329 1329 1330 1330 return true; 1331 1331 }
+31
include/linux/execmem.h
··· 65 65 * Architectures that use EXECMEM_ROX_CACHE must implement this. 66 66 */ 67 67 void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); 68 + 69 + /** 70 + * execmem_make_temp_rw - temporarily remap region with read-write 71 + * permissions 72 + * @ptr: address of the region to remap 73 + * @size: size of the region to remap 74 + * 75 + * Remaps a part of the cached large page in the ROX cache in the range 76 + * [@ptr, @ptr + @size) as writable and not executable. The caller must 77 + * have exclusive ownership of this range and ensure nothing will try to 78 + * execute code in this range. 79 + * 80 + * Return: 0 on success or negative error code on failure. 81 + */ 82 + int execmem_make_temp_rw(void *ptr, size_t size); 83 + 84 + /** 85 + * execmem_restore_rox - restore read-only-execute permissions 86 + * @ptr: address of the region to remap 87 + * @size: size of the region to remap 88 + * 89 + * Restores read-only-execute permissions on a range [@ptr, @ptr + @size) 90 + * after it was temporarily remapped as writable. Relies on architecture 91 + * implementation of set_memory_rox() to restore mapping using large pages. 92 + * 93 + * Return: 0 on success or negative error code on failure. 94 + */ 95 + int execmem_restore_rox(void *ptr, size_t size); 96 + #else 97 + static inline int execmem_make_temp_rw(void *ptr, size_t size) { return 0; } 98 + static inline int execmem_restore_rox(void *ptr, size_t size) { return 0; } 68 99 #endif 69 100 70 101 /**
-16
include/linux/module.h
··· 370 370 371 371 struct module_memory { 372 372 void *base; 373 - void *rw_copy; 374 373 bool is_rox; 375 374 unsigned int size; 376 375 ··· 771 772 772 773 void set_module_sig_enforced(void); 773 774 774 - void *__module_writable_address(struct module *mod, void *loc); 775 - 776 - static inline void *module_writable_address(struct module *mod, void *loc) 777 - { 778 - if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod || 779 - mod->state != MODULE_STATE_UNFORMED) 780 - return loc; 781 - return __module_writable_address(mod, loc); 782 - } 783 - 784 775 #else /* !CONFIG_MODULES... */ 785 776 786 777 static inline struct module *__module_address(unsigned long addr) ··· 877 888 static inline bool module_is_coming(struct module *mod) 878 889 { 879 890 return false; 880 - } 881 - 882 - static inline void *module_writable_address(struct module *mod, void *loc) 883 - { 884 - return loc; 885 891 } 886 892 #endif /* CONFIG_MODULES */ 887 893
-4
include/linux/moduleloader.h
··· 108 108 const Elf_Shdr *sechdrs, 109 109 struct module *mod); 110 110 111 - int module_post_finalize(const Elf_Ehdr *hdr, 112 - const Elf_Shdr *sechdrs, 113 - struct module *mod); 114 - 115 111 #ifdef CONFIG_MODULES 116 112 void flush_module_init_free_work(void); 117 113 #else
+2
include/linux/vm_event_item.h
··· 151 151 #ifdef CONFIG_X86 152 152 DIRECT_MAP_LEVEL2_SPLIT, 153 153 DIRECT_MAP_LEVEL3_SPLIT, 154 + DIRECT_MAP_LEVEL2_COLLAPSE, 155 + DIRECT_MAP_LEVEL3_COLLAPSE, 154 156 #endif 155 157 #ifdef CONFIG_PER_VMA_LOCK_STATS 156 158 VMA_LOCK_SUCCESS,
+23 -58
kernel/module/main.c
··· 1221 1221 { 1222 1222 } 1223 1223 1224 - void *__module_writable_address(struct module *mod, void *loc) 1225 - { 1226 - for_class_mod_mem_type(type, text) { 1227 - struct module_memory *mem = &mod->mem[type]; 1228 - 1229 - if (loc >= mem->base && loc < mem->base + mem->size) 1230 - return loc + (mem->rw_copy - mem->base); 1231 - } 1232 - 1233 - return loc; 1234 - } 1235 - 1236 1224 static int module_memory_alloc(struct module *mod, enum mod_mem_type type) 1237 1225 { 1238 1226 unsigned int size = PAGE_ALIGN(mod->mem[type].size); ··· 1238 1250 if (!ptr) 1239 1251 return -ENOMEM; 1240 1252 1241 - mod->mem[type].base = ptr; 1242 - 1243 1253 if (execmem_is_rox(execmem_type)) { 1244 - ptr = vzalloc(size); 1254 + int err = execmem_make_temp_rw(ptr, size); 1245 1255 1246 - if (!ptr) { 1247 - execmem_free(mod->mem[type].base); 1256 + if (err) { 1257 + execmem_free(ptr); 1248 1258 return -ENOMEM; 1249 1259 } 1250 1260 1251 - mod->mem[type].rw_copy = ptr; 1252 1261 mod->mem[type].is_rox = true; 1253 - } else { 1254 - mod->mem[type].rw_copy = mod->mem[type].base; 1255 - memset(mod->mem[type].base, 0, size); 1256 1262 } 1257 1263 1258 1264 /* ··· 1260 1278 * *do* eventually get freed, but let's just keep things simple 1261 1279 * and avoid *any* false positives. 1262 1280 */ 1263 - kmemleak_not_leak(ptr); 1281 + if (!mod->mem[type].is_rox) 1282 + kmemleak_not_leak(ptr); 1283 + 1284 + memset(ptr, 0, size); 1285 + mod->mem[type].base = ptr; 1264 1286 1265 1287 return 0; 1288 + } 1289 + 1290 + static void module_memory_restore_rox(struct module *mod) 1291 + { 1292 + for_class_mod_mem_type(type, text) { 1293 + struct module_memory *mem = &mod->mem[type]; 1294 + 1295 + if (mem->is_rox) 1296 + execmem_restore_rox(mem->base, mem->size); 1297 + } 1266 1298 } 1267 1299 1268 1300 static void module_memory_free(struct module *mod, enum mod_mem_type type) 1269 1301 { 1270 1302 struct module_memory *mem = &mod->mem[type]; 1271 - 1272 - if (mem->is_rox) 1273 - vfree(mem->rw_copy); 1274 1303 1275 1304 execmem_free(mem->base); 1276 1305 } ··· 2635 2642 for_each_mod_mem_type(type) { 2636 2643 if (!mod->mem[type].size) { 2637 2644 mod->mem[type].base = NULL; 2638 - mod->mem[type].rw_copy = NULL; 2639 2645 continue; 2640 2646 } 2641 2647 ··· 2651 2659 void *dest; 2652 2660 Elf_Shdr *shdr = &info->sechdrs[i]; 2653 2661 const char *sname; 2654 - unsigned long addr; 2655 2662 2656 2663 if (!(shdr->sh_flags & SHF_ALLOC)) 2657 2664 continue; ··· 2671 2680 ret = PTR_ERR(dest); 2672 2681 goto out_err; 2673 2682 } 2674 - addr = (unsigned long)dest; 2675 2683 codetag_section_found = true; 2676 2684 } else { 2677 2685 enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; 2678 2686 unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; 2679 2687 2680 - addr = (unsigned long)mod->mem[type].base + offset; 2681 - dest = mod->mem[type].rw_copy + offset; 2688 + dest = mod->mem[type].base + offset; 2682 2689 } 2683 2690 2684 2691 if (shdr->sh_type != SHT_NOBITS) { ··· 2699 2710 * users of info can keep taking advantage and using the newly 2700 2711 * minted official memory area. 2701 2712 */ 2702 - shdr->sh_addr = addr; 2713 + shdr->sh_addr = (unsigned long)dest; 2703 2714 pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr, 2704 2715 (long)shdr->sh_size, info->secstrings + shdr->sh_name); 2705 2716 } 2706 2717 2707 2718 return 0; 2708 2719 out_err: 2720 + module_memory_restore_rox(mod); 2709 2721 for (t--; t >= 0; t--) 2710 2722 module_memory_free(mod, t); 2711 2723 if (codetag_section_found) ··· 2853 2863 return 0; 2854 2864 } 2855 2865 2856 - int __weak module_post_finalize(const Elf_Ehdr *hdr, 2857 - const Elf_Shdr *sechdrs, 2858 - struct module *me) 2859 - { 2860 - return 0; 2861 - } 2862 - 2863 2866 static int post_relocation(struct module *mod, const struct load_info *info) 2864 2867 { 2865 - int ret; 2866 - 2867 2868 /* Sort exception table now relocations are done. */ 2868 2869 sort_extable(mod->extable, mod->extable + mod->num_exentries); 2869 2870 ··· 2866 2885 add_kallsyms(mod, info); 2867 2886 2868 2887 /* Arch-specific module finalizing. */ 2869 - ret = module_finalize(info->hdr, info->sechdrs, mod); 2870 - if (ret) 2871 - return ret; 2872 - 2873 - for_each_mod_mem_type(type) { 2874 - struct module_memory *mem = &mod->mem[type]; 2875 - 2876 - if (mem->is_rox) { 2877 - if (!execmem_update_copy(mem->base, mem->rw_copy, 2878 - mem->size)) 2879 - return -ENOMEM; 2880 - 2881 - vfree(mem->rw_copy); 2882 - mem->rw_copy = NULL; 2883 - } 2884 - } 2885 - 2886 - return module_post_finalize(info->hdr, info->sechdrs, mod); 2888 + return module_finalize(info->hdr, info->sechdrs, mod); 2887 2889 } 2888 2890 2889 2891 /* Call module constructors. */ ··· 3463 3499 mod->mem[type].size); 3464 3500 } 3465 3501 3502 + module_memory_restore_rox(mod); 3466 3503 module_deallocate(mod, info); 3467 3504 free_copy: 3468 3505 /*
+5 -4
kernel/module/strict_rwx.c
··· 9 9 #include <linux/mm.h> 10 10 #include <linux/vmalloc.h> 11 11 #include <linux/set_memory.h> 12 + #include <linux/execmem.h> 12 13 #include "internal.h" 13 14 14 15 static int module_set_memory(const struct module *mod, enum mod_mem_type type, ··· 33 32 int module_enable_text_rox(const struct module *mod) 34 33 { 35 34 for_class_mod_mem_type(type, text) { 35 + const struct module_memory *mem = &mod->mem[type]; 36 36 int ret; 37 37 38 - if (mod->mem[type].is_rox) 39 - continue; 40 - 41 - if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) 38 + if (mem->is_rox) 39 + ret = execmem_restore_rox(mem->base, mem->size); 40 + else if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) 42 41 ret = module_set_memory(mod, type, set_memory_rox); 43 42 else 44 43 ret = module_set_memory(mod, type, set_memory_x);
+26 -13
mm/execmem.c
··· 257 257 static int execmem_cache_populate(struct execmem_range *range, size_t size) 258 258 { 259 259 unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; 260 - unsigned long start, end; 261 260 struct vm_struct *vm; 262 261 size_t alloc_size; 263 262 int err = -ENOMEM; ··· 274 275 /* fill memory with instructions that will trap */ 275 276 execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); 276 277 277 - start = (unsigned long)p; 278 - end = start + alloc_size; 279 - 280 - vunmap_range(start, end); 281 - 282 - err = execmem_set_direct_map_valid(vm, false); 283 - if (err) 284 - goto err_free_mem; 285 - 286 - err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages, 287 - PMD_SHIFT); 278 + err = set_memory_rox((unsigned long)p, vm->nr_pages); 288 279 if (err) 289 280 goto err_free_mem; 290 281 291 282 err = execmem_cache_add(p, alloc_size); 292 283 if (err) 293 - goto err_free_mem; 284 + goto err_reset_direct_map; 294 285 295 286 return 0; 296 287 288 + err_reset_direct_map: 289 + execmem_set_direct_map_valid(vm, true); 297 290 err_free_mem: 298 291 vfree(p); 299 292 return err; ··· 335 344 336 345 return true; 337 346 } 347 + 348 + int execmem_make_temp_rw(void *ptr, size_t size) 349 + { 350 + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; 351 + unsigned long addr = (unsigned long)ptr; 352 + int ret; 353 + 354 + ret = set_memory_nx(addr, nr); 355 + if (ret) 356 + return ret; 357 + 358 + return set_memory_rw(addr, nr); 359 + } 360 + 361 + int execmem_restore_rox(void *ptr, size_t size) 362 + { 363 + unsigned int nr = PAGE_ALIGN(size) >> PAGE_SHIFT; 364 + unsigned long addr = (unsigned long)ptr; 365 + 366 + return set_memory_rox(addr, nr); 367 + } 368 + 338 369 #else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ 339 370 static void *execmem_cache_alloc(struct execmem_range *range, size_t size) 340 371 {
+2
mm/vmstat.c
··· 1435 1435 #ifdef CONFIG_X86 1436 1436 "direct_map_level2_splits", 1437 1437 "direct_map_level3_splits", 1438 + "direct_map_level2_collapses", 1439 + "direct_map_level3_collapses", 1438 1440 #endif 1439 1441 #ifdef CONFIG_PER_VMA_LOCK_STATS 1440 1442 "vma_lock_success",