Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
"This is bigger than usual - the reason is partly a pent-up stream of
fixes after the merge window and partly accidental. The fixes are:

- five patches to fix a boot failure on Andy Lutomirsky's laptop
- four SGI UV platform fixes
- KASAN fix
- warning fix
- documentation update
- swap entry definition fix
- pkeys fix
- irq stats fix"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/apic/x2apic, smp/hotplug: Don't use before alloc in x2apic_cluster_probe()
x86/efi: Allocate a trampoline if needed in efi_free_boot_services()
x86/boot: Rework reserve_real_mode() to allow multiple tries
x86/boot: Defer setup_real_mode() to early_initcall time
x86/boot: Synchronize trampoline_cr4_features and mmu_cr4_features directly
x86/boot: Run reserve_bios_regions() after we initialize the memory map
x86/irq: Do not substract irq_tlb_count from irq_call_count
x86/mm: Fix swap entry comment and macro
x86/mm/kaslr: Fix -Wformat-security warning
x86/mm/pkeys: Fix compact mode by removing protection keys' XSAVE buffer manipulation
x86/build: Reduce the W=1 warnings noise when compiling x86 syscall tables
x86/platform/UV: Fix kernel panic running RHEL kdump kernel on UV systems
x86/platform/UV: Fix problem with UV4 BIOS providing incorrect PXM values
x86/platform/UV: Fix bug with iounmap() of the UV4 EFI System Table causing a crash
x86/platform/UV: Fix problem with UV4 Socket IDs not being contiguous
x86/entry: Clarify the RF saving/restoring situation with SYSCALL/SYSRET
x86/mm: Disable preemption during CR3 read+write
x86/mm/KASLR: Increase BRK pages for KASLR memory randomization
x86/mm/KASLR: Fix physical memory calculation on KASLR memory randomization
x86, kasan, ftrace: Put APIC interrupt handlers into .irqentry.text

+185 -198
+2
arch/x86/entry/Makefile
··· 5 5 OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y 6 6 OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y 7 7 8 + CFLAGS_syscall_64.o += -Wno-override-init 9 + CFLAGS_syscall_32.o += -Wno-override-init 8 10 obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o 9 11 obj-y += common.o 10 12
+20 -5
arch/x86/entry/entry_64.S
··· 288 288 jne opportunistic_sysret_failed 289 289 290 290 /* 291 - * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, 292 - * restoring TF results in a trap from userspace immediately after 293 - * SYSRET. This would cause an infinite loop whenever #DB happens 294 - * with register state that satisfies the opportunistic SYSRET 295 - * conditions. For example, single-stepping this user code: 291 + * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot 292 + * restore RF properly. If the slowpath sets it for whatever reason, we 293 + * need to restore it correctly. 294 + * 295 + * SYSRET can restore TF, but unlike IRET, restoring TF results in a 296 + * trap from userspace immediately after SYSRET. This would cause an 297 + * infinite loop whenever #DB happens with register state that satisfies 298 + * the opportunistic SYSRET conditions. For example, single-stepping 299 + * this user code: 296 300 * 297 301 * movq $stuck_here, %rcx 298 302 * pushfq ··· 605 601 .endm 606 602 #endif 607 603 604 + /* Make sure APIC interrupt handlers end up in the irqentry section: */ 605 + #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) 606 + # define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" 607 + # define POP_SECTION_IRQENTRY .popsection 608 + #else 609 + # define PUSH_SECTION_IRQENTRY 610 + # define POP_SECTION_IRQENTRY 611 + #endif 612 + 608 613 .macro apicinterrupt num sym do_sym 614 + PUSH_SECTION_IRQENTRY 609 615 apicinterrupt3 \num \sym \do_sym 610 616 trace_apicinterrupt \num \sym 617 + POP_SECTION_IRQENTRY 611 618 .endm 612 619 613 620 #ifdef CONFIG_SMP
-4
arch/x86/include/asm/hardirq.h
··· 22 22 #ifdef CONFIG_SMP 23 23 unsigned int irq_resched_count; 24 24 unsigned int irq_call_count; 25 - /* 26 - * irq_tlb_count is double-counted in irq_call_count, so it must be 27 - * subtracted from irq_call_count when displaying irq_call_count 28 - */ 29 25 unsigned int irq_tlb_count; 30 26 #endif 31 27 #ifdef CONFIG_X86_THERMAL_VECTOR
+2 -2
arch/x86/include/asm/pgtable_64.h
··· 145 145 * 146 146 * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number 147 147 * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names 148 - * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry 148 + * | OFFSET (14->63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| <- swp entry 149 149 * 150 150 * G (8) is aliased and used as a PROT_NONE indicator for 151 151 * !present ptes. We need to start storing swap entries above ··· 156 156 #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) 157 157 #define SWP_TYPE_BITS 5 158 158 /* Place the offset above the type: */ 159 - #define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1) 159 + #define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS) 160 160 161 161 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) 162 162
+9 -1
arch/x86/include/asm/realmode.h
··· 58 58 extern unsigned char secondary_startup_64[]; 59 59 #endif 60 60 61 + static inline size_t real_mode_size_needed(void) 62 + { 63 + if (real_mode_header) 64 + return 0; /* already allocated. */ 65 + 66 + return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE); 67 + } 68 + 69 + void set_real_mode_mem(phys_addr_t mem, size_t size); 61 70 void reserve_real_mode(void); 62 - void setup_real_mode(void); 63 71 64 72 #endif /* _ARCH_X86_REALMODE_H */
+7
arch/x86/include/asm/tlbflush.h
··· 135 135 136 136 static inline void __native_flush_tlb(void) 137 137 { 138 + /* 139 + * If current->mm == NULL then we borrow a mm which may change during a 140 + * task switch and therefore we must not be preempted while we write CR3 141 + * back: 142 + */ 143 + preempt_disable(); 138 144 native_write_cr3(native_read_cr3()); 145 + preempt_enable(); 139 146 } 140 147 141 148 static inline void __native_flush_tlb_global_irq_disabled(void)
+3 -2
arch/x86/include/asm/uv/bios.h
··· 79 79 u16 nasid; /* HNasid */ 80 80 u16 sockid; /* Socket ID, high bits of APIC ID */ 81 81 u16 pnode; /* Index to MMR and GRU spaces */ 82 - u32 pxm; /* ACPI proximity domain number */ 82 + u32 unused2; 83 83 u32 limit; /* PA bits 56:26 (UV_GAM_RANGE_SHFT) */ 84 84 }; 85 85 ··· 88 88 #define UV_SYSTAB_VERSION_UV4 0x400 /* UV4 BIOS base version */ 89 89 #define UV_SYSTAB_VERSION_UV4_1 0x401 /* + gpa_shift */ 90 90 #define UV_SYSTAB_VERSION_UV4_2 0x402 /* + TYPE_NVRAM/WINDOW/MBOX */ 91 - #define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_2 91 + #define UV_SYSTAB_VERSION_UV4_3 0x403 /* - GAM Range PXM Value */ 92 + #define UV_SYSTAB_VERSION_UV4_LATEST UV_SYSTAB_VERSION_UV4_3 92 93 93 94 #define UV_SYSTAB_TYPE_UNUSED 0 /* End of table (offset == 0) */ 94 95 #define UV_SYSTAB_TYPE_GAM_PARAMS 1 /* GAM PARAM conversions */
+9 -4
arch/x86/kernel/apic/x2apic_cluster.c
··· 155 155 /* 156 156 * At CPU state changes, update the x2apic cluster sibling info. 157 157 */ 158 - int x2apic_prepare_cpu(unsigned int cpu) 158 + static int x2apic_prepare_cpu(unsigned int cpu) 159 159 { 160 160 if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL)) 161 161 return -ENOMEM; ··· 168 168 return 0; 169 169 } 170 170 171 - int x2apic_dead_cpu(unsigned int this_cpu) 171 + static int x2apic_dead_cpu(unsigned int this_cpu) 172 172 { 173 173 int cpu; 174 174 ··· 186 186 static int x2apic_cluster_probe(void) 187 187 { 188 188 int cpu = smp_processor_id(); 189 + int ret; 189 190 190 191 if (!x2apic_mode) 191 192 return 0; 192 193 194 + ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", 195 + x2apic_prepare_cpu, x2apic_dead_cpu); 196 + if (ret < 0) { 197 + pr_err("Failed to register X2APIC_PREPARE\n"); 198 + return 0; 199 + } 193 200 cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); 194 - cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE", 195 - x2apic_prepare_cpu, x2apic_dead_cpu); 196 201 return 1; 197 202 } 198 203
+20 -22
arch/x86/kernel/apic/x2apic_uv_x.c
··· 223 223 if (strncmp(oem_id, "SGI", 3) != 0) 224 224 return 0; 225 225 226 + if (numa_off) { 227 + pr_err("UV: NUMA is off, disabling UV support\n"); 228 + return 0; 229 + } 230 + 226 231 /* Setup early hub type field in uv_hub_info for Node 0 */ 227 232 uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0; 228 233 ··· 330 325 struct uv_gam_range_entry *gre = uv_gre_table; 331 326 struct uv_gam_range_s *grt; 332 327 unsigned long last_limit = 0, ram_limit = 0; 333 - int bytes, i, sid, lsid = -1; 328 + int bytes, i, sid, lsid = -1, indx = 0, lindx = -1; 334 329 335 330 if (!gre) 336 331 return; ··· 361 356 } 362 357 sid = gre->sockid - _min_socket; 363 358 if (lsid < sid) { /* new range */ 364 - grt = &_gr_table[sid]; 365 - grt->base = lsid; 359 + grt = &_gr_table[indx]; 360 + grt->base = lindx; 366 361 grt->nasid = gre->nasid; 367 362 grt->limit = last_limit = gre->limit; 368 363 lsid = sid; 364 + lindx = indx++; 369 365 continue; 370 366 } 371 367 if (lsid == sid && !ram_limit) { /* update range */ ··· 377 371 } 378 372 if (!ram_limit) { /* non-contiguous ram range */ 379 373 grt++; 380 - grt->base = sid - 1; 374 + grt->base = lindx; 381 375 grt->nasid = gre->nasid; 382 376 grt->limit = last_limit = gre->limit; 383 377 continue; ··· 1161 1155 for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { 1162 1156 if (!index) { 1163 1157 pr_info("UV: GAM Range Table...\n"); 1164 - pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s %3s\n", 1158 + pr_info("UV: # %20s %14s %5s %4s %5s %3s %2s\n", 1165 1159 "Range", "", "Size", "Type", "NASID", 1166 - "SID", "PN", "PXM"); 1160 + "SID", "PN"); 1167 1161 } 1168 1162 pr_info( 1169 - "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x %3d\n", 1163 + "UV: %2d: 0x%014lx-0x%014lx %5luG %3d %04x %02x %02x\n", 1170 1164 index++, 1171 1165 (unsigned long)lgre << UV_GAM_RANGE_SHFT, 1172 1166 (unsigned long)gre->limit << UV_GAM_RANGE_SHFT, 1173 1167 ((unsigned long)(gre->limit - lgre)) >> 1174 1168 (30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */ 1175 - gre->type, gre->nasid, gre->sockid, 1176 - gre->pnode, gre->pxm); 1169 + gre->type, gre->nasid, gre->sockid, gre->pnode); 1177 1170 1178 1171 lgre = gre->limit; 1179 1172 if (sock_min > gre->sockid) ··· 1291 1286 _pnode_to_socket[i] = SOCK_EMPTY; 1292 1287 1293 1288 /* fill in pnode/node/addr conversion list values */ 1294 - pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n"); 1289 + pr_info("UV: GAM Building socket/pnode conversion tables\n"); 1295 1290 for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) { 1296 1291 if (gre->type == UV_GAM_RANGE_TYPE_HOLE) 1297 1292 continue; ··· 1299 1294 if (_socket_to_pnode[i] != SOCK_EMPTY) 1300 1295 continue; /* duplicate */ 1301 1296 _socket_to_pnode[i] = gre->pnode; 1302 - _socket_to_node[i] = gre->pxm; 1303 1297 1304 1298 i = gre->pnode - minpnode; 1305 1299 _pnode_to_socket[i] = gre->sockid; 1306 1300 1307 1301 pr_info( 1308 - "UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n", 1302 + "UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n", 1309 1303 gre->sockid, gre->type, gre->nasid, 1310 1304 _socket_to_pnode[gre->sockid - minsock], 1311 - _socket_to_node[gre->sockid - minsock], 1312 1305 _pnode_to_socket[gre->pnode - minpnode]); 1313 1306 } 1314 1307 1315 - /* check socket -> node values */ 1308 + /* Set socket -> node values */ 1316 1309 lnid = -1; 1317 1310 for_each_present_cpu(cpu) { 1318 1311 int nid = cpu_to_node(cpu); ··· 1321 1318 lnid = nid; 1322 1319 apicid = per_cpu(x86_cpu_to_apicid, cpu); 1323 1320 sockid = apicid >> uv_cpuid.socketid_shift; 1324 - i = sockid - minsock; 1325 - 1326 - if (nid != _socket_to_node[i]) { 1327 - pr_warn( 1328 - "UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n", 1329 - i, sockid, gre->type, _socket_to_node[i], nid); 1330 - _socket_to_node[i] = nid; 1331 - } 1321 + _socket_to_node[sockid - minsock] = nid; 1322 + pr_info("UV: sid:%02x: apicid:%04x node:%2d\n", 1323 + sockid, apicid, nid); 1332 1324 } 1333 1325 1334 1326 /* Setup physical blade to pnode translation from GAM Range Table */
+17 -121
arch/x86/kernel/fpu/xstate.c
··· 866 866 return get_xsave_addr(&fpu->state.xsave, xsave_state); 867 867 } 868 868 869 - 870 - /* 871 - * Set xfeatures (aka XSTATE_BV) bit for a feature that we want 872 - * to take out of its "init state". This will ensure that an 873 - * XRSTOR actually restores the state. 874 - */ 875 - static void fpu__xfeature_set_non_init(struct xregs_state *xsave, 876 - int xstate_feature_mask) 877 - { 878 - xsave->header.xfeatures |= xstate_feature_mask; 879 - } 880 - 881 - /* 882 - * This function is safe to call whether the FPU is in use or not. 883 - * 884 - * Note that this only works on the current task. 885 - * 886 - * Inputs: 887 - * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, 888 - * XFEATURE_MASK_SSE, etc...) 889 - * @xsave_state_ptr: a pointer to a copy of the state that you would 890 - * like written in to the current task's FPU xsave state. This pointer 891 - * must not be located in the current tasks's xsave area. 892 - * Output: 893 - * address of the state in the xsave area or NULL if the state 894 - * is not present or is in its 'init state'. 895 - */ 896 - static void fpu__xfeature_set_state(int xstate_feature_mask, 897 - void *xstate_feature_src, size_t len) 898 - { 899 - struct xregs_state *xsave = &current->thread.fpu.state.xsave; 900 - struct fpu *fpu = &current->thread.fpu; 901 - void *dst; 902 - 903 - if (!boot_cpu_has(X86_FEATURE_XSAVE)) { 904 - WARN_ONCE(1, "%s() attempted with no xsave support", __func__); 905 - return; 906 - } 907 - 908 - /* 909 - * Tell the FPU code that we need the FPU state to be in 910 - * 'fpu' (not in the registers), and that we need it to 911 - * be stable while we write to it. 912 - */ 913 - fpu__current_fpstate_write_begin(); 914 - 915 - /* 916 - * This method *WILL* *NOT* work for compact-format 917 - * buffers. If the 'xstate_feature_mask' is unset in 918 - * xcomp_bv then we may need to move other feature state 919 - * "up" in the buffer. 920 - */ 921 - if (xsave->header.xcomp_bv & xstate_feature_mask) { 922 - WARN_ON_ONCE(1); 923 - goto out; 924 - } 925 - 926 - /* find the location in the xsave buffer of the desired state */ 927 - dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask); 928 - 929 - /* 930 - * Make sure that the pointer being passed in did not 931 - * come from the xsave buffer itself. 932 - */ 933 - WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself"); 934 - 935 - /* put the caller-provided data in the location */ 936 - memcpy(dst, xstate_feature_src, len); 937 - 938 - /* 939 - * Mark the xfeature so that the CPU knows there is state 940 - * in the buffer now. 941 - */ 942 - fpu__xfeature_set_non_init(xsave, xstate_feature_mask); 943 - out: 944 - /* 945 - * We are done writing to the 'fpu'. Reenable preeption 946 - * and (possibly) move the fpstate back in to the fpregs. 947 - */ 948 - fpu__current_fpstate_write_end(); 949 - } 950 - 951 869 #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2) 952 870 #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1) 953 871 954 872 /* 955 - * This will go out and modify the XSAVE buffer so that PKRU is 956 - * set to a particular state for access to 'pkey'. 957 - * 958 - * PKRU state does affect kernel access to user memory. We do 959 - * not modfiy PKRU *itself* here, only the XSAVE state that will 960 - * be restored in to PKRU when we return back to userspace. 873 + * This will go out and modify PKRU register to set the access 874 + * rights for @pkey to @init_val. 961 875 */ 962 876 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, 963 877 unsigned long init_val) 964 878 { 965 - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; 966 - struct pkru_state *old_pkru_state; 967 - struct pkru_state new_pkru_state; 879 + u32 old_pkru; 968 880 int pkey_shift = (pkey * PKRU_BITS_PER_PKEY); 969 881 u32 new_pkru_bits = 0; 970 882 ··· 886 974 */ 887 975 if (!boot_cpu_has(X86_FEATURE_OSPKE)) 888 976 return -EINVAL; 977 + /* 978 + * For most XSAVE components, this would be an arduous task: 979 + * brining fpstate up to date with fpregs, updating fpstate, 980 + * then re-populating fpregs. But, for components that are 981 + * never lazily managed, we can just access the fpregs 982 + * directly. PKRU is never managed lazily, so we can just 983 + * manipulate it directly. Make sure it stays that way. 984 + */ 985 + WARN_ON_ONCE(!use_eager_fpu()); 889 986 890 987 /* Set the bits we need in PKRU: */ 891 988 if (init_val & PKEY_DISABLE_ACCESS) ··· 905 984 /* Shift the bits in to the correct place in PKRU for pkey: */ 906 985 new_pkru_bits <<= pkey_shift; 907 986 908 - /* Locate old copy of the state in the xsave buffer: */ 909 - old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU); 987 + /* Get old PKRU and mask off any old bits in place: */ 988 + old_pkru = read_pkru(); 989 + old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); 910 990 911 - /* 912 - * When state is not in the buffer, it is in the init 913 - * state, set it manually. Otherwise, copy out the old 914 - * state. 915 - */ 916 - if (!old_pkru_state) 917 - new_pkru_state.pkru = 0; 918 - else 919 - new_pkru_state.pkru = old_pkru_state->pkru; 920 - 921 - /* Mask off any old bits in place: */ 922 - new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift); 923 - 924 - /* Set the newly-requested bits: */ 925 - new_pkru_state.pkru |= new_pkru_bits; 926 - 927 - /* 928 - * We could theoretically live without zeroing pkru.pad. 929 - * The current XSAVE feature state definition says that 930 - * only bytes 0->3 are used. But we do not want to 931 - * chance leaking kernel stack out to userspace in case a 932 - * memcpy() of the whole xsave buffer was done. 933 - * 934 - * They're in the same cacheline anyway. 935 - */ 936 - new_pkru_state.pad = 0; 937 - 938 - fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, sizeof(new_pkru_state)); 991 + /* Write old part along with new part: */ 992 + write_pkru(old_pkru | new_pkru_bits); 939 993 940 994 return 0; 941 995 }
-2
arch/x86/kernel/head32.c
··· 25 25 /* Initialize 32bit specific setup functions */ 26 26 x86_init.resources.reserve_resources = i386_reserve_resources; 27 27 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; 28 - 29 - reserve_bios_regions(); 30 28 } 31 29 32 30 asmlinkage __visible void __init i386_start_kernel(void)
-1
arch/x86/kernel/head64.c
··· 183 183 copy_bootdata(__va(real_mode_data)); 184 184 185 185 x86_early_init_platform_quirks(); 186 - reserve_bios_regions(); 187 186 188 187 switch (boot_params.hdr.hardware_subarch) { 189 188 case X86_SUBARCH_INTEL_MID:
+1 -2
arch/x86/kernel/irq.c
··· 102 102 seq_puts(p, " Rescheduling interrupts\n"); 103 103 seq_printf(p, "%*s: ", prec, "CAL"); 104 104 for_each_online_cpu(j) 105 - seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - 106 - irq_stats(j)->irq_tlb_count); 105 + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); 107 106 seq_puts(p, " Function call interrupts\n"); 108 107 seq_printf(p, "%*s: ", prec, "TLB"); 109 108 for_each_online_cpu(j)
+17 -10
arch/x86/kernel/setup.c
··· 936 936 937 937 x86_init.oem.arch_setup(); 938 938 939 - kernel_randomize_memory(); 940 - 941 939 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; 942 940 setup_memory_map(); 943 941 parse_setup_data(); ··· 1053 1055 1054 1056 max_possible_pfn = max_pfn; 1055 1057 1058 + /* 1059 + * Define random base addresses for memory sections after max_pfn is 1060 + * defined and before each memory section base is used. 1061 + */ 1062 + kernel_randomize_memory(); 1063 + 1056 1064 #ifdef CONFIG_X86_32 1057 1065 /* max_low_pfn get updated here */ 1058 1066 find_low_pfn_range(); ··· 1101 1097 efi_find_mirror(); 1102 1098 } 1103 1099 1100 + reserve_bios_regions(); 1101 + 1104 1102 /* 1105 1103 * The EFI specification says that boot service code won't be called 1106 1104 * after ExitBootServices(). This is, in fact, a lie. ··· 1131 1125 1132 1126 early_trap_pf_init(); 1133 1127 1134 - setup_real_mode(); 1128 + /* 1129 + * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features) 1130 + * with the current CR4 value. This may not be necessary, but 1131 + * auditing all the early-boot CR4 manipulation would be needed to 1132 + * rule it out. 1133 + */ 1134 + if (boot_cpu_data.cpuid_level >= 0) 1135 + /* A CPU has %cr4 if and only if it has CPUID. */ 1136 + mmu_cr4_features = __read_cr4(); 1135 1137 1136 1138 memblock_set_current_limit(get_max_mapped()); 1137 1139 ··· 1187 1173 x86_init.paging.pagetable_init(); 1188 1174 1189 1175 kasan_init(); 1190 - 1191 - if (boot_cpu_data.cpuid_level >= 0) { 1192 - /* A CPU has %cr4 if and only if it has CPUID */ 1193 - mmu_cr4_features = __read_cr4(); 1194 - if (trampoline_cr4_features) 1195 - *trampoline_cr4_features = mmu_cr4_features; 1196 - } 1197 1176 1198 1177 #ifdef CONFIG_X86_32 1199 1178 /* sync back kernel address range */
+1 -1
arch/x86/lib/kaslr.c
··· 19 19 #include <asm/cpufeature.h> 20 20 #include <asm/setup.h> 21 21 22 - #define debug_putstr(v) early_printk(v) 22 + #define debug_putstr(v) early_printk("%s", v) 23 23 #define has_cpuflag(f) boot_cpu_has(f) 24 24 #define get_boot_seed() kaslr_offset() 25 25 #endif
+12 -2
arch/x86/mm/init.c
··· 122 122 return __va(pfn << PAGE_SHIFT); 123 123 } 124 124 125 - /* need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS */ 126 - #define INIT_PGT_BUF_SIZE (6 * PAGE_SIZE) 125 + /* 126 + * By default need 3 4k for initial PMD_SIZE, 3 4k for 0-ISA_END_ADDRESS. 127 + * With KASLR memory randomization, depending on the machine e820 memory 128 + * and the PUD alignment. We may need twice more pages when KASLR memory 129 + * randomization is enabled. 130 + */ 131 + #ifndef CONFIG_RANDOMIZE_MEMORY 132 + #define INIT_PGD_PAGE_COUNT 6 133 + #else 134 + #define INIT_PGD_PAGE_COUNT 12 135 + #endif 136 + #define INIT_PGT_BUF_SIZE (INIT_PGD_PAGE_COUNT * PAGE_SIZE) 127 137 RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE); 128 138 void __init early_alloc_pgt_buf(void) 129 139 {
+1 -1
arch/x86/mm/kaslr.c
··· 97 97 * add padding if needed (especially for memory hotplug support). 98 98 */ 99 99 BUG_ON(kaslr_regions[0].base != &page_offset_base); 100 - memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) + 100 + memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + 101 101 CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; 102 102 103 103 /* Adapt phyiscal memory region size based on available memory */
+21
arch/x86/platform/efi/quirks.c
··· 254 254 for_each_efi_memory_desc(md) { 255 255 unsigned long long start = md->phys_addr; 256 256 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; 257 + size_t rm_size; 257 258 258 259 if (md->type != EFI_BOOT_SERVICES_CODE && 259 260 md->type != EFI_BOOT_SERVICES_DATA) ··· 263 262 /* Do not free, someone else owns it: */ 264 263 if (md->attribute & EFI_MEMORY_RUNTIME) 265 264 continue; 265 + 266 + /* 267 + * Nasty quirk: if all sub-1MB memory is used for boot 268 + * services, we can get here without having allocated the 269 + * real mode trampoline. It's too late to hand boot services 270 + * memory back to the memblock allocator, so instead 271 + * try to manually allocate the trampoline if needed. 272 + * 273 + * I've seen this on a Dell XPS 13 9350 with firmware 274 + * 1.4.4 with SGX enabled booting Linux via Fedora 24's 275 + * grub2-efi on a hard disk. (And no, I don't know why 276 + * this happened, but Linux should still try to boot rather 277 + * panicing early.) 278 + */ 279 + rm_size = real_mode_size_needed(); 280 + if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) { 281 + set_real_mode_mem(start, rm_size); 282 + start += rm_size; 283 + size -= rm_size; 284 + } 266 285 267 286 free_bootmem_late(start, size); 268 287 }
+5 -3
arch/x86/platform/uv/bios_uv.c
··· 200 200 return; 201 201 } 202 202 203 + /* Starting with UV4 the UV systab size is variable */ 203 204 if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) { 205 + int size = uv_systab->size; 206 + 204 207 iounmap(uv_systab); 205 - uv_systab = ioremap(efi.uv_systab, uv_systab->size); 208 + uv_systab = ioremap(efi.uv_systab, size); 206 209 if (!uv_systab) { 207 - pr_err("UV: UVsystab: ioremap(%d) failed!\n", 208 - uv_systab->size); 210 + pr_err("UV: UVsystab: ioremap(%d) failed!\n", size); 209 211 return; 210 212 } 211 213 }
+38 -15
arch/x86/realmode/init.c
··· 1 1 #include <linux/io.h> 2 + #include <linux/slab.h> 2 3 #include <linux/memblock.h> 3 4 4 5 #include <asm/cacheflush.h> 5 6 #include <asm/pgtable.h> 6 7 #include <asm/realmode.h> 8 + #include <asm/tlbflush.h> 7 9 8 10 struct real_mode_header *real_mode_header; 9 11 u32 *trampoline_cr4_features; ··· 13 11 /* Hold the pgd entry used on booting additional CPUs */ 14 12 pgd_t trampoline_pgd_entry; 15 13 16 - void __init reserve_real_mode(void) 14 + void __init set_real_mode_mem(phys_addr_t mem, size_t size) 17 15 { 18 - phys_addr_t mem; 19 - unsigned char *base; 20 - size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); 16 + void *base = __va(mem); 21 17 22 - /* Has to be under 1M so we can execute real-mode AP code. */ 23 - mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); 24 - if (!mem) 25 - panic("Cannot allocate trampoline\n"); 26 - 27 - base = __va(mem); 28 - memblock_reserve(mem, size); 29 18 real_mode_header = (struct real_mode_header *) base; 30 19 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", 31 20 base, (unsigned long long)mem, size); 32 21 } 33 22 34 - void __init setup_real_mode(void) 23 + void __init reserve_real_mode(void) 24 + { 25 + phys_addr_t mem; 26 + size_t size = real_mode_size_needed(); 27 + 28 + if (!size) 29 + return; 30 + 31 + WARN_ON(slab_is_available()); 32 + 33 + /* Has to be under 1M so we can execute real-mode AP code. */ 34 + mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); 35 + if (!mem) { 36 + pr_info("No sub-1M memory is available for the trampoline\n"); 37 + return; 38 + } 39 + 40 + memblock_reserve(mem, size); 41 + set_real_mode_mem(mem, size); 42 + } 43 + 44 + static void __init setup_real_mode(void) 35 45 { 36 46 u16 real_mode_seg; 37 47 const u32 *rel; ··· 98 84 99 85 trampoline_header->start = (u64) secondary_startup_64; 100 86 trampoline_cr4_features = &trampoline_header->cr4; 101 - *trampoline_cr4_features = __read_cr4(); 87 + *trampoline_cr4_features = mmu_cr4_features; 102 88 103 89 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); 104 90 trampoline_pgd[0] = trampoline_pgd_entry.pgd; ··· 114 100 * need to mark it executable at do_pre_smp_initcalls() at least, 115 101 * thus run it as a early_initcall(). 116 102 */ 117 - static int __init set_real_mode_permissions(void) 103 + static void __init set_real_mode_permissions(void) 118 104 { 119 105 unsigned char *base = (unsigned char *) real_mode_header; 120 106 size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob); ··· 133 119 set_memory_nx((unsigned long) base, size >> PAGE_SHIFT); 134 120 set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT); 135 121 set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT); 122 + } 123 + 124 + static int __init init_real_mode(void) 125 + { 126 + if (!real_mode_header) 127 + panic("Real mode trampoline was not allocated"); 128 + 129 + setup_real_mode(); 130 + set_real_mode_permissions(); 136 131 137 132 return 0; 138 133 } 139 - early_initcall(set_real_mode_permissions); 134 + early_initcall(init_real_mode);