commit b1b6f83ac938d176742c85757960dec2cf10e468

+13

Documentation/admin-guide/kernel-parameters.txt

··· 2233 2233 memory contents and reserves bad memory 2234 2234 regions that are detected. 2235 2235 2236 + mem_encrypt= [X86-64] AMD Secure Memory Encryption (SME) control 2237 + Valid arguments: on, off 2238 + Default (depends on kernel configuration option): 2239 + on (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) 2240 + off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n) 2241 + mem_encrypt=on: Activate SME 2242 + mem_encrypt=off: Do not activate SME 2243 + 2244 + Refer to Documentation/x86/amd-memory-encryption.txt 2245 + for details on when memory encryption can be activated. 2246 + 2236 2247 mem_sleep_default= [SUSPEND] Default system suspend mode: 2237 2248 s2idle - Suspend-To-Idle 2238 2249 shallow - Power-On Suspend or equivalent (if supported) ··· 2707 2696 2708 2697 nopat [X86] Disable PAT (page attribute table extension of 2709 2698 pagetables) support. 2699 + 2700 + nopcid [X86-64] Disable the PCID cpu feature. 2710 2701 2711 2702 norandmaps Don't use address space randomization. Equivalent to 2712 2703 echo 0 > /proc/sys/kernel/randomize_va_space

+68

Documentation/x86/amd-memory-encryption.txt

··· 1 + Secure Memory Encryption (SME) is a feature found on AMD processors. 2 + 3 + SME provides the ability to mark individual pages of memory as encrypted using 4 + the standard x86 page tables. A page that is marked encrypted will be 5 + automatically decrypted when read from DRAM and encrypted when written to 6 + DRAM. SME can therefore be used to protect the contents of DRAM from physical 7 + attacks on the system. 8 + 9 + A page is encrypted when a page table entry has the encryption bit set (see 10 + below on how to determine its position). The encryption bit can also be 11 + specified in the cr3 register, allowing the PGD table to be encrypted. Each 12 + successive level of page tables can also be encrypted by setting the encryption 13 + bit in the page table entry that points to the next table. This allows the full 14 + page table hierarchy to be encrypted. Note, this means that just because the 15 + encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted. 16 + Each page table entry in the hierarchy needs to have the encryption bit set to 17 + achieve that. So, theoretically, you could have the encryption bit set in cr3 18 + so that the PGD is encrypted, but not set the encryption bit in the PGD entry 19 + for a PUD which results in the PUD pointed to by that entry to not be 20 + encrypted. 21 + 22 + Support for SME can be determined through the CPUID instruction. The CPUID 23 + function 0x8000001f reports information related to SME: 24 + 25 + 0x8000001f[eax]: 26 + Bit[0] indicates support for SME 27 + 0x8000001f[ebx]: 28 + Bits[5:0] pagetable bit number used to activate memory 29 + encryption 30 + Bits[11:6] reduction in physical address space, in bits, when 31 + memory encryption is enabled (this only affects 32 + system physical addresses, not guest physical 33 + addresses) 34 + 35 + If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to 36 + determine if SME is enabled and/or to enable memory encryption: 37 + 38 + 0xc0010010: 39 + Bit[23] 0 = memory encryption features are disabled 40 + 1 = memory encryption features are enabled 41 + 42 + Linux relies on BIOS to set this bit if BIOS has determined that the reduction 43 + in the physical address space as a result of enabling memory encryption (see 44 + CPUID information above) will not conflict with the address space resource 45 + requirements for the system. If this bit is not set upon Linux startup then 46 + Linux itself will not set it and memory encryption will not be possible. 47 + 48 + The state of SME in the Linux kernel can be documented as follows: 49 + - Supported: 50 + The CPU supports SME (determined through CPUID instruction). 51 + 52 + - Enabled: 53 + Supported and bit 23 of MSR_K8_SYSCFG is set. 54 + 55 + - Active: 56 + Supported, Enabled and the Linux kernel is actively applying 57 + the encryption bit to page table entries (the SME mask in the 58 + kernel is non-zero). 59 + 60 + SME can also be enabled and activated in the BIOS. If SME is enabled and 61 + activated in the BIOS, then all memory accesses will be encrypted and it will 62 + not be necessary to activate the Linux memory encryption support. If the BIOS 63 + merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate 64 + memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or 65 + by supplying mem_encrypt=on on the kernel command line. However, if BIOS does 66 + not enable SME, then Linux will not be able to activate memory encryption, even 67 + if configured to do so by default or the mem_encrypt=on command line parameter 68 + is specified.

+3 -3

Documentation/x86/protection-keys.txt

··· 34 34 called pkey_set(). 35 35 36 36 int real_prot = PROT_READ|PROT_WRITE; 37 - pkey = pkey_alloc(0, PKEY_DENY_WRITE); 37 + pkey = pkey_alloc(0, PKEY_DISABLE_WRITE); 38 38 ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 39 39 ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey); 40 40 ... application runs here ··· 42 42 Now, if the application needs to update the data at 'ptr', it can 43 43 gain access, do the update, then remove its write access: 44 44 45 - pkey_set(pkey, 0); // clear PKEY_DENY_WRITE 45 + pkey_set(pkey, 0); // clear PKEY_DISABLE_WRITE 46 46 *ptr = foo; // assign something 47 - pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again 47 + pkey_set(pkey, PKEY_DISABLE_WRITE); // set PKEY_DISABLE_WRITE again 48 48 49 49 Now when it frees the memory, it will also free the pkey since it 50 50 is no longer in use:

+64

Documentation/x86/x86_64/5level-paging.txt

··· 1 + == Overview == 2 + 3 + Original x86-64 was limited by 4-level paing to 256 TiB of virtual address 4 + space and 64 TiB of physical address space. We are already bumping into 5 + this limit: some vendors offers servers with 64 TiB of memory today. 6 + 7 + To overcome the limitation upcoming hardware will introduce support for 8 + 5-level paging. It is a straight-forward extension of the current page 9 + table structure adding one more layer of translation. 10 + 11 + It bumps the limits to 128 PiB of virtual address space and 4 PiB of 12 + physical address space. This "ought to be enough for anybody" ©. 13 + 14 + QEMU 2.9 and later support 5-level paging. 15 + 16 + Virtual memory layout for 5-level paging is described in 17 + Documentation/x86/x86_64/mm.txt 18 + 19 + == Enabling 5-level paging == 20 + 21 + CONFIG_X86_5LEVEL=y enables the feature. 22 + 23 + So far, a kernel compiled with the option enabled will be able to boot 24 + only on machines that supports the feature -- see for 'la57' flag in 25 + /proc/cpuinfo. 26 + 27 + The plan is to implement boot-time switching between 4- and 5-level paging 28 + in the future. 29 + 30 + == User-space and large virtual address space == 31 + 32 + On x86, 5-level paging enables 56-bit userspace virtual address space. 33 + Not all user space is ready to handle wide addresses. It's known that 34 + at least some JIT compilers use higher bits in pointers to encode their 35 + information. It collides with valid pointers with 5-level paging and 36 + leads to crashes. 37 + 38 + To mitigate this, we are not going to allocate virtual address space 39 + above 47-bit by default. 40 + 41 + But userspace can ask for allocation from full address space by 42 + specifying hint address (with or without MAP_FIXED) above 47-bits. 43 + 44 + If hint address set above 47-bit, but MAP_FIXED is not specified, we try 45 + to look for unmapped area by specified address. If it's already 46 + occupied, we look for unmapped area in *full* address space, rather than 47 + from 47-bit window. 48 + 49 + A high hint address would only affect the allocation in question, but not 50 + any future mmap()s. 51 + 52 + Specifying high hint address on older kernel or on machine without 5-level 53 + paging support is safe. The hint will be ignored and kernel will fall back 54 + to allocation from 47-bit address space. 55 + 56 + This approach helps to easily make application's memory allocator aware 57 + about large address space without manually tracking allocated virtual 58 + address space. 59 + 60 + One important case we need to handle here is interaction with MPX. 61 + MPX (without MAWA extension) cannot handle addresses above 47-bit, so we 62 + need to make sure that MPX cannot be enabled we already have VMA above 63 + the boundary and forbid creating such VMAs once MPX is enabled. 64 +

-2

arch/ia64/include/asm/acpi.h

··· 112 112 buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; 113 113 } 114 114 115 - #define acpi_unlazy_tlb(x) 116 - 117 115 #ifdef CONFIG_ACPI_NUMA 118 116 extern cpumask_t early_cpu_possible_map; 119 117 #define for_each_possible_early_cpu(cpu) \

+2 -2

arch/ia64/kernel/efi.c

··· 757 757 return 0; 758 758 } 759 759 760 - u32 760 + int 761 761 efi_mem_type (unsigned long phys_addr) 762 762 { 763 763 efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); 764 764 765 765 if (md) 766 766 return md->type; 767 - return 0; 767 + return -EINVAL; 768 768 } 769 769 770 770 u64

+49

arch/x86/Kconfig

··· 169 169 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI 170 170 select HAVE_PERF_REGS 171 171 select HAVE_PERF_USER_STACK_DUMP 172 + select HAVE_RCU_TABLE_FREE 172 173 select HAVE_REGS_AND_STACK_ACCESS_API 173 174 select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION 174 175 select HAVE_STACK_VALIDATION if X86_64 ··· 330 329 331 330 config PGTABLE_LEVELS 332 331 int 332 + default 5 if X86_5LEVEL 333 333 default 4 if X86_64 334 334 default 3 if X86_PAE 335 335 default 2 ··· 1401 1399 has the cost of more pagetable lookup overhead, and also 1402 1400 consumes more pagetable space per process. 1403 1401 1402 + config X86_5LEVEL 1403 + bool "Enable 5-level page tables support" 1404 + depends on X86_64 1405 + ---help--- 1406 + 5-level paging enables access to larger address space: 1407 + upto 128 PiB of virtual address space and 4 PiB of 1408 + physical address space. 1409 + 1410 + It will be supported by future Intel CPUs. 1411 + 1412 + Note: a kernel with this option enabled can only be booted 1413 + on machines that support the feature. 1414 + 1415 + See Documentation/x86/x86_64/5level-paging.txt for more 1416 + information. 1417 + 1418 + Say N if unsure. 1419 + 1404 1420 config ARCH_PHYS_ADDR_T_64BIT 1405 1421 def_bool y 1406 1422 depends on X86_64 || X86_PAE ··· 1435 1415 linear 1 GB mappings (even if the CPU otherwise 1436 1416 supports them), so don't confuse the user by printing 1437 1417 that we have them enabled. 1418 + 1419 + config ARCH_HAS_MEM_ENCRYPT 1420 + def_bool y 1421 + 1422 + config AMD_MEM_ENCRYPT 1423 + bool "AMD Secure Memory Encryption (SME) support" 1424 + depends on X86_64 && CPU_SUP_AMD 1425 + ---help--- 1426 + Say yes to enable support for the encryption of system memory. 1427 + This requires an AMD processor that supports Secure Memory 1428 + Encryption (SME). 1429 + 1430 + config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT 1431 + bool "Activate AMD Secure Memory Encryption (SME) by default" 1432 + default y 1433 + depends on AMD_MEM_ENCRYPT 1434 + ---help--- 1435 + Say yes to have system memory encrypted by default if running on 1436 + an AMD processor that supports Secure Memory Encryption (SME). 1437 + 1438 + If set to Y, then the encryption of system memory can be 1439 + deactivated with the mem_encrypt=off command line option. 1440 + 1441 + If set to N, then the encryption of system memory can be 1442 + activated with the mem_encrypt=on command line option. 1443 + 1444 + config ARCH_USE_MEMREMAP_PROT 1445 + def_bool y 1446 + depends on AMD_MEM_ENCRYPT 1438 1447 1439 1448 # Common NUMA Features 1440 1449 config NUMA

+7

arch/x86/boot/compressed/pagetable.c

··· 15 15 #define __pa(x) ((unsigned long)(x)) 16 16 #define __va(x) ((void *)((unsigned long)(x))) 17 17 18 + /* 19 + * The pgtable.h and mm/ident_map.c includes make use of the SME related 20 + * information which is not used in the compressed image support. Un-define 21 + * the SME support to avoid any compile and link errors. 22 + */ 23 + #undef CONFIG_AMD_MEM_ENCRYPT 24 + 18 25 #include "misc.h" 19 26 20 27 /* These actually do the work of building the kernel identity maps. */

+6 -7

arch/x86/include/asm/acpi.h

··· 150 150 extern int x86_acpi_numa_init(void); 151 151 #endif /* CONFIG_ACPI_NUMA */ 152 152 153 - #define acpi_unlazy_tlb(x) leave_mm(x) 154 - 155 153 #ifdef CONFIG_ACPI_APEI 156 154 static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) 157 155 { ··· 160 162 * you call efi_mem_attributes() during boot and at runtime, 161 163 * you could theoretically see different attributes. 162 164 * 163 - * Since we are yet to see any x86 platforms that require 164 - * anything other than PAGE_KERNEL (some arm64 platforms 165 - * require the equivalent of PAGE_KERNEL_NOCACHE), return that 166 - * until we know differently. 165 + * We are yet to see any x86 platforms that require anything 166 + * other than PAGE_KERNEL (some ARM64 platforms require the 167 + * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME 168 + * is active, the ACPI information will not be encrypted, 169 + * so return PAGE_KERNEL_NOENC until we know differently. 167 170 */ 168 - return PAGE_KERNEL; 171 + return PAGE_KERNEL_NOENC; 169 172 } 170 173 #endif 171 174

+2

arch/x86/include/asm/cmdline.h

··· 2 2 #define _ASM_X86_CMDLINE_H 3 3 4 4 int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); 5 + int cmdline_find_option(const char *cmdline_ptr, const char *option, 6 + char *buffer, int bufsize); 5 7 6 8 #endif /* _ASM_X86_CMDLINE_H */

+1

arch/x86/include/asm/cpufeatures.h

··· 196 196 197 197 #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ 198 198 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ 199 + #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ 199 200 200 201 #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ 201 202 #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */

+3 -1

arch/x86/include/asm/disabled-features.h

··· 21 21 # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) 22 22 # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) 23 23 # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) 24 + # define DISABLE_PCID 0 24 25 #else 25 26 # define DISABLE_VME 0 26 27 # define DISABLE_K6_MTRR 0 27 28 # define DISABLE_CYRIX_ARR 0 28 29 # define DISABLE_CENTAUR_MCR 0 30 + # define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) 29 31 #endif /* CONFIG_X86_64 */ 30 32 31 33 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS ··· 51 49 #define DISABLED_MASK1 0 52 50 #define DISABLED_MASK2 0 53 51 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) 54 - #define DISABLED_MASK4 0 52 + #define DISABLED_MASK4 (DISABLE_PCID) 55 53 #define DISABLED_MASK5 0 56 54 #define DISABLED_MASK6 0 57 55 #define DISABLED_MASK7 0

+3 -2

arch/x86/include/asm/dma-mapping.h

··· 12 12 #include <asm/io.h> 13 13 #include <asm/swiotlb.h> 14 14 #include <linux/dma-contiguous.h> 15 + #include <linux/mem_encrypt.h> 15 16 16 17 #ifdef CONFIG_ISA 17 18 # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) ··· 58 57 59 58 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) 60 59 { 61 - return paddr; 60 + return __sme_set(paddr); 62 61 } 63 62 64 63 static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) 65 64 { 66 - return daddr; 65 + return __sme_clr(daddr); 67 66 } 68 67 #endif /* CONFIG_X86_DMA_REMAP */ 69 68

+4 -4

arch/x86/include/asm/dmi.h

··· 13 13 } 14 14 15 15 /* Use early IO mappings for DMI because it's initialized early */ 16 - #define dmi_early_remap early_ioremap 17 - #define dmi_early_unmap early_iounmap 18 - #define dmi_remap ioremap_cache 19 - #define dmi_unmap iounmap 16 + #define dmi_early_remap early_memremap 17 + #define dmi_early_unmap early_memunmap 18 + #define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB) 19 + #define dmi_unmap(_x) memunmap(_x) 20 20 21 21 #endif /* _ASM_X86_DMI_H */

+2

arch/x86/include/asm/e820/api.h

··· 39 39 extern void e820__reallocate_tables(void); 40 40 extern void e820__register_nosave_regions(unsigned long limit_pfn); 41 41 42 + extern int e820__get_entry_type(u64 start, u64 end); 43 + 42 44 /* 43 45 * Returns true iff the specified range [start,end) is completely contained inside 44 46 * the ISA region.

+2 -2

arch/x86/include/asm/elf.h

··· 305 305 test_thread_flag(TIF_ADDR32)); 306 306 } 307 307 308 - extern unsigned long tasksize_32bit(void); 309 - extern unsigned long tasksize_64bit(void); 308 + extern unsigned long task_size_32bit(void); 309 + extern unsigned long task_size_64bit(int full_addr_space); 310 310 extern unsigned long get_mmap_base(int is_legacy); 311 311 312 312 #ifdef CONFIG_X86_32

+20

arch/x86/include/asm/fixmap.h

··· 157 157 } 158 158 #endif 159 159 160 + /* 161 + * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not 162 + * supported for MMIO addresses, so make sure that the memory encryption 163 + * mask is not part of the page attributes. 164 + */ 165 + #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE 166 + 167 + /* 168 + * Early memremap routines used for in-place encryption. The mappings created 169 + * by these routines are intended to be used as temporary mappings. 170 + */ 171 + void __init *early_memremap_encrypted(resource_size_t phys_addr, 172 + unsigned long size); 173 + void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, 174 + unsigned long size); 175 + void __init *early_memremap_decrypted(resource_size_t phys_addr, 176 + unsigned long size); 177 + void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, 178 + unsigned long size); 179 + 160 180 #include <asm-generic/fixmap.h> 161 181 162 182 #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)

+1

arch/x86/include/asm/init.h

··· 7 7 unsigned long page_flag; /* page flag for PMD or PUD entry */ 8 8 unsigned long offset; /* ident mapping offset */ 9 9 bool direct_gbpages; /* PUD level 1GB page support */ 10 + unsigned long kernpg_flag; /* kernel pagetable flag override */ 10 11 }; 11 12 12 13 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,

+8

arch/x86/include/asm/io.h

··· 377 377 #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc 378 378 #endif 379 379 380 + extern bool arch_memremap_can_ram_remap(resource_size_t offset, 381 + unsigned long size, 382 + unsigned long flags); 383 + #define arch_memremap_can_ram_remap arch_memremap_can_ram_remap 384 + 385 + extern bool phys_mem_access_encrypted(unsigned long phys_addr, 386 + unsigned long size); 387 + 380 388 #endif /* _ASM_X86_IO_H */

+10 -1

arch/x86/include/asm/kexec.h

··· 147 147 relocate_kernel(unsigned long indirection_page, 148 148 unsigned long page_list, 149 149 unsigned long start_address, 150 - unsigned int preserve_context); 150 + unsigned int preserve_context, 151 + unsigned int sme_active); 151 152 #endif 152 153 153 154 #define ARCH_HAS_KIMAGE_ARCH ··· 208 207 uint64_t r15; 209 208 uint64_t rip; 210 209 }; 210 + 211 + extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, 212 + gfp_t gfp); 213 + #define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages 214 + 215 + extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); 216 + #define arch_kexec_pre_free_pages arch_kexec_pre_free_pages 217 + 211 218 #endif 212 219 213 220 typedef void crash_vmclear_fn(void);

+1 -1

arch/x86/include/asm/kvm_host.h

··· 1079 1079 void kvm_mmu_uninit_vm(struct kvm *kvm); 1080 1080 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 1081 1081 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, 1082 - u64 acc_track_mask); 1082 + u64 acc_track_mask, u64 me_mask); 1083 1083 1084 1084 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 1085 1085 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,

+80

arch/x86/include/asm/mem_encrypt.h

··· 1 + /* 2 + * AMD Memory Encryption Support 3 + * 4 + * Copyright (C) 2016 Advanced Micro Devices, Inc. 5 + * 6 + * Author: Tom Lendacky <thomas.lendacky@amd.com> 7 + * 8 + * This program is free software; you can redistribute it and/or modify 9 + * it under the terms of the GNU General Public License version 2 as 10 + * published by the Free Software Foundation. 11 + */ 12 + 13 + #ifndef __X86_MEM_ENCRYPT_H__ 14 + #define __X86_MEM_ENCRYPT_H__ 15 + 16 + #ifndef __ASSEMBLY__ 17 + 18 + #include <linux/init.h> 19 + 20 + #include <asm/bootparam.h> 21 + 22 + #ifdef CONFIG_AMD_MEM_ENCRYPT 23 + 24 + extern unsigned long sme_me_mask; 25 + 26 + void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, 27 + unsigned long decrypted_kernel_vaddr, 28 + unsigned long kernel_len, 29 + unsigned long encryption_wa, 30 + unsigned long encryption_pgd); 31 + 32 + void __init sme_early_encrypt(resource_size_t paddr, 33 + unsigned long size); 34 + void __init sme_early_decrypt(resource_size_t paddr, 35 + unsigned long size); 36 + 37 + void __init sme_map_bootdata(char *real_mode_data); 38 + void __init sme_unmap_bootdata(char *real_mode_data); 39 + 40 + void __init sme_early_init(void); 41 + 42 + void __init sme_encrypt_kernel(void); 43 + void __init sme_enable(struct boot_params *bp); 44 + 45 + /* Architecture __weak replacement functions */ 46 + void __init mem_encrypt_init(void); 47 + 48 + void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); 49 + 50 + #else /* !CONFIG_AMD_MEM_ENCRYPT */ 51 + 52 + #define sme_me_mask 0UL 53 + 54 + static inline void __init sme_early_encrypt(resource_size_t paddr, 55 + unsigned long size) { } 56 + static inline void __init sme_early_decrypt(resource_size_t paddr, 57 + unsigned long size) { } 58 + 59 + static inline void __init sme_map_bootdata(char *real_mode_data) { } 60 + static inline void __init sme_unmap_bootdata(char *real_mode_data) { } 61 + 62 + static inline void __init sme_early_init(void) { } 63 + 64 + static inline void __init sme_encrypt_kernel(void) { } 65 + static inline void __init sme_enable(struct boot_params *bp) { } 66 + 67 + #endif /* CONFIG_AMD_MEM_ENCRYPT */ 68 + 69 + /* 70 + * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when 71 + * writing to or comparing values from the cr3 register. Having the 72 + * encryption mask set in cr3 enables the PGD entry to be encrypted and 73 + * avoid special case handling of PGD allocations. 74 + */ 75 + #define __sme_pa(x) (__pa(x) | sme_me_mask) 76 + #define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) 77 + 78 + #endif /* __ASSEMBLY__ */ 79 + 80 + #endif /* __X86_MEM_ENCRYPT_H__ */

+23 -2

arch/x86/include/asm/mmu.h

··· 3 3 4 4 #include <linux/spinlock.h> 5 5 #include <linux/mutex.h> 6 + #include <linux/atomic.h> 6 7 7 8 /* 8 - * The x86 doesn't have a mmu context, but 9 - * we put the segment information here. 9 + * x86 has arch-specific MMU state beyond what lives in mm_struct. 10 10 */ 11 11 typedef struct { 12 + /* 13 + * ctx_id uniquely identifies this mm_struct. A ctx_id will never 14 + * be reused, and zero is not a valid ctx_id. 15 + */ 16 + u64 ctx_id; 17 + 18 + /* 19 + * Any code that needs to do any sort of TLB flushing for this 20 + * mm will first make its changes to the page tables, then 21 + * increment tlb_gen, then flush. This lets the low-level 22 + * flushing code keep track of what needs flushing. 23 + * 24 + * This is not used on Xen PV. 25 + */ 26 + atomic64_t tlb_gen; 27 + 12 28 #ifdef CONFIG_MODIFY_LDT_SYSCALL 13 29 struct ldt_struct *ldt; 14 30 #endif ··· 52 36 void __user *bd_addr; 53 37 #endif 54 38 } mm_context_t; 39 + 40 + #define INIT_MM_CONTEXT(mm) \ 41 + .context = { \ 42 + .ctx_id = 1, \ 43 + } 55 44 56 45 void leave_mm(int cpu); 57 46

+13 -2

arch/x86/include/asm/mmu_context.h

··· 12 12 #include <asm/tlbflush.h> 13 13 #include <asm/paravirt.h> 14 14 #include <asm/mpx.h> 15 + 16 + extern atomic64_t last_mm_ctx_id; 17 + 15 18 #ifndef CONFIG_PARAVIRT 16 19 static inline void paravirt_activate_mm(struct mm_struct *prev, 17 20 struct mm_struct *next) ··· 128 125 129 126 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 130 127 { 131 - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 132 - this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); 128 + int cpu = smp_processor_id(); 129 + 130 + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) 131 + cpumask_clear_cpu(cpu, mm_cpumask(mm)); 133 132 } 134 133 135 134 static inline int init_new_context(struct task_struct *tsk, 136 135 struct mm_struct *mm) 137 136 { 137 + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); 138 + atomic64_set(&mm->context.tlb_gen, 0); 139 + 138 140 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 139 141 if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { 140 142 /* pkey 0 is the default and always allocated */ ··· 297 289 static inline unsigned long __get_current_cr3_fast(void) 298 290 { 299 291 unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); 292 + 293 + if (static_cpu_has(X86_FEATURE_PCID)) 294 + cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); 300 295 301 296 /* For now, be very restrictive about when this can be called. */ 302 297 VM_WARN_ON(in_nmi() || preemptible());

+9

arch/x86/include/asm/mpx.h

··· 73 73 } 74 74 void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, 75 75 unsigned long start, unsigned long end); 76 + 77 + unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, 78 + unsigned long flags); 76 79 #else 77 80 static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) 78 81 { ··· 96 93 struct vm_area_struct *vma, 97 94 unsigned long start, unsigned long end) 98 95 { 96 + } 97 + 98 + static inline unsigned long mpx_unmapped_area_check(unsigned long addr, 99 + unsigned long len, unsigned long flags) 100 + { 101 + return addr; 99 102 } 100 103 #endif /* CONFIG_X86_INTEL_MPX */ 101 104

+2

arch/x86/include/asm/msr-index.h

··· 356 356 #define MSR_K8_TOP_MEM1 0xc001001a 357 357 #define MSR_K8_TOP_MEM2 0xc001001d 358 358 #define MSR_K8_SYSCFG 0xc0010010 359 + #define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 360 + #define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) 359 361 #define MSR_K8_INT_PENDING_MSG 0xc0010055 360 362 /* C1E active bits in int pending message */ 361 363 #define K8_INTP_C1E_ACTIVE_MASK 0x18000000

+4

arch/x86/include/asm/page_64.h

··· 51 51 52 52 void copy_page(void *to, void *from); 53 53 54 + #ifdef CONFIG_X86_MCE 55 + #define arch_unmap_kpfn arch_unmap_kpfn 56 + #endif 57 + 54 58 #endif /* !__ASSEMBLY__ */ 55 59 56 60 #ifdef CONFIG_X86_VSYSCALL_EMULATION

+2 -1

arch/x86/include/asm/page_types.h

··· 3 3 4 4 #include <linux/const.h> 5 5 #include <linux/types.h> 6 + #include <linux/mem_encrypt.h> 6 7 7 8 /* PAGE_SHIFT determines the page size */ 8 9 #define PAGE_SHIFT 12 ··· 16 15 #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) 17 16 #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) 18 17 19 - #define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) 18 + #define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1))) 20 19 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) 21 20 22 21 /* Cast *PAGE_MASK to a signed type so that it is sign-extended if

+21 -7

arch/x86/include/asm/pgtable.h

··· 1 1 #ifndef _ASM_X86_PGTABLE_H 2 2 #define _ASM_X86_PGTABLE_H 3 3 4 + #include <linux/mem_encrypt.h> 4 5 #include <asm/page.h> 5 6 #include <asm/pgtable_types.h> 6 7 ··· 14 13 cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ 15 14 : (prot)) 16 15 16 + /* 17 + * Macros to add or remove encryption attribute 18 + */ 19 + #define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) 20 + #define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) 21 + 17 22 #ifndef __ASSEMBLY__ 18 23 #include <asm/x86_init.h> 24 + 25 + extern pgd_t early_top_pgt[PTRS_PER_PGD]; 26 + int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); 19 27 20 28 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); 21 29 void ptdump_walk_pgd_level_checkwx(void); ··· 47 37 extern struct list_head pgd_list; 48 38 49 39 extern struct mm_struct *pgd_page_get_mm(struct page *page); 40 + 41 + extern pmdval_t early_pmd_flags; 50 42 51 43 #ifdef CONFIG_PARAVIRT 52 44 #include <asm/paravirt.h> ··· 205 193 static inline unsigned long p4d_pfn(p4d_t p4d) 206 194 { 207 195 return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; 196 + } 197 + 198 + static inline unsigned long pgd_pfn(pgd_t pgd) 199 + { 200 + return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; 208 201 } 209 202 210 203 static inline int p4d_large(p4d_t p4d) ··· 721 704 * Currently stuck as a macro due to indirect forward reference to 722 705 * linux/mmzone.h's __section_mem_map_addr() definition: 723 706 */ 724 - #define pmd_page(pmd) \ 725 - pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) 707 + #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) 726 708 727 709 /* 728 710 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] ··· 789 773 * Currently stuck as a macro due to indirect forward reference to 790 774 * linux/mmzone.h's __section_mem_map_addr() definition: 791 775 */ 792 - #define pud_page(pud) \ 793 - pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) 776 + #define pud_page(pud) pfn_to_page(pud_pfn(pud)) 794 777 795 778 /* Find an entry in the second-level page table.. */ 796 779 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) ··· 839 824 * Currently stuck as a macro due to indirect forward reference to 840 825 * linux/mmzone.h's __section_mem_map_addr() definition: 841 826 */ 842 - #define p4d_page(p4d) \ 843 - pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) 827 + #define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) 844 828 845 829 /* Find an entry in the third-level page table.. */ 846 830 static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) ··· 873 859 * Currently stuck as a macro due to indirect forward reference to 874 860 * linux/mmzone.h's __section_mem_map_addr() definition: 875 861 */ 876 - #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) 862 + #define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) 877 863 878 864 /* to find an entry in a page-table-directory. */ 879 865 static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)

+43 -15

arch/x86/include/asm/pgtable_types.h

··· 2 2 #define _ASM_X86_PGTABLE_DEFS_H 3 3 4 4 #include <linux/const.h> 5 + #include <linux/mem_encrypt.h> 6 + 5 7 #include <asm/page_types.h> 6 8 7 9 #define FIRST_USER_ADDRESS 0UL ··· 123 121 124 122 #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) 125 123 126 - #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 127 - _PAGE_ACCESSED | _PAGE_DIRTY) 128 - #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ 129 - _PAGE_DIRTY) 124 + #define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\ 125 + _PAGE_ACCESSED | _PAGE_DIRTY) 126 + #define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \ 127 + _PAGE_ACCESSED | _PAGE_DIRTY) 130 128 131 129 /* 132 130 * Set of bits not changed in pte_modify. The pte's ··· 161 159 162 160 #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) 163 161 #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) 162 + #define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) 164 163 165 164 #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) 166 165 #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ ··· 190 187 #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) 191 188 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) 192 189 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) 190 + #define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP) 193 191 194 192 #define __PAGE_KERNEL_IO (__PAGE_KERNEL) 195 193 #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) 196 194 197 - #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) 198 - #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) 199 - #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) 200 - #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) 201 - #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) 202 - #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) 203 - #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) 204 - #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) 205 - #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) 195 + #ifndef __ASSEMBLY__ 206 196 207 - #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) 208 - #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) 197 + #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) 198 + 199 + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 200 + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) 201 + #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ 202 + _PAGE_DIRTY | _PAGE_ENC) 203 + 204 + #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) 205 + #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) 206 + 207 + #define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) 208 + #define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) 209 + 210 + #define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) 211 + #define PAGE_KERNEL_NOENC __pgprot(__PAGE_KERNEL) 212 + #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) 213 + #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) 214 + #define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) 215 + #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) 216 + #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) 217 + #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) 218 + #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) 219 + #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) 220 + #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) 221 + 222 + #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) 223 + #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) 224 + 225 + #endif /* __ASSEMBLY__ */ 209 226 210 227 /* xwr */ 211 228 #define __P000 PAGE_NONE ··· 309 286 } 310 287 #else 311 288 #include <asm-generic/pgtable-nop4d.h> 289 + 290 + static inline p4d_t native_make_p4d(pudval_t val) 291 + { 292 + return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) }; 293 + } 312 294 313 295 static inline p4dval_t native_p4d_val(p4d_t p4d) 314 296 {

+8 -5

arch/x86/include/asm/processor-flags.h

··· 2 2 #define _ASM_X86_PROCESSOR_FLAGS_H 3 3 4 4 #include <uapi/asm/processor-flags.h> 5 + #include <linux/mem_encrypt.h> 5 6 6 7 #ifdef CONFIG_VM86 7 8 #define X86_VM_MASK X86_EFLAGS_VM ··· 33 32 * CR3_ADDR_MASK is the mask used by read_cr3_pa(). 34 33 */ 35 34 #ifdef CONFIG_X86_64 36 - /* Mask off the address space ID bits. */ 37 - #define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull 38 - #define CR3_PCID_MASK 0xFFFull 35 + /* Mask off the address space ID and SME encryption bits. */ 36 + #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) 37 + #define CR3_PCID_MASK 0xFFFull 38 + #define CR3_NOFLUSH BIT_ULL(63) 39 39 #else 40 40 /* 41 41 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save 42 42 * a tiny bit of code size by setting all the bits. 43 43 */ 44 - #define CR3_ADDR_MASK 0xFFFFFFFFull 45 - #define CR3_PCID_MASK 0ull 44 + #define CR3_ADDR_MASK 0xFFFFFFFFull 45 + #define CR3_PCID_MASK 0ull 46 + #define CR3_NOFLUSH 0 46 47 #endif 47 48 48 49 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */

+16 -4

arch/x86/include/asm/processor.h

··· 30 30 #include <linux/math64.h> 31 31 #include <linux/err.h> 32 32 #include <linux/irqflags.h> 33 + #include <linux/mem_encrypt.h> 33 34 34 35 /* 35 36 * We handle most unaligned accesses in hardware. On the other hand ··· 241 240 return __read_cr3() & CR3_ADDR_MASK; 242 241 } 243 242 243 + static inline unsigned long native_read_cr3_pa(void) 244 + { 245 + return __native_read_cr3() & CR3_ADDR_MASK; 246 + } 247 + 244 248 static inline void load_cr3(pgd_t *pgdir) 245 249 { 246 - write_cr3(__pa(pgdir)); 250 + write_cr3(__sme_pa(pgdir)); 247 251 } 248 252 249 253 #ifdef CONFIG_X86_32 ··· 811 805 */ 812 806 #define IA32_PAGE_OFFSET PAGE_OFFSET 813 807 #define TASK_SIZE PAGE_OFFSET 808 + #define TASK_SIZE_LOW TASK_SIZE 814 809 #define TASK_SIZE_MAX TASK_SIZE 810 + #define DEFAULT_MAP_WINDOW TASK_SIZE 815 811 #define STACK_TOP TASK_SIZE 816 812 #define STACK_TOP_MAX STACK_TOP 817 813 ··· 853 845 * particular problem by preventing anything from being mapped 854 846 * at the maximum canonical address. 855 847 */ 856 - #define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) 848 + #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) 849 + 850 + #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) 857 851 858 852 /* This decides where the kernel will search for a free chunk of vm 859 853 * space during mmap's. ··· 863 853 #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 864 854 0xc0000000 : 0xFFFFe000) 865 855 856 + #define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ 857 + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) 866 858 #define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ 867 859 IA32_PAGE_OFFSET : TASK_SIZE_MAX) 868 860 #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ 869 861 IA32_PAGE_OFFSET : TASK_SIZE_MAX) 870 862 871 - #define STACK_TOP TASK_SIZE 863 + #define STACK_TOP TASK_SIZE_LOW 872 864 #define STACK_TOP_MAX TASK_SIZE_MAX 873 865 874 866 #define INIT_THREAD { \ ··· 891 879 * space during mmap's. 892 880 */ 893 881 #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) 894 - #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) 882 + #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) 895 883 896 884 #define KSTK_EIP(task) (task_pt_regs(task)->ip) 897 885

+12

arch/x86/include/asm/realmode.h

··· 1 1 #ifndef _ARCH_X86_REALMODE_H 2 2 #define _ARCH_X86_REALMODE_H 3 3 4 + /* 5 + * Flag bit definitions for use with the flags field of the trampoline header 6 + * in the CONFIG_X86_64 variant. 7 + */ 8 + #define TH_FLAGS_SME_ACTIVE_BIT 0 9 + #define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT) 10 + 11 + #ifndef __ASSEMBLY__ 12 + 4 13 #include <linux/types.h> 5 14 #include <asm/io.h> 6 15 ··· 47 38 u64 start; 48 39 u64 efer; 49 40 u32 cr4; 41 + u32 flags; 50 42 #endif 51 43 }; 52 44 ··· 78 68 79 69 void set_real_mode_mem(phys_addr_t mem, size_t size); 80 70 void reserve_real_mode(void); 71 + 72 + #endif /* __ASSEMBLY__ */ 81 73 82 74 #endif /* _ARCH_X86_REALMODE_H */

+3

arch/x86/include/asm/set_memory.h

··· 11 11 * Executability : eXeutable, NoteXecutable 12 12 * Read/Write : ReadOnly, ReadWrite 13 13 * Presence : NotPresent 14 + * Encryption : Encrypted, Decrypted 14 15 * 15 16 * Within a category, the attributes are mutually exclusive. 16 17 * ··· 43 42 int set_memory_wb(unsigned long addr, int numpages); 44 43 int set_memory_np(unsigned long addr, int numpages); 45 44 int set_memory_4k(unsigned long addr, int numpages); 45 + int set_memory_encrypted(unsigned long addr, int numpages); 46 + int set_memory_decrypted(unsigned long addr, int numpages); 46 47 47 48 int set_memory_array_uc(unsigned long *addr, int addrinarray); 48 49 int set_memory_array_wc(unsigned long *addr, int addrinarray);

+14

arch/x86/include/asm/tlb.h

··· 15 15 16 16 #include <asm-generic/tlb.h> 17 17 18 + /* 19 + * While x86 architecture in general requires an IPI to perform TLB 20 + * shootdown, enablement code for several hypervisors overrides 21 + * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing 22 + * a hypercall. To keep software pagetable walkers safe in this case we 23 + * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment 24 + * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h 25 + * for more details. 26 + */ 27 + static inline void __tlb_remove_table(void *table) 28 + { 29 + free_page_and_swap_cache(table); 30 + } 31 + 18 32 #endif /* _ASM_X86_TLB_H */

+80 -7

arch/x86/include/asm/tlbflush.h

··· 57 57 __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); 58 58 } 59 59 60 + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) 61 + { 62 + u64 new_tlb_gen; 63 + 64 + /* 65 + * Bump the generation count. This also serves as a full barrier 66 + * that synchronizes with switch_mm(): callers are required to order 67 + * their read of mm_cpumask after their writes to the paging 68 + * structures. 69 + */ 70 + smp_mb__before_atomic(); 71 + new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); 72 + smp_mb__after_atomic(); 73 + 74 + return new_tlb_gen; 75 + } 76 + 60 77 #ifdef CONFIG_PARAVIRT 61 78 #include <asm/paravirt.h> 62 79 #else ··· 81 64 #define __flush_tlb_global() __native_flush_tlb_global() 82 65 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) 83 66 #endif 67 + 68 + /* 69 + * 6 because 6 should be plenty and struct tlb_state will fit in 70 + * two cache lines. 71 + */ 72 + #define TLB_NR_DYN_ASIDS 6 73 + 74 + struct tlb_context { 75 + u64 ctx_id; 76 + u64 tlb_gen; 77 + }; 84 78 85 79 struct tlb_state { 86 80 /* ··· 101 73 * mode even if we've already switched back to swapper_pg_dir. 102 74 */ 103 75 struct mm_struct *loaded_mm; 104 - int state; 76 + u16 loaded_mm_asid; 77 + u16 next_asid; 105 78 106 79 /* 107 80 * Access to this CR4 shadow and to H/W CR4 is protected by 108 81 * disabling interrupts when modifying either one. 109 82 */ 110 83 unsigned long cr4; 84 + 85 + /* 86 + * This is a list of all contexts that might exist in the TLB. 87 + * There is one per ASID that we use, and the ASID (what the 88 + * CPU calls PCID) is the index into ctxts. 89 + * 90 + * For each context, ctx_id indicates which mm the TLB's user 91 + * entries came from. As an invariant, the TLB will never 92 + * contain entries that are out-of-date as when that mm reached 93 + * the tlb_gen in the list. 94 + * 95 + * To be clear, this means that it's legal for the TLB code to 96 + * flush the TLB without updating tlb_gen. This can happen 97 + * (for now, at least) due to paravirt remote flushes. 98 + * 99 + * NB: context 0 is a bit special, since it's also used by 100 + * various bits of init code. This is fine -- code that 101 + * isn't aware of PCID will end up harmlessly flushing 102 + * context 0. 103 + */ 104 + struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; 111 105 }; 112 106 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); 113 107 ··· 257 207 __flush_tlb_global(); 258 208 else 259 209 __flush_tlb(); 210 + 211 + /* 212 + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- 213 + * we'd end up flushing kernel translations for the current ASID but 214 + * we might fail to flush kernel translations for other cached ASIDs. 215 + * 216 + * To avoid this issue, we force PCID off if PGE is off. 217 + */ 260 218 } 261 219 262 220 static inline void __flush_tlb_one(unsigned long addr) ··· 289 231 * and page-granular flushes are available only on i486 and up. 290 232 */ 291 233 struct flush_tlb_info { 292 - struct mm_struct *mm; 293 - unsigned long start; 294 - unsigned long end; 234 + /* 235 + * We support several kinds of flushes. 236 + * 237 + * - Fully flush a single mm. .mm will be set, .end will be 238 + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to 239 + * which the IPI sender is trying to catch us up. 240 + * 241 + * - Partially flush a single mm. .mm will be set, .start and 242 + * .end will indicate the range, and .new_tlb_gen will be set 243 + * such that the changes between generation .new_tlb_gen-1 and 244 + * .new_tlb_gen are entirely contained in the indicated range. 245 + * 246 + * - Fully flush all mms whose tlb_gens have been updated. .mm 247 + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen 248 + * will be zero. 249 + */ 250 + struct mm_struct *mm; 251 + unsigned long start; 252 + unsigned long end; 253 + u64 new_tlb_gen; 295 254 }; 296 255 297 256 #define local_flush_tlb() __flush_tlb() ··· 331 256 void native_flush_tlb_others(const struct cpumask *cpumask, 332 257 const struct flush_tlb_info *info); 333 258 334 - #define TLBSTATE_OK 1 335 - #define TLBSTATE_LAZY 2 336 - 337 259 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, 338 260 struct mm_struct *mm) 339 261 { 262 + inc_mm_tlb_gen(mm); 340 263 cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); 341 264 } 342 265

+13 -1

arch/x86/include/asm/vga.h

··· 7 7 #ifndef _ASM_X86_VGA_H 8 8 #define _ASM_X86_VGA_H 9 9 10 + #include <asm/set_memory.h> 11 + 10 12 /* 11 13 * On the PC, we can just recalculate addresses and then 12 14 * access the videoram directly without any black magic. 15 + * To support memory encryption however, we need to access 16 + * the videoram as decrypted memory. 13 17 */ 14 18 15 - #define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x) 19 + #define VGA_MAP_MEM(x, s) \ 20 + ({ \ 21 + unsigned long start = (unsigned long)phys_to_virt(x); \ 22 + \ 23 + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) \ 24 + set_memory_decrypted(start, (s) >> PAGE_SHIFT); \ 25 + \ 26 + start; \ 27 + }) 16 28 17 29 #define vga_readb(x) (*(x)) 18 30 #define vga_writeb(x, y) (*(y) = (x))

+3 -3

arch/x86/kernel/acpi/boot.c

··· 115 115 #define ACPI_INVALID_GSI INT_MIN 116 116 117 117 /* 118 - * This is just a simple wrapper around early_ioremap(), 118 + * This is just a simple wrapper around early_memremap(), 119 119 * with sanity checks for phys == 0 and size == 0. 120 120 */ 121 121 char *__init __acpi_map_table(unsigned long phys, unsigned long size) ··· 124 124 if (!phys || !size) 125 125 return NULL; 126 126 127 - return early_ioremap(phys, size); 127 + return early_memremap(phys, size); 128 128 } 129 129 130 130 void __init __acpi_unmap_table(char *map, unsigned long size) ··· 132 132 if (!map || !size) 133 133 return; 134 134 135 - early_iounmap(map, size); 135 + early_memunmap(map, size); 136 136 } 137 137 138 138 #ifdef CONFIG_X86_LOCAL_APIC

+25 -4

arch/x86/kernel/cpu/amd.c

··· 558 558 559 559 static void early_init_amd(struct cpuinfo_x86 *c) 560 560 { 561 + u32 dummy; 562 + 561 563 early_init_amd_mc(c); 564 + 565 + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); 562 566 563 567 /* 564 568 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate ··· 626 622 */ 627 623 if (cpu_has_amd_erratum(c, amd_erratum_400)) 628 624 set_cpu_bug(c, X86_BUG_AMD_E400); 625 + 626 + /* 627 + * BIOS support is required for SME. If BIOS has enabled SME then 628 + * adjust x86_phys_bits by the SME physical address space reduction 629 + * value. If BIOS has not enabled SME then don't advertise the 630 + * feature (set in scattered.c). Also, since the SME support requires 631 + * long mode, don't advertise the feature under CONFIG_X86_32. 632 + */ 633 + if (cpu_has(c, X86_FEATURE_SME)) { 634 + u64 msr; 635 + 636 + /* Check if SME is enabled */ 637 + rdmsrl(MSR_K8_SYSCFG, msr); 638 + if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) { 639 + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; 640 + if (IS_ENABLED(CONFIG_X86_32)) 641 + clear_cpu_cap(c, X86_FEATURE_SME); 642 + } else { 643 + clear_cpu_cap(c, X86_FEATURE_SME); 644 + } 645 + } 629 646 } 630 647 631 648 static void init_amd_k8(struct cpuinfo_x86 *c) ··· 765 740 766 741 static void init_amd(struct cpuinfo_x86 *c) 767 742 { 768 - u32 dummy; 769 - 770 743 early_init_amd(c); 771 744 772 745 /* ··· 825 802 */ 826 803 if (c->x86 > 0x11) 827 804 set_cpu_cap(c, X86_FEATURE_ARAT); 828 - 829 - rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); 830 805 831 806 /* 3DNow or LM implies PREFETCHW */ 832 807 if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))

+8

arch/x86/kernel/cpu/bugs.c

··· 21 21 22 22 void __init check_bugs(void) 23 23 { 24 + #ifdef CONFIG_X86_32 25 + /* 26 + * Regardless of whether PCID is enumerated, the SDM says 27 + * that it can't be enabled in 32-bit mode. 28 + */ 29 + setup_clear_cpu_cap(X86_FEATURE_PCID); 30 + #endif 31 + 24 32 identify_boot_cpu(); 25 33 26 34 if (!IS_ENABLED(CONFIG_SMP)) {

+40

arch/x86/kernel/cpu/common.c

··· 168 168 } 169 169 __setup("nompx", x86_mpx_setup); 170 170 171 + #ifdef CONFIG_X86_64 172 + static int __init x86_pcid_setup(char *s) 173 + { 174 + /* require an exact match without trailing characters */ 175 + if (strlen(s)) 176 + return 0; 177 + 178 + /* do not emit a message if the feature is not present */ 179 + if (!boot_cpu_has(X86_FEATURE_PCID)) 180 + return 1; 181 + 182 + setup_clear_cpu_cap(X86_FEATURE_PCID); 183 + pr_info("nopcid: PCID feature disabled\n"); 184 + return 1; 185 + } 186 + __setup("nopcid", x86_pcid_setup); 187 + #endif 188 + 171 189 static int __init x86_noinvpcid_setup(char *s) 172 190 { 173 191 /* noinvpcid doesn't accept parameters */ ··· 326 308 #else 327 309 cr4_clear_bits(X86_CR4_SMAP); 328 310 #endif 311 + } 312 + } 313 + 314 + static void setup_pcid(struct cpuinfo_x86 *c) 315 + { 316 + if (cpu_has(c, X86_FEATURE_PCID)) { 317 + if (cpu_has(c, X86_FEATURE_PGE)) { 318 + cr4_set_bits(X86_CR4_PCIDE); 319 + } else { 320 + /* 321 + * flush_tlb_all(), as currently implemented, won't 322 + * work if PCID is on but PGE is not. Since that 323 + * combination doesn't exist on real hardware, there's 324 + * no reason to try to fully support it, but it's 325 + * polite to avoid corrupting data if we're on 326 + * an improperly configured VM. 327 + */ 328 + clear_cpu_cap(c, X86_FEATURE_PCID); 329 + } 329 330 } 330 331 } 331 332 ··· 1161 1124 /* Set up SMEP/SMAP */ 1162 1125 setup_smep(c); 1163 1126 setup_smap(c); 1127 + 1128 + /* Set up PCID */ 1129 + setup_pcid(c); 1164 1130 1165 1131 /* 1166 1132 * The vendor-specific functions might have changed features.

+43

arch/x86/kernel/cpu/mcheck/mce.c

··· 51 51 #include <asm/mce.h> 52 52 #include <asm/msr.h> 53 53 #include <asm/reboot.h> 54 + #include <asm/set_memory.h> 54 55 55 56 #include "mce-internal.h" 56 57 ··· 1051 1050 pr_err("Memory error not recovered"); 1052 1051 return ret; 1053 1052 } 1053 + 1054 + #if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE) 1055 + 1056 + void arch_unmap_kpfn(unsigned long pfn) 1057 + { 1058 + unsigned long decoy_addr; 1059 + 1060 + /* 1061 + * Unmap this page from the kernel 1:1 mappings to make sure 1062 + * we don't log more errors because of speculative access to 1063 + * the page. 1064 + * We would like to just call: 1065 + * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); 1066 + * but doing that would radically increase the odds of a 1067 + * speculative access to the posion page because we'd have 1068 + * the virtual address of the kernel 1:1 mapping sitting 1069 + * around in registers. 1070 + * Instead we get tricky. We create a non-canonical address 1071 + * that looks just like the one we want, but has bit 63 flipped. 1072 + * This relies on set_memory_np() not checking whether we passed 1073 + * a legal address. 1074 + */ 1075 + 1076 + /* 1077 + * Build time check to see if we have a spare virtual bit. Don't want 1078 + * to leave this until run time because most developers don't have a 1079 + * system that can exercise this code path. This will only become a 1080 + * problem if/when we move beyond 5-level page tables. 1081 + * 1082 + * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) 1083 + */ 1084 + #if PGDIR_SHIFT + 9 < 63 1085 + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); 1086 + #else 1087 + #error "no unused virtual bit available" 1088 + #endif 1089 + 1090 + if (set_memory_np(decoy_addr, 1)) 1091 + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); 1092 + 1093 + } 1094 + #endif 1054 1095 1055 1096 /* 1056 1097 * The actual machine check handler. This only handles real

+1

arch/x86/kernel/cpu/scattered.c

··· 31 31 { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, 32 32 { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, 33 33 { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, 34 + { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, 34 35 { 0, 0, 0, 0, 0 } 35 36 }; 36 37

+23 -3

arch/x86/kernel/e820.c

··· 96 96 * Note: this function only works correctly once the E820 table is sorted and 97 97 * not-overlapping (at least for the range specified), which is the case normally. 98 98 */ 99 - bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) 99 + static struct e820_entry *__e820__mapped_all(u64 start, u64 end, 100 + enum e820_type type) 100 101 { 101 102 int i; 102 103 ··· 123 122 * coverage of the desired range exists: 124 123 */ 125 124 if (start >= end) 126 - return 1; 125 + return entry; 127 126 } 128 - return 0; 127 + 128 + return NULL; 129 + } 130 + 131 + /* 132 + * This function checks if the entire range <start,end> is mapped with type. 133 + */ 134 + bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) 135 + { 136 + return __e820__mapped_all(start, end, type); 137 + } 138 + 139 + /* 140 + * This function returns the type associated with the range <start,end>. 141 + */ 142 + int e820__get_entry_type(u64 start, u64 end) 143 + { 144 + struct e820_entry *entry = __e820__mapped_all(start, end, 0); 145 + 146 + return entry ? entry->type : -EINVAL; 129 147 } 130 148 131 149 /*

+1 -1

arch/x86/kernel/espfix_64.c

··· 195 195 196 196 pte_p = pte_offset_kernel(&pmd, addr); 197 197 stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); 198 - pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); 198 + pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); 199 199 for (n = 0; n < ESPFIX_PTE_CLONES; n++) 200 200 set_pte(&pte_p[n*PTE_STRIDE], pte); 201 201

+79 -16

arch/x86/kernel/head64.c

··· 14 14 #include <linux/start_kernel.h> 15 15 #include <linux/io.h> 16 16 #include <linux/memblock.h> 17 + #include <linux/mem_encrypt.h> 17 18 18 19 #include <asm/processor.h> 19 20 #include <asm/proto.h> ··· 34 33 /* 35 34 * Manage page tables very early on. 36 35 */ 37 - extern pgd_t early_top_pgt[PTRS_PER_PGD]; 38 36 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; 39 37 static unsigned int __initdata next_early_pgt; 40 38 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); ··· 45 45 return ptr - (void *)_text + (void *)physaddr; 46 46 } 47 47 48 - void __head __startup_64(unsigned long physaddr) 48 + unsigned long __head __startup_64(unsigned long physaddr, 49 + struct boot_params *bp) 49 50 { 50 51 unsigned long load_delta, *p; 52 + unsigned long pgtable_flags; 51 53 pgdval_t *pgd; 52 54 p4dval_t *p4d; 53 55 pudval_t *pud; ··· 70 68 /* Is the address not 2M aligned? */ 71 69 if (load_delta & ~PMD_PAGE_MASK) 72 70 for (;;); 71 + 72 + /* Activate Secure Memory Encryption (SME) if supported and enabled */ 73 + sme_enable(bp); 74 + 75 + /* Include the SME encryption mask in the fixup value */ 76 + load_delta += sme_get_me_mask(); 73 77 74 78 /* Fixup the physical addresses in the page table */ 75 79 ··· 100 92 * creates a bunch of nonsense entries but that is fine -- 101 93 * it avoids problems around wraparound. 102 94 */ 95 + 103 96 next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); 104 97 pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); 105 98 pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); 99 + 100 + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); 106 101 107 102 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 108 103 p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); 109 104 110 105 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; 111 - pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; 112 - pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; 106 + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; 107 + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; 113 108 114 109 i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; 115 - p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; 116 - p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; 110 + p4d[i + 0] = (pgdval_t)pud + pgtable_flags; 111 + p4d[i + 1] = (pgdval_t)pud + pgtable_flags; 117 112 } else { 118 113 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; 119 - pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; 120 - pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; 114 + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; 115 + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; 121 116 } 122 117 123 118 i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; 124 - pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; 125 - pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; 119 + pud[i + 0] = (pudval_t)pmd + pgtable_flags; 120 + pud[i + 1] = (pudval_t)pmd + pgtable_flags; 126 121 127 122 pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; 123 + pmd_entry += sme_get_me_mask(); 128 124 pmd_entry += physaddr; 129 125 130 126 for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { ··· 149 137 pmd[i] += load_delta; 150 138 } 151 139 152 - /* Fixup phys_base */ 140 + /* 141 + * Fixup phys_base - remove the memory encryption mask to obtain 142 + * the true physical address. 143 + */ 153 144 p = fixup_pointer(&phys_base, physaddr); 154 - *p += load_delta; 145 + *p += load_delta - sme_get_me_mask(); 146 + 147 + /* Encrypt the kernel (if SME is active) */ 148 + sme_encrypt_kernel(); 149 + 150 + /* 151 + * Return the SME encryption mask (if SME is active) to be used as a 152 + * modifier for the initial pgdir entry programmed into CR3. 153 + */ 154 + return sme_get_me_mask(); 155 + } 156 + 157 + unsigned long __startup_secondary_64(void) 158 + { 159 + /* 160 + * Return the SME encryption mask (if SME is active) to be used as a 161 + * modifier for the initial pgdir entry programmed into CR3. 162 + */ 163 + return sme_get_me_mask(); 155 164 } 156 165 157 166 /* Wipe all early page tables except for the kernel symbol map */ ··· 180 147 { 181 148 memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); 182 149 next_early_pgt = 0; 183 - write_cr3(__pa_nodebug(early_top_pgt)); 150 + write_cr3(__sme_pa_nodebug(early_top_pgt)); 184 151 } 185 152 186 153 /* Create a new PMD entry */ 187 - int __init early_make_pgtable(unsigned long address) 154 + int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) 188 155 { 189 156 unsigned long physaddr = address - __PAGE_OFFSET; 190 157 pgdval_t pgd, *pgd_p; 191 158 p4dval_t p4d, *p4d_p; 192 159 pudval_t pud, *pud_p; 193 - pmdval_t pmd, *pmd_p; 160 + pmdval_t *pmd_p; 194 161 195 162 /* Invalid address or early pgt is done ? */ 196 163 if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) ··· 249 216 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); 250 217 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; 251 218 } 252 - pmd = (physaddr & PMD_MASK) + early_pmd_flags; 253 219 pmd_p[pmd_index(address)] = pmd; 254 220 255 221 return 0; 222 + } 223 + 224 + int __init early_make_pgtable(unsigned long address) 225 + { 226 + unsigned long physaddr = address - __PAGE_OFFSET; 227 + pmdval_t pmd; 228 + 229 + pmd = (physaddr & PMD_MASK) + early_pmd_flags; 230 + 231 + return __early_make_pgtable(address, pmd); 256 232 } 257 233 258 234 /* Don't add a printk in there. printk relies on the PDA which is not initialized ··· 286 244 char * command_line; 287 245 unsigned long cmd_line_ptr; 288 246 247 + /* 248 + * If SME is active, this will create decrypted mappings of the 249 + * boot data in advance of the copy operations. 250 + */ 251 + sme_map_bootdata(real_mode_data); 252 + 289 253 memcpy(&boot_params, real_mode_data, sizeof boot_params); 290 254 sanitize_boot_params(&boot_params); 291 255 cmd_line_ptr = get_cmd_line_ptr(); ··· 299 251 command_line = __va(cmd_line_ptr); 300 252 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); 301 253 } 254 + 255 + /* 256 + * The old boot data is no longer needed and won't be reserved, 257 + * freeing up that memory for use by the system. If SME is active, 258 + * we need to remove the mappings that were created so that the 259 + * memory doesn't remain mapped as decrypted. 260 + */ 261 + sme_unmap_bootdata(real_mode_data); 302 262 } 303 263 304 264 asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) ··· 335 279 clear_bss(); 336 280 337 281 clear_page(init_top_pgt); 282 + 283 + /* 284 + * SME support may update early_pmd_flags to include the memory 285 + * encryption mask, so it needs to be called before anything 286 + * that may generate a page fault. 287 + */ 288 + sme_early_init(); 338 289 339 290 kasan_early_init(); 340 291

+28 -12

arch/x86/kernel/head_64.S

··· 73 73 /* Sanitize CPU configuration */ 74 74 call verify_cpu 75 75 76 + /* 77 + * Perform pagetable fixups. Additionally, if SME is active, encrypt 78 + * the kernel and retrieve the modifier (SME encryption mask if SME 79 + * is active) to be added to the initial pgdir entry that will be 80 + * programmed into CR3. 81 + */ 76 82 leaq _text(%rip), %rdi 77 83 pushq %rsi 78 84 call __startup_64 79 85 popq %rsi 80 86 81 - movq $(early_top_pgt - __START_KERNEL_map), %rax 87 + /* Form the CR3 value being sure to include the CR3 modifier */ 88 + addq $(early_top_pgt - __START_KERNEL_map), %rax 82 89 jmp 1f 83 90 ENTRY(secondary_startup_64) 84 91 /* ··· 105 98 /* Sanitize CPU configuration */ 106 99 call verify_cpu 107 100 108 - movq $(init_top_pgt - __START_KERNEL_map), %rax 101 + /* 102 + * Retrieve the modifier (SME encryption mask if SME is active) to be 103 + * added to the initial pgdir entry that will be programmed into CR3. 104 + */ 105 + pushq %rsi 106 + call __startup_secondary_64 107 + popq %rsi 108 + 109 + /* Form the CR3 value being sure to include the CR3 modifier */ 110 + addq $(init_top_pgt - __START_KERNEL_map), %rax 109 111 1: 110 112 111 113 /* Enable PAE mode, PGE and LA57 */ ··· 351 335 NEXT_PAGE(early_top_pgt) 352 336 .fill 511,8,0 353 337 #ifdef CONFIG_X86_5LEVEL 354 - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 338 + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 355 339 #else 356 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 340 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 357 341 #endif 358 342 359 343 NEXT_PAGE(early_dynamic_pgts) ··· 366 350 .fill 512,8,0 367 351 #else 368 352 NEXT_PAGE(init_top_pgt) 369 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 353 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 370 354 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 371 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 355 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 372 356 .org init_top_pgt + PGD_START_KERNEL*8, 0 373 357 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 374 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 358 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 375 359 376 360 NEXT_PAGE(level3_ident_pgt) 377 - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 361 + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 378 362 .fill 511, 8, 0 379 363 NEXT_PAGE(level2_ident_pgt) 380 364 /* Since I easily can, map the first 1G. ··· 386 370 #ifdef CONFIG_X86_5LEVEL 387 371 NEXT_PAGE(level4_kernel_pgt) 388 372 .fill 511,8,0 389 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 373 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 390 374 #endif 391 375 392 376 NEXT_PAGE(level3_kernel_pgt) 393 377 .fill L3_START_KERNEL,8,0 394 378 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ 395 - .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE 396 - .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 379 + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 380 + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 397 381 398 382 NEXT_PAGE(level2_kernel_pgt) 399 383 /* ··· 411 395 412 396 NEXT_PAGE(level2_fixmap_pgt) 413 397 .fill 506,8,0 414 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 398 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 415 399 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ 416 400 .fill 5,8,0 417 401

+11 -23

arch/x86/kernel/kdebugfs.c

··· 33 33 struct setup_data_node *node = file->private_data; 34 34 unsigned long remain; 35 35 loff_t pos = *ppos; 36 - struct page *pg; 37 36 void *p; 38 37 u64 pa; 39 38 ··· 46 47 count = node->len - pos; 47 48 48 49 pa = node->paddr + sizeof(struct setup_data) + pos; 49 - pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); 50 - if (PageHighMem(pg)) { 51 - p = ioremap_cache(pa, count); 52 - if (!p) 53 - return -ENXIO; 54 - } else 55 - p = __va(pa); 50 + p = memremap(pa, count, MEMREMAP_WB); 51 + if (!p) 52 + return -ENOMEM; 56 53 57 54 remain = copy_to_user(user_buf, p, count); 58 55 59 - if (PageHighMem(pg)) 60 - iounmap(p); 56 + memunmap(p); 61 57 62 58 if (remain) 63 59 return -EFAULT; ··· 103 109 struct setup_data *data; 104 110 int error; 105 111 struct dentry *d; 106 - struct page *pg; 107 112 u64 pa_data; 108 113 int no = 0; 109 114 ··· 119 126 goto err_dir; 120 127 } 121 128 122 - pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); 123 - if (PageHighMem(pg)) { 124 - data = ioremap_cache(pa_data, sizeof(*data)); 125 - if (!data) { 126 - kfree(node); 127 - error = -ENXIO; 128 - goto err_dir; 129 - } 130 - } else 131 - data = __va(pa_data); 129 + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); 130 + if (!data) { 131 + kfree(node); 132 + error = -ENOMEM; 133 + goto err_dir; 134 + } 132 135 133 136 node->paddr = pa_data; 134 137 node->type = data->type; ··· 132 143 error = create_setup_data_node(d, no, node); 133 144 pa_data = data->next; 134 145 135 - if (PageHighMem(pg)) 136 - iounmap(data); 146 + memunmap(data); 137 147 if (error) 138 148 goto err_dir; 139 149 no++;

+14 -14

arch/x86/kernel/ksysfs.c

··· 16 16 #include <linux/stat.h> 17 17 #include <linux/slab.h> 18 18 #include <linux/mm.h> 19 + #include <linux/io.h> 19 20 20 - #include <asm/io.h> 21 21 #include <asm/setup.h> 22 22 23 23 static ssize_t version_show(struct kobject *kobj, ··· 79 79 *paddr = pa_data; 80 80 return 0; 81 81 } 82 - data = ioremap_cache(pa_data, sizeof(*data)); 82 + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); 83 83 if (!data) 84 84 return -ENOMEM; 85 85 86 86 pa_data = data->next; 87 - iounmap(data); 87 + memunmap(data); 88 88 i++; 89 89 } 90 90 return -EINVAL; ··· 97 97 u64 pa_data = boot_params.hdr.setup_data; 98 98 99 99 while (pa_data) { 100 - data = ioremap_cache(pa_data, sizeof(*data)); 100 + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); 101 101 if (!data) 102 102 return -ENOMEM; 103 103 if (nr == i) { 104 104 *size = data->len; 105 - iounmap(data); 105 + memunmap(data); 106 106 return 0; 107 107 } 108 108 109 109 pa_data = data->next; 110 - iounmap(data); 110 + memunmap(data); 111 111 i++; 112 112 } 113 113 return -EINVAL; ··· 127 127 ret = get_setup_data_paddr(nr, &paddr); 128 128 if (ret) 129 129 return ret; 130 - data = ioremap_cache(paddr, sizeof(*data)); 130 + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); 131 131 if (!data) 132 132 return -ENOMEM; 133 133 134 134 ret = sprintf(buf, "0x%x\n", data->type); 135 - iounmap(data); 135 + memunmap(data); 136 136 return ret; 137 137 } 138 138 ··· 154 154 ret = get_setup_data_paddr(nr, &paddr); 155 155 if (ret) 156 156 return ret; 157 - data = ioremap_cache(paddr, sizeof(*data)); 157 + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); 158 158 if (!data) 159 159 return -ENOMEM; 160 160 ··· 170 170 goto out; 171 171 172 172 ret = count; 173 - p = ioremap_cache(paddr + sizeof(*data), data->len); 173 + p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); 174 174 if (!p) { 175 175 ret = -ENOMEM; 176 176 goto out; 177 177 } 178 178 memcpy(buf, p + off, count); 179 - iounmap(p); 179 + memunmap(p); 180 180 out: 181 - iounmap(data); 181 + memunmap(data); 182 182 return ret; 183 183 } 184 184 ··· 250 250 *nr = 0; 251 251 while (pa_data) { 252 252 *nr += 1; 253 - data = ioremap_cache(pa_data, sizeof(*data)); 253 + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); 254 254 if (!data) { 255 255 ret = -ENOMEM; 256 256 goto out; 257 257 } 258 258 pa_data = data->next; 259 - iounmap(data); 259 + memunmap(data); 260 260 } 261 261 262 262 out:

+23 -2

arch/x86/kernel/machine_kexec_64.c

··· 87 87 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); 88 88 } 89 89 pte = pte_offset_kernel(pmd, vaddr); 90 - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); 90 + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); 91 91 return 0; 92 92 err: 93 93 free_transition_pgtable(image); ··· 115 115 .alloc_pgt_page = alloc_pgt_page, 116 116 .context = image, 117 117 .page_flag = __PAGE_KERNEL_LARGE_EXEC, 118 + .kernpg_flag = _KERNPG_TABLE_NOENC, 118 119 }; 119 120 unsigned long mstart, mend; 120 121 pgd_t *level4p; ··· 335 334 image->start = relocate_kernel((unsigned long)image->head, 336 335 (unsigned long)page_list, 337 336 image->start, 338 - image->preserve_context); 337 + image->preserve_context, 338 + sme_active()); 339 339 340 340 #ifdef CONFIG_KEXEC_JUMP 341 341 if (image->preserve_context) ··· 603 601 void arch_kexec_unprotect_crashkres(void) 604 602 { 605 603 kexec_mark_crashkres(false); 604 + } 605 + 606 + int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) 607 + { 608 + /* 609 + * If SME is active we need to be sure that kexec pages are 610 + * not encrypted because when we boot to the new kernel the 611 + * pages won't be accessed encrypted (initially). 612 + */ 613 + return set_memory_decrypted((unsigned long)vaddr, pages); 614 + } 615 + 616 + void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) 617 + { 618 + /* 619 + * If SME is active we need to reset the pages back to being 620 + * an encrypted mapping before freeing them. 621 + */ 622 + set_memory_encrypted((unsigned long)vaddr, pages); 606 623 }

+75 -33

arch/x86/kernel/mpparse.c

··· 429 429 } 430 430 } 431 431 432 - static struct mpf_intel *mpf_found; 432 + static unsigned long mpf_base; 433 433 434 434 static unsigned long __init get_mpc_size(unsigned long physptr) 435 435 { 436 436 struct mpc_table *mpc; 437 437 unsigned long size; 438 438 439 - mpc = early_ioremap(physptr, PAGE_SIZE); 439 + mpc = early_memremap(physptr, PAGE_SIZE); 440 440 size = mpc->length; 441 - early_iounmap(mpc, PAGE_SIZE); 441 + early_memunmap(mpc, PAGE_SIZE); 442 442 apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); 443 443 444 444 return size; ··· 450 450 unsigned long size; 451 451 452 452 size = get_mpc_size(mpf->physptr); 453 - mpc = early_ioremap(mpf->physptr, size); 453 + mpc = early_memremap(mpf->physptr, size); 454 + 454 455 /* 455 456 * Read the physical hardware table. Anything here will 456 457 * override the defaults. ··· 462 461 #endif 463 462 pr_err("BIOS bug, MP table errors detected!...\n"); 464 463 pr_cont("... disabling SMP support. (tell your hw vendor)\n"); 465 - early_iounmap(mpc, size); 464 + early_memunmap(mpc, size); 466 465 return -1; 467 466 } 468 - early_iounmap(mpc, size); 467 + early_memunmap(mpc, size); 469 468 470 469 if (early) 471 470 return -1; ··· 498 497 */ 499 498 void __init default_get_smp_config(unsigned int early) 500 499 { 501 - struct mpf_intel *mpf = mpf_found; 500 + struct mpf_intel *mpf; 502 501 503 502 if (!smp_found_config) 504 503 return; 505 504 506 - if (!mpf) 505 + if (!mpf_base) 507 506 return; 508 507 509 508 if (acpi_lapic && early) ··· 515 514 */ 516 515 if (acpi_lapic && acpi_ioapic) 517 516 return; 517 + 518 + mpf = early_memremap(mpf_base, sizeof(*mpf)); 519 + if (!mpf) { 520 + pr_err("MPTABLE: error mapping MP table\n"); 521 + return; 522 + } 518 523 519 524 pr_info("Intel MultiProcessor Specification v1.%d\n", 520 525 mpf->specification); ··· 536 529 /* 537 530 * Now see if we need to read further. 538 531 */ 539 - if (mpf->feature1 != 0) { 532 + if (mpf->feature1) { 540 533 if (early) { 541 534 /* 542 535 * local APIC has default address ··· 549 542 construct_default_ISA_mptable(mpf->feature1); 550 543 551 544 } else if (mpf->physptr) { 552 - if (check_physptr(mpf, early)) 545 + if (check_physptr(mpf, early)) { 546 + early_memunmap(mpf, sizeof(*mpf)); 553 547 return; 548 + } 554 549 } else 555 550 BUG(); 556 551 ··· 561 552 /* 562 553 * Only use the first configuration found. 563 554 */ 555 + 556 + early_memunmap(mpf, sizeof(*mpf)); 564 557 } 565 558 566 559 static void __init smp_reserve_memory(struct mpf_intel *mpf) ··· 572 561 573 562 static int __init smp_scan_config(unsigned long base, unsigned long length) 574 563 { 575 - unsigned int *bp = phys_to_virt(base); 564 + unsigned int *bp; 576 565 struct mpf_intel *mpf; 577 - unsigned long mem; 566 + int ret = 0; 578 567 579 568 apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", 580 569 base, base + length - 1); 581 570 BUILD_BUG_ON(sizeof(*mpf) != 16); 582 571 583 572 while (length > 0) { 573 + bp = early_memremap(base, length); 584 574 mpf = (struct mpf_intel *)bp; 585 575 if ((*bp == SMP_MAGIC_IDENT) && 586 576 (mpf->length == 1) && ··· 591 579 #ifdef CONFIG_X86_LOCAL_APIC 592 580 smp_found_config = 1; 593 581 #endif 594 - mpf_found = mpf; 582 + mpf_base = base; 595 583 596 - pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", 597 - (unsigned long long) virt_to_phys(mpf), 598 - (unsigned long long) virt_to_phys(mpf) + 599 - sizeof(*mpf) - 1, mpf); 584 + pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n", 585 + base, base + sizeof(*mpf) - 1, mpf); 600 586 601 - mem = virt_to_phys(mpf); 602 - memblock_reserve(mem, sizeof(*mpf)); 587 + memblock_reserve(base, sizeof(*mpf)); 603 588 if (mpf->physptr) 604 589 smp_reserve_memory(mpf); 605 590 606 - return 1; 591 + ret = 1; 607 592 } 608 - bp += 4; 593 + early_memunmap(bp, length); 594 + 595 + if (ret) 596 + break; 597 + 598 + base += 16; 609 599 length -= 16; 610 600 } 611 - return 0; 601 + return ret; 612 602 } 613 603 614 604 void __init default_find_smp_config(void) ··· 852 838 char oem[10]; 853 839 struct mpf_intel *mpf; 854 840 struct mpc_table *mpc, *mpc_new; 841 + unsigned long size; 855 842 856 843 if (!enable_update_mptable) 857 844 return 0; 858 845 859 - mpf = mpf_found; 860 - if (!mpf) 846 + if (!mpf_base) 861 847 return 0; 848 + 849 + mpf = early_memremap(mpf_base, sizeof(*mpf)); 850 + if (!mpf) { 851 + pr_err("MPTABLE: mpf early_memremap() failed\n"); 852 + return 0; 853 + } 862 854 863 855 /* 864 856 * Now see if we need to go further. 865 857 */ 866 - if (mpf->feature1 != 0) 867 - return 0; 858 + if (mpf->feature1) 859 + goto do_unmap_mpf; 868 860 869 861 if (!mpf->physptr) 870 - return 0; 862 + goto do_unmap_mpf; 871 863 872 - mpc = phys_to_virt(mpf->physptr); 864 + size = get_mpc_size(mpf->physptr); 865 + mpc = early_memremap(mpf->physptr, size); 866 + if (!mpc) { 867 + pr_err("MPTABLE: mpc early_memremap() failed\n"); 868 + goto do_unmap_mpf; 869 + } 873 870 874 871 if (!smp_check_mpc(mpc, oem, str)) 875 - return 0; 872 + goto do_unmap_mpc; 876 873 877 - pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf)); 874 + pr_info("mpf: %llx\n", (u64)mpf_base); 878 875 pr_info("physptr: %x\n", mpf->physptr); 879 876 880 877 if (mpc_new_phys && mpc->length > mpc_new_length) { ··· 903 878 new = mpf_checksum((unsigned char *)mpc, mpc->length); 904 879 if (old == new) { 905 880 pr_info("mpc is readonly, please try alloc_mptable instead\n"); 906 - return 0; 881 + goto do_unmap_mpc; 907 882 } 908 883 pr_info("use in-position replacing\n"); 909 884 } else { 885 + mpc_new = early_memremap(mpc_new_phys, mpc_new_length); 886 + if (!mpc_new) { 887 + pr_err("MPTABLE: new mpc early_memremap() failed\n"); 888 + goto do_unmap_mpc; 889 + } 910 890 mpf->physptr = mpc_new_phys; 911 - mpc_new = phys_to_virt(mpc_new_phys); 912 891 memcpy(mpc_new, mpc, mpc->length); 892 + early_memunmap(mpc, size); 913 893 mpc = mpc_new; 894 + size = mpc_new_length; 914 895 /* check if we can modify that */ 915 896 if (mpc_new_phys - mpf->physptr) { 916 897 struct mpf_intel *mpf_new; 917 898 /* steal 16 bytes from [0, 1k) */ 899 + mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new)); 900 + if (!mpf_new) { 901 + pr_err("MPTABLE: new mpf early_memremap() failed\n"); 902 + goto do_unmap_mpc; 903 + } 918 904 pr_info("mpf new: %x\n", 0x400 - 16); 919 - mpf_new = phys_to_virt(0x400 - 16); 920 905 memcpy(mpf_new, mpf, 16); 906 + early_memunmap(mpf, sizeof(*mpf)); 921 907 mpf = mpf_new; 922 908 mpf->physptr = mpc_new_phys; 923 909 } ··· 944 908 * may need pci=routeirq for all coverage 945 909 */ 946 910 replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); 911 + 912 + do_unmap_mpc: 913 + early_memunmap(mpc, size); 914 + 915 + do_unmap_mpf: 916 + early_memunmap(mpf, sizeof(*mpf)); 947 917 948 918 return 0; 949 919 }

+7 -4

arch/x86/kernel/pci-dma.c

··· 93 93 if (gfpflags_allow_blocking(flag)) { 94 94 page = dma_alloc_from_contiguous(dev, count, get_order(size), 95 95 flag); 96 - if (page && page_to_phys(page) + size > dma_mask) { 97 - dma_release_from_contiguous(dev, page, count); 98 - page = NULL; 96 + if (page) { 97 + addr = phys_to_dma(dev, page_to_phys(page)); 98 + if (addr + size > dma_mask) { 99 + dma_release_from_contiguous(dev, page, count); 100 + page = NULL; 101 + } 99 102 } 100 103 } 101 104 /* fallback */ ··· 107 104 if (!page) 108 105 return NULL; 109 106 110 - addr = page_to_phys(page); 107 + addr = phys_to_dma(dev, page_to_phys(page)); 111 108 if (addr + size > dma_mask) { 112 109 __free_pages(page, get_order(size)); 113 110

+1 -1

arch/x86/kernel/pci-nommu.c

··· 32 32 enum dma_data_direction dir, 33 33 unsigned long attrs) 34 34 { 35 - dma_addr_t bus = page_to_phys(page) + offset; 35 + dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset; 36 36 WARN_ON(size == 0); 37 37 if (!check_addr("map_single", dev, bus, size)) 38 38 return NOMMU_MAPPING_ERROR;

+13 -2

arch/x86/kernel/pci-swiotlb.c

··· 6 6 #include <linux/swiotlb.h> 7 7 #include <linux/bootmem.h> 8 8 #include <linux/dma-mapping.h> 9 + #include <linux/mem_encrypt.h> 9 10 10 11 #include <asm/iommu.h> 11 12 #include <asm/swiotlb.h> 12 13 #include <asm/dma.h> 13 14 #include <asm/xen/swiotlb-xen.h> 14 15 #include <asm/iommu_table.h> 16 + 15 17 int swiotlb __read_mostly; 16 18 17 19 void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, ··· 81 79 pci_swiotlb_late_init); 82 80 83 81 /* 84 - * if 4GB or more detected (and iommu=off not set) return 1 85 - * and set swiotlb to 1. 82 + * If 4GB or more detected (and iommu=off not set) or if SME is active 83 + * then set swiotlb to 1 and return 1. 86 84 */ 87 85 int __init pci_swiotlb_detect_4gb(void) 88 86 { ··· 91 89 if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) 92 90 swiotlb = 1; 93 91 #endif 92 + 93 + /* 94 + * If SME is active then swiotlb will be set to 1 so that bounce 95 + * buffers are allocated and used for devices that do not support 96 + * the addressing range required for the encryption mask. 97 + */ 98 + if (sme_active()) 99 + swiotlb = 1; 100 + 94 101 return swiotlb; 95 102 } 96 103 IOMMU_INIT(pci_swiotlb_detect_4gb,

+15 -2

arch/x86/kernel/process.c

··· 355 355 return ret; 356 356 } 357 357 #endif 358 + 358 359 void stop_this_cpu(void *dummy) 359 360 { 360 361 local_irq_disable(); ··· 366 365 disable_local_APIC(); 367 366 mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); 368 367 369 - for (;;) 370 - halt(); 368 + for (;;) { 369 + /* 370 + * Use wbinvd followed by hlt to stop the processor. This 371 + * provides support for kexec on a processor that supports 372 + * SME. With kexec, going from SME inactive to SME active 373 + * requires clearing cache entries so that addresses without 374 + * the encryption bit set don't corrupt the same physical 375 + * address that has the encryption bit set when caches are 376 + * flushed. To achieve this a wbinvd is performed followed by 377 + * a hlt. Even if the processor is not in the kexec/SME 378 + * scenario this only adds a wbinvd to a halting processor. 379 + */ 380 + asm volatile("wbinvd; hlt" : : : "memory"); 381 + } 371 382 } 372 383 373 384 /*

+14

arch/x86/kernel/relocate_kernel_64.S

··· 47 47 * %rsi page_list 48 48 * %rdx start address 49 49 * %rcx preserve_context 50 + * %r8 sme_active 50 51 */ 51 52 52 53 /* Save the CPU context, used for jumping back */ ··· 71 70 /* zero out flags, and disable interrupts */ 72 71 pushq $0 73 72 popfq 73 + 74 + /* Save SME active flag */ 75 + movq %r8, %r12 74 76 75 77 /* 76 78 * get physical address of control page now ··· 135 131 136 132 /* Flush the TLB (needed?) */ 137 133 movq %r9, %cr3 134 + 135 + /* 136 + * If SME is active, there could be old encrypted cache line 137 + * entries that will conflict with the now unencrypted memory 138 + * used by kexec. Flush the caches before copying the kernel. 139 + */ 140 + testq %r12, %r12 141 + jz 1f 142 + wbinvd 143 + 1: 138 144 139 145 movq %rcx, %r11 140 146 call swap_pages

+9

arch/x86/kernel/setup.c

··· 69 69 #include <linux/crash_dump.h> 70 70 #include <linux/tboot.h> 71 71 #include <linux/jiffies.h> 72 + #include <linux/mem_encrypt.h> 72 73 73 74 #include <linux/usb/xhci-dbgp.h> 74 75 #include <video/edid.h> ··· 375 374 if (!boot_params.hdr.type_of_loader || 376 375 !ramdisk_image || !ramdisk_size) 377 376 return; /* No initrd provided by bootloader */ 377 + 378 + /* 379 + * If SME is active, this memory will be marked encrypted by the 380 + * kernel when it is accessed (including relocation). However, the 381 + * ramdisk image was loaded decrypted by the bootloader, so make 382 + * sure that it is encrypted before accessing it. 383 + */ 384 + sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); 378 385 379 386 initrd_start = 0; 380 387

+26 -4

arch/x86/kernel/sys_x86_64.c

··· 21 21 #include <asm/compat.h> 22 22 #include <asm/ia32.h> 23 23 #include <asm/syscalls.h> 24 + #include <asm/mpx.h> 24 25 25 26 /* 26 27 * Align a virtual address to avoid aliasing in the I$ on AMD F15h. ··· 101 100 return error; 102 101 } 103 102 104 - static void find_start_end(unsigned long flags, unsigned long *begin, 105 - unsigned long *end) 103 + static void find_start_end(unsigned long addr, unsigned long flags, 104 + unsigned long *begin, unsigned long *end) 106 105 { 107 106 if (!in_compat_syscall() && (flags & MAP_32BIT)) { 108 107 /* This is usually used needed to map code in small ··· 121 120 } 122 121 123 122 *begin = get_mmap_base(1); 124 - *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); 123 + if (in_compat_syscall()) 124 + *end = task_size_32bit(); 125 + else 126 + *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); 125 127 } 126 128 127 129 unsigned long ··· 136 132 struct vm_unmapped_area_info info; 137 133 unsigned long begin, end; 138 134 135 + addr = mpx_unmapped_area_check(addr, len, flags); 136 + if (IS_ERR_VALUE(addr)) 137 + return addr; 138 + 139 139 if (flags & MAP_FIXED) 140 140 return addr; 141 141 142 - find_start_end(flags, &begin, &end); 142 + find_start_end(addr, flags, &begin, &end); 143 143 144 144 if (len > end) 145 145 return -ENOMEM; ··· 179 171 unsigned long addr = addr0; 180 172 struct vm_unmapped_area_info info; 181 173 174 + addr = mpx_unmapped_area_check(addr, len, flags); 175 + if (IS_ERR_VALUE(addr)) 176 + return addr; 177 + 182 178 /* requested length too big for entire address space */ 183 179 if (len > TASK_SIZE) 184 180 return -ENOMEM; ··· 207 195 info.length = len; 208 196 info.low_limit = PAGE_SIZE; 209 197 info.high_limit = get_mmap_base(0); 198 + 199 + /* 200 + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area 201 + * in the full address space. 202 + * 203 + * !in_compat_syscall() check to avoid high addresses for x32. 204 + */ 205 + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) 206 + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; 207 + 210 208 info.align_mask = 0; 211 209 info.align_offset = pgoff << PAGE_SHIFT; 212 210 if (filp) {

+34 -7

arch/x86/kvm/mmu.c

··· 108 108 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 109 109 110 110 111 - #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) 111 + #define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) 112 112 #define PT64_DIR_BASE_ADDR_MASK \ 113 113 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 114 114 #define PT64_LVL_ADDR_MASK(level) \ ··· 126 126 * PT32_LEVEL_BITS))) - 1)) 127 127 128 128 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ 129 - | shadow_x_mask | shadow_nx_mask) 129 + | shadow_x_mask | shadow_nx_mask | shadow_me_mask) 130 130 131 131 #define ACC_EXEC_MASK 1 132 132 #define ACC_WRITE_MASK PT_WRITABLE_MASK ··· 186 186 static u64 __read_mostly shadow_mmio_mask; 187 187 static u64 __read_mostly shadow_mmio_value; 188 188 static u64 __read_mostly shadow_present_mask; 189 + static u64 __read_mostly shadow_me_mask; 189 190 190 191 /* 191 192 * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. ··· 350 349 */ 351 350 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 352 351 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, 353 - u64 acc_track_mask) 352 + u64 acc_track_mask, u64 me_mask) 354 353 { 355 354 BUG_ON(!dirty_mask != !accessed_mask); 356 355 BUG_ON(!accessed_mask && !acc_track_mask); ··· 363 362 shadow_x_mask = x_mask; 364 363 shadow_present_mask = p_mask; 365 364 shadow_acc_track_mask = acc_track_mask; 365 + shadow_me_mask = me_mask; 366 366 } 367 367 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 368 368 ··· 2435 2433 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); 2436 2434 2437 2435 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | 2438 - shadow_user_mask | shadow_x_mask; 2436 + shadow_user_mask | shadow_x_mask | shadow_me_mask; 2439 2437 2440 2438 if (sp_ad_disabled(sp)) 2441 2439 spte |= shadow_acc_track_value; ··· 2747 2745 pte_access &= ~ACC_WRITE_MASK; 2748 2746 2749 2747 spte |= (u64)pfn << PAGE_SHIFT; 2748 + spte |= shadow_me_mask; 2750 2749 2751 2750 if (pte_access & ACC_WRITE_MASK) { 2752 2751 ··· 4109 4106 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 4110 4107 { 4111 4108 bool uses_nx = context->nx || context->base_role.smep_andnot_wp; 4109 + struct rsvd_bits_validate *shadow_zero_check; 4110 + int i; 4112 4111 4113 4112 /* 4114 4113 * Passing "true" to the last argument is okay; it adds a check 4115 4114 * on bit 8 of the SPTEs which KVM doesn't use anyway. 4116 4115 */ 4117 - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, 4116 + shadow_zero_check = &context->shadow_zero_check; 4117 + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, 4118 4118 boot_cpu_data.x86_phys_bits, 4119 4119 context->shadow_root_level, uses_nx, 4120 4120 guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), 4121 4121 true); 4122 + 4123 + if (!shadow_me_mask) 4124 + return; 4125 + 4126 + for (i = context->shadow_root_level; --i >= 0;) { 4127 + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; 4128 + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; 4129 + } 4130 + 4122 4131 } 4123 4132 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); 4124 4133 ··· 4148 4133 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4149 4134 struct kvm_mmu *context) 4150 4135 { 4136 + struct rsvd_bits_validate *shadow_zero_check; 4137 + int i; 4138 + 4139 + shadow_zero_check = &context->shadow_zero_check; 4140 + 4151 4141 if (boot_cpu_is_amd()) 4152 - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, 4142 + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, 4153 4143 boot_cpu_data.x86_phys_bits, 4154 4144 context->shadow_root_level, false, 4155 4145 boot_cpu_has(X86_FEATURE_GBPAGES), 4156 4146 true, true); 4157 4147 else 4158 - __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 4148 + __reset_rsvds_bits_mask_ept(shadow_zero_check, 4159 4149 boot_cpu_data.x86_phys_bits, 4160 4150 false); 4161 4151 4152 + if (!shadow_me_mask) 4153 + return; 4154 + 4155 + for (i = context->shadow_root_level; --i >= 0;) { 4156 + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; 4157 + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; 4158 + } 4162 4159 } 4163 4160 4164 4161 /*

+18 -17

arch/x86/kvm/svm.c

··· 1167 1167 { 1168 1168 struct vmcb *vmcb = svm->vmcb; 1169 1169 struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; 1170 - phys_addr_t bpa = page_to_phys(svm->avic_backing_page); 1171 - phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page); 1172 - phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page); 1170 + phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); 1171 + phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page)); 1172 + phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page)); 1173 1173 1174 1174 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; 1175 1175 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; ··· 1232 1232 set_intercept(svm, INTERCEPT_MWAIT); 1233 1233 } 1234 1234 1235 - control->iopm_base_pa = iopm_base; 1236 - control->msrpm_base_pa = __pa(svm->msrpm); 1235 + control->iopm_base_pa = __sme_set(iopm_base); 1236 + control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1237 1237 control->int_ctl = V_INTR_MASKING_MASK; 1238 1238 1239 1239 init_seg(&save->es); ··· 1377 1377 return -EINVAL; 1378 1378 1379 1379 new_entry = READ_ONCE(*entry); 1380 - new_entry = (page_to_phys(svm->avic_backing_page) & 1381 - AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | 1382 - AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; 1380 + new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & 1381 + AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | 1382 + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); 1383 1383 WRITE_ONCE(*entry, new_entry); 1384 1384 1385 1385 svm->avic_physical_id_cache = entry; ··· 1647 1647 1648 1648 svm->vmcb = page_address(page); 1649 1649 clear_page(svm->vmcb); 1650 - svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1650 + svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT); 1651 1651 svm->asid_generation = 0; 1652 1652 init_vmcb(svm); 1653 1653 ··· 1675 1675 { 1676 1676 struct vcpu_svm *svm = to_svm(vcpu); 1677 1677 1678 - __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); 1678 + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); 1679 1679 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); 1680 1680 __free_page(virt_to_page(svm->nested.hsave)); 1681 1681 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); ··· 2330 2330 u64 pdpte; 2331 2331 int ret; 2332 2332 2333 - ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, 2333 + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte, 2334 2334 offset_in_page(cr3) + index * 8, 8); 2335 2335 if (ret) 2336 2336 return 0; ··· 2342 2342 { 2343 2343 struct vcpu_svm *svm = to_svm(vcpu); 2344 2344 2345 - svm->vmcb->control.nested_cr3 = root; 2345 + svm->vmcb->control.nested_cr3 = __sme_set(root); 2346 2346 mark_dirty(svm->vmcb, VMCB_NPT); 2347 2347 svm_flush_tlb(vcpu); 2348 2348 } ··· 2873 2873 svm->nested.msrpm[p] = svm->msrpm[p] | value; 2874 2874 } 2875 2875 2876 - svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); 2876 + svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); 2877 2877 2878 2878 return true; 2879 2879 } ··· 4506 4506 pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, 4507 4507 irq.vector); 4508 4508 *svm = to_svm(vcpu); 4509 - vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); 4509 + vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); 4510 4510 vcpu_info->vector = irq.vector; 4511 4511 4512 4512 return 0; ··· 4557 4557 struct amd_iommu_pi_data pi; 4558 4558 4559 4559 /* Try to enable guest_mode in IRTE */ 4560 - pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; 4560 + pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & 4561 + AVIC_HPA_MASK); 4561 4562 pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, 4562 4563 svm->vcpu.vcpu_id); 4563 4564 pi.is_guest_mode = true; ··· 5007 5006 { 5008 5007 struct vcpu_svm *svm = to_svm(vcpu); 5009 5008 5010 - svm->vmcb->save.cr3 = root; 5009 + svm->vmcb->save.cr3 = __sme_set(root); 5011 5010 mark_dirty(svm->vmcb, VMCB_CR); 5012 5011 svm_flush_tlb(vcpu); 5013 5012 } ··· 5016 5015 { 5017 5016 struct vcpu_svm *svm = to_svm(vcpu); 5018 5017 5019 - svm->vmcb->control.nested_cr3 = root; 5018 + svm->vmcb->control.nested_cr3 = __sme_set(root); 5020 5019 mark_dirty(svm->vmcb, VMCB_NPT); 5021 5020 5022 5021 /* Also sync guest cr3 here in case we live migrate */

+1 -1

arch/x86/kvm/vmx.c

··· 6556 6556 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 6557 6557 0ull, VMX_EPT_EXECUTABLE_MASK, 6558 6558 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, 6559 - VMX_EPT_RWX_MASK); 6559 + VMX_EPT_RWX_MASK, 0ull); 6560 6560 6561 6561 ept_set_mmio_spte_mask(); 6562 6562 kvm_enable_tdp();

+2 -1

arch/x86/kvm/x86.c

··· 54 54 #include <linux/kvm_irqfd.h> 55 55 #include <linux/irqbypass.h> 56 56 #include <linux/sched/stat.h> 57 + #include <linux/mem_encrypt.h> 57 58 58 59 #include <trace/events/kvm.h> 59 60 ··· 6126 6125 6127 6126 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 6128 6127 PT_DIRTY_MASK, PT64_NX_MASK, 0, 6129 - PT_PRESENT_MASK, 0); 6128 + PT_PRESENT_MASK, 0, sme_me_mask); 6130 6129 kvm_timer_init(); 6131 6130 6132 6131 perf_register_guest_info_callbacks(&kvm_guest_cbs);

+105

arch/x86/lib/cmdline.c

··· 104 104 return 0; /* Buffer overrun */ 105 105 } 106 106 107 + /* 108 + * Find a non-boolean option (i.e. option=argument). In accordance with 109 + * standard Linux practice, if this option is repeated, this returns the 110 + * last instance on the command line. 111 + * 112 + * @cmdline: the cmdline string 113 + * @max_cmdline_size: the maximum size of cmdline 114 + * @option: option string to look for 115 + * @buffer: memory buffer to return the option argument 116 + * @bufsize: size of the supplied memory buffer 117 + * 118 + * Returns the length of the argument (regardless of if it was 119 + * truncated to fit in the buffer), or -1 on not found. 120 + */ 121 + static int 122 + __cmdline_find_option(const char *cmdline, int max_cmdline_size, 123 + const char *option, char *buffer, int bufsize) 124 + { 125 + char c; 126 + int pos = 0, len = -1; 127 + const char *opptr = NULL; 128 + char *bufptr = buffer; 129 + enum { 130 + st_wordstart = 0, /* Start of word/after whitespace */ 131 + st_wordcmp, /* Comparing this word */ 132 + st_wordskip, /* Miscompare, skip */ 133 + st_bufcpy, /* Copying this to buffer */ 134 + } state = st_wordstart; 135 + 136 + if (!cmdline) 137 + return -1; /* No command line */ 138 + 139 + /* 140 + * This 'pos' check ensures we do not overrun 141 + * a non-NULL-terminated 'cmdline' 142 + */ 143 + while (pos++ < max_cmdline_size) { 144 + c = *(char *)cmdline++; 145 + if (!c) 146 + break; 147 + 148 + switch (state) { 149 + case st_wordstart: 150 + if (myisspace(c)) 151 + break; 152 + 153 + state = st_wordcmp; 154 + opptr = option; 155 + /* fall through */ 156 + 157 + case st_wordcmp: 158 + if ((c == '=') && !*opptr) { 159 + /* 160 + * We matched all the way to the end of the 161 + * option we were looking for, prepare to 162 + * copy the argument. 163 + */ 164 + len = 0; 165 + bufptr = buffer; 166 + state = st_bufcpy; 167 + break; 168 + } else if (c == *opptr++) { 169 + /* 170 + * We are currently matching, so continue 171 + * to the next character on the cmdline. 172 + */ 173 + break; 174 + } 175 + state = st_wordskip; 176 + /* fall through */ 177 + 178 + case st_wordskip: 179 + if (myisspace(c)) 180 + state = st_wordstart; 181 + break; 182 + 183 + case st_bufcpy: 184 + if (myisspace(c)) { 185 + state = st_wordstart; 186 + } else { 187 + /* 188 + * Increment len, but don't overrun the 189 + * supplied buffer and leave room for the 190 + * NULL terminator. 191 + */ 192 + if (++len < bufsize) 193 + *bufptr++ = c; 194 + } 195 + break; 196 + } 197 + } 198 + 199 + if (bufsize) 200 + *bufptr = '\0'; 201 + 202 + return len; 203 + } 204 + 107 205 int cmdline_find_option_bool(const char *cmdline, const char *option) 108 206 { 109 207 return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); 208 + } 209 + 210 + int cmdline_find_option(const char *cmdline, const char *option, char *buffer, 211 + int bufsize) 212 + { 213 + return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, 214 + buffer, bufsize); 110 215 }

+2

arch/x86/mm/Makefile

··· 39 39 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o 40 40 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o 41 41 42 + obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o 43 + obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o

+56 -37

arch/x86/mm/dump_pagetables.c

··· 13 13 */ 14 14 15 15 #include <linux/debugfs.h> 16 + #include <linux/kasan.h> 16 17 #include <linux/mm.h> 17 18 #include <linux/init.h> 18 19 #include <linux/sched.h> 19 20 #include <linux/seq_file.h> 20 21 21 - #include <asm/kasan.h> 22 22 #include <asm/pgtable.h> 23 23 24 24 /* ··· 138 138 { 139 139 pgprotval_t pr = pgprot_val(prot); 140 140 static const char * const level_name[] = 141 - { "cr3", "pgd", "pud", "pmd", "pte" }; 141 + { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; 142 142 143 143 if (!pgprot_val(prot)) { 144 144 /* Not present */ ··· 162 162 pt_dump_cont_printf(m, dmsg, " "); 163 163 164 164 /* Bit 7 has a different meaning on level 3 vs 4 */ 165 - if (level <= 3 && pr & _PAGE_PSE) 165 + if (level <= 4 && pr & _PAGE_PSE) 166 166 pt_dump_cont_printf(m, dmsg, "PSE "); 167 167 else 168 168 pt_dump_cont_printf(m, dmsg, " "); 169 - if ((level == 4 && pr & _PAGE_PAT) || 170 - ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) 169 + if ((level == 5 && pr & _PAGE_PAT) || 170 + ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE)) 171 171 pt_dump_cont_printf(m, dmsg, "PAT "); 172 172 else 173 173 pt_dump_cont_printf(m, dmsg, " "); ··· 188 188 */ 189 189 static unsigned long normalize_addr(unsigned long u) 190 190 { 191 - #ifdef CONFIG_X86_64 192 - return (signed long)(u << 16) >> 16; 193 - #else 194 - return u; 195 - #endif 191 + int shift; 192 + if (!IS_ENABLED(CONFIG_X86_64)) 193 + return u; 194 + 195 + shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 196 + return (signed long)(u << shift) >> shift; 196 197 } 197 198 198 199 /* ··· 298 297 for (i = 0; i < PTRS_PER_PTE; i++) { 299 298 prot = pte_flags(*start); 300 299 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); 301 - note_page(m, st, __pgprot(prot), 4); 300 + note_page(m, st, __pgprot(prot), 5); 302 301 start++; 303 302 } 304 303 } 304 + #ifdef CONFIG_KASAN 305 + 306 + /* 307 + * This is an optimization for KASAN=y case. Since all kasan page tables 308 + * eventually point to the kasan_zero_page we could call note_page() 309 + * right away without walking through lower level page tables. This saves 310 + * us dozens of seconds (minutes for 5-level config) while checking for 311 + * W+X mapping or reading kernel_page_tables debugfs file. 312 + */ 313 + static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, 314 + void *pt) 315 + { 316 + if (__pa(pt) == __pa(kasan_zero_pmd) || 317 + #ifdef CONFIG_X86_5LEVEL 318 + __pa(pt) == __pa(kasan_zero_p4d) || 319 + #endif 320 + __pa(pt) == __pa(kasan_zero_pud)) { 321 + pgprotval_t prot = pte_flags(kasan_zero_pte[0]); 322 + note_page(m, st, __pgprot(prot), 5); 323 + return true; 324 + } 325 + return false; 326 + } 327 + #else 328 + static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, 329 + void *pt) 330 + { 331 + return false; 332 + } 333 + #endif 305 334 306 335 #if PTRS_PER_PMD > 1 307 336 308 337 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) 309 338 { 310 339 int i; 311 - pmd_t *start; 340 + pmd_t *start, *pmd_start; 312 341 pgprotval_t prot; 313 342 314 - start = (pmd_t *)pud_page_vaddr(addr); 343 + pmd_start = start = (pmd_t *)pud_page_vaddr(addr); 315 344 for (i = 0; i < PTRS_PER_PMD; i++) { 316 345 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 317 346 if (!pmd_none(*start)) { 318 347 if (pmd_large(*start) || !pmd_present(*start)) { 319 348 prot = pmd_flags(*start); 320 - note_page(m, st, __pgprot(prot), 3); 321 - } else { 349 + note_page(m, st, __pgprot(prot), 4); 350 + } else if (!kasan_page_table(m, st, pmd_start)) { 322 351 walk_pte_level(m, st, *start, 323 352 P + i * PMD_LEVEL_MULT); 324 353 } 325 354 } else 326 - note_page(m, st, __pgprot(0), 3); 355 + note_page(m, st, __pgprot(0), 4); 327 356 start++; 328 357 } 329 358 } ··· 366 335 367 336 #if PTRS_PER_PUD > 1 368 337 369 - /* 370 - * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y 371 - * KASAN fills page tables with the same values. Since there is no 372 - * point in checking page table more than once we just skip repeated 373 - * entries. This saves us dozens of seconds during boot. 374 - */ 375 - static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) 376 - { 377 - return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); 378 - } 379 - 380 338 static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) 381 339 { 382 340 int i; 383 - pud_t *start; 341 + pud_t *start, *pud_start; 384 342 pgprotval_t prot; 385 343 pud_t *prev_pud = NULL; 386 344 387 - start = (pud_t *)p4d_page_vaddr(addr); 345 + pud_start = start = (pud_t *)p4d_page_vaddr(addr); 388 346 389 347 for (i = 0; i < PTRS_PER_PUD; i++) { 390 348 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 391 - if (!pud_none(*start) && 392 - !pud_already_checked(prev_pud, start, st->check_wx)) { 349 + if (!pud_none(*start)) { 393 350 if (pud_large(*start) || !pud_present(*start)) { 394 351 prot = pud_flags(*start); 395 - note_page(m, st, __pgprot(prot), 2); 396 - } else { 352 + note_page(m, st, __pgprot(prot), 3); 353 + } else if (!kasan_page_table(m, st, pud_start)) { 397 354 walk_pmd_level(m, st, *start, 398 355 P + i * PUD_LEVEL_MULT); 399 356 } 400 357 } else 401 - note_page(m, st, __pgprot(0), 2); 358 + note_page(m, st, __pgprot(0), 3); 402 359 403 360 prev_pud = start; 404 361 start++; ··· 404 385 static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) 405 386 { 406 387 int i; 407 - p4d_t *start; 388 + p4d_t *start, *p4d_start; 408 389 pgprotval_t prot; 409 390 410 - start = (p4d_t *)pgd_page_vaddr(addr); 391 + p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); 411 392 412 393 for (i = 0; i < PTRS_PER_P4D; i++) { 413 394 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); ··· 415 396 if (p4d_large(*start) || !p4d_present(*start)) { 416 397 prot = p4d_flags(*start); 417 398 note_page(m, st, __pgprot(prot), 2); 418 - } else { 399 + } else if (!kasan_page_table(m, st, p4d_start)) { 419 400 walk_pud_level(m, st, *start, 420 401 P + i * P4D_LEVEL_MULT); 421 402 }

+15 -11

arch/x86/mm/fault.c

··· 396 396 pte_t *pte; 397 397 398 398 #ifdef CONFIG_X86_PAE 399 - printk("*pdpt = %016Lx ", pgd_val(*pgd)); 399 + pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); 400 400 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) 401 401 goto out; 402 + #define pr_pde pr_cont 403 + #else 404 + #define pr_pde pr_info 402 405 #endif 403 406 p4d = p4d_offset(pgd, address); 404 407 pud = pud_offset(p4d, address); 405 408 pmd = pmd_offset(pud, address); 406 - printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 409 + pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 410 + #undef pr_pde 407 411 408 412 /* 409 413 * We must not directly access the pte in the highpte ··· 419 415 goto out; 420 416 421 417 pte = pte_offset_kernel(pmd, address); 422 - printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); 418 + pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); 423 419 out: 424 - printk("\n"); 420 + pr_cont("\n"); 425 421 } 426 422 427 423 #else /* CONFIG_X86_64: */ ··· 569 565 if (bad_address(pgd)) 570 566 goto bad; 571 567 572 - printk("PGD %lx ", pgd_val(*pgd)); 568 + pr_info("PGD %lx ", pgd_val(*pgd)); 573 569 574 570 if (!pgd_present(*pgd)) 575 571 goto out; ··· 578 574 if (bad_address(p4d)) 579 575 goto bad; 580 576 581 - printk("P4D %lx ", p4d_val(*p4d)); 577 + pr_cont("P4D %lx ", p4d_val(*p4d)); 582 578 if (!p4d_present(*p4d) || p4d_large(*p4d)) 583 579 goto out; 584 580 ··· 586 582 if (bad_address(pud)) 587 583 goto bad; 588 584 589 - printk("PUD %lx ", pud_val(*pud)); 585 + pr_cont("PUD %lx ", pud_val(*pud)); 590 586 if (!pud_present(*pud) || pud_large(*pud)) 591 587 goto out; 592 588 ··· 594 590 if (bad_address(pmd)) 595 591 goto bad; 596 592 597 - printk("PMD %lx ", pmd_val(*pmd)); 593 + pr_cont("PMD %lx ", pmd_val(*pmd)); 598 594 if (!pmd_present(*pmd) || pmd_large(*pmd)) 599 595 goto out; 600 596 ··· 602 598 if (bad_address(pte)) 603 599 goto bad; 604 600 605 - printk("PTE %lx", pte_val(*pte)); 601 + pr_cont("PTE %lx", pte_val(*pte)); 606 602 out: 607 - printk("\n"); 603 + pr_cont("\n"); 608 604 return; 609 605 bad: 610 - printk("BAD\n"); 606 + pr_info("BAD\n"); 611 607 } 612 608 613 609 #endif /* CONFIG_X86_64 */

+23 -4

arch/x86/mm/hugetlbpage.c

··· 18 18 #include <asm/tlbflush.h> 19 19 #include <asm/pgalloc.h> 20 20 #include <asm/elf.h> 21 + #include <asm/mpx.h> 21 22 22 23 #if 0 /* This is just for testing */ 23 24 struct page * ··· 86 85 info.flags = 0; 87 86 info.length = len; 88 87 info.low_limit = get_mmap_base(1); 88 + 89 + /* 90 + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area 91 + * in the full address space. 92 + */ 89 93 info.high_limit = in_compat_syscall() ? 90 - tasksize_32bit() : tasksize_64bit(); 94 + task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); 95 + 91 96 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 92 97 info.align_offset = 0; 93 98 return vm_unmapped_area(&info); 94 99 } 95 100 96 101 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, 97 - unsigned long addr0, unsigned long len, 102 + unsigned long addr, unsigned long len, 98 103 unsigned long pgoff, unsigned long flags) 99 104 { 100 105 struct hstate *h = hstate_file(file); 101 106 struct vm_unmapped_area_info info; 102 - unsigned long addr; 103 107 104 108 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 105 109 info.length = len; 106 110 info.low_limit = PAGE_SIZE; 107 111 info.high_limit = get_mmap_base(0); 112 + 113 + /* 114 + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area 115 + * in the full address space. 116 + */ 117 + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) 118 + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; 119 + 108 120 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 109 121 info.align_offset = 0; 110 122 addr = vm_unmapped_area(&info); ··· 132 118 VM_BUG_ON(addr != -ENOMEM); 133 119 info.flags = 0; 134 120 info.low_limit = TASK_UNMAPPED_BASE; 135 - info.high_limit = TASK_SIZE; 121 + info.high_limit = TASK_SIZE_LOW; 136 122 addr = vm_unmapped_area(&info); 137 123 } 138 124 ··· 149 135 150 136 if (len & ~huge_page_mask(h)) 151 137 return -EINVAL; 138 + 139 + addr = mpx_unmapped_area_check(addr, len, flags); 140 + if (IS_ERR_VALUE(addr)) 141 + return addr; 142 + 152 143 if (len > TASK_SIZE) 153 144 return -ENOMEM; 154 145

+8 -4

arch/x86/mm/ident_map.c

··· 51 51 if (!pmd) 52 52 return -ENOMEM; 53 53 ident_pmd_init(info, pmd, addr, next); 54 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); 54 + set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag)); 55 55 } 56 56 57 57 return 0; ··· 79 79 if (!pud) 80 80 return -ENOMEM; 81 81 ident_pud_init(info, pud, addr, next); 82 - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); 82 + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); 83 83 } 84 84 85 85 return 0; ··· 92 92 unsigned long end = pend + info->offset; 93 93 unsigned long next; 94 94 int result; 95 + 96 + /* Set the default pagetable flags if not supplied */ 97 + if (!info->kernpg_flag) 98 + info->kernpg_flag = _KERNPG_TABLE; 95 99 96 100 for (; addr < end; addr = next) { 97 101 pgd_t *pgd = pgd_page + pgd_index(addr); ··· 120 116 if (result) 121 117 return result; 122 118 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 123 - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); 119 + set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); 124 120 } else { 125 121 /* 126 122 * With p4d folded, pgd is equal to p4d. 127 123 * The pgd entry has to point to the pud page table in this case. 128 124 */ 129 125 pud_t *pud = pud_offset(p4d, 0); 130 - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); 126 + set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag)); 131 127 } 132 128 } 133 129

+1 -1

arch/x86/mm/init.c

··· 815 815 816 816 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { 817 817 .loaded_mm = &init_mm, 818 - .state = 0, 818 + .next_asid = 1, 819 819 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ 820 820 }; 821 821 EXPORT_SYMBOL_GPL(cpu_tlbstate);

+270 -19

arch/x86/mm/ioremap.c

··· 13 13 #include <linux/slab.h> 14 14 #include <linux/vmalloc.h> 15 15 #include <linux/mmiotrace.h> 16 + #include <linux/mem_encrypt.h> 17 + #include <linux/efi.h> 16 18 17 19 #include <asm/set_memory.h> 18 20 #include <asm/e820/api.h> ··· 23 21 #include <asm/tlbflush.h> 24 22 #include <asm/pgalloc.h> 25 23 #include <asm/pat.h> 24 + #include <asm/setup.h> 26 25 27 26 #include "physaddr.h" 28 27 ··· 107 104 WARN_ON_ONCE(1); 108 105 return NULL; 109 106 } 110 - 111 - /* 112 - * Don't remap the low PCI/ISA area, it's always mapped.. 113 - */ 114 - if (is_ISA_range(phys_addr, last_addr)) 115 - return (__force void __iomem *)phys_to_virt(phys_addr); 116 107 117 108 /* 118 109 * Don't allow anybody to remap normal RAM that we're using.. ··· 337 340 return; 338 341 339 342 /* 340 - * __ioremap special-cases the PCI/ISA range by not instantiating a 341 - * vm_area and by simply returning an address into the kernel mapping 342 - * of ISA space. So handle that here. 343 + * The PCI/ISA range special-casing was removed from __ioremap() 344 + * so this check, in theory, can be removed. However, there are 345 + * cases where iounmap() is called for addresses not obtained via 346 + * ioremap() (vga16fb for example). Add a warning so that these 347 + * cases can be caught and fixed. 343 348 */ 344 349 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && 345 - (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) 350 + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) { 351 + WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n"); 346 352 return; 353 + } 347 354 348 355 addr = (volatile void __iomem *) 349 356 (PAGE_MASK & (unsigned long __force)addr); ··· 400 399 unsigned long offset = phys & ~PAGE_MASK; 401 400 void *vaddr; 402 401 403 - /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ 404 - if (page_is_ram(start >> PAGE_SHIFT)) 405 - return __va(phys); 402 + /* memremap() maps if RAM, otherwise falls back to ioremap() */ 403 + vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB); 406 404 407 - vaddr = ioremap_cache(start, PAGE_SIZE); 408 - /* Only add the offset on success and return NULL if the ioremap() failed: */ 405 + /* Only add the offset on success and return NULL if memremap() failed */ 409 406 if (vaddr) 410 407 vaddr += offset; 411 408 ··· 412 413 413 414 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) 414 415 { 415 - if (page_is_ram(phys >> PAGE_SHIFT)) 416 - return; 417 - 418 - iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); 416 + memunmap((void *)((unsigned long)addr & PAGE_MASK)); 419 417 } 418 + 419 + /* 420 + * Examine the physical address to determine if it is an area of memory 421 + * that should be mapped decrypted. If the memory is not part of the 422 + * kernel usable area it was accessed and created decrypted, so these 423 + * areas should be mapped decrypted. And since the encryption key can 424 + * change across reboots, persistent memory should also be mapped 425 + * decrypted. 426 + */ 427 + static bool memremap_should_map_decrypted(resource_size_t phys_addr, 428 + unsigned long size) 429 + { 430 + int is_pmem; 431 + 432 + /* 433 + * Check if the address is part of a persistent memory region. 434 + * This check covers areas added by E820, EFI and ACPI. 435 + */ 436 + is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM, 437 + IORES_DESC_PERSISTENT_MEMORY); 438 + if (is_pmem != REGION_DISJOINT) 439 + return true; 440 + 441 + /* 442 + * Check if the non-volatile attribute is set for an EFI 443 + * reserved area. 444 + */ 445 + if (efi_enabled(EFI_BOOT)) { 446 + switch (efi_mem_type(phys_addr)) { 447 + case EFI_RESERVED_TYPE: 448 + if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV) 449 + return true; 450 + break; 451 + default: 452 + break; 453 + } 454 + } 455 + 456 + /* Check if the address is outside kernel usable area */ 457 + switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) { 458 + case E820_TYPE_RESERVED: 459 + case E820_TYPE_ACPI: 460 + case E820_TYPE_NVS: 461 + case E820_TYPE_UNUSABLE: 462 + case E820_TYPE_PRAM: 463 + return true; 464 + default: 465 + break; 466 + } 467 + 468 + return false; 469 + } 470 + 471 + /* 472 + * Examine the physical address to determine if it is EFI data. Check 473 + * it against the boot params structure and EFI tables and memory types. 474 + */ 475 + static bool memremap_is_efi_data(resource_size_t phys_addr, 476 + unsigned long size) 477 + { 478 + u64 paddr; 479 + 480 + /* Check if the address is part of EFI boot/runtime data */ 481 + if (!efi_enabled(EFI_BOOT)) 482 + return false; 483 + 484 + paddr = boot_params.efi_info.efi_memmap_hi; 485 + paddr <<= 32; 486 + paddr |= boot_params.efi_info.efi_memmap; 487 + if (phys_addr == paddr) 488 + return true; 489 + 490 + paddr = boot_params.efi_info.efi_systab_hi; 491 + paddr <<= 32; 492 + paddr |= boot_params.efi_info.efi_systab; 493 + if (phys_addr == paddr) 494 + return true; 495 + 496 + if (efi_is_table_address(phys_addr)) 497 + return true; 498 + 499 + switch (efi_mem_type(phys_addr)) { 500 + case EFI_BOOT_SERVICES_DATA: 501 + case EFI_RUNTIME_SERVICES_DATA: 502 + return true; 503 + default: 504 + break; 505 + } 506 + 507 + return false; 508 + } 509 + 510 + /* 511 + * Examine the physical address to determine if it is boot data by checking 512 + * it against the boot params setup_data chain. 513 + */ 514 + static bool memremap_is_setup_data(resource_size_t phys_addr, 515 + unsigned long size) 516 + { 517 + struct setup_data *data; 518 + u64 paddr, paddr_next; 519 + 520 + paddr = boot_params.hdr.setup_data; 521 + while (paddr) { 522 + unsigned int len; 523 + 524 + if (phys_addr == paddr) 525 + return true; 526 + 527 + data = memremap(paddr, sizeof(*data), 528 + MEMREMAP_WB | MEMREMAP_DEC); 529 + 530 + paddr_next = data->next; 531 + len = data->len; 532 + 533 + memunmap(data); 534 + 535 + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) 536 + return true; 537 + 538 + paddr = paddr_next; 539 + } 540 + 541 + return false; 542 + } 543 + 544 + /* 545 + * Examine the physical address to determine if it is boot data by checking 546 + * it against the boot params setup_data chain (early boot version). 547 + */ 548 + static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, 549 + unsigned long size) 550 + { 551 + struct setup_data *data; 552 + u64 paddr, paddr_next; 553 + 554 + paddr = boot_params.hdr.setup_data; 555 + while (paddr) { 556 + unsigned int len; 557 + 558 + if (phys_addr == paddr) 559 + return true; 560 + 561 + data = early_memremap_decrypted(paddr, sizeof(*data)); 562 + 563 + paddr_next = data->next; 564 + len = data->len; 565 + 566 + early_memunmap(data, sizeof(*data)); 567 + 568 + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) 569 + return true; 570 + 571 + paddr = paddr_next; 572 + } 573 + 574 + return false; 575 + } 576 + 577 + /* 578 + * Architecture function to determine if RAM remap is allowed. By default, a 579 + * RAM remap will map the data as encrypted. Determine if a RAM remap should 580 + * not be done so that the data will be mapped decrypted. 581 + */ 582 + bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, 583 + unsigned long flags) 584 + { 585 + if (!sme_active()) 586 + return true; 587 + 588 + if (flags & MEMREMAP_ENC) 589 + return true; 590 + 591 + if (flags & MEMREMAP_DEC) 592 + return false; 593 + 594 + if (memremap_is_setup_data(phys_addr, size) || 595 + memremap_is_efi_data(phys_addr, size) || 596 + memremap_should_map_decrypted(phys_addr, size)) 597 + return false; 598 + 599 + return true; 600 + } 601 + 602 + /* 603 + * Architecture override of __weak function to adjust the protection attributes 604 + * used when remapping memory. By default, early_memremap() will map the data 605 + * as encrypted. Determine if an encrypted mapping should not be done and set 606 + * the appropriate protection attributes. 607 + */ 608 + pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, 609 + unsigned long size, 610 + pgprot_t prot) 611 + { 612 + if (!sme_active()) 613 + return prot; 614 + 615 + if (early_memremap_is_setup_data(phys_addr, size) || 616 + memremap_is_efi_data(phys_addr, size) || 617 + memremap_should_map_decrypted(phys_addr, size)) 618 + prot = pgprot_decrypted(prot); 619 + else 620 + prot = pgprot_encrypted(prot); 621 + 622 + return prot; 623 + } 624 + 625 + bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) 626 + { 627 + return arch_memremap_can_ram_remap(phys_addr, size, 0); 628 + } 629 + 630 + #ifdef CONFIG_ARCH_USE_MEMREMAP_PROT 631 + /* Remap memory with encryption */ 632 + void __init *early_memremap_encrypted(resource_size_t phys_addr, 633 + unsigned long size) 634 + { 635 + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); 636 + } 637 + 638 + /* 639 + * Remap memory with encryption and write-protected - cannot be called 640 + * before pat_init() is called 641 + */ 642 + void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, 643 + unsigned long size) 644 + { 645 + /* Be sure the write-protect PAT entry is set for write-protect */ 646 + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) 647 + return NULL; 648 + 649 + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); 650 + } 651 + 652 + /* Remap memory without encryption */ 653 + void __init *early_memremap_decrypted(resource_size_t phys_addr, 654 + unsigned long size) 655 + { 656 + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); 657 + } 658 + 659 + /* 660 + * Remap memory without encryption and write-protected - cannot be called 661 + * before pat_init() is called 662 + */ 663 + void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, 664 + unsigned long size) 665 + { 666 + /* Be sure the write-protect PAT entry is set for write-protect */ 667 + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) 668 + return NULL; 669 + 670 + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); 671 + } 672 + #endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */ 420 673 421 674 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; 422 675

+3 -3

arch/x86/mm/kasan_init_64.c

··· 11 11 #include <asm/e820/types.h> 12 12 #include <asm/tlbflush.h> 13 13 #include <asm/sections.h> 14 + #include <asm/pgtable.h> 14 15 15 - extern pgd_t early_top_pgt[PTRS_PER_PGD]; 16 16 extern struct range pfn_mapped[E820_MAX_ENTRIES]; 17 17 18 18 static int __init map_range(struct range *range) ··· 87 87 void __init kasan_early_init(void) 88 88 { 89 89 int i; 90 - pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; 90 + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; 91 91 pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; 92 92 pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; 93 93 p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; ··· 153 153 */ 154 154 memset(kasan_zero_page, 0, PAGE_SIZE); 155 155 for (i = 0; i < PTRS_PER_PTE; i++) { 156 - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); 156 + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); 157 157 set_pte(&kasan_zero_pte[i], pte); 158 158 } 159 159 /* Flush TLBs again to be sure that write protection applied. */

+593

arch/x86/mm/mem_encrypt.c

··· 1 + /* 2 + * AMD Memory Encryption Support 3 + * 4 + * Copyright (C) 2016 Advanced Micro Devices, Inc. 5 + * 6 + * Author: Tom Lendacky <thomas.lendacky@amd.com> 7 + * 8 + * This program is free software; you can redistribute it and/or modify 9 + * it under the terms of the GNU General Public License version 2 as 10 + * published by the Free Software Foundation. 11 + */ 12 + 13 + #include <linux/linkage.h> 14 + #include <linux/init.h> 15 + #include <linux/mm.h> 16 + #include <linux/dma-mapping.h> 17 + #include <linux/swiotlb.h> 18 + #include <linux/mem_encrypt.h> 19 + 20 + #include <asm/tlbflush.h> 21 + #include <asm/fixmap.h> 22 + #include <asm/setup.h> 23 + #include <asm/bootparam.h> 24 + #include <asm/set_memory.h> 25 + #include <asm/cacheflush.h> 26 + #include <asm/sections.h> 27 + #include <asm/processor-flags.h> 28 + #include <asm/msr.h> 29 + #include <asm/cmdline.h> 30 + 31 + static char sme_cmdline_arg[] __initdata = "mem_encrypt"; 32 + static char sme_cmdline_on[] __initdata = "on"; 33 + static char sme_cmdline_off[] __initdata = "off"; 34 + 35 + /* 36 + * Since SME related variables are set early in the boot process they must 37 + * reside in the .data section so as not to be zeroed out when the .bss 38 + * section is later cleared. 39 + */ 40 + unsigned long sme_me_mask __section(.data) = 0; 41 + EXPORT_SYMBOL_GPL(sme_me_mask); 42 + 43 + /* Buffer used for early in-place encryption by BSP, no locking needed */ 44 + static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); 45 + 46 + /* 47 + * This routine does not change the underlying encryption setting of the 48 + * page(s) that map this memory. It assumes that eventually the memory is 49 + * meant to be accessed as either encrypted or decrypted but the contents 50 + * are currently not in the desired state. 51 + * 52 + * This routine follows the steps outlined in the AMD64 Architecture 53 + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. 54 + */ 55 + static void __init __sme_early_enc_dec(resource_size_t paddr, 56 + unsigned long size, bool enc) 57 + { 58 + void *src, *dst; 59 + size_t len; 60 + 61 + if (!sme_me_mask) 62 + return; 63 + 64 + local_flush_tlb(); 65 + wbinvd(); 66 + 67 + /* 68 + * There are limited number of early mapping slots, so map (at most) 69 + * one page at time. 70 + */ 71 + while (size) { 72 + len = min_t(size_t, sizeof(sme_early_buffer), size); 73 + 74 + /* 75 + * Create mappings for the current and desired format of 76 + * the memory. Use a write-protected mapping for the source. 77 + */ 78 + src = enc ? early_memremap_decrypted_wp(paddr, len) : 79 + early_memremap_encrypted_wp(paddr, len); 80 + 81 + dst = enc ? early_memremap_encrypted(paddr, len) : 82 + early_memremap_decrypted(paddr, len); 83 + 84 + /* 85 + * If a mapping can't be obtained to perform the operation, 86 + * then eventual access of that area in the desired mode 87 + * will cause a crash. 88 + */ 89 + BUG_ON(!src || !dst); 90 + 91 + /* 92 + * Use a temporary buffer, of cache-line multiple size, to 93 + * avoid data corruption as documented in the APM. 94 + */ 95 + memcpy(sme_early_buffer, src, len); 96 + memcpy(dst, sme_early_buffer, len); 97 + 98 + early_memunmap(dst, len); 99 + early_memunmap(src, len); 100 + 101 + paddr += len; 102 + size -= len; 103 + } 104 + } 105 + 106 + void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) 107 + { 108 + __sme_early_enc_dec(paddr, size, true); 109 + } 110 + 111 + void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) 112 + { 113 + __sme_early_enc_dec(paddr, size, false); 114 + } 115 + 116 + static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, 117 + bool map) 118 + { 119 + unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; 120 + pmdval_t pmd_flags, pmd; 121 + 122 + /* Use early_pmd_flags but remove the encryption mask */ 123 + pmd_flags = __sme_clr(early_pmd_flags); 124 + 125 + do { 126 + pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; 127 + __early_make_pgtable((unsigned long)vaddr, pmd); 128 + 129 + vaddr += PMD_SIZE; 130 + paddr += PMD_SIZE; 131 + size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; 132 + } while (size); 133 + 134 + __native_flush_tlb(); 135 + } 136 + 137 + void __init sme_unmap_bootdata(char *real_mode_data) 138 + { 139 + struct boot_params *boot_data; 140 + unsigned long cmdline_paddr; 141 + 142 + if (!sme_active()) 143 + return; 144 + 145 + /* Get the command line address before unmapping the real_mode_data */ 146 + boot_data = (struct boot_params *)real_mode_data; 147 + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); 148 + 149 + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); 150 + 151 + if (!cmdline_paddr) 152 + return; 153 + 154 + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); 155 + } 156 + 157 + void __init sme_map_bootdata(char *real_mode_data) 158 + { 159 + struct boot_params *boot_data; 160 + unsigned long cmdline_paddr; 161 + 162 + if (!sme_active()) 163 + return; 164 + 165 + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); 166 + 167 + /* Get the command line address after mapping the real_mode_data */ 168 + boot_data = (struct boot_params *)real_mode_data; 169 + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); 170 + 171 + if (!cmdline_paddr) 172 + return; 173 + 174 + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); 175 + } 176 + 177 + void __init sme_early_init(void) 178 + { 179 + unsigned int i; 180 + 181 + if (!sme_me_mask) 182 + return; 183 + 184 + early_pmd_flags = __sme_set(early_pmd_flags); 185 + 186 + __supported_pte_mask = __sme_set(__supported_pte_mask); 187 + 188 + /* Update the protection map with memory encryption mask */ 189 + for (i = 0; i < ARRAY_SIZE(protection_map); i++) 190 + protection_map[i] = pgprot_encrypted(protection_map[i]); 191 + } 192 + 193 + /* Architecture __weak replacement functions */ 194 + void __init mem_encrypt_init(void) 195 + { 196 + if (!sme_me_mask) 197 + return; 198 + 199 + /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ 200 + swiotlb_update_mem_attributes(); 201 + 202 + pr_info("AMD Secure Memory Encryption (SME) active\n"); 203 + } 204 + 205 + void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) 206 + { 207 + WARN(PAGE_ALIGN(size) != size, 208 + "size is not page-aligned (%#lx)\n", size); 209 + 210 + /* Make the SWIOTLB buffer area decrypted */ 211 + set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); 212 + } 213 + 214 + static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, 215 + unsigned long end) 216 + { 217 + unsigned long pgd_start, pgd_end, pgd_size; 218 + pgd_t *pgd_p; 219 + 220 + pgd_start = start & PGDIR_MASK; 221 + pgd_end = end & PGDIR_MASK; 222 + 223 + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1); 224 + pgd_size *= sizeof(pgd_t); 225 + 226 + pgd_p = pgd_base + pgd_index(start); 227 + 228 + memset(pgd_p, 0, pgd_size); 229 + } 230 + 231 + #define PGD_FLAGS _KERNPG_TABLE_NOENC 232 + #define P4D_FLAGS _KERNPG_TABLE_NOENC 233 + #define PUD_FLAGS _KERNPG_TABLE_NOENC 234 + #define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) 235 + 236 + static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, 237 + unsigned long vaddr, pmdval_t pmd_val) 238 + { 239 + pgd_t *pgd_p; 240 + p4d_t *p4d_p; 241 + pud_t *pud_p; 242 + pmd_t *pmd_p; 243 + 244 + pgd_p = pgd_base + pgd_index(vaddr); 245 + if (native_pgd_val(*pgd_p)) { 246 + if (IS_ENABLED(CONFIG_X86_5LEVEL)) 247 + p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); 248 + else 249 + pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); 250 + } else { 251 + pgd_t pgd; 252 + 253 + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 254 + p4d_p = pgtable_area; 255 + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); 256 + pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; 257 + 258 + pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); 259 + } else { 260 + pud_p = pgtable_area; 261 + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); 262 + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; 263 + 264 + pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); 265 + } 266 + native_set_pgd(pgd_p, pgd); 267 + } 268 + 269 + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 270 + p4d_p += p4d_index(vaddr); 271 + if (native_p4d_val(*p4d_p)) { 272 + pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); 273 + } else { 274 + p4d_t p4d; 275 + 276 + pud_p = pgtable_area; 277 + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); 278 + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; 279 + 280 + p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); 281 + native_set_p4d(p4d_p, p4d); 282 + } 283 + } 284 + 285 + pud_p += pud_index(vaddr); 286 + if (native_pud_val(*pud_p)) { 287 + if (native_pud_val(*pud_p) & _PAGE_PSE) 288 + goto out; 289 + 290 + pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); 291 + } else { 292 + pud_t pud; 293 + 294 + pmd_p = pgtable_area; 295 + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); 296 + pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; 297 + 298 + pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); 299 + native_set_pud(pud_p, pud); 300 + } 301 + 302 + pmd_p += pmd_index(vaddr); 303 + if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) 304 + native_set_pmd(pmd_p, native_make_pmd(pmd_val)); 305 + 306 + out: 307 + return pgtable_area; 308 + } 309 + 310 + static unsigned long __init sme_pgtable_calc(unsigned long len) 311 + { 312 + unsigned long p4d_size, pud_size, pmd_size; 313 + unsigned long total; 314 + 315 + /* 316 + * Perform a relatively simplistic calculation of the pagetable 317 + * entries that are needed. That mappings will be covered by 2MB 318 + * PMD entries so we can conservatively calculate the required 319 + * number of P4D, PUD and PMD structures needed to perform the 320 + * mappings. Incrementing the count for each covers the case where 321 + * the addresses cross entries. 322 + */ 323 + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 324 + p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; 325 + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; 326 + pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; 327 + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 328 + } else { 329 + p4d_size = 0; 330 + pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; 331 + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 332 + } 333 + pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; 334 + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; 335 + 336 + total = p4d_size + pud_size + pmd_size; 337 + 338 + /* 339 + * Now calculate the added pagetable structures needed to populate 340 + * the new pagetables. 341 + */ 342 + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 343 + p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; 344 + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; 345 + pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; 346 + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 347 + } else { 348 + p4d_size = 0; 349 + pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; 350 + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 351 + } 352 + pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; 353 + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; 354 + 355 + total += p4d_size + pud_size + pmd_size; 356 + 357 + return total; 358 + } 359 + 360 + void __init sme_encrypt_kernel(void) 361 + { 362 + unsigned long workarea_start, workarea_end, workarea_len; 363 + unsigned long execute_start, execute_end, execute_len; 364 + unsigned long kernel_start, kernel_end, kernel_len; 365 + unsigned long pgtable_area_len; 366 + unsigned long paddr, pmd_flags; 367 + unsigned long decrypted_base; 368 + void *pgtable_area; 369 + pgd_t *pgd; 370 + 371 + if (!sme_active()) 372 + return; 373 + 374 + /* 375 + * Prepare for encrypting the kernel by building new pagetables with 376 + * the necessary attributes needed to encrypt the kernel in place. 377 + * 378 + * One range of virtual addresses will map the memory occupied 379 + * by the kernel as encrypted. 380 + * 381 + * Another range of virtual addresses will map the memory occupied 382 + * by the kernel as decrypted and write-protected. 383 + * 384 + * The use of write-protect attribute will prevent any of the 385 + * memory from being cached. 386 + */ 387 + 388 + /* Physical addresses gives us the identity mapped virtual addresses */ 389 + kernel_start = __pa_symbol(_text); 390 + kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); 391 + kernel_len = kernel_end - kernel_start; 392 + 393 + /* Set the encryption workarea to be immediately after the kernel */ 394 + workarea_start = kernel_end; 395 + 396 + /* 397 + * Calculate required number of workarea bytes needed: 398 + * executable encryption area size: 399 + * stack page (PAGE_SIZE) 400 + * encryption routine page (PAGE_SIZE) 401 + * intermediate copy buffer (PMD_PAGE_SIZE) 402 + * pagetable structures for the encryption of the kernel 403 + * pagetable structures for workarea (in case not currently mapped) 404 + */ 405 + execute_start = workarea_start; 406 + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; 407 + execute_len = execute_end - execute_start; 408 + 409 + /* 410 + * One PGD for both encrypted and decrypted mappings and a set of 411 + * PUDs and PMDs for each of the encrypted and decrypted mappings. 412 + */ 413 + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; 414 + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; 415 + 416 + /* PUDs and PMDs needed in the current pagetables for the workarea */ 417 + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); 418 + 419 + /* 420 + * The total workarea includes the executable encryption area and 421 + * the pagetable area. 422 + */ 423 + workarea_len = execute_len + pgtable_area_len; 424 + workarea_end = workarea_start + workarea_len; 425 + 426 + /* 427 + * Set the address to the start of where newly created pagetable 428 + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable 429 + * structures are created when the workarea is added to the current 430 + * pagetables and when the new encrypted and decrypted kernel 431 + * mappings are populated. 432 + */ 433 + pgtable_area = (void *)execute_end; 434 + 435 + /* 436 + * Make sure the current pagetable structure has entries for 437 + * addressing the workarea. 438 + */ 439 + pgd = (pgd_t *)native_read_cr3_pa(); 440 + paddr = workarea_start; 441 + while (paddr < workarea_end) { 442 + pgtable_area = sme_populate_pgd(pgd, pgtable_area, 443 + paddr, 444 + paddr + PMD_FLAGS); 445 + 446 + paddr += PMD_PAGE_SIZE; 447 + } 448 + 449 + /* Flush the TLB - no globals so cr3 is enough */ 450 + native_write_cr3(__native_read_cr3()); 451 + 452 + /* 453 + * A new pagetable structure is being built to allow for the kernel 454 + * to be encrypted. It starts with an empty PGD that will then be 455 + * populated with new PUDs and PMDs as the encrypted and decrypted 456 + * kernel mappings are created. 457 + */ 458 + pgd = pgtable_area; 459 + memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); 460 + pgtable_area += sizeof(*pgd) * PTRS_PER_PGD; 461 + 462 + /* Add encrypted kernel (identity) mappings */ 463 + pmd_flags = PMD_FLAGS | _PAGE_ENC; 464 + paddr = kernel_start; 465 + while (paddr < kernel_end) { 466 + pgtable_area = sme_populate_pgd(pgd, pgtable_area, 467 + paddr, 468 + paddr + pmd_flags); 469 + 470 + paddr += PMD_PAGE_SIZE; 471 + } 472 + 473 + /* 474 + * A different PGD index/entry must be used to get different 475 + * pagetable entries for the decrypted mapping. Choose the next 476 + * PGD index and convert it to a virtual address to be used as 477 + * the base of the mapping. 478 + */ 479 + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); 480 + decrypted_base <<= PGDIR_SHIFT; 481 + 482 + /* Add decrypted, write-protected kernel (non-identity) mappings */ 483 + pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); 484 + paddr = kernel_start; 485 + while (paddr < kernel_end) { 486 + pgtable_area = sme_populate_pgd(pgd, pgtable_area, 487 + paddr + decrypted_base, 488 + paddr + pmd_flags); 489 + 490 + paddr += PMD_PAGE_SIZE; 491 + } 492 + 493 + /* Add decrypted workarea mappings to both kernel mappings */ 494 + paddr = workarea_start; 495 + while (paddr < workarea_end) { 496 + pgtable_area = sme_populate_pgd(pgd, pgtable_area, 497 + paddr, 498 + paddr + PMD_FLAGS); 499 + 500 + pgtable_area = sme_populate_pgd(pgd, pgtable_area, 501 + paddr + decrypted_base, 502 + paddr + PMD_FLAGS); 503 + 504 + paddr += PMD_PAGE_SIZE; 505 + } 506 + 507 + /* Perform the encryption */ 508 + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, 509 + kernel_len, workarea_start, (unsigned long)pgd); 510 + 511 + /* 512 + * At this point we are running encrypted. Remove the mappings for 513 + * the decrypted areas - all that is needed for this is to remove 514 + * the PGD entry/entries. 515 + */ 516 + sme_clear_pgd(pgd, kernel_start + decrypted_base, 517 + kernel_end + decrypted_base); 518 + 519 + sme_clear_pgd(pgd, workarea_start + decrypted_base, 520 + workarea_end + decrypted_base); 521 + 522 + /* Flush the TLB - no globals so cr3 is enough */ 523 + native_write_cr3(__native_read_cr3()); 524 + } 525 + 526 + void __init __nostackprotector sme_enable(struct boot_params *bp) 527 + { 528 + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; 529 + unsigned int eax, ebx, ecx, edx; 530 + bool active_by_default; 531 + unsigned long me_mask; 532 + char buffer[16]; 533 + u64 msr; 534 + 535 + /* Check for the SME support leaf */ 536 + eax = 0x80000000; 537 + ecx = 0; 538 + native_cpuid(&eax, &ebx, &ecx, &edx); 539 + if (eax < 0x8000001f) 540 + return; 541 + 542 + /* 543 + * Check for the SME feature: 544 + * CPUID Fn8000_001F[EAX] - Bit 0 545 + * Secure Memory Encryption support 546 + * CPUID Fn8000_001F[EBX] - Bits 5:0 547 + * Pagetable bit position used to indicate encryption 548 + */ 549 + eax = 0x8000001f; 550 + ecx = 0; 551 + native_cpuid(&eax, &ebx, &ecx, &edx); 552 + if (!(eax & 1)) 553 + return; 554 + 555 + me_mask = 1UL << (ebx & 0x3f); 556 + 557 + /* Check if SME is enabled */ 558 + msr = __rdmsr(MSR_K8_SYSCFG); 559 + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) 560 + return; 561 + 562 + /* 563 + * Fixups have not been applied to phys_base yet and we're running 564 + * identity mapped, so we must obtain the address to the SME command 565 + * line argument data using rip-relative addressing. 566 + */ 567 + asm ("lea sme_cmdline_arg(%%rip), %0" 568 + : "=r" (cmdline_arg) 569 + : "p" (sme_cmdline_arg)); 570 + asm ("lea sme_cmdline_on(%%rip), %0" 571 + : "=r" (cmdline_on) 572 + : "p" (sme_cmdline_on)); 573 + asm ("lea sme_cmdline_off(%%rip), %0" 574 + : "=r" (cmdline_off) 575 + : "p" (sme_cmdline_off)); 576 + 577 + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) 578 + active_by_default = true; 579 + else 580 + active_by_default = false; 581 + 582 + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | 583 + ((u64)bp->ext_cmd_line_ptr << 32)); 584 + 585 + cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); 586 + 587 + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) 588 + sme_me_mask = me_mask; 589 + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) 590 + sme_me_mask = 0; 591 + else 592 + sme_me_mask = active_by_default ? me_mask : 0; 593 + }

+149

arch/x86/mm/mem_encrypt_boot.S

··· 1 + /* 2 + * AMD Memory Encryption Support 3 + * 4 + * Copyright (C) 2016 Advanced Micro Devices, Inc. 5 + * 6 + * Author: Tom Lendacky <thomas.lendacky@amd.com> 7 + * 8 + * This program is free software; you can redistribute it and/or modify 9 + * it under the terms of the GNU General Public License version 2 as 10 + * published by the Free Software Foundation. 11 + */ 12 + 13 + #include <linux/linkage.h> 14 + #include <asm/pgtable.h> 15 + #include <asm/page.h> 16 + #include <asm/processor-flags.h> 17 + #include <asm/msr-index.h> 18 + 19 + .text 20 + .code64 21 + ENTRY(sme_encrypt_execute) 22 + 23 + /* 24 + * Entry parameters: 25 + * RDI - virtual address for the encrypted kernel mapping 26 + * RSI - virtual address for the decrypted kernel mapping 27 + * RDX - length of kernel 28 + * RCX - virtual address of the encryption workarea, including: 29 + * - stack page (PAGE_SIZE) 30 + * - encryption routine page (PAGE_SIZE) 31 + * - intermediate copy buffer (PMD_PAGE_SIZE) 32 + * R8 - physcial address of the pagetables to use for encryption 33 + */ 34 + 35 + push %rbp 36 + movq %rsp, %rbp /* RBP now has original stack pointer */ 37 + 38 + /* Set up a one page stack in the non-encrypted memory area */ 39 + movq %rcx, %rax /* Workarea stack page */ 40 + leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */ 41 + addq $PAGE_SIZE, %rax /* Workarea encryption routine */ 42 + 43 + push %r12 44 + movq %rdi, %r10 /* Encrypted kernel */ 45 + movq %rsi, %r11 /* Decrypted kernel */ 46 + movq %rdx, %r12 /* Kernel length */ 47 + 48 + /* Copy encryption routine into the workarea */ 49 + movq %rax, %rdi /* Workarea encryption routine */ 50 + leaq __enc_copy(%rip), %rsi /* Encryption routine */ 51 + movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */ 52 + rep movsb 53 + 54 + /* Setup registers for call */ 55 + movq %r10, %rdi /* Encrypted kernel */ 56 + movq %r11, %rsi /* Decrypted kernel */ 57 + movq %r8, %rdx /* Pagetables used for encryption */ 58 + movq %r12, %rcx /* Kernel length */ 59 + movq %rax, %r8 /* Workarea encryption routine */ 60 + addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ 61 + 62 + call *%rax /* Call the encryption routine */ 63 + 64 + pop %r12 65 + 66 + movq %rbp, %rsp /* Restore original stack pointer */ 67 + pop %rbp 68 + 69 + ret 70 + ENDPROC(sme_encrypt_execute) 71 + 72 + ENTRY(__enc_copy) 73 + /* 74 + * Routine used to encrypt kernel. 75 + * This routine must be run outside of the kernel proper since 76 + * the kernel will be encrypted during the process. So this 77 + * routine is defined here and then copied to an area outside 78 + * of the kernel where it will remain and run decrypted 79 + * during execution. 80 + * 81 + * On entry the registers must be: 82 + * RDI - virtual address for the encrypted kernel mapping 83 + * RSI - virtual address for the decrypted kernel mapping 84 + * RDX - address of the pagetables to use for encryption 85 + * RCX - length of kernel 86 + * R8 - intermediate copy buffer 87 + * 88 + * RAX - points to this routine 89 + * 90 + * The kernel will be encrypted by copying from the non-encrypted 91 + * kernel space to an intermediate buffer and then copying from the 92 + * intermediate buffer back to the encrypted kernel space. The physical 93 + * addresses of the two kernel space mappings are the same which 94 + * results in the kernel being encrypted "in place". 95 + */ 96 + /* Enable the new page tables */ 97 + mov %rdx, %cr3 98 + 99 + /* Flush any global TLBs */ 100 + mov %cr4, %rdx 101 + andq $~X86_CR4_PGE, %rdx 102 + mov %rdx, %cr4 103 + orq $X86_CR4_PGE, %rdx 104 + mov %rdx, %cr4 105 + 106 + /* Set the PAT register PA5 entry to write-protect */ 107 + push %rcx 108 + movl $MSR_IA32_CR_PAT, %ecx 109 + rdmsr 110 + push %rdx /* Save original PAT value */ 111 + andl $0xffff00ff, %edx /* Clear PA5 */ 112 + orl $0x00000500, %edx /* Set PA5 to WP */ 113 + wrmsr 114 + pop %rdx /* RDX contains original PAT value */ 115 + pop %rcx 116 + 117 + movq %rcx, %r9 /* Save kernel length */ 118 + movq %rdi, %r10 /* Save encrypted kernel address */ 119 + movq %rsi, %r11 /* Save decrypted kernel address */ 120 + 121 + wbinvd /* Invalidate any cache entries */ 122 + 123 + /* Copy/encrypt 2MB at a time */ 124 + 1: 125 + movq %r11, %rsi /* Source - decrypted kernel */ 126 + movq %r8, %rdi /* Dest - intermediate copy buffer */ 127 + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ 128 + rep movsb 129 + 130 + movq %r8, %rsi /* Source - intermediate copy buffer */ 131 + movq %r10, %rdi /* Dest - encrypted kernel */ 132 + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ 133 + rep movsb 134 + 135 + addq $PMD_PAGE_SIZE, %r11 136 + addq $PMD_PAGE_SIZE, %r10 137 + subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ 138 + jnz 1b /* Kernel length not zero? */ 139 + 140 + /* Restore PAT register */ 141 + push %rdx /* Save original PAT value */ 142 + movl $MSR_IA32_CR_PAT, %ecx 143 + rdmsr 144 + pop %rdx /* Restore original PAT value */ 145 + wrmsr 146 + 147 + ret 148 + .L__enc_copy_end: 149 + ENDPROC(__enc_copy)

+6 -6

arch/x86/mm/mmap.c

··· 37 37 .flags = -1, 38 38 }; 39 39 40 - unsigned long tasksize_32bit(void) 40 + unsigned long task_size_32bit(void) 41 41 { 42 42 return IA32_PAGE_OFFSET; 43 43 } 44 44 45 - unsigned long tasksize_64bit(void) 45 + unsigned long task_size_64bit(int full_addr_space) 46 46 { 47 - return TASK_SIZE_MAX; 47 + return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW; 48 48 } 49 49 50 50 static unsigned long stack_maxrandom_size(unsigned long task_size) 51 51 { 52 52 unsigned long max = 0; 53 53 if (current->flags & PF_RANDOMIZE) { 54 - max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); 54 + max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit()); 55 55 max <<= PAGE_SHIFT; 56 56 } 57 57 ··· 141 141 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 142 142 143 143 arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, 144 - arch_rnd(mmap64_rnd_bits), tasksize_64bit()); 144 + arch_rnd(mmap64_rnd_bits), task_size_64bit(0)); 145 145 146 146 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES 147 147 /* ··· 151 151 * mmap_base, the compat syscall uses mmap_compat_base. 152 152 */ 153 153 arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, 154 - arch_rnd(mmap32_rnd_bits), tasksize_32bit()); 154 + arch_rnd(mmap32_rnd_bits), task_size_32bit()); 155 155 #endif 156 156 } 157 157

+32 -1

arch/x86/mm/mpx.c

··· 355 355 */ 356 356 bd_base = mpx_get_bounds_dir(); 357 357 down_write(&mm->mmap_sem); 358 + 359 + /* MPX doesn't support addresses above 47 bits yet. */ 360 + if (find_vma(mm, DEFAULT_MAP_WINDOW)) { 361 + pr_warn_once("%s (%d): MPX cannot handle addresses " 362 + "above 47-bits. Disabling.", 363 + current->comm, current->pid); 364 + ret = -ENXIO; 365 + goto out; 366 + } 358 367 mm->context.bd_addr = bd_base; 359 368 if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) 360 369 ret = -ENXIO; 361 - 370 + out: 362 371 up_write(&mm->mmap_sem); 363 372 return ret; 364 373 } ··· 1038 1029 ret = mpx_unmap_tables(mm, start, end); 1039 1030 if (ret) 1040 1031 force_sig(SIGSEGV, current); 1032 + } 1033 + 1034 + /* MPX cannot handle addresses above 47 bits yet. */ 1035 + unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, 1036 + unsigned long flags) 1037 + { 1038 + if (!kernel_managing_mpx_tables(current->mm)) 1039 + return addr; 1040 + if (addr + len <= DEFAULT_MAP_WINDOW) 1041 + return addr; 1042 + if (flags & MAP_FIXED) 1043 + return -ENOMEM; 1044 + 1045 + /* 1046 + * Requested len is larger than the whole area we're allowed to map in. 1047 + * Resetting hinting address wouldn't do much good -- fail early. 1048 + */ 1049 + if (len > DEFAULT_MAP_WINDOW) 1050 + return -ENOMEM; 1051 + 1052 + /* Look for unmap area within DEFAULT_MAP_WINDOW */ 1053 + return 0; 1041 1054 }

+67

arch/x86/mm/pageattr.c

··· 1775 1775 __pgprot(0), 1, 0, NULL); 1776 1776 } 1777 1777 1778 + static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) 1779 + { 1780 + struct cpa_data cpa; 1781 + unsigned long start; 1782 + int ret; 1783 + 1784 + /* Nothing to do if the SME is not active */ 1785 + if (!sme_active()) 1786 + return 0; 1787 + 1788 + /* Should not be working on unaligned addresses */ 1789 + if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) 1790 + addr &= PAGE_MASK; 1791 + 1792 + start = addr; 1793 + 1794 + memset(&cpa, 0, sizeof(cpa)); 1795 + cpa.vaddr = &addr; 1796 + cpa.numpages = numpages; 1797 + cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0); 1798 + cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC); 1799 + cpa.pgd = init_mm.pgd; 1800 + 1801 + /* Must avoid aliasing mappings in the highmem code */ 1802 + kmap_flush_unused(); 1803 + vm_unmap_aliases(); 1804 + 1805 + /* 1806 + * Before changing the encryption attribute, we need to flush caches. 1807 + */ 1808 + if (static_cpu_has(X86_FEATURE_CLFLUSH)) 1809 + cpa_flush_range(start, numpages, 1); 1810 + else 1811 + cpa_flush_all(1); 1812 + 1813 + ret = __change_page_attr_set_clr(&cpa, 1); 1814 + 1815 + /* 1816 + * After changing the encryption attribute, we need to flush TLBs 1817 + * again in case any speculative TLB caching occurred (but no need 1818 + * to flush caches again). We could just use cpa_flush_all(), but 1819 + * in case TLB flushing gets optimized in the cpa_flush_range() 1820 + * path use the same logic as above. 1821 + */ 1822 + if (static_cpu_has(X86_FEATURE_CLFLUSH)) 1823 + cpa_flush_range(start, numpages, 0); 1824 + else 1825 + cpa_flush_all(0); 1826 + 1827 + return ret; 1828 + } 1829 + 1830 + int set_memory_encrypted(unsigned long addr, int numpages) 1831 + { 1832 + return __set_memory_enc_dec(addr, numpages, true); 1833 + } 1834 + EXPORT_SYMBOL_GPL(set_memory_encrypted); 1835 + 1836 + int set_memory_decrypted(unsigned long addr, int numpages) 1837 + { 1838 + return __set_memory_enc_dec(addr, numpages, false); 1839 + } 1840 + EXPORT_SYMBOL_GPL(set_memory_decrypted); 1841 + 1778 1842 int set_pages_uc(struct page *page, int numpages) 1779 1843 { 1780 1844 unsigned long addr = (unsigned long)page_address(page); ··· 2083 2019 2084 2020 if (!(page_flags & _PAGE_RW)) 2085 2021 cpa.mask_clr = __pgprot(_PAGE_RW); 2022 + 2023 + if (!(page_flags & _PAGE_ENC)) 2024 + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); 2086 2025 2087 2026 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); 2088 2027

+6 -3

arch/x86/mm/pat.c

··· 293 293 * pat_init - Initialize PAT MSR and PAT table 294 294 * 295 295 * This function initializes PAT MSR and PAT table with an OS-defined value 296 - * to enable additional cache attributes, WC and WT. 296 + * to enable additional cache attributes, WC, WT and WP. 297 297 * 298 298 * This function must be called on all CPUs using the specific sequence of 299 299 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this ··· 352 352 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 353 353 * 011 3 UC : _PAGE_CACHE_MODE_UC 354 354 * 100 4 WB : Reserved 355 - * 101 5 WC : Reserved 355 + * 101 5 WP : _PAGE_CACHE_MODE_WP 356 356 * 110 6 UC-: Reserved 357 357 * 111 7 WT : _PAGE_CACHE_MODE_WT 358 358 * ··· 360 360 * corresponding types in the presence of PAT errata. 361 361 */ 362 362 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 363 - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); 363 + PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); 364 364 } 365 365 366 366 if (!boot_cpu_done) { ··· 744 744 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 745 745 unsigned long size, pgprot_t vma_prot) 746 746 { 747 + if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) 748 + vma_prot = pgprot_decrypted(vma_prot); 749 + 747 750 return vma_prot; 748 751 } 749 752

+4 -4

arch/x86/mm/pgtable.c

··· 56 56 { 57 57 pgtable_page_dtor(pte); 58 58 paravirt_release_pte(page_to_pfn(pte)); 59 - tlb_remove_page(tlb, pte); 59 + tlb_remove_table(tlb, pte); 60 60 } 61 61 62 62 #if CONFIG_PGTABLE_LEVELS > 2 ··· 72 72 tlb->need_flush_all = 1; 73 73 #endif 74 74 pgtable_pmd_page_dtor(page); 75 - tlb_remove_page(tlb, page); 75 + tlb_remove_table(tlb, page); 76 76 } 77 77 78 78 #if CONFIG_PGTABLE_LEVELS > 3 79 79 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 80 80 { 81 81 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 82 - tlb_remove_page(tlb, virt_to_page(pud)); 82 + tlb_remove_table(tlb, virt_to_page(pud)); 83 83 } 84 84 85 85 #if CONFIG_PGTABLE_LEVELS > 4 86 86 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) 87 87 { 88 88 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); 89 - tlb_remove_page(tlb, virt_to_page(p4d)); 89 + tlb_remove_table(tlb, virt_to_page(p4d)); 90 90 } 91 91 #endif /* CONFIG_PGTABLE_LEVELS > 4 */ 92 92 #endif /* CONFIG_PGTABLE_LEVELS > 3 */

+251 -88

arch/x86/mm/tlb.c

··· 28 28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 29 29 */ 30 30 31 + atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 32 + 33 + static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, 34 + u16 *new_asid, bool *need_flush) 35 + { 36 + u16 asid; 37 + 38 + if (!static_cpu_has(X86_FEATURE_PCID)) { 39 + *new_asid = 0; 40 + *need_flush = true; 41 + return; 42 + } 43 + 44 + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 45 + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != 46 + next->context.ctx_id) 47 + continue; 48 + 49 + *new_asid = asid; 50 + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < 51 + next_tlb_gen); 52 + return; 53 + } 54 + 55 + /* 56 + * We don't currently own an ASID slot on this CPU. 57 + * Allocate a slot. 58 + */ 59 + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; 60 + if (*new_asid >= TLB_NR_DYN_ASIDS) { 61 + *new_asid = 0; 62 + this_cpu_write(cpu_tlbstate.next_asid, 1); 63 + } 64 + *need_flush = true; 65 + } 66 + 31 67 void leave_mm(int cpu) 32 68 { 33 69 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); ··· 79 43 if (loaded_mm == &init_mm) 80 44 return; 81 45 82 - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 83 - BUG(); 46 + /* Warn if we're not lazy. */ 47 + WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); 84 48 85 49 switch_mm(NULL, &init_mm, NULL); 86 50 } 87 - EXPORT_SYMBOL_GPL(leave_mm); 88 51 89 52 void switch_mm(struct mm_struct *prev, struct mm_struct *next, 90 53 struct task_struct *tsk) ··· 98 63 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 99 64 struct task_struct *tsk) 100 65 { 101 - unsigned cpu = smp_processor_id(); 102 66 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 67 + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 68 + unsigned cpu = smp_processor_id(); 69 + u64 next_tlb_gen; 103 70 104 71 /* 105 - * NB: The scheduler will call us with prev == next when 106 - * switching from lazy TLB mode to normal mode if active_mm 107 - * isn't changing. When this happens, there is no guarantee 108 - * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. 72 + * NB: The scheduler will call us with prev == next when switching 73 + * from lazy TLB mode to normal mode if active_mm isn't changing. 74 + * When this happens, we don't assume that CR3 (and hence 75 + * cpu_tlbstate.loaded_mm) matches next. 109 76 * 110 77 * NB: leave_mm() calls us with prev == NULL and tsk == NULL. 111 78 */ 112 79 113 - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); 80 + /* We don't want flush_tlb_func_* to run concurrently with us. */ 81 + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 82 + WARN_ON_ONCE(!irqs_disabled()); 83 + 84 + /* 85 + * Verify that CR3 is what we think it is. This will catch 86 + * hypothetical buggy code that directly switches to swapper_pg_dir 87 + * without going through leave_mm() / switch_mm_irqs_off() or that 88 + * does something like write_cr3(read_cr3_pa()). 89 + */ 90 + VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid)); 114 91 115 92 if (real_prev == next) { 93 + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != 94 + next->context.ctx_id); 95 + 96 + if (cpumask_test_cpu(cpu, mm_cpumask(next))) { 97 + /* 98 + * There's nothing to do: we weren't lazy, and we 99 + * aren't changing our mm. We don't need to flush 100 + * anything, nor do we need to update CR3, CR4, or 101 + * LDTR. 102 + */ 103 + return; 104 + } 105 + 106 + /* Resume remote flushes and then read tlb_gen. */ 107 + cpumask_set_cpu(cpu, mm_cpumask(next)); 108 + next_tlb_gen = atomic64_read(&next->context.tlb_gen); 109 + 110 + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < 111 + next_tlb_gen) { 112 + /* 113 + * Ideally, we'd have a flush_tlb() variant that 114 + * takes the known CR3 value as input. This would 115 + * be faster on Xen PV and on hypothetical CPUs 116 + * on which INVPCID is fast. 117 + */ 118 + this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, 119 + next_tlb_gen); 120 + write_cr3(__sme_pa(next->pgd) | prev_asid); 121 + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 122 + TLB_FLUSH_ALL); 123 + } 124 + 116 125 /* 117 - * There's nothing to do: we always keep the per-mm control 118 - * regs in sync with cpu_tlbstate.loaded_mm. Just 119 - * sanity-check mm_cpumask. 126 + * We just exited lazy mode, which means that CR4 and/or LDTR 127 + * may be stale. (Changes to the required CR4 and LDTR states 128 + * are not reflected in tlb_gen.) 120 129 */ 121 - if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) 122 - cpumask_set_cpu(cpu, mm_cpumask(next)); 123 - return; 130 + } else { 131 + u16 new_asid; 132 + bool need_flush; 133 + 134 + if (IS_ENABLED(CONFIG_VMAP_STACK)) { 135 + /* 136 + * If our current stack is in vmalloc space and isn't 137 + * mapped in the new pgd, we'll double-fault. Forcibly 138 + * map it. 139 + */ 140 + unsigned int index = pgd_index(current_stack_pointer()); 141 + pgd_t *pgd = next->pgd + index; 142 + 143 + if (unlikely(pgd_none(*pgd))) 144 + set_pgd(pgd, init_mm.pgd[index]); 145 + } 146 + 147 + /* Stop remote flushes for the previous mm */ 148 + if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) 149 + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 150 + 151 + VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); 152 + 153 + /* 154 + * Start remote flushes and then read tlb_gen. 155 + */ 156 + cpumask_set_cpu(cpu, mm_cpumask(next)); 157 + next_tlb_gen = atomic64_read(&next->context.tlb_gen); 158 + 159 + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); 160 + 161 + if (need_flush) { 162 + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 163 + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 164 + write_cr3(__sme_pa(next->pgd) | new_asid); 165 + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 166 + TLB_FLUSH_ALL); 167 + } else { 168 + /* The new ASID is already up to date. */ 169 + write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); 170 + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); 171 + } 172 + 173 + this_cpu_write(cpu_tlbstate.loaded_mm, next); 174 + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 124 175 } 125 176 126 - if (IS_ENABLED(CONFIG_VMAP_STACK)) { 127 - /* 128 - * If our current stack is in vmalloc space and isn't 129 - * mapped in the new pgd, we'll double-fault. Forcibly 130 - * map it. 131 - */ 132 - unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); 133 - 134 - pgd_t *pgd = next->pgd + stack_pgd_index; 135 - 136 - if (unlikely(pgd_none(*pgd))) 137 - set_pgd(pgd, init_mm.pgd[stack_pgd_index]); 138 - } 139 - 140 - this_cpu_write(cpu_tlbstate.loaded_mm, next); 141 - 142 - WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); 143 - cpumask_set_cpu(cpu, mm_cpumask(next)); 144 - 145 - /* 146 - * Re-load page tables. 147 - * 148 - * This logic has an ordering constraint: 149 - * 150 - * CPU 0: Write to a PTE for 'next' 151 - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. 152 - * CPU 1: set bit 1 in next's mm_cpumask 153 - * CPU 1: load from the PTE that CPU 0 writes (implicit) 154 - * 155 - * We need to prevent an outcome in which CPU 1 observes 156 - * the new PTE value and CPU 0 observes bit 1 clear in 157 - * mm_cpumask. (If that occurs, then the IPI will never 158 - * be sent, and CPU 0's TLB will contain a stale entry.) 159 - * 160 - * The bad outcome can occur if either CPU's load is 161 - * reordered before that CPU's store, so both CPUs must 162 - * execute full barriers to prevent this from happening. 163 - * 164 - * Thus, switch_mm needs a full barrier between the 165 - * store to mm_cpumask and any operation that could load 166 - * from next->pgd. TLB fills are special and can happen 167 - * due to instruction fetches or for no reason at all, 168 - * and neither LOCK nor MFENCE orders them. 169 - * Fortunately, load_cr3() is serializing and gives the 170 - * ordering guarantee we need. 171 - */ 172 - load_cr3(next->pgd); 173 - 174 - /* 175 - * This gets called via leave_mm() in the idle path where RCU 176 - * functions differently. Tracing normally uses RCU, so we have to 177 - * call the tracepoint specially here. 178 - */ 179 - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 180 - 181 - /* Stop flush ipis for the previous mm */ 182 - WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && 183 - real_prev != &init_mm); 184 - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 185 - 186 - /* Load per-mm CR4 and LDTR state */ 187 177 load_mm_cr4(next); 188 178 switch_ldt(real_prev, next); 189 179 } 190 180 181 + /* 182 + * flush_tlb_func_common()'s memory ordering requirement is that any 183 + * TLB fills that happen after we flush the TLB are ordered after we 184 + * read active_mm's tlb_gen. We don't need any explicit barriers 185 + * because all x86 flush operations are serializing and the 186 + * atomic64_read operation won't be reordered by the compiler. 187 + */ 191 188 static void flush_tlb_func_common(const struct flush_tlb_info *f, 192 189 bool local, enum tlb_flush_reason reason) 193 190 { 191 + /* 192 + * We have three different tlb_gen values in here. They are: 193 + * 194 + * - mm_tlb_gen: the latest generation. 195 + * - local_tlb_gen: the generation that this CPU has already caught 196 + * up to. 197 + * - f->new_tlb_gen: the generation that the requester of the flush 198 + * wants us to catch up to. 199 + */ 200 + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 201 + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 202 + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); 203 + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); 204 + 194 205 /* This code cannot presently handle being reentered. */ 195 206 VM_WARN_ON(!irqs_disabled()); 196 207 197 - if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { 198 - leave_mm(smp_processor_id()); 208 + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != 209 + loaded_mm->context.ctx_id); 210 + 211 + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { 212 + /* 213 + * We're in lazy mode -- don't flush. We can get here on 214 + * remote flushes due to races and on local flushes if a 215 + * kernel thread coincidentally flushes the mm it's lazily 216 + * still using. 217 + */ 199 218 return; 200 219 } 201 220 202 - if (f->end == TLB_FLUSH_ALL) { 203 - local_flush_tlb(); 204 - if (local) 205 - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 206 - trace_tlb_flush(reason, TLB_FLUSH_ALL); 207 - } else { 221 + if (unlikely(local_tlb_gen == mm_tlb_gen)) { 222 + /* 223 + * There's nothing to do: we're already up to date. This can 224 + * happen if two concurrent flushes happen -- the first flush to 225 + * be handled can catch us all the way up, leaving no work for 226 + * the second flush. 227 + */ 228 + trace_tlb_flush(reason, 0); 229 + return; 230 + } 231 + 232 + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); 233 + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); 234 + 235 + /* 236 + * If we get to this point, we know that our TLB is out of date. 237 + * This does not strictly imply that we need to flush (it's 238 + * possible that f->new_tlb_gen <= local_tlb_gen), but we're 239 + * going to need to flush in the very near future, so we might 240 + * as well get it over with. 241 + * 242 + * The only question is whether to do a full or partial flush. 243 + * 244 + * We do a partial flush if requested and two extra conditions 245 + * are met: 246 + * 247 + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that 248 + * we've always done all needed flushes to catch up to 249 + * local_tlb_gen. If, for example, local_tlb_gen == 2 and 250 + * f->new_tlb_gen == 3, then we know that the flush needed to bring 251 + * us up to date for tlb_gen 3 is the partial flush we're 252 + * processing. 253 + * 254 + * As an example of why this check is needed, suppose that there 255 + * are two concurrent flushes. The first is a full flush that 256 + * changes context.tlb_gen from 1 to 2. The second is a partial 257 + * flush that changes context.tlb_gen from 2 to 3. If they get 258 + * processed on this CPU in reverse order, we'll see 259 + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. 260 + * If we were to use __flush_tlb_single() and set local_tlb_gen to 261 + * 3, we'd be break the invariant: we'd update local_tlb_gen above 262 + * 1 without the full flush that's needed for tlb_gen 2. 263 + * 264 + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. 265 + * Partial TLB flushes are not all that much cheaper than full TLB 266 + * flushes, so it seems unlikely that it would be a performance win 267 + * to do a partial flush if that won't bring our TLB fully up to 268 + * date. By doing a full flush instead, we can increase 269 + * local_tlb_gen all the way to mm_tlb_gen and we can probably 270 + * avoid another flush in the very near future. 271 + */ 272 + if (f->end != TLB_FLUSH_ALL && 273 + f->new_tlb_gen == local_tlb_gen + 1 && 274 + f->new_tlb_gen == mm_tlb_gen) { 275 + /* Partial flush */ 208 276 unsigned long addr; 209 277 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; 278 + 210 279 addr = f->start; 211 280 while (addr < f->end) { 212 281 __flush_tlb_single(addr); ··· 319 180 if (local) 320 181 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); 321 182 trace_tlb_flush(reason, nr_pages); 183 + } else { 184 + /* Full flush. */ 185 + local_flush_tlb(); 186 + if (local) 187 + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 188 + trace_tlb_flush(reason, TLB_FLUSH_ALL); 322 189 } 190 + 191 + /* Both paths above update our state to mm_tlb_gen. */ 192 + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); 323 193 } 324 194 325 195 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) ··· 362 214 (info->end - info->start) >> PAGE_SHIFT); 363 215 364 216 if (is_uv_system()) { 217 + /* 218 + * This whole special case is confused. UV has a "Broadcast 219 + * Assist Unit", which seems to be a fancy way to send IPIs. 220 + * Back when x86 used an explicit TLB flush IPI, UV was 221 + * optimized to use its own mechanism. These days, x86 uses 222 + * smp_call_function_many(), but UV still uses a manual IPI, 223 + * and that IPI's action is out of date -- it does a manual 224 + * flush instead of calling flush_tlb_func_remote(). This 225 + * means that the percpu tlb_gen variables won't be updated 226 + * and we'll do pointless flushes on future context switches. 227 + * 228 + * Rather than hooking native_flush_tlb_others() here, I think 229 + * that UV should be updated so that smp_call_function_many(), 230 + * etc, are optimal on UV. 231 + */ 365 232 unsigned int cpu; 366 233 367 234 cpu = smp_processor_id(); ··· 413 250 414 251 cpu = get_cpu(); 415 252 416 - /* Synchronize with switch_mm. */ 417 - smp_mb(); 253 + /* This is also a barrier that synchronizes with switch_mm(). */ 254 + info.new_tlb_gen = inc_mm_tlb_gen(mm); 418 255 419 256 /* Should we flush just the requested range? */ 420 257 if ((end != TLB_FLUSH_ALL) && ··· 436 273 437 274 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) 438 275 flush_tlb_others(mm_cpumask(mm), &info); 276 + 439 277 put_cpu(); 440 278 } 441 279 ··· 445 281 { 446 282 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 447 283 __flush_tlb_all(); 448 - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 449 - leave_mm(smp_processor_id()); 450 284 } 451 285 452 286 void flush_tlb_all(void) ··· 497 335 498 336 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) 499 337 flush_tlb_others(&batch->cpumask, &info); 338 + 500 339 cpumask_clear(&batch->cpumask); 501 340 502 341 put_cpu();

+2 -2

arch/x86/pci/common.c

··· 674 674 675 675 pa_data = boot_params.hdr.setup_data; 676 676 while (pa_data) { 677 - data = ioremap(pa_data, sizeof(*rom)); 677 + data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB); 678 678 if (!data) 679 679 return -ENOMEM; 680 680 ··· 693 693 } 694 694 } 695 695 pa_data = data->next; 696 - iounmap(data); 696 + memunmap(data); 697 697 } 698 698 set_dma_domain_ops(dev); 699 699 set_dev_domain_options(dev);

+3 -3

arch/x86/platform/efi/efi.c

··· 1035 1035 /* 1036 1036 * Convenience functions to obtain memory types and attributes 1037 1037 */ 1038 - u32 efi_mem_type(unsigned long phys_addr) 1038 + int efi_mem_type(unsigned long phys_addr) 1039 1039 { 1040 1040 efi_memory_desc_t *md; 1041 1041 1042 1042 if (!efi_enabled(EFI_MEMMAP)) 1043 - return 0; 1043 + return -ENOTSUPP; 1044 1044 1045 1045 for_each_efi_memory_desc(md) { 1046 1046 if ((md->phys_addr <= phys_addr) && ··· 1048 1048 (md->num_pages << EFI_PAGE_SHIFT)))) 1049 1049 return md->type; 1050 1050 } 1051 - return 0; 1051 + return -EINVAL; 1052 1052 } 1053 1053 1054 1054 static int __init arch_parse_efi_cmdline(char *str)

+11 -4

arch/x86/platform/efi/efi_64.c

··· 327 327 328 328 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) 329 329 { 330 - unsigned long pfn, text; 330 + unsigned long pfn, text, pf; 331 331 struct page *page; 332 332 unsigned npages; 333 333 pgd_t *pgd; ··· 335 335 if (efi_enabled(EFI_OLD_MEMMAP)) 336 336 return 0; 337 337 338 - efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd); 338 + /* 339 + * Since the PGD is encrypted, set the encryption mask so that when 340 + * this value is loaded into cr3 the PGD will be decrypted during 341 + * the pagetable walk. 342 + */ 343 + efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd); 339 344 pgd = efi_pgd; 340 345 341 346 /* ··· 350 345 * phys_efi_set_virtual_address_map(). 351 346 */ 352 347 pfn = pa_memmap >> PAGE_SHIFT; 353 - if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) { 348 + pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC; 349 + if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) { 354 350 pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap); 355 351 return 1; 356 352 } ··· 394 388 text = __pa(_text); 395 389 pfn = text >> PAGE_SHIFT; 396 390 397 - if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) { 391 + pf = _PAGE_RW | _PAGE_ENC; 392 + if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) { 398 393 pr_err("Failed to map kernel text 1:1\n"); 399 394 return 1; 400 395 }

+12

arch/x86/realmode/init.c

··· 1 1 #include <linux/io.h> 2 2 #include <linux/slab.h> 3 3 #include <linux/memblock.h> 4 + #include <linux/mem_encrypt.h> 4 5 5 6 #include <asm/set_memory.h> 6 7 #include <asm/pgtable.h> ··· 60 59 61 60 base = (unsigned char *)real_mode_header; 62 61 62 + /* 63 + * If SME is active, the trampoline area will need to be in 64 + * decrypted memory in order to bring up other processors 65 + * successfully. 66 + */ 67 + set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); 68 + 63 69 memcpy(base, real_mode_blob, size); 64 70 65 71 phys_base = __pa(base); ··· 107 99 trampoline_header->start = (u64) secondary_startup_64; 108 100 trampoline_cr4_features = &trampoline_header->cr4; 109 101 *trampoline_cr4_features = mmu_cr4_features; 102 + 103 + trampoline_header->flags = 0; 104 + if (sme_active()) 105 + trampoline_header->flags |= TH_FLAGS_SME_ACTIVE; 110 106 111 107 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); 112 108 trampoline_pgd[0] = trampoline_pgd_entry.pgd;

+24

arch/x86/realmode/rm/trampoline_64.S

··· 30 30 #include <asm/msr.h> 31 31 #include <asm/segment.h> 32 32 #include <asm/processor-flags.h> 33 + #include <asm/realmode.h> 33 34 #include "realmode.h" 34 35 35 36 .text ··· 93 92 movl %edx, %fs 94 93 movl %edx, %gs 95 94 95 + /* 96 + * Check for memory encryption support. This is a safety net in 97 + * case BIOS hasn't done the necessary step of setting the bit in 98 + * the MSR for this AP. If SME is active and we've gotten this far 99 + * then it is safe for us to set the MSR bit and continue. If we 100 + * don't we'll eventually crash trying to execute encrypted 101 + * instructions. 102 + */ 103 + bt $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags 104 + jnc .Ldone 105 + movl $MSR_K8_SYSCFG, %ecx 106 + rdmsr 107 + bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax 108 + jc .Ldone 109 + 110 + /* 111 + * Memory encryption is enabled but the SME enable bit for this 112 + * CPU has has not been set. It is safe to set it, so do so. 113 + */ 114 + wrmsr 115 + .Ldone: 116 + 96 117 movl pa_tr_cr4, %eax 97 118 movl %eax, %cr4 # Enable PAE mode 98 119 ··· 170 147 tr_start: .space 8 171 148 GLOBAL(tr_efer) .space 8 172 149 GLOBAL(tr_cr4) .space 4 150 + GLOBAL(tr_flags) .space 4 173 151 END(trampoline_header) 174 152 175 153 #include "trampoline_common.S"

+5

arch/x86/xen/Kconfig

··· 17 17 bool "Xen PV guest support" 18 18 default y 19 19 depends on XEN 20 + # XEN_PV is not ready to work with 5-level paging. 21 + # Changes to hypervisor are also required. 22 + depends on !X86_5LEVEL 20 23 select XEN_HAVE_PVMMU 21 24 select XEN_HAVE_VPMU 22 25 help ··· 78 75 config XEN_PVH 79 76 bool "Support for running as a PVH guest" 80 77 depends on XEN && XEN_PVHVM && ACPI 78 + # Pre-built page tables are not ready to handle 5-level paging. 79 + depends on !X86_5LEVEL 81 80 def_bool n

+7

arch/x86/xen/enlighten_pv.c

··· 263 263 setup_clear_cpu_cap(X86_FEATURE_MTRR); 264 264 setup_clear_cpu_cap(X86_FEATURE_ACC); 265 265 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 266 + setup_clear_cpu_cap(X86_FEATURE_SME); 267 + 268 + /* 269 + * Xen PV would need some work to support PCID: CR3 handling as well 270 + * as xen_flush_tlb_others() would need updating. 271 + */ 272 + setup_clear_cpu_cap(X86_FEATURE_PCID); 266 273 267 274 if (!xen_initial_domain()) 268 275 setup_clear_cpu_cap(X86_FEATURE_ACPI);

+2 -3

arch/x86/xen/mmu_pv.c

··· 1005 1005 /* Get the "official" set of cpus referring to our pagetable. */ 1006 1006 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { 1007 1007 for_each_online_cpu(cpu) { 1008 - if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) 1009 - && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) 1008 + if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) 1010 1009 continue; 1011 1010 smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); 1012 1011 } 1013 1012 return; 1014 1013 } 1015 - cpumask_copy(mask, mm_cpumask(mm)); 1016 1014 1017 1015 /* 1018 1016 * It's possible that a vcpu may have a stale reference to our ··· 1019 1021 * look at its actual current cr3 value, and force it to flush 1020 1022 * if needed. 1021 1023 */ 1024 + cpumask_clear(mask); 1022 1025 for_each_online_cpu(cpu) { 1023 1026 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) 1024 1027 cpumask_set_cpu(cpu, mask);

+1 -1

arch/x86/xen/xen-head.S

··· 58 58 #else 59 59 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) 60 60 /* Map the p2m table to a 512GB-aligned user address. */ 61 - ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) 61 + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD)) 62 62 #endif 63 63 #ifdef CONFIG_XEN_PV 64 64 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)

-2

drivers/acpi/processor_idle.c

··· 708 708 static void acpi_idle_enter_bm(struct acpi_processor *pr, 709 709 struct acpi_processor_cx *cx, bool timer_bc) 710 710 { 711 - acpi_unlazy_tlb(smp_processor_id()); 712 - 713 711 /* 714 712 * Must be done before busmaster disable as we might need to 715 713 * access HPET !

+3 -2

drivers/firmware/dmi-sysfs.c

··· 25 25 #include <linux/slab.h> 26 26 #include <linux/list.h> 27 27 #include <linux/io.h> 28 + #include <asm/dmi.h> 28 29 29 30 #define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider 30 31 the top entry type is only 8 bits */ ··· 381 380 u8 __iomem *mapped; 382 381 ssize_t wrote = 0; 383 382 384 - mapped = ioremap(sel->access_method_address, sel->area_length); 383 + mapped = dmi_remap(sel->access_method_address, sel->area_length); 385 384 if (!mapped) 386 385 return -EIO; 387 386 ··· 391 390 wrote++; 392 391 } 393 392 394 - iounmap(mapped); 393 + dmi_unmap(mapped); 395 394 return wrote; 396 395 } 397 396

+33

drivers/firmware/efi/efi.c

··· 55 55 }; 56 56 EXPORT_SYMBOL(efi); 57 57 58 + static unsigned long *efi_tables[] = { 59 + &efi.mps, 60 + &efi.acpi, 61 + &efi.acpi20, 62 + &efi.smbios, 63 + &efi.smbios3, 64 + &efi.sal_systab, 65 + &efi.boot_info, 66 + &efi.hcdp, 67 + &efi.uga, 68 + &efi.uv_systab, 69 + &efi.fw_vendor, 70 + &efi.runtime, 71 + &efi.config_table, 72 + &efi.esrt, 73 + &efi.properties_table, 74 + &efi.mem_attr_table, 75 + }; 76 + 58 77 static bool disable_runtime; 59 78 static int __init setup_noefi(char *arg) 60 79 { ··· 872 853 } 873 854 874 855 return err; 856 + } 857 + 858 + bool efi_is_table_address(unsigned long phys_addr) 859 + { 860 + unsigned int i; 861 + 862 + if (phys_addr == EFI_INVALID_TABLE_ADDR) 863 + return false; 864 + 865 + for (i = 0; i < ARRAY_SIZE(efi_tables); i++) 866 + if (*(efi_tables[i]) == phys_addr) 867 + return true; 868 + 869 + return false; 875 870 } 876 871 877 872 #ifdef CONFIG_KEXEC

+2 -2

drivers/firmware/pcdp.c

··· 95 95 if (efi.hcdp == EFI_INVALID_TABLE_ADDR) 96 96 return -ENODEV; 97 97 98 - pcdp = early_ioremap(efi.hcdp, 4096); 98 + pcdp = early_memremap(efi.hcdp, 4096); 99 99 printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp); 100 100 101 101 if (strstr(cmdline, "console=hcdp")) { ··· 131 131 } 132 132 133 133 out: 134 - early_iounmap(pcdp, 4096); 134 + early_memunmap(pcdp, 4096); 135 135 return rc; 136 136 }

+2

drivers/gpu/drm/drm_gem.c

··· 36 36 #include <linux/pagemap.h> 37 37 #include <linux/shmem_fs.h> 38 38 #include <linux/dma-buf.h> 39 + #include <linux/mem_encrypt.h> 39 40 #include <drm/drmP.h> 40 41 #include <drm/drm_vma_manager.h> 41 42 #include <drm/drm_gem.h> ··· 966 965 vma->vm_ops = dev->driver->gem_vm_ops; 967 966 vma->vm_private_data = obj; 968 967 vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); 968 + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); 969 969 970 970 /* Take a ref for this mapping of the object, so that the fault 971 971 * handler can dereference the mmap offset's pointer to the object.

+4

drivers/gpu/drm/drm_vm.c

··· 40 40 #include <linux/efi.h> 41 41 #include <linux/slab.h> 42 42 #endif 43 + #include <linux/mem_encrypt.h> 43 44 #include <asm/pgtable.h> 44 45 #include "drm_internal.h" 45 46 #include "drm_legacy.h" ··· 58 57 struct vm_area_struct *vma) 59 58 { 60 59 pgprot_t tmp = vm_get_page_prot(vma->vm_flags); 60 + 61 + /* We don't want graphics memory to be mapped encrypted */ 62 + tmp = pgprot_decrypted(tmp); 61 63 62 64 #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) 63 65 if (map->type == _DRM_REGISTERS && !(map->flags & _DRM_WRITE_COMBINING))

+5 -2

drivers/gpu/drm/ttm/ttm_bo_vm.c

··· 39 39 #include <linux/rbtree.h> 40 40 #include <linux/module.h> 41 41 #include <linux/uaccess.h> 42 + #include <linux/mem_encrypt.h> 42 43 43 44 #define TTM_BO_VM_NUM_PREFAULT 16 44 45 ··· 231 230 * first page. 232 231 */ 233 232 for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) { 234 - if (bo->mem.bus.is_iomem) 233 + if (bo->mem.bus.is_iomem) { 234 + /* Iomem should not be marked encrypted */ 235 + cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot); 235 236 pfn = bdev->driver->io_mem_pfn(bo, page_offset); 236 - else { 237 + } else { 237 238 page = ttm->pages[page_offset]; 238 239 if (unlikely(!page && i == 0)) { 239 240 retval = VM_FAULT_OOM;

+4

drivers/gpu/drm/udl/udl_fb.c

··· 14 14 #include <linux/slab.h> 15 15 #include <linux/fb.h> 16 16 #include <linux/dma-buf.h> 17 + #include <linux/mem_encrypt.h> 17 18 18 19 #include <drm/drmP.h> 19 20 #include <drm/drm_crtc.h> ··· 169 168 170 169 pr_notice("mmap() framebuffer addr:%lu size:%lu\n", 171 170 pos, size); 171 + 172 + /* We don't want the framebuffer to be mapped encrypted */ 173 + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); 172 174 173 175 while (size > 0) { 174 176 page = vmalloc_to_pfn((void *)pos);

+4 -5

drivers/idle/intel_idle.c

··· 913 913 struct cpuidle_state *state = &drv->states[index]; 914 914 unsigned long eax = flg2MWAIT(state->flags); 915 915 unsigned int cstate; 916 - int cpu = smp_processor_id(); 917 916 918 917 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; 919 918 920 919 /* 921 - * leave_mm() to avoid costly and often unnecessary wakeups 922 - * for flushing the user TLB's associated with the active mm. 920 + * NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition 921 + * will probably flush the TLB. It's not guaranteed to flush 922 + * the TLB, though, so it's not clear that we can do anything 923 + * useful with this knowledge. 923 924 */ 924 - if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED) 925 - leave_mm(cpu); 926 925 927 926 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 928 927 tick_broadcast_enter();

+16 -14

drivers/iommu/amd_iommu.c

··· 575 575 576 576 static void dump_command(unsigned long phys_addr) 577 577 { 578 - struct iommu_cmd *cmd = phys_to_virt(phys_addr); 578 + struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr); 579 579 int i; 580 580 581 581 for (i = 0; i < 4; ++i) ··· 919 919 920 920 static void build_completion_wait(struct iommu_cmd *cmd, u64 address) 921 921 { 922 + u64 paddr = iommu_virt_to_phys((void *)address); 923 + 922 924 WARN_ON(address & 0x7ULL); 923 925 924 926 memset(cmd, 0, sizeof(*cmd)); 925 - cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK; 926 - cmd->data[1] = upper_32_bits(__pa(address)); 927 + cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK; 928 + cmd->data[1] = upper_32_bits(paddr); 927 929 cmd->data[2] = 1; 928 930 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); 929 931 } ··· 1385 1383 return false; 1386 1384 1387 1385 *pte = PM_LEVEL_PDE(domain->mode, 1388 - virt_to_phys(domain->pt_root)); 1386 + iommu_virt_to_phys(domain->pt_root)); 1389 1387 domain->pt_root = pte; 1390 1388 domain->mode += 1; 1391 1389 domain->updated = true; ··· 1422 1420 if (!page) 1423 1421 return NULL; 1424 1422 1425 - __npte = PM_LEVEL_PDE(level, virt_to_phys(page)); 1423 + __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 1426 1424 1427 1425 /* pte could have been changed somewhere. */ 1428 1426 if (cmpxchg64(pte, __pte, __npte) != __pte) { ··· 1538 1536 return -EBUSY; 1539 1537 1540 1538 if (count > 1) { 1541 - __pte = PAGE_SIZE_PTE(phys_addr, page_size); 1539 + __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size); 1542 1540 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC; 1543 1541 } else 1544 - __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC; 1542 + __pte = __sme_set(phys_addr) | IOMMU_PTE_P | IOMMU_PTE_FC; 1545 1543 1546 1544 if (prot & IOMMU_PROT_IR) 1547 1545 __pte |= IOMMU_PTE_IR; ··· 1757 1755 if (!(tbl[i] & GCR3_VALID)) 1758 1756 continue; 1759 1757 1760 - ptr = __va(tbl[i] & PAGE_MASK); 1758 + ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1761 1759 1762 1760 free_page((unsigned long)ptr); 1763 1761 } ··· 1772 1770 if (!(tbl[i] & GCR3_VALID)) 1773 1771 continue; 1774 1772 1775 - ptr = __va(tbl[i] & PAGE_MASK); 1773 + ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1776 1774 1777 1775 free_gcr3_tbl_level1(ptr); 1778 1776 } ··· 2051 2049 u64 flags = 0; 2052 2050 2053 2051 if (domain->mode != PAGE_MODE_NONE) 2054 - pte_root = virt_to_phys(domain->pt_root); 2052 + pte_root = iommu_virt_to_phys(domain->pt_root); 2055 2053 2056 2054 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 2057 2055 << DEV_ENTRY_MODE_SHIFT; ··· 2063 2061 flags |= DTE_FLAG_IOTLB; 2064 2062 2065 2063 if (domain->flags & PD_IOMMUV2_MASK) { 2066 - u64 gcr3 = __pa(domain->gcr3_tbl); 2064 + u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl); 2067 2065 u64 glx = domain->glx; 2068 2066 u64 tmp; 2069 2067 ··· 3608 3606 if (root == NULL) 3609 3607 return NULL; 3610 3608 3611 - *pte = __pa(root) | GCR3_VALID; 3609 + *pte = iommu_virt_to_phys(root) | GCR3_VALID; 3612 3610 } 3613 3611 3614 - root = __va(*pte & PAGE_MASK); 3612 + root = iommu_phys_to_virt(*pte & PAGE_MASK); 3615 3613 3616 3614 level -= 1; 3617 3615 } ··· 3790 3788 3791 3789 dte = amd_iommu_dev_table[devid].data[2]; 3792 3790 dte &= ~DTE_IRQ_PHYS_ADDR_MASK; 3793 - dte |= virt_to_phys(table->table); 3791 + dte |= iommu_virt_to_phys(table->table); 3794 3792 dte |= DTE_IRQ_REMAP_INTCTL; 3795 3793 dte |= DTE_IRQ_TABLE_LEN; 3796 3794 dte |= DTE_IRQ_REMAP_ENABLE;

+28 -6

drivers/iommu/amd_iommu_init.c

··· 30 30 #include <linux/iommu.h> 31 31 #include <linux/kmemleak.h> 32 32 #include <linux/crash_dump.h> 33 + #include <linux/mem_encrypt.h> 33 34 #include <asm/pci-direct.h> 34 35 #include <asm/iommu.h> 35 36 #include <asm/gart.h> ··· 349 348 350 349 BUG_ON(iommu->mmio_base == NULL); 351 350 352 - entry = virt_to_phys(amd_iommu_dev_table); 351 + entry = iommu_virt_to_phys(amd_iommu_dev_table); 353 352 entry |= (dev_table_size >> 12) - 1; 354 353 memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, 355 354 &entry, sizeof(entry)); ··· 607 606 608 607 BUG_ON(iommu->cmd_buf == NULL); 609 608 610 - entry = (u64)virt_to_phys(iommu->cmd_buf); 609 + entry = iommu_virt_to_phys(iommu->cmd_buf); 611 610 entry |= MMIO_CMD_SIZE_512; 612 611 613 612 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, ··· 636 635 637 636 BUG_ON(iommu->evt_buf == NULL); 638 637 639 - entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; 638 + entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; 640 639 641 640 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 642 641 &entry, sizeof(entry)); ··· 669 668 if (iommu->ppr_log == NULL) 670 669 return; 671 670 672 - entry = (u64)virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512; 671 + entry = iommu_virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512; 673 672 674 673 memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET, 675 674 &entry, sizeof(entry)); ··· 749 748 if (!iommu->ga_log_tail) 750 749 goto err_out; 751 750 752 - entry = (u64)virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512; 751 + entry = iommu_virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512; 753 752 memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET, 754 753 &entry, sizeof(entry)); 755 - entry = ((u64)virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL; 754 + entry = (iommu_virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL; 756 755 memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET, 757 756 &entry, sizeof(entry)); 758 757 writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); ··· 2565 2564 return ret; 2566 2565 } 2567 2566 2567 + static bool amd_iommu_sme_check(void) 2568 + { 2569 + if (!sme_active() || (boot_cpu_data.x86 != 0x17)) 2570 + return true; 2571 + 2572 + /* For Fam17h, a specific level of support is required */ 2573 + if (boot_cpu_data.microcode >= 0x08001205) 2574 + return true; 2575 + 2576 + if ((boot_cpu_data.microcode >= 0x08001126) && 2577 + (boot_cpu_data.microcode <= 0x080011ff)) 2578 + return true; 2579 + 2580 + pr_notice("AMD-Vi: IOMMU not currently supported when SME is active\n"); 2581 + 2582 + return false; 2583 + } 2584 + 2568 2585 /**************************************************************************** 2569 2586 * 2570 2587 * Early detect code. This code runs at IOMMU detection time in the DMA ··· 2595 2576 int ret; 2596 2577 2597 2578 if (no_iommu || (iommu_detected && !gart_iommu_aperture)) 2579 + return -ENODEV; 2580 + 2581 + if (!amd_iommu_sme_check()) 2598 2582 return -ENODEV; 2599 2583 2600 2584 ret = iommu_go_to_state(IOMMU_IVRS_DETECTED);

+10

drivers/iommu/amd_iommu_proto.h

··· 87 87 return !!(iommu->features & f); 88 88 } 89 89 90 + static inline u64 iommu_virt_to_phys(void *vaddr) 91 + { 92 + return (u64)__sme_set(virt_to_phys(vaddr)); 93 + } 94 + 95 + static inline void *iommu_phys_to_virt(unsigned long paddr) 96 + { 97 + return phys_to_virt(__sme_clr(paddr)); 98 + } 99 + 90 100 #endif /* _ASM_X86_AMD_IOMMU_PROTO_H */

+1 -1

drivers/iommu/amd_iommu_types.h

··· 344 344 345 345 #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) 346 346 #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) 347 - #define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) 347 + #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) 348 348 #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) 349 349 350 350 #define IOMMU_PROT_MASK 0x03

+12 -11

drivers/sfi/sfi_core.c

··· 68 68 #include <linux/init.h> 69 69 #include <linux/sfi.h> 70 70 #include <linux/slab.h> 71 + #include <linux/io.h> 71 72 72 73 #include "sfi_core.h" 73 74 ··· 87 86 /* 88 87 * FW creates and saves the SFI tables in memory. When these tables get 89 88 * used, they may need to be mapped to virtual address space, and the mapping 90 - * can happen before or after the ioremap() is ready, so a flag is needed 89 + * can happen before or after the memremap() is ready, so a flag is needed 91 90 * to indicating this 92 91 */ 93 - static u32 sfi_use_ioremap __read_mostly; 92 + static u32 sfi_use_memremap __read_mostly; 94 93 95 94 /* 96 - * sfi_un/map_memory calls early_ioremap/iounmap which is a __init function 95 + * sfi_un/map_memory calls early_memremap/memunmap which is a __init function 97 96 * and introduces section mismatch. So use __ref to make it calm. 98 97 */ 99 98 static void __iomem * __ref sfi_map_memory(u64 phys, u32 size) ··· 101 100 if (!phys || !size) 102 101 return NULL; 103 102 104 - if (sfi_use_ioremap) 105 - return ioremap_cache(phys, size); 103 + if (sfi_use_memremap) 104 + return memremap(phys, size, MEMREMAP_WB); 106 105 else 107 - return early_ioremap(phys, size); 106 + return early_memremap(phys, size); 108 107 } 109 108 110 109 static void __ref sfi_unmap_memory(void __iomem *virt, u32 size) ··· 112 111 if (!virt || !size) 113 112 return; 114 113 115 - if (sfi_use_ioremap) 116 - iounmap(virt); 114 + if (sfi_use_memremap) 115 + memunmap(virt); 117 116 else 118 - early_iounmap(virt, size); 117 + early_memunmap(virt, size); 119 118 } 120 119 121 120 static void sfi_print_table_header(unsigned long long pa, ··· 508 507 length = syst_va->header.len; 509 508 sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple)); 510 509 511 - /* Use ioremap now after it is ready */ 512 - sfi_use_ioremap = 1; 510 + /* Use memremap now after it is ready */ 511 + sfi_use_memremap = 1; 513 512 syst_va = sfi_map_memory(syst_pa, length); 514 513 515 514 sfi_acpi_init();

+12

drivers/video/fbdev/core/fbmem.c

··· 32 32 #include <linux/device.h> 33 33 #include <linux/efi.h> 34 34 #include <linux/fb.h> 35 + #include <linux/mem_encrypt.h> 35 36 36 37 #include <asm/fb.h> 37 38 ··· 1397 1396 mutex_lock(&info->mm_lock); 1398 1397 if (fb->fb_mmap) { 1399 1398 int res; 1399 + 1400 + /* 1401 + * The framebuffer needs to be accessed decrypted, be sure 1402 + * SME protection is removed ahead of the call 1403 + */ 1404 + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); 1400 1405 res = fb->fb_mmap(info, vma); 1401 1406 mutex_unlock(&info->mm_lock); 1402 1407 return res; ··· 1428 1421 mutex_unlock(&info->mm_lock); 1429 1422 1430 1423 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 1424 + /* 1425 + * The framebuffer needs to be accessed decrypted, be sure 1426 + * SME protection is removed 1427 + */ 1428 + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); 1431 1429 fb_pgprotect(file, vma, start); 1432 1430 1433 1431 return vm_iomap_memory(vma, start, len);

+2

include/asm-generic/early_ioremap.h

··· 13 13 unsigned long size); 14 14 extern void *early_memremap_ro(resource_size_t phys_addr, 15 15 unsigned long size); 16 + extern void *early_memremap_prot(resource_size_t phys_addr, 17 + unsigned long size, unsigned long prot_val); 16 18 extern void early_iounmap(void __iomem *addr, unsigned long size); 17 19 extern void early_memunmap(void *addr, unsigned long size); 18 20

+12

include/asm-generic/pgtable.h

··· 583 583 #endif /* CONFIG_MMU */ 584 584 585 585 /* 586 + * No-op macros that just return the current protection value. Defined here 587 + * because these macros can be used used even if CONFIG_MMU is not defined. 588 + */ 589 + #ifndef pgprot_encrypted 590 + #define pgprot_encrypted(prot) (prot) 591 + #endif 592 + 593 + #ifndef pgprot_decrypted 594 + #define pgprot_decrypted(prot) (prot) 595 + #endif 596 + 597 + /* 586 598 * A facility to provide lazy MMU batching. This allows PTE updates and 587 599 * page invalidations to be delayed until a call to leave lazy MMU mode 588 600 * is issued. Some architectures may benefit from doing this, and it is

+2

include/linux/compiler-gcc.h

··· 166 166 167 167 #if GCC_VERSION >= 40100 168 168 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0) 169 + 170 + #define __nostackprotector __attribute__((__optimize__("no-stack-protector"))) 169 171 #endif 170 172 171 173 #if GCC_VERSION >= 40300

+4

include/linux/compiler.h

··· 501 501 #define __visible 502 502 #endif 503 503 504 + #ifndef __nostackprotector 505 + # define __nostackprotector 506 + #endif 507 + 504 508 /* 505 509 * Assume alignment of return value. 506 510 */

+13

include/linux/dma-mapping.h

··· 10 10 #include <linux/scatterlist.h> 11 11 #include <linux/kmemcheck.h> 12 12 #include <linux/bug.h> 13 + #include <linux/mem_encrypt.h> 13 14 14 15 /** 15 16 * List of possible attributes associated with a DMA mapping. The semantics ··· 573 572 return 0; 574 573 } 575 574 575 + static inline void dma_check_mask(struct device *dev, u64 mask) 576 + { 577 + if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1))) 578 + dev_warn(dev, "SME is active, device will require DMA bounce buffers\n"); 579 + } 580 + 576 581 static inline int dma_supported(struct device *dev, u64 mask) 577 582 { 578 583 const struct dma_map_ops *ops = get_dma_ops(dev); ··· 595 588 { 596 589 if (!dev->dma_mask || !dma_supported(dev, mask)) 597 590 return -EIO; 591 + 592 + dma_check_mask(dev, mask); 593 + 598 594 *dev->dma_mask = mask; 599 595 return 0; 600 596 } ··· 617 607 { 618 608 if (!dma_supported(dev, mask)) 619 609 return -EIO; 610 + 611 + dma_check_mask(dev, mask); 612 + 620 613 dev->coherent_dma_mask = mask; 621 614 return 0; 622 615 }

+8 -1

include/linux/efi.h

··· 985 985 extern int efi_config_parse_tables(void *config_tables, int count, int sz, 986 986 efi_config_table_type_t *arch_tables); 987 987 extern u64 efi_get_iobase (void); 988 - extern u32 efi_mem_type (unsigned long phys_addr); 988 + extern int efi_mem_type(unsigned long phys_addr); 989 989 extern u64 efi_mem_attributes (unsigned long phys_addr); 990 990 extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); 991 991 extern int __init efi_uart_console_only (void); ··· 1113 1113 return test_bit(feature, &efi.flags) != 0; 1114 1114 } 1115 1115 extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused); 1116 + 1117 + extern bool efi_is_table_address(unsigned long phys_addr); 1116 1118 #else 1117 1119 static inline bool efi_enabled(int feature) 1118 1120 { ··· 1125 1123 1126 1124 static inline bool 1127 1125 efi_capsule_pending(int *reset_type) 1126 + { 1127 + return false; 1128 + } 1129 + 1130 + static inline bool efi_is_table_address(unsigned long phys_addr) 1128 1131 { 1129 1132 return false; 1130 1133 }

+2

include/linux/io.h

··· 157 157 MEMREMAP_WB = 1 << 0, 158 158 MEMREMAP_WT = 1 << 1, 159 159 MEMREMAP_WC = 1 << 2, 160 + MEMREMAP_ENC = 1 << 3, 161 + MEMREMAP_DEC = 1 << 4, 160 162 }; 161 163 162 164 void *memremap(resource_size_t offset, size_t size, unsigned long flags);

+8

include/linux/kexec.h

··· 327 327 return phys_to_virt(boot_phys_to_phys(entry)); 328 328 } 329 329 330 + #ifndef arch_kexec_post_alloc_pages 331 + static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { return 0; } 332 + #endif 333 + 334 + #ifndef arch_kexec_pre_free_pages 335 + static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { } 336 + #endif 337 + 330 338 #else /* !CONFIG_KEXEC_CORE */ 331 339 struct pt_regs; 332 340 struct task_struct;

+48

include/linux/mem_encrypt.h

··· 1 + /* 2 + * AMD Memory Encryption Support 3 + * 4 + * Copyright (C) 2016 Advanced Micro Devices, Inc. 5 + * 6 + * Author: Tom Lendacky <thomas.lendacky@amd.com> 7 + * 8 + * This program is free software; you can redistribute it and/or modify 9 + * it under the terms of the GNU General Public License version 2 as 10 + * published by the Free Software Foundation. 11 + */ 12 + 13 + #ifndef __MEM_ENCRYPT_H__ 14 + #define __MEM_ENCRYPT_H__ 15 + 16 + #ifndef __ASSEMBLY__ 17 + 18 + #ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT 19 + 20 + #include <asm/mem_encrypt.h> 21 + 22 + #else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */ 23 + 24 + #define sme_me_mask 0UL 25 + 26 + #endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ 27 + 28 + static inline bool sme_active(void) 29 + { 30 + return !!sme_me_mask; 31 + } 32 + 33 + static inline unsigned long sme_get_me_mask(void) 34 + { 35 + return sme_me_mask; 36 + } 37 + 38 + /* 39 + * The __sme_set() and __sme_clr() macros are useful for adding or removing 40 + * the encryption mask from a value (e.g. when dealing with pagetable 41 + * entries). 42 + */ 43 + #define __sme_set(x) ((unsigned long)(x) | sme_me_mask) 44 + #define __sme_clr(x) ((unsigned long)(x) & ~sme_me_mask) 45 + 46 + #endif /* __ASSEMBLY__ */ 47 + 48 + #endif /* __MEM_ENCRYPT_H__ */

+6

include/linux/mm_inline.h

··· 126 126 127 127 #define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) 128 128 129 + #ifdef arch_unmap_kpfn 130 + extern void arch_unmap_kpfn(unsigned long pfn); 131 + #else 132 + static __always_inline void arch_unmap_kpfn(unsigned long pfn) { } 133 + #endif 134 + 129 135 #endif

+1

include/linux/swiotlb.h

··· 35 35 extern unsigned long swiotlb_nr_tbl(void); 36 36 unsigned long swiotlb_size_or_default(void); 37 37 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); 38 + extern void __init swiotlb_update_mem_attributes(void); 38 39 39 40 /* 40 41 * Enumeration for sync targets

+10

init/main.c

··· 487 487 } 488 488 #endif 489 489 490 + void __init __weak mem_encrypt_init(void) { } 491 + 490 492 /* 491 493 * Set up kernel memory allocators 492 494 */ ··· 641 639 * too: 642 640 */ 643 641 locking_selftest(); 642 + 643 + /* 644 + * This needs to be called before any devices perform DMA 645 + * operations that might use the SWIOTLB bounce buffers. It will 646 + * mark the bounce buffers as decrypted so that their usage will 647 + * not cause "plain-text" data to be decrypted when accessed. 648 + */ 649 + mem_encrypt_init(); 644 650 645 651 #ifdef CONFIG_BLK_DEV_INITRD 646 652 if (initrd_start && !initrd_below_start_ok &&

+11 -1

kernel/kexec_core.c

··· 301 301 { 302 302 struct page *pages; 303 303 304 - pages = alloc_pages(gfp_mask, order); 304 + pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); 305 305 if (pages) { 306 306 unsigned int count, i; 307 307 ··· 310 310 count = 1 << order; 311 311 for (i = 0; i < count; i++) 312 312 SetPageReserved(pages + i); 313 + 314 + arch_kexec_post_alloc_pages(page_address(pages), count, 315 + gfp_mask); 316 + 317 + if (gfp_mask & __GFP_ZERO) 318 + for (i = 0; i < count; i++) 319 + clear_highpage(pages + i); 313 320 } 314 321 315 322 return pages; ··· 328 321 329 322 order = page_private(page); 330 323 count = 1 << order; 324 + 325 + arch_kexec_pre_free_pages(page_address(page), count); 326 + 331 327 for (i = 0; i < count; i++) 332 328 ClearPageReserved(page + i); 333 329 __free_pages(page, order);

+16 -4

kernel/memremap.c

··· 34 34 } 35 35 #endif 36 36 37 - static void *try_ram_remap(resource_size_t offset, size_t size) 37 + #ifndef arch_memremap_can_ram_remap 38 + static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, 39 + unsigned long flags) 40 + { 41 + return true; 42 + } 43 + #endif 44 + 45 + static void *try_ram_remap(resource_size_t offset, size_t size, 46 + unsigned long flags) 38 47 { 39 48 unsigned long pfn = PHYS_PFN(offset); 40 49 41 50 /* In the simple case just return the existing linear address */ 42 - if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) 51 + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && 52 + arch_memremap_can_ram_remap(offset, size, flags)) 43 53 return __va(offset); 54 + 44 55 return NULL; /* fallback to arch_memremap_wb */ 45 56 } 46 57 ··· 59 48 * memremap() - remap an iomem_resource as cacheable memory 60 49 * @offset: iomem resource start address 61 50 * @size: size of remap 62 - * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC 51 + * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, 52 + * MEMREMAP_ENC, MEMREMAP_DEC 63 53 * 64 54 * memremap() is "ioremap" for cases where it is known that the resource 65 55 * being mapped does not have i/o side effects and the __iomem ··· 107 95 * the requested range is potentially in System RAM. 108 96 */ 109 97 if (is_ram == REGION_INTERSECTS) 110 - addr = try_ram_remap(offset, size); 98 + addr = try_ram_remap(offset, size, flags); 111 99 if (!addr) 112 100 addr = arch_memremap_wb(offset, size); 113 101 }

+49 -8

lib/swiotlb.c

··· 30 30 #include <linux/highmem.h> 31 31 #include <linux/gfp.h> 32 32 #include <linux/scatterlist.h> 33 + #include <linux/mem_encrypt.h> 33 34 34 35 #include <asm/io.h> 35 36 #include <asm/dma.h> ··· 156 155 return size ? size : (IO_TLB_DEFAULT_SIZE); 157 156 } 158 157 158 + void __weak swiotlb_set_mem_attributes(void *vaddr, unsigned long size) { } 159 + 160 + /* For swiotlb, clear memory encryption mask from dma addresses */ 161 + static dma_addr_t swiotlb_phys_to_dma(struct device *hwdev, 162 + phys_addr_t address) 163 + { 164 + return __sme_clr(phys_to_dma(hwdev, address)); 165 + } 166 + 159 167 /* Note that this doesn't work with highmem page */ 160 168 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, 161 169 volatile void *address) ··· 191 181 (unsigned long long)io_tlb_start, 192 182 (unsigned long long)io_tlb_end, 193 183 bytes >> 20, vstart, vend - 1); 184 + } 185 + 186 + /* 187 + * Early SWIOTLB allocation may be too early to allow an architecture to 188 + * perform the desired operations. This function allows the architecture to 189 + * call SWIOTLB when the operations are possible. It needs to be called 190 + * before the SWIOTLB memory is used. 191 + */ 192 + void __init swiotlb_update_mem_attributes(void) 193 + { 194 + void *vaddr; 195 + unsigned long bytes; 196 + 197 + if (no_iotlb_memory || late_alloc) 198 + return; 199 + 200 + vaddr = phys_to_virt(io_tlb_start); 201 + bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); 202 + swiotlb_set_mem_attributes(vaddr, bytes); 203 + memset(vaddr, 0, bytes); 204 + 205 + vaddr = phys_to_virt(io_tlb_overflow_buffer); 206 + bytes = PAGE_ALIGN(io_tlb_overflow); 207 + swiotlb_set_mem_attributes(vaddr, bytes); 208 + memset(vaddr, 0, bytes); 194 209 } 195 210 196 211 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) ··· 355 320 io_tlb_start = virt_to_phys(tlb); 356 321 io_tlb_end = io_tlb_start + bytes; 357 322 323 + swiotlb_set_mem_attributes(tlb, bytes); 358 324 memset(tlb, 0, bytes); 359 325 360 326 /* ··· 366 330 if (!v_overflow_buffer) 367 331 goto cleanup2; 368 332 333 + swiotlb_set_mem_attributes(v_overflow_buffer, io_tlb_overflow); 334 + memset(v_overflow_buffer, 0, io_tlb_overflow); 369 335 io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); 370 336 371 337 /* ··· 507 469 if (no_iotlb_memory) 508 470 panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); 509 471 472 + if (sme_active()) 473 + pr_warn_once("SME is active and system is using DMA bounce buffers\n"); 474 + 510 475 mask = dma_get_seg_boundary(hwdev); 511 476 512 477 tbl_dma_addr &= mask; ··· 622 581 return SWIOTLB_MAP_ERROR; 623 582 } 624 583 625 - start_dma_addr = phys_to_dma(hwdev, io_tlb_start); 584 + start_dma_addr = swiotlb_phys_to_dma(hwdev, io_tlb_start); 626 585 return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, 627 586 dir, attrs); 628 587 } ··· 743 702 goto err_warn; 744 703 745 704 ret = phys_to_virt(paddr); 746 - dev_addr = phys_to_dma(hwdev, paddr); 705 + dev_addr = swiotlb_phys_to_dma(hwdev, paddr); 747 706 748 707 /* Confirm address can be DMA'd by device */ 749 708 if (dev_addr + size - 1 > dma_mask) { ··· 853 812 map = map_single(dev, phys, size, dir, attrs); 854 813 if (map == SWIOTLB_MAP_ERROR) { 855 814 swiotlb_full(dev, size, dir, 1); 856 - return phys_to_dma(dev, io_tlb_overflow_buffer); 815 + return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer); 857 816 } 858 817 859 - dev_addr = phys_to_dma(dev, map); 818 + dev_addr = swiotlb_phys_to_dma(dev, map); 860 819 861 820 /* Ensure that the address returned is DMA'ble */ 862 821 if (dma_capable(dev, dev_addr, size)) ··· 865 824 attrs |= DMA_ATTR_SKIP_CPU_SYNC; 866 825 swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); 867 826 868 - return phys_to_dma(dev, io_tlb_overflow_buffer); 827 + return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer); 869 828 } 870 829 EXPORT_SYMBOL_GPL(swiotlb_map_page); 871 830 ··· 999 958 sg_dma_len(sgl) = 0; 1000 959 return 0; 1001 960 } 1002 - sg->dma_address = phys_to_dma(hwdev, map); 961 + sg->dma_address = swiotlb_phys_to_dma(hwdev, map); 1003 962 } else 1004 963 sg->dma_address = dev_addr; 1005 964 sg_dma_len(sg) = sg->length; ··· 1067 1026 int 1068 1027 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) 1069 1028 { 1070 - return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer)); 1029 + return (dma_addr == swiotlb_phys_to_dma(hwdev, io_tlb_overflow_buffer)); 1071 1030 } 1072 1031 EXPORT_SYMBOL(swiotlb_dma_mapping_error); 1073 1032 ··· 1080 1039 int 1081 1040 swiotlb_dma_supported(struct device *hwdev, u64 mask) 1082 1041 { 1083 - return phys_to_dma(hwdev, io_tlb_end - 1) <= mask; 1042 + return swiotlb_phys_to_dma(hwdev, io_tlb_end - 1) <= mask; 1084 1043 } 1085 1044 EXPORT_SYMBOL(swiotlb_dma_supported);

+25 -3

mm/early_ioremap.c

··· 30 30 31 31 static int after_paging_init __initdata; 32 32 33 + pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr, 34 + unsigned long size, 35 + pgprot_t prot) 36 + { 37 + return prot; 38 + } 39 + 33 40 void __init __weak early_ioremap_shutdown(void) 34 41 { 35 42 } ··· 222 215 void __init * 223 216 early_memremap(resource_size_t phys_addr, unsigned long size) 224 217 { 225 - return (__force void *)__early_ioremap(phys_addr, size, 226 - FIXMAP_PAGE_NORMAL); 218 + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, 219 + FIXMAP_PAGE_NORMAL); 220 + 221 + return (__force void *)__early_ioremap(phys_addr, size, prot); 227 222 } 228 223 #ifdef FIXMAP_PAGE_RO 229 224 void __init * 230 225 early_memremap_ro(resource_size_t phys_addr, unsigned long size) 231 226 { 232 - return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); 227 + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, 228 + FIXMAP_PAGE_RO); 229 + 230 + return (__force void *)__early_ioremap(phys_addr, size, prot); 231 + } 232 + #endif 233 + 234 + #ifdef CONFIG_ARCH_USE_MEMREMAP_PROT 235 + void __init * 236 + early_memremap_prot(resource_size_t phys_addr, unsigned long size, 237 + unsigned long prot_val) 238 + { 239 + return (__force void *)__early_ioremap(phys_addr, size, 240 + __pgprot(prot_val)); 233 241 } 234 242 #endif 235 243

+2

mm/memory-failure.c

··· 1146 1146 return 0; 1147 1147 } 1148 1148 1149 + arch_unmap_kpfn(pfn); 1150 + 1149 1151 orig_head = hpage = compound_head(p); 1150 1152 num_poisoned_pages_inc(); 1151 1153