commit b1b6f83ac938d176742c85757960dec2cf10e468

+13

Documentation/admin-guide/kernel-parameters.txt

··· 2233 memory contents and reserves bad memory 2234 regions that are detected. 2235 2236 mem_sleep_default= [SUSPEND] Default system suspend mode: 2237 s2idle - Suspend-To-Idle 2238 shallow - Power-On Suspend or equivalent (if supported) ··· 2707 2708 nopat [X86] Disable PAT (page attribute table extension of 2709 pagetables) support. 2710 2711 norandmaps Don't use address space randomization. Equivalent to 2712 echo 0 > /proc/sys/kernel/randomize_va_space

··· 2233 memory contents and reserves bad memory 2234 regions that are detected. 2235 2236 + mem_encrypt= [X86-64] AMD Secure Memory Encryption (SME) control 2237 + Valid arguments: on, off 2238 + Default (depends on kernel configuration option): 2239 + on (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) 2240 + off (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=n) 2241 + mem_encrypt=on: Activate SME 2242 + mem_encrypt=off: Do not activate SME 2243 + 2244 + Refer to Documentation/x86/amd-memory-encryption.txt 2245 + for details on when memory encryption can be activated. 2246 + 2247 mem_sleep_default= [SUSPEND] Default system suspend mode: 2248 s2idle - Suspend-To-Idle 2249 shallow - Power-On Suspend or equivalent (if supported) ··· 2696 2697 nopat [X86] Disable PAT (page attribute table extension of 2698 pagetables) support. 2699 + 2700 + nopcid [X86-64] Disable the PCID cpu feature. 2701 2702 norandmaps Don't use address space randomization. Equivalent to 2703 echo 0 > /proc/sys/kernel/randomize_va_space

+68

Documentation/x86/amd-memory-encryption.txt

···

··· 1 + Secure Memory Encryption (SME) is a feature found on AMD processors. 2 + 3 + SME provides the ability to mark individual pages of memory as encrypted using 4 + the standard x86 page tables. A page that is marked encrypted will be 5 + automatically decrypted when read from DRAM and encrypted when written to 6 + DRAM. SME can therefore be used to protect the contents of DRAM from physical 7 + attacks on the system. 8 + 9 + A page is encrypted when a page table entry has the encryption bit set (see 10 + below on how to determine its position). The encryption bit can also be 11 + specified in the cr3 register, allowing the PGD table to be encrypted. Each 12 + successive level of page tables can also be encrypted by setting the encryption 13 + bit in the page table entry that points to the next table. This allows the full 14 + page table hierarchy to be encrypted. Note, this means that just because the 15 + encryption bit is set in cr3, doesn't imply the full hierarchy is encyrpted. 16 + Each page table entry in the hierarchy needs to have the encryption bit set to 17 + achieve that. So, theoretically, you could have the encryption bit set in cr3 18 + so that the PGD is encrypted, but not set the encryption bit in the PGD entry 19 + for a PUD which results in the PUD pointed to by that entry to not be 20 + encrypted. 21 + 22 + Support for SME can be determined through the CPUID instruction. The CPUID 23 + function 0x8000001f reports information related to SME: 24 + 25 + 0x8000001f[eax]: 26 + Bit[0] indicates support for SME 27 + 0x8000001f[ebx]: 28 + Bits[5:0] pagetable bit number used to activate memory 29 + encryption 30 + Bits[11:6] reduction in physical address space, in bits, when 31 + memory encryption is enabled (this only affects 32 + system physical addresses, not guest physical 33 + addresses) 34 + 35 + If support for SME is present, MSR 0xc00100010 (MSR_K8_SYSCFG) can be used to 36 + determine if SME is enabled and/or to enable memory encryption: 37 + 38 + 0xc0010010: 39 + Bit[23] 0 = memory encryption features are disabled 40 + 1 = memory encryption features are enabled 41 + 42 + Linux relies on BIOS to set this bit if BIOS has determined that the reduction 43 + in the physical address space as a result of enabling memory encryption (see 44 + CPUID information above) will not conflict with the address space resource 45 + requirements for the system. If this bit is not set upon Linux startup then 46 + Linux itself will not set it and memory encryption will not be possible. 47 + 48 + The state of SME in the Linux kernel can be documented as follows: 49 + - Supported: 50 + The CPU supports SME (determined through CPUID instruction). 51 + 52 + - Enabled: 53 + Supported and bit 23 of MSR_K8_SYSCFG is set. 54 + 55 + - Active: 56 + Supported, Enabled and the Linux kernel is actively applying 57 + the encryption bit to page table entries (the SME mask in the 58 + kernel is non-zero). 59 + 60 + SME can also be enabled and activated in the BIOS. If SME is enabled and 61 + activated in the BIOS, then all memory accesses will be encrypted and it will 62 + not be necessary to activate the Linux memory encryption support. If the BIOS 63 + merely enables SME (sets bit 23 of the MSR_K8_SYSCFG), then Linux can activate 64 + memory encryption by default (CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y) or 65 + by supplying mem_encrypt=on on the kernel command line. However, if BIOS does 66 + not enable SME, then Linux will not be able to activate memory encryption, even 67 + if configured to do so by default or the mem_encrypt=on command line parameter 68 + is specified.

+3 -3

Documentation/x86/protection-keys.txt

··· 34 called pkey_set(). 35 36 int real_prot = PROT_READ|PROT_WRITE; 37 - pkey = pkey_alloc(0, PKEY_DENY_WRITE); 38 ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 39 ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey); 40 ... application runs here ··· 42 Now, if the application needs to update the data at 'ptr', it can 43 gain access, do the update, then remove its write access: 44 45 - pkey_set(pkey, 0); // clear PKEY_DENY_WRITE 46 *ptr = foo; // assign something 47 - pkey_set(pkey, PKEY_DENY_WRITE); // set PKEY_DENY_WRITE again 48 49 Now when it frees the memory, it will also free the pkey since it 50 is no longer in use:

··· 34 called pkey_set(). 35 36 int real_prot = PROT_READ|PROT_WRITE; 37 + pkey = pkey_alloc(0, PKEY_DISABLE_WRITE); 38 ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); 39 ret = pkey_mprotect(ptr, PAGE_SIZE, real_prot, pkey); 40 ... application runs here ··· 42 Now, if the application needs to update the data at 'ptr', it can 43 gain access, do the update, then remove its write access: 44 45 + pkey_set(pkey, 0); // clear PKEY_DISABLE_WRITE 46 *ptr = foo; // assign something 47 + pkey_set(pkey, PKEY_DISABLE_WRITE); // set PKEY_DISABLE_WRITE again 48 49 Now when it frees the memory, it will also free the pkey since it 50 is no longer in use:

+64

Documentation/x86/x86_64/5level-paging.txt

···

··· 1 + == Overview == 2 + 3 + Original x86-64 was limited by 4-level paing to 256 TiB of virtual address 4 + space and 64 TiB of physical address space. We are already bumping into 5 + this limit: some vendors offers servers with 64 TiB of memory today. 6 + 7 + To overcome the limitation upcoming hardware will introduce support for 8 + 5-level paging. It is a straight-forward extension of the current page 9 + table structure adding one more layer of translation. 10 + 11 + It bumps the limits to 128 PiB of virtual address space and 4 PiB of 12 + physical address space. This "ought to be enough for anybody" ©. 13 + 14 + QEMU 2.9 and later support 5-level paging. 15 + 16 + Virtual memory layout for 5-level paging is described in 17 + Documentation/x86/x86_64/mm.txt 18 + 19 + == Enabling 5-level paging == 20 + 21 + CONFIG_X86_5LEVEL=y enables the feature. 22 + 23 + So far, a kernel compiled with the option enabled will be able to boot 24 + only on machines that supports the feature -- see for 'la57' flag in 25 + /proc/cpuinfo. 26 + 27 + The plan is to implement boot-time switching between 4- and 5-level paging 28 + in the future. 29 + 30 + == User-space and large virtual address space == 31 + 32 + On x86, 5-level paging enables 56-bit userspace virtual address space. 33 + Not all user space is ready to handle wide addresses. It's known that 34 + at least some JIT compilers use higher bits in pointers to encode their 35 + information. It collides with valid pointers with 5-level paging and 36 + leads to crashes. 37 + 38 + To mitigate this, we are not going to allocate virtual address space 39 + above 47-bit by default. 40 + 41 + But userspace can ask for allocation from full address space by 42 + specifying hint address (with or without MAP_FIXED) above 47-bits. 43 + 44 + If hint address set above 47-bit, but MAP_FIXED is not specified, we try 45 + to look for unmapped area by specified address. If it's already 46 + occupied, we look for unmapped area in *full* address space, rather than 47 + from 47-bit window. 48 + 49 + A high hint address would only affect the allocation in question, but not 50 + any future mmap()s. 51 + 52 + Specifying high hint address on older kernel or on machine without 5-level 53 + paging support is safe. The hint will be ignored and kernel will fall back 54 + to allocation from 47-bit address space. 55 + 56 + This approach helps to easily make application's memory allocator aware 57 + about large address space without manually tracking allocated virtual 58 + address space. 59 + 60 + One important case we need to handle here is interaction with MPX. 61 + MPX (without MAWA extension) cannot handle addresses above 47-bit, so we 62 + need to make sure that MPX cannot be enabled we already have VMA above 63 + the boundary and forbid creating such VMAs once MPX is enabled. 64 +

-2

arch/ia64/include/asm/acpi.h

··· 112 buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; 113 } 114 115 - #define acpi_unlazy_tlb(x) 116 - 117 #ifdef CONFIG_ACPI_NUMA 118 extern cpumask_t early_cpu_possible_map; 119 #define for_each_possible_early_cpu(cpu) \

··· 112 buf[2] |= ACPI_PDC_EST_CAPABILITY_SMP; 113 } 114 115 #ifdef CONFIG_ACPI_NUMA 116 extern cpumask_t early_cpu_possible_map; 117 #define for_each_possible_early_cpu(cpu) \

+2 -2

arch/ia64/kernel/efi.c

··· 757 return 0; 758 } 759 760 - u32 761 efi_mem_type (unsigned long phys_addr) 762 { 763 efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); 764 765 if (md) 766 return md->type; 767 - return 0; 768 } 769 770 u64

··· 757 return 0; 758 } 759 760 + int 761 efi_mem_type (unsigned long phys_addr) 762 { 763 efi_memory_desc_t *md = efi_memory_descriptor(phys_addr); 764 765 if (md) 766 return md->type; 767 + return -EINVAL; 768 } 769 770 u64

+49

arch/x86/Kconfig

··· 169 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI 170 select HAVE_PERF_REGS 171 select HAVE_PERF_USER_STACK_DUMP 172 select HAVE_REGS_AND_STACK_ACCESS_API 173 select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION 174 select HAVE_STACK_VALIDATION if X86_64 ··· 330 331 config PGTABLE_LEVELS 332 int 333 default 4 if X86_64 334 default 3 if X86_PAE 335 default 2 ··· 1401 has the cost of more pagetable lookup overhead, and also 1402 consumes more pagetable space per process. 1403 1404 config ARCH_PHYS_ADDR_T_64BIT 1405 def_bool y 1406 depends on X86_64 || X86_PAE ··· 1435 linear 1 GB mappings (even if the CPU otherwise 1436 supports them), so don't confuse the user by printing 1437 that we have them enabled. 1438 1439 # Common NUMA Features 1440 config NUMA

··· 169 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI 170 select HAVE_PERF_REGS 171 select HAVE_PERF_USER_STACK_DUMP 172 + select HAVE_RCU_TABLE_FREE 173 select HAVE_REGS_AND_STACK_ACCESS_API 174 select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION 175 select HAVE_STACK_VALIDATION if X86_64 ··· 329 330 config PGTABLE_LEVELS 331 int 332 + default 5 if X86_5LEVEL 333 default 4 if X86_64 334 default 3 if X86_PAE 335 default 2 ··· 1399 has the cost of more pagetable lookup overhead, and also 1400 consumes more pagetable space per process. 1401 1402 + config X86_5LEVEL 1403 + bool "Enable 5-level page tables support" 1404 + depends on X86_64 1405 + ---help--- 1406 + 5-level paging enables access to larger address space: 1407 + upto 128 PiB of virtual address space and 4 PiB of 1408 + physical address space. 1409 + 1410 + It will be supported by future Intel CPUs. 1411 + 1412 + Note: a kernel with this option enabled can only be booted 1413 + on machines that support the feature. 1414 + 1415 + See Documentation/x86/x86_64/5level-paging.txt for more 1416 + information. 1417 + 1418 + Say N if unsure. 1419 + 1420 config ARCH_PHYS_ADDR_T_64BIT 1421 def_bool y 1422 depends on X86_64 || X86_PAE ··· 1415 linear 1 GB mappings (even if the CPU otherwise 1416 supports them), so don't confuse the user by printing 1417 that we have them enabled. 1418 + 1419 + config ARCH_HAS_MEM_ENCRYPT 1420 + def_bool y 1421 + 1422 + config AMD_MEM_ENCRYPT 1423 + bool "AMD Secure Memory Encryption (SME) support" 1424 + depends on X86_64 && CPU_SUP_AMD 1425 + ---help--- 1426 + Say yes to enable support for the encryption of system memory. 1427 + This requires an AMD processor that supports Secure Memory 1428 + Encryption (SME). 1429 + 1430 + config AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT 1431 + bool "Activate AMD Secure Memory Encryption (SME) by default" 1432 + default y 1433 + depends on AMD_MEM_ENCRYPT 1434 + ---help--- 1435 + Say yes to have system memory encrypted by default if running on 1436 + an AMD processor that supports Secure Memory Encryption (SME). 1437 + 1438 + If set to Y, then the encryption of system memory can be 1439 + deactivated with the mem_encrypt=off command line option. 1440 + 1441 + If set to N, then the encryption of system memory can be 1442 + activated with the mem_encrypt=on command line option. 1443 + 1444 + config ARCH_USE_MEMREMAP_PROT 1445 + def_bool y 1446 + depends on AMD_MEM_ENCRYPT 1447 1448 # Common NUMA Features 1449 config NUMA

+7

arch/x86/boot/compressed/pagetable.c

··· 15 #define __pa(x) ((unsigned long)(x)) 16 #define __va(x) ((void *)((unsigned long)(x))) 17 18 #include "misc.h" 19 20 /* These actually do the work of building the kernel identity maps. */

··· 15 #define __pa(x) ((unsigned long)(x)) 16 #define __va(x) ((void *)((unsigned long)(x))) 17 18 + /* 19 + * The pgtable.h and mm/ident_map.c includes make use of the SME related 20 + * information which is not used in the compressed image support. Un-define 21 + * the SME support to avoid any compile and link errors. 22 + */ 23 + #undef CONFIG_AMD_MEM_ENCRYPT 24 + 25 #include "misc.h" 26 27 /* These actually do the work of building the kernel identity maps. */

+6 -7

arch/x86/include/asm/acpi.h

··· 150 extern int x86_acpi_numa_init(void); 151 #endif /* CONFIG_ACPI_NUMA */ 152 153 - #define acpi_unlazy_tlb(x) leave_mm(x) 154 - 155 #ifdef CONFIG_ACPI_APEI 156 static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) 157 { ··· 160 * you call efi_mem_attributes() during boot and at runtime, 161 * you could theoretically see different attributes. 162 * 163 - * Since we are yet to see any x86 platforms that require 164 - * anything other than PAGE_KERNEL (some arm64 platforms 165 - * require the equivalent of PAGE_KERNEL_NOCACHE), return that 166 - * until we know differently. 167 */ 168 - return PAGE_KERNEL; 169 } 170 #endif 171

··· 150 extern int x86_acpi_numa_init(void); 151 #endif /* CONFIG_ACPI_NUMA */ 152 153 #ifdef CONFIG_ACPI_APEI 154 static inline pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr) 155 { ··· 162 * you call efi_mem_attributes() during boot and at runtime, 163 * you could theoretically see different attributes. 164 * 165 + * We are yet to see any x86 platforms that require anything 166 + * other than PAGE_KERNEL (some ARM64 platforms require the 167 + * equivalent of PAGE_KERNEL_NOCACHE). Additionally, if SME 168 + * is active, the ACPI information will not be encrypted, 169 + * so return PAGE_KERNEL_NOENC until we know differently. 170 */ 171 + return PAGE_KERNEL_NOENC; 172 } 173 #endif 174

+2

arch/x86/include/asm/cmdline.h

··· 2 #define _ASM_X86_CMDLINE_H 3 4 int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); 5 6 #endif /* _ASM_X86_CMDLINE_H */

··· 2 #define _ASM_X86_CMDLINE_H 3 4 int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); 5 + int cmdline_find_option(const char *cmdline_ptr, const char *option, 6 + char *buffer, int bufsize); 7 8 #endif /* _ASM_X86_CMDLINE_H */

+1

arch/x86/include/asm/cpufeatures.h

··· 196 197 #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ 198 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ 199 200 #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ 201 #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */

··· 196 197 #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ 198 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ 199 + #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ 200 201 #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ 202 #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */

+3 -1

arch/x86/include/asm/disabled-features.h

··· 21 # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) 22 # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) 23 # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) 24 #else 25 # define DISABLE_VME 0 26 # define DISABLE_K6_MTRR 0 27 # define DISABLE_CYRIX_ARR 0 28 # define DISABLE_CENTAUR_MCR 0 29 #endif /* CONFIG_X86_64 */ 30 31 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS ··· 51 #define DISABLED_MASK1 0 52 #define DISABLED_MASK2 0 53 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) 54 - #define DISABLED_MASK4 0 55 #define DISABLED_MASK5 0 56 #define DISABLED_MASK6 0 57 #define DISABLED_MASK7 0

··· 21 # define DISABLE_K6_MTRR (1<<(X86_FEATURE_K6_MTRR & 31)) 22 # define DISABLE_CYRIX_ARR (1<<(X86_FEATURE_CYRIX_ARR & 31)) 23 # define DISABLE_CENTAUR_MCR (1<<(X86_FEATURE_CENTAUR_MCR & 31)) 24 + # define DISABLE_PCID 0 25 #else 26 # define DISABLE_VME 0 27 # define DISABLE_K6_MTRR 0 28 # define DISABLE_CYRIX_ARR 0 29 # define DISABLE_CENTAUR_MCR 0 30 + # define DISABLE_PCID (1<<(X86_FEATURE_PCID & 31)) 31 #endif /* CONFIG_X86_64 */ 32 33 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS ··· 49 #define DISABLED_MASK1 0 50 #define DISABLED_MASK2 0 51 #define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR) 52 + #define DISABLED_MASK4 (DISABLE_PCID) 53 #define DISABLED_MASK5 0 54 #define DISABLED_MASK6 0 55 #define DISABLED_MASK7 0

+3 -2

arch/x86/include/asm/dma-mapping.h

··· 12 #include <asm/io.h> 13 #include <asm/swiotlb.h> 14 #include <linux/dma-contiguous.h> 15 16 #ifdef CONFIG_ISA 17 # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) ··· 58 59 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) 60 { 61 - return paddr; 62 } 63 64 static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) 65 { 66 - return daddr; 67 } 68 #endif /* CONFIG_X86_DMA_REMAP */ 69

··· 12 #include <asm/io.h> 13 #include <asm/swiotlb.h> 14 #include <linux/dma-contiguous.h> 15 + #include <linux/mem_encrypt.h> 16 17 #ifdef CONFIG_ISA 18 # define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) ··· 57 58 static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) 59 { 60 + return __sme_set(paddr); 61 } 62 63 static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t daddr) 64 { 65 + return __sme_clr(daddr); 66 } 67 #endif /* CONFIG_X86_DMA_REMAP */ 68

+4 -4

arch/x86/include/asm/dmi.h

··· 13 } 14 15 /* Use early IO mappings for DMI because it's initialized early */ 16 - #define dmi_early_remap early_ioremap 17 - #define dmi_early_unmap early_iounmap 18 - #define dmi_remap ioremap_cache 19 - #define dmi_unmap iounmap 20 21 #endif /* _ASM_X86_DMI_H */

··· 13 } 14 15 /* Use early IO mappings for DMI because it's initialized early */ 16 + #define dmi_early_remap early_memremap 17 + #define dmi_early_unmap early_memunmap 18 + #define dmi_remap(_x, _l) memremap(_x, _l, MEMREMAP_WB) 19 + #define dmi_unmap(_x) memunmap(_x) 20 21 #endif /* _ASM_X86_DMI_H */

+2

arch/x86/include/asm/e820/api.h

··· 39 extern void e820__reallocate_tables(void); 40 extern void e820__register_nosave_regions(unsigned long limit_pfn); 41 42 /* 43 * Returns true iff the specified range [start,end) is completely contained inside 44 * the ISA region.

··· 39 extern void e820__reallocate_tables(void); 40 extern void e820__register_nosave_regions(unsigned long limit_pfn); 41 42 + extern int e820__get_entry_type(u64 start, u64 end); 43 + 44 /* 45 * Returns true iff the specified range [start,end) is completely contained inside 46 * the ISA region.

+2 -2

arch/x86/include/asm/elf.h

··· 305 test_thread_flag(TIF_ADDR32)); 306 } 307 308 - extern unsigned long tasksize_32bit(void); 309 - extern unsigned long tasksize_64bit(void); 310 extern unsigned long get_mmap_base(int is_legacy); 311 312 #ifdef CONFIG_X86_32

··· 305 test_thread_flag(TIF_ADDR32)); 306 } 307 308 + extern unsigned long task_size_32bit(void); 309 + extern unsigned long task_size_64bit(int full_addr_space); 310 extern unsigned long get_mmap_base(int is_legacy); 311 312 #ifdef CONFIG_X86_32

+20

arch/x86/include/asm/fixmap.h

··· 157 } 158 #endif 159 160 #include <asm-generic/fixmap.h> 161 162 #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)

··· 157 } 158 #endif 159 160 + /* 161 + * FIXMAP_PAGE_NOCACHE is used for MMIO. Memory encryption is not 162 + * supported for MMIO addresses, so make sure that the memory encryption 163 + * mask is not part of the page attributes. 164 + */ 165 + #define FIXMAP_PAGE_NOCACHE PAGE_KERNEL_IO_NOCACHE 166 + 167 + /* 168 + * Early memremap routines used for in-place encryption. The mappings created 169 + * by these routines are intended to be used as temporary mappings. 170 + */ 171 + void __init *early_memremap_encrypted(resource_size_t phys_addr, 172 + unsigned long size); 173 + void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, 174 + unsigned long size); 175 + void __init *early_memremap_decrypted(resource_size_t phys_addr, 176 + unsigned long size); 177 + void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, 178 + unsigned long size); 179 + 180 #include <asm-generic/fixmap.h> 181 182 #define __late_set_fixmap(idx, phys, flags) __set_fixmap(idx, phys, flags)

+1

arch/x86/include/asm/init.h

··· 7 unsigned long page_flag; /* page flag for PMD or PUD entry */ 8 unsigned long offset; /* ident mapping offset */ 9 bool direct_gbpages; /* PUD level 1GB page support */ 10 }; 11 12 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,

··· 7 unsigned long page_flag; /* page flag for PMD or PUD entry */ 8 unsigned long offset; /* ident mapping offset */ 9 bool direct_gbpages; /* PUD level 1GB page support */ 10 + unsigned long kernpg_flag; /* kernel pagetable flag override */ 11 }; 12 13 int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,

+8

arch/x86/include/asm/io.h

··· 377 #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc 378 #endif 379 380 #endif /* _ASM_X86_IO_H */

··· 377 #define arch_io_reserve_memtype_wc arch_io_reserve_memtype_wc 378 #endif 379 380 + extern bool arch_memremap_can_ram_remap(resource_size_t offset, 381 + unsigned long size, 382 + unsigned long flags); 383 + #define arch_memremap_can_ram_remap arch_memremap_can_ram_remap 384 + 385 + extern bool phys_mem_access_encrypted(unsigned long phys_addr, 386 + unsigned long size); 387 + 388 #endif /* _ASM_X86_IO_H */

+10 -1

arch/x86/include/asm/kexec.h

··· 147 relocate_kernel(unsigned long indirection_page, 148 unsigned long page_list, 149 unsigned long start_address, 150 - unsigned int preserve_context); 151 #endif 152 153 #define ARCH_HAS_KIMAGE_ARCH ··· 208 uint64_t r15; 209 uint64_t rip; 210 }; 211 #endif 212 213 typedef void crash_vmclear_fn(void);

··· 147 relocate_kernel(unsigned long indirection_page, 148 unsigned long page_list, 149 unsigned long start_address, 150 + unsigned int preserve_context, 151 + unsigned int sme_active); 152 #endif 153 154 #define ARCH_HAS_KIMAGE_ARCH ··· 207 uint64_t r15; 208 uint64_t rip; 209 }; 210 + 211 + extern int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, 212 + gfp_t gfp); 213 + #define arch_kexec_post_alloc_pages arch_kexec_post_alloc_pages 214 + 215 + extern void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages); 216 + #define arch_kexec_pre_free_pages arch_kexec_pre_free_pages 217 + 218 #endif 219 220 typedef void crash_vmclear_fn(void);

+1 -1

arch/x86/include/asm/kvm_host.h

··· 1079 void kvm_mmu_uninit_vm(struct kvm *kvm); 1080 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 1081 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, 1082 - u64 acc_track_mask); 1083 1084 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 1085 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,

··· 1079 void kvm_mmu_uninit_vm(struct kvm *kvm); 1080 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 1081 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, 1082 + u64 acc_track_mask, u64 me_mask); 1083 1084 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); 1085 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,

+80

arch/x86/include/asm/mem_encrypt.h

···

··· 1 + /* 2 + * AMD Memory Encryption Support 3 + * 4 + * Copyright (C) 2016 Advanced Micro Devices, Inc. 5 + * 6 + * Author: Tom Lendacky <thomas.lendacky@amd.com> 7 + * 8 + * This program is free software; you can redistribute it and/or modify 9 + * it under the terms of the GNU General Public License version 2 as 10 + * published by the Free Software Foundation. 11 + */ 12 + 13 + #ifndef __X86_MEM_ENCRYPT_H__ 14 + #define __X86_MEM_ENCRYPT_H__ 15 + 16 + #ifndef __ASSEMBLY__ 17 + 18 + #include <linux/init.h> 19 + 20 + #include <asm/bootparam.h> 21 + 22 + #ifdef CONFIG_AMD_MEM_ENCRYPT 23 + 24 + extern unsigned long sme_me_mask; 25 + 26 + void sme_encrypt_execute(unsigned long encrypted_kernel_vaddr, 27 + unsigned long decrypted_kernel_vaddr, 28 + unsigned long kernel_len, 29 + unsigned long encryption_wa, 30 + unsigned long encryption_pgd); 31 + 32 + void __init sme_early_encrypt(resource_size_t paddr, 33 + unsigned long size); 34 + void __init sme_early_decrypt(resource_size_t paddr, 35 + unsigned long size); 36 + 37 + void __init sme_map_bootdata(char *real_mode_data); 38 + void __init sme_unmap_bootdata(char *real_mode_data); 39 + 40 + void __init sme_early_init(void); 41 + 42 + void __init sme_encrypt_kernel(void); 43 + void __init sme_enable(struct boot_params *bp); 44 + 45 + /* Architecture __weak replacement functions */ 46 + void __init mem_encrypt_init(void); 47 + 48 + void swiotlb_set_mem_attributes(void *vaddr, unsigned long size); 49 + 50 + #else /* !CONFIG_AMD_MEM_ENCRYPT */ 51 + 52 + #define sme_me_mask 0UL 53 + 54 + static inline void __init sme_early_encrypt(resource_size_t paddr, 55 + unsigned long size) { } 56 + static inline void __init sme_early_decrypt(resource_size_t paddr, 57 + unsigned long size) { } 58 + 59 + static inline void __init sme_map_bootdata(char *real_mode_data) { } 60 + static inline void __init sme_unmap_bootdata(char *real_mode_data) { } 61 + 62 + static inline void __init sme_early_init(void) { } 63 + 64 + static inline void __init sme_encrypt_kernel(void) { } 65 + static inline void __init sme_enable(struct boot_params *bp) { } 66 + 67 + #endif /* CONFIG_AMD_MEM_ENCRYPT */ 68 + 69 + /* 70 + * The __sme_pa() and __sme_pa_nodebug() macros are meant for use when 71 + * writing to or comparing values from the cr3 register. Having the 72 + * encryption mask set in cr3 enables the PGD entry to be encrypted and 73 + * avoid special case handling of PGD allocations. 74 + */ 75 + #define __sme_pa(x) (__pa(x) | sme_me_mask) 76 + #define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask) 77 + 78 + #endif /* __ASSEMBLY__ */ 79 + 80 + #endif /* __X86_MEM_ENCRYPT_H__ */

+23 -2

arch/x86/include/asm/mmu.h

··· 3 4 #include <linux/spinlock.h> 5 #include <linux/mutex.h> 6 7 /* 8 - * The x86 doesn't have a mmu context, but 9 - * we put the segment information here. 10 */ 11 typedef struct { 12 #ifdef CONFIG_MODIFY_LDT_SYSCALL 13 struct ldt_struct *ldt; 14 #endif ··· 52 void __user *bd_addr; 53 #endif 54 } mm_context_t; 55 56 void leave_mm(int cpu); 57

··· 3 4 #include <linux/spinlock.h> 5 #include <linux/mutex.h> 6 + #include <linux/atomic.h> 7 8 /* 9 + * x86 has arch-specific MMU state beyond what lives in mm_struct. 10 */ 11 typedef struct { 12 + /* 13 + * ctx_id uniquely identifies this mm_struct. A ctx_id will never 14 + * be reused, and zero is not a valid ctx_id. 15 + */ 16 + u64 ctx_id; 17 + 18 + /* 19 + * Any code that needs to do any sort of TLB flushing for this 20 + * mm will first make its changes to the page tables, then 21 + * increment tlb_gen, then flush. This lets the low-level 22 + * flushing code keep track of what needs flushing. 23 + * 24 + * This is not used on Xen PV. 25 + */ 26 + atomic64_t tlb_gen; 27 + 28 #ifdef CONFIG_MODIFY_LDT_SYSCALL 29 struct ldt_struct *ldt; 30 #endif ··· 36 void __user *bd_addr; 37 #endif 38 } mm_context_t; 39 + 40 + #define INIT_MM_CONTEXT(mm) \ 41 + .context = { \ 42 + .ctx_id = 1, \ 43 + } 44 45 void leave_mm(int cpu); 46

+13 -2

arch/x86/include/asm/mmu_context.h

··· 12 #include <asm/tlbflush.h> 13 #include <asm/paravirt.h> 14 #include <asm/mpx.h> 15 #ifndef CONFIG_PARAVIRT 16 static inline void paravirt_activate_mm(struct mm_struct *prev, 17 struct mm_struct *next) ··· 128 129 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 130 { 131 - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 132 - this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); 133 } 134 135 static inline int init_new_context(struct task_struct *tsk, 136 struct mm_struct *mm) 137 { 138 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 139 if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { 140 /* pkey 0 is the default and always allocated */ ··· 297 static inline unsigned long __get_current_cr3_fast(void) 298 { 299 unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); 300 301 /* For now, be very restrictive about when this can be called. */ 302 VM_WARN_ON(in_nmi() || preemptible());

··· 12 #include <asm/tlbflush.h> 13 #include <asm/paravirt.h> 14 #include <asm/mpx.h> 15 + 16 + extern atomic64_t last_mm_ctx_id; 17 + 18 #ifndef CONFIG_PARAVIRT 19 static inline void paravirt_activate_mm(struct mm_struct *prev, 20 struct mm_struct *next) ··· 125 126 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) 127 { 128 + int cpu = smp_processor_id(); 129 + 130 + if (cpumask_test_cpu(cpu, mm_cpumask(mm))) 131 + cpumask_clear_cpu(cpu, mm_cpumask(mm)); 132 } 133 134 static inline int init_new_context(struct task_struct *tsk, 135 struct mm_struct *mm) 136 { 137 + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); 138 + atomic64_set(&mm->context.tlb_gen, 0); 139 + 140 #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS 141 if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { 142 /* pkey 0 is the default and always allocated */ ··· 289 static inline unsigned long __get_current_cr3_fast(void) 290 { 291 unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); 292 + 293 + if (static_cpu_has(X86_FEATURE_PCID)) 294 + cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid); 295 296 /* For now, be very restrictive about when this can be called. */ 297 VM_WARN_ON(in_nmi() || preemptible());

+9

arch/x86/include/asm/mpx.h

··· 73 } 74 void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, 75 unsigned long start, unsigned long end); 76 #else 77 static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) 78 { ··· 96 struct vm_area_struct *vma, 97 unsigned long start, unsigned long end) 98 { 99 } 100 #endif /* CONFIG_X86_INTEL_MPX */ 101

··· 73 } 74 void mpx_notify_unmap(struct mm_struct *mm, struct vm_area_struct *vma, 75 unsigned long start, unsigned long end); 76 + 77 + unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, 78 + unsigned long flags); 79 #else 80 static inline siginfo_t *mpx_generate_siginfo(struct pt_regs *regs) 81 { ··· 93 struct vm_area_struct *vma, 94 unsigned long start, unsigned long end) 95 { 96 + } 97 + 98 + static inline unsigned long mpx_unmapped_area_check(unsigned long addr, 99 + unsigned long len, unsigned long flags) 100 + { 101 + return addr; 102 } 103 #endif /* CONFIG_X86_INTEL_MPX */ 104

+2

arch/x86/include/asm/msr-index.h

··· 356 #define MSR_K8_TOP_MEM1 0xc001001a 357 #define MSR_K8_TOP_MEM2 0xc001001d 358 #define MSR_K8_SYSCFG 0xc0010010 359 #define MSR_K8_INT_PENDING_MSG 0xc0010055 360 /* C1E active bits in int pending message */ 361 #define K8_INTP_C1E_ACTIVE_MASK 0x18000000

··· 356 #define MSR_K8_TOP_MEM1 0xc001001a 357 #define MSR_K8_TOP_MEM2 0xc001001d 358 #define MSR_K8_SYSCFG 0xc0010010 359 + #define MSR_K8_SYSCFG_MEM_ENCRYPT_BIT 23 360 + #define MSR_K8_SYSCFG_MEM_ENCRYPT BIT_ULL(MSR_K8_SYSCFG_MEM_ENCRYPT_BIT) 361 #define MSR_K8_INT_PENDING_MSG 0xc0010055 362 /* C1E active bits in int pending message */ 363 #define K8_INTP_C1E_ACTIVE_MASK 0x18000000

+4

arch/x86/include/asm/page_64.h

··· 51 52 void copy_page(void *to, void *from); 53 54 #endif /* !__ASSEMBLY__ */ 55 56 #ifdef CONFIG_X86_VSYSCALL_EMULATION

··· 51 52 void copy_page(void *to, void *from); 53 54 + #ifdef CONFIG_X86_MCE 55 + #define arch_unmap_kpfn arch_unmap_kpfn 56 + #endif 57 + 58 #endif /* !__ASSEMBLY__ */ 59 60 #ifdef CONFIG_X86_VSYSCALL_EMULATION

+2 -1

arch/x86/include/asm/page_types.h

··· 3 4 #include <linux/const.h> 5 #include <linux/types.h> 6 7 /* PAGE_SHIFT determines the page size */ 8 #define PAGE_SHIFT 12 ··· 16 #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) 17 #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) 18 19 - #define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1)) 20 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) 21 22 /* Cast *PAGE_MASK to a signed type so that it is sign-extended if

··· 3 4 #include <linux/const.h> 5 #include <linux/types.h> 6 + #include <linux/mem_encrypt.h> 7 8 /* PAGE_SHIFT determines the page size */ 9 #define PAGE_SHIFT 12 ··· 15 #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) 16 #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) 17 18 + #define __PHYSICAL_MASK ((phys_addr_t)(__sme_clr((1ULL << __PHYSICAL_MASK_SHIFT) - 1))) 19 #define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) 20 21 /* Cast *PAGE_MASK to a signed type so that it is sign-extended if

+21 -7

arch/x86/include/asm/pgtable.h

··· 1 #ifndef _ASM_X86_PGTABLE_H 2 #define _ASM_X86_PGTABLE_H 3 4 #include <asm/page.h> 5 #include <asm/pgtable_types.h> 6 ··· 14 cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ 15 : (prot)) 16 17 #ifndef __ASSEMBLY__ 18 #include <asm/x86_init.h> 19 20 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); 21 void ptdump_walk_pgd_level_checkwx(void); ··· 47 extern struct list_head pgd_list; 48 49 extern struct mm_struct *pgd_page_get_mm(struct page *page); 50 51 #ifdef CONFIG_PARAVIRT 52 #include <asm/paravirt.h> ··· 205 static inline unsigned long p4d_pfn(p4d_t p4d) 206 { 207 return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; 208 } 209 210 static inline int p4d_large(p4d_t p4d) ··· 721 * Currently stuck as a macro due to indirect forward reference to 722 * linux/mmzone.h's __section_mem_map_addr() definition: 723 */ 724 - #define pmd_page(pmd) \ 725 - pfn_to_page((pmd_val(pmd) & pmd_pfn_mask(pmd)) >> PAGE_SHIFT) 726 727 /* 728 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] ··· 789 * Currently stuck as a macro due to indirect forward reference to 790 * linux/mmzone.h's __section_mem_map_addr() definition: 791 */ 792 - #define pud_page(pud) \ 793 - pfn_to_page((pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT) 794 795 /* Find an entry in the second-level page table.. */ 796 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) ··· 839 * Currently stuck as a macro due to indirect forward reference to 840 * linux/mmzone.h's __section_mem_map_addr() definition: 841 */ 842 - #define p4d_page(p4d) \ 843 - pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) 844 845 /* Find an entry in the third-level page table.. */ 846 static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) ··· 873 * Currently stuck as a macro due to indirect forward reference to 874 * linux/mmzone.h's __section_mem_map_addr() definition: 875 */ 876 - #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) 877 878 /* to find an entry in a page-table-directory. */ 879 static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)

··· 1 #ifndef _ASM_X86_PGTABLE_H 2 #define _ASM_X86_PGTABLE_H 3 4 + #include <linux/mem_encrypt.h> 5 #include <asm/page.h> 6 #include <asm/pgtable_types.h> 7 ··· 13 cachemode2protval(_PAGE_CACHE_MODE_UC_MINUS))) \ 14 : (prot)) 15 16 + /* 17 + * Macros to add or remove encryption attribute 18 + */ 19 + #define pgprot_encrypted(prot) __pgprot(__sme_set(pgprot_val(prot))) 20 + #define pgprot_decrypted(prot) __pgprot(__sme_clr(pgprot_val(prot))) 21 + 22 #ifndef __ASSEMBLY__ 23 #include <asm/x86_init.h> 24 + 25 + extern pgd_t early_top_pgt[PTRS_PER_PGD]; 26 + int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); 27 28 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); 29 void ptdump_walk_pgd_level_checkwx(void); ··· 37 extern struct list_head pgd_list; 38 39 extern struct mm_struct *pgd_page_get_mm(struct page *page); 40 + 41 + extern pmdval_t early_pmd_flags; 42 43 #ifdef CONFIG_PARAVIRT 44 #include <asm/paravirt.h> ··· 193 static inline unsigned long p4d_pfn(p4d_t p4d) 194 { 195 return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; 196 + } 197 + 198 + static inline unsigned long pgd_pfn(pgd_t pgd) 199 + { 200 + return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT; 201 } 202 203 static inline int p4d_large(p4d_t p4d) ··· 704 * Currently stuck as a macro due to indirect forward reference to 705 * linux/mmzone.h's __section_mem_map_addr() definition: 706 */ 707 + #define pmd_page(pmd) pfn_to_page(pmd_pfn(pmd)) 708 709 /* 710 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] ··· 773 * Currently stuck as a macro due to indirect forward reference to 774 * linux/mmzone.h's __section_mem_map_addr() definition: 775 */ 776 + #define pud_page(pud) pfn_to_page(pud_pfn(pud)) 777 778 /* Find an entry in the second-level page table.. */ 779 static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) ··· 824 * Currently stuck as a macro due to indirect forward reference to 825 * linux/mmzone.h's __section_mem_map_addr() definition: 826 */ 827 + #define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) 828 829 /* Find an entry in the third-level page table.. */ 830 static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) ··· 859 * Currently stuck as a macro due to indirect forward reference to 860 * linux/mmzone.h's __section_mem_map_addr() definition: 861 */ 862 + #define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) 863 864 /* to find an entry in a page-table-directory. */ 865 static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)

+43 -15

arch/x86/include/asm/pgtable_types.h

··· 2 #define _ASM_X86_PGTABLE_DEFS_H 3 4 #include <linux/const.h> 5 #include <asm/page_types.h> 6 7 #define FIRST_USER_ADDRESS 0UL ··· 123 124 #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) 125 126 - #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 127 - _PAGE_ACCESSED | _PAGE_DIRTY) 128 - #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ 129 - _PAGE_DIRTY) 130 131 /* 132 * Set of bits not changed in pte_modify. The pte's ··· 161 162 #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) 163 #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) 164 165 #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) 166 #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ ··· 190 #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) 191 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) 192 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) 193 194 #define __PAGE_KERNEL_IO (__PAGE_KERNEL) 195 #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) 196 197 - #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) 198 - #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) 199 - #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) 200 - #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) 201 - #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) 202 - #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) 203 - #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) 204 - #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL) 205 - #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR) 206 207 - #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) 208 - #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) 209 210 /* xwr */ 211 #define __P000 PAGE_NONE ··· 309 } 310 #else 311 #include <asm-generic/pgtable-nop4d.h> 312 313 static inline p4dval_t native_p4d_val(p4d_t p4d) 314 {

··· 2 #define _ASM_X86_PGTABLE_DEFS_H 3 4 #include <linux/const.h> 5 + #include <linux/mem_encrypt.h> 6 + 7 #include <asm/page_types.h> 8 9 #define FIRST_USER_ADDRESS 0UL ··· 121 122 #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) 123 124 + #define _PAGE_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |\ 125 + _PAGE_ACCESSED | _PAGE_DIRTY) 126 + #define _KERNPG_TABLE_NOENC (_PAGE_PRESENT | _PAGE_RW | \ 127 + _PAGE_ACCESSED | _PAGE_DIRTY) 128 129 /* 130 * Set of bits not changed in pte_modify. The pte's ··· 159 160 #define _PAGE_CACHE_MASK (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT) 161 #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) 162 + #define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) 163 164 #define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED) 165 #define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ ··· 187 #define __PAGE_KERNEL_VVAR (__PAGE_KERNEL_RO | _PAGE_USER) 188 #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) 189 #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) 190 + #define __PAGE_KERNEL_WP (__PAGE_KERNEL | _PAGE_CACHE_WP) 191 192 #define __PAGE_KERNEL_IO (__PAGE_KERNEL) 193 #define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE) 194 195 + #ifndef __ASSEMBLY__ 196 197 + #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) 198 + 199 + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ 200 + _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC) 201 + #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ 202 + _PAGE_DIRTY | _PAGE_ENC) 203 + 204 + #define __PAGE_KERNEL_ENC (__PAGE_KERNEL | _PAGE_ENC) 205 + #define __PAGE_KERNEL_ENC_WP (__PAGE_KERNEL_WP | _PAGE_ENC) 206 + 207 + #define __PAGE_KERNEL_NOENC (__PAGE_KERNEL) 208 + #define __PAGE_KERNEL_NOENC_WP (__PAGE_KERNEL_WP) 209 + 210 + #define PAGE_KERNEL __pgprot(__PAGE_KERNEL | _PAGE_ENC) 211 + #define PAGE_KERNEL_NOENC __pgprot(__PAGE_KERNEL) 212 + #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO | _PAGE_ENC) 213 + #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC | _PAGE_ENC) 214 + #define PAGE_KERNEL_EXEC_NOENC __pgprot(__PAGE_KERNEL_EXEC) 215 + #define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX | _PAGE_ENC) 216 + #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE | _PAGE_ENC) 217 + #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE | _PAGE_ENC) 218 + #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC | _PAGE_ENC) 219 + #define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL | _PAGE_ENC) 220 + #define PAGE_KERNEL_VVAR __pgprot(__PAGE_KERNEL_VVAR | _PAGE_ENC) 221 + 222 + #define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO) 223 + #define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE) 224 + 225 + #endif /* __ASSEMBLY__ */ 226 227 /* xwr */ 228 #define __P000 PAGE_NONE ··· 286 } 287 #else 288 #include <asm-generic/pgtable-nop4d.h> 289 + 290 + static inline p4d_t native_make_p4d(pudval_t val) 291 + { 292 + return (p4d_t) { .pgd = native_make_pgd((pgdval_t)val) }; 293 + } 294 295 static inline p4dval_t native_p4d_val(p4d_t p4d) 296 {

+8 -5

arch/x86/include/asm/processor-flags.h

··· 2 #define _ASM_X86_PROCESSOR_FLAGS_H 3 4 #include <uapi/asm/processor-flags.h> 5 6 #ifdef CONFIG_VM86 7 #define X86_VM_MASK X86_EFLAGS_VM ··· 33 * CR3_ADDR_MASK is the mask used by read_cr3_pa(). 34 */ 35 #ifdef CONFIG_X86_64 36 - /* Mask off the address space ID bits. */ 37 - #define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull 38 - #define CR3_PCID_MASK 0xFFFull 39 #else 40 /* 41 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save 42 * a tiny bit of code size by setting all the bits. 43 */ 44 - #define CR3_ADDR_MASK 0xFFFFFFFFull 45 - #define CR3_PCID_MASK 0ull 46 #endif 47 48 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */

··· 2 #define _ASM_X86_PROCESSOR_FLAGS_H 3 4 #include <uapi/asm/processor-flags.h> 5 + #include <linux/mem_encrypt.h> 6 7 #ifdef CONFIG_VM86 8 #define X86_VM_MASK X86_EFLAGS_VM ··· 32 * CR3_ADDR_MASK is the mask used by read_cr3_pa(). 33 */ 34 #ifdef CONFIG_X86_64 35 + /* Mask off the address space ID and SME encryption bits. */ 36 + #define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull) 37 + #define CR3_PCID_MASK 0xFFFull 38 + #define CR3_NOFLUSH BIT_ULL(63) 39 #else 40 /* 41 * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save 42 * a tiny bit of code size by setting all the bits. 43 */ 44 + #define CR3_ADDR_MASK 0xFFFFFFFFull 45 + #define CR3_PCID_MASK 0ull 46 + #define CR3_NOFLUSH 0 47 #endif 48 49 #endif /* _ASM_X86_PROCESSOR_FLAGS_H */

+16 -4

arch/x86/include/asm/processor.h

··· 30 #include <linux/math64.h> 31 #include <linux/err.h> 32 #include <linux/irqflags.h> 33 34 /* 35 * We handle most unaligned accesses in hardware. On the other hand ··· 241 return __read_cr3() & CR3_ADDR_MASK; 242 } 243 244 static inline void load_cr3(pgd_t *pgdir) 245 { 246 - write_cr3(__pa(pgdir)); 247 } 248 249 #ifdef CONFIG_X86_32 ··· 811 */ 812 #define IA32_PAGE_OFFSET PAGE_OFFSET 813 #define TASK_SIZE PAGE_OFFSET 814 #define TASK_SIZE_MAX TASK_SIZE 815 #define STACK_TOP TASK_SIZE 816 #define STACK_TOP_MAX STACK_TOP 817 ··· 853 * particular problem by preventing anything from being mapped 854 * at the maximum canonical address. 855 */ 856 - #define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) 857 858 /* This decides where the kernel will search for a free chunk of vm 859 * space during mmap's. ··· 863 #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 864 0xc0000000 : 0xFFFFe000) 865 866 #define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ 867 IA32_PAGE_OFFSET : TASK_SIZE_MAX) 868 #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ 869 IA32_PAGE_OFFSET : TASK_SIZE_MAX) 870 871 - #define STACK_TOP TASK_SIZE 872 #define STACK_TOP_MAX TASK_SIZE_MAX 873 874 #define INIT_THREAD { \ ··· 891 * space during mmap's. 892 */ 893 #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) 894 - #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) 895 896 #define KSTK_EIP(task) (task_pt_regs(task)->ip) 897

··· 30 #include <linux/math64.h> 31 #include <linux/err.h> 32 #include <linux/irqflags.h> 33 + #include <linux/mem_encrypt.h> 34 35 /* 36 * We handle most unaligned accesses in hardware. On the other hand ··· 240 return __read_cr3() & CR3_ADDR_MASK; 241 } 242 243 + static inline unsigned long native_read_cr3_pa(void) 244 + { 245 + return __native_read_cr3() & CR3_ADDR_MASK; 246 + } 247 + 248 static inline void load_cr3(pgd_t *pgdir) 249 { 250 + write_cr3(__sme_pa(pgdir)); 251 } 252 253 #ifdef CONFIG_X86_32 ··· 805 */ 806 #define IA32_PAGE_OFFSET PAGE_OFFSET 807 #define TASK_SIZE PAGE_OFFSET 808 + #define TASK_SIZE_LOW TASK_SIZE 809 #define TASK_SIZE_MAX TASK_SIZE 810 + #define DEFAULT_MAP_WINDOW TASK_SIZE 811 #define STACK_TOP TASK_SIZE 812 #define STACK_TOP_MAX STACK_TOP 813 ··· 845 * particular problem by preventing anything from being mapped 846 * at the maximum canonical address. 847 */ 848 + #define TASK_SIZE_MAX ((1UL << __VIRTUAL_MASK_SHIFT) - PAGE_SIZE) 849 + 850 + #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) 851 852 /* This decides where the kernel will search for a free chunk of vm 853 * space during mmap's. ··· 853 #define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ 854 0xc0000000 : 0xFFFFe000) 855 856 + #define TASK_SIZE_LOW (test_thread_flag(TIF_ADDR32) ? \ 857 + IA32_PAGE_OFFSET : DEFAULT_MAP_WINDOW) 858 #define TASK_SIZE (test_thread_flag(TIF_ADDR32) ? \ 859 IA32_PAGE_OFFSET : TASK_SIZE_MAX) 860 #define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_ADDR32)) ? \ 861 IA32_PAGE_OFFSET : TASK_SIZE_MAX) 862 863 + #define STACK_TOP TASK_SIZE_LOW 864 #define STACK_TOP_MAX TASK_SIZE_MAX 865 866 #define INIT_THREAD { \ ··· 879 * space during mmap's. 880 */ 881 #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) 882 + #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) 883 884 #define KSTK_EIP(task) (task_pt_regs(task)->ip) 885

+12

arch/x86/include/asm/realmode.h

··· 1 #ifndef _ARCH_X86_REALMODE_H 2 #define _ARCH_X86_REALMODE_H 3 4 #include <linux/types.h> 5 #include <asm/io.h> 6 ··· 47 u64 start; 48 u64 efer; 49 u32 cr4; 50 #endif 51 }; 52 ··· 78 79 void set_real_mode_mem(phys_addr_t mem, size_t size); 80 void reserve_real_mode(void); 81 82 #endif /* _ARCH_X86_REALMODE_H */

··· 1 #ifndef _ARCH_X86_REALMODE_H 2 #define _ARCH_X86_REALMODE_H 3 4 + /* 5 + * Flag bit definitions for use with the flags field of the trampoline header 6 + * in the CONFIG_X86_64 variant. 7 + */ 8 + #define TH_FLAGS_SME_ACTIVE_BIT 0 9 + #define TH_FLAGS_SME_ACTIVE BIT(TH_FLAGS_SME_ACTIVE_BIT) 10 + 11 + #ifndef __ASSEMBLY__ 12 + 13 #include <linux/types.h> 14 #include <asm/io.h> 15 ··· 38 u64 start; 39 u64 efer; 40 u32 cr4; 41 + u32 flags; 42 #endif 43 }; 44 ··· 68 69 void set_real_mode_mem(phys_addr_t mem, size_t size); 70 void reserve_real_mode(void); 71 + 72 + #endif /* __ASSEMBLY__ */ 73 74 #endif /* _ARCH_X86_REALMODE_H */

+3

arch/x86/include/asm/set_memory.h

··· 11 * Executability : eXeutable, NoteXecutable 12 * Read/Write : ReadOnly, ReadWrite 13 * Presence : NotPresent 14 * 15 * Within a category, the attributes are mutually exclusive. 16 * ··· 43 int set_memory_wb(unsigned long addr, int numpages); 44 int set_memory_np(unsigned long addr, int numpages); 45 int set_memory_4k(unsigned long addr, int numpages); 46 47 int set_memory_array_uc(unsigned long *addr, int addrinarray); 48 int set_memory_array_wc(unsigned long *addr, int addrinarray);

··· 11 * Executability : eXeutable, NoteXecutable 12 * Read/Write : ReadOnly, ReadWrite 13 * Presence : NotPresent 14 + * Encryption : Encrypted, Decrypted 15 * 16 * Within a category, the attributes are mutually exclusive. 17 * ··· 42 int set_memory_wb(unsigned long addr, int numpages); 43 int set_memory_np(unsigned long addr, int numpages); 44 int set_memory_4k(unsigned long addr, int numpages); 45 + int set_memory_encrypted(unsigned long addr, int numpages); 46 + int set_memory_decrypted(unsigned long addr, int numpages); 47 48 int set_memory_array_uc(unsigned long *addr, int addrinarray); 49 int set_memory_array_wc(unsigned long *addr, int addrinarray);

+14

arch/x86/include/asm/tlb.h

··· 15 16 #include <asm-generic/tlb.h> 17 18 #endif /* _ASM_X86_TLB_H */

··· 15 16 #include <asm-generic/tlb.h> 17 18 + /* 19 + * While x86 architecture in general requires an IPI to perform TLB 20 + * shootdown, enablement code for several hypervisors overrides 21 + * .flush_tlb_others hook in pv_mmu_ops and implements it by issuing 22 + * a hypercall. To keep software pagetable walkers safe in this case we 23 + * switch to RCU based table free (HAVE_RCU_TABLE_FREE). See the comment 24 + * below 'ifdef CONFIG_HAVE_RCU_TABLE_FREE' in include/asm-generic/tlb.h 25 + * for more details. 26 + */ 27 + static inline void __tlb_remove_table(void *table) 28 + { 29 + free_page_and_swap_cache(table); 30 + } 31 + 32 #endif /* _ASM_X86_TLB_H */

+80 -7

arch/x86/include/asm/tlbflush.h

··· 57 __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); 58 } 59 60 #ifdef CONFIG_PARAVIRT 61 #include <asm/paravirt.h> 62 #else ··· 81 #define __flush_tlb_global() __native_flush_tlb_global() 82 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) 83 #endif 84 85 struct tlb_state { 86 /* ··· 101 * mode even if we've already switched back to swapper_pg_dir. 102 */ 103 struct mm_struct *loaded_mm; 104 - int state; 105 106 /* 107 * Access to this CR4 shadow and to H/W CR4 is protected by 108 * disabling interrupts when modifying either one. 109 */ 110 unsigned long cr4; 111 }; 112 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); 113 ··· 257 __flush_tlb_global(); 258 else 259 __flush_tlb(); 260 } 261 262 static inline void __flush_tlb_one(unsigned long addr) ··· 289 * and page-granular flushes are available only on i486 and up. 290 */ 291 struct flush_tlb_info { 292 - struct mm_struct *mm; 293 - unsigned long start; 294 - unsigned long end; 295 }; 296 297 #define local_flush_tlb() __flush_tlb() ··· 331 void native_flush_tlb_others(const struct cpumask *cpumask, 332 const struct flush_tlb_info *info); 333 334 - #define TLBSTATE_OK 1 335 - #define TLBSTATE_LAZY 2 336 - 337 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, 338 struct mm_struct *mm) 339 { 340 cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); 341 } 342

··· 57 __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); 58 } 59 60 + static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) 61 + { 62 + u64 new_tlb_gen; 63 + 64 + /* 65 + * Bump the generation count. This also serves as a full barrier 66 + * that synchronizes with switch_mm(): callers are required to order 67 + * their read of mm_cpumask after their writes to the paging 68 + * structures. 69 + */ 70 + smp_mb__before_atomic(); 71 + new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); 72 + smp_mb__after_atomic(); 73 + 74 + return new_tlb_gen; 75 + } 76 + 77 #ifdef CONFIG_PARAVIRT 78 #include <asm/paravirt.h> 79 #else ··· 64 #define __flush_tlb_global() __native_flush_tlb_global() 65 #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) 66 #endif 67 + 68 + /* 69 + * 6 because 6 should be plenty and struct tlb_state will fit in 70 + * two cache lines. 71 + */ 72 + #define TLB_NR_DYN_ASIDS 6 73 + 74 + struct tlb_context { 75 + u64 ctx_id; 76 + u64 tlb_gen; 77 + }; 78 79 struct tlb_state { 80 /* ··· 73 * mode even if we've already switched back to swapper_pg_dir. 74 */ 75 struct mm_struct *loaded_mm; 76 + u16 loaded_mm_asid; 77 + u16 next_asid; 78 79 /* 80 * Access to this CR4 shadow and to H/W CR4 is protected by 81 * disabling interrupts when modifying either one. 82 */ 83 unsigned long cr4; 84 + 85 + /* 86 + * This is a list of all contexts that might exist in the TLB. 87 + * There is one per ASID that we use, and the ASID (what the 88 + * CPU calls PCID) is the index into ctxts. 89 + * 90 + * For each context, ctx_id indicates which mm the TLB's user 91 + * entries came from. As an invariant, the TLB will never 92 + * contain entries that are out-of-date as when that mm reached 93 + * the tlb_gen in the list. 94 + * 95 + * To be clear, this means that it's legal for the TLB code to 96 + * flush the TLB without updating tlb_gen. This can happen 97 + * (for now, at least) due to paravirt remote flushes. 98 + * 99 + * NB: context 0 is a bit special, since it's also used by 100 + * various bits of init code. This is fine -- code that 101 + * isn't aware of PCID will end up harmlessly flushing 102 + * context 0. 103 + */ 104 + struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; 105 }; 106 DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); 107 ··· 207 __flush_tlb_global(); 208 else 209 __flush_tlb(); 210 + 211 + /* 212 + * Note: if we somehow had PCID but not PGE, then this wouldn't work -- 213 + * we'd end up flushing kernel translations for the current ASID but 214 + * we might fail to flush kernel translations for other cached ASIDs. 215 + * 216 + * To avoid this issue, we force PCID off if PGE is off. 217 + */ 218 } 219 220 static inline void __flush_tlb_one(unsigned long addr) ··· 231 * and page-granular flushes are available only on i486 and up. 232 */ 233 struct flush_tlb_info { 234 + /* 235 + * We support several kinds of flushes. 236 + * 237 + * - Fully flush a single mm. .mm will be set, .end will be 238 + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to 239 + * which the IPI sender is trying to catch us up. 240 + * 241 + * - Partially flush a single mm. .mm will be set, .start and 242 + * .end will indicate the range, and .new_tlb_gen will be set 243 + * such that the changes between generation .new_tlb_gen-1 and 244 + * .new_tlb_gen are entirely contained in the indicated range. 245 + * 246 + * - Fully flush all mms whose tlb_gens have been updated. .mm 247 + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen 248 + * will be zero. 249 + */ 250 + struct mm_struct *mm; 251 + unsigned long start; 252 + unsigned long end; 253 + u64 new_tlb_gen; 254 }; 255 256 #define local_flush_tlb() __flush_tlb() ··· 256 void native_flush_tlb_others(const struct cpumask *cpumask, 257 const struct flush_tlb_info *info); 258 259 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, 260 struct mm_struct *mm) 261 { 262 + inc_mm_tlb_gen(mm); 263 cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); 264 } 265

+13 -1

arch/x86/include/asm/vga.h

··· 7 #ifndef _ASM_X86_VGA_H 8 #define _ASM_X86_VGA_H 9 10 /* 11 * On the PC, we can just recalculate addresses and then 12 * access the videoram directly without any black magic. 13 */ 14 15 - #define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x) 16 17 #define vga_readb(x) (*(x)) 18 #define vga_writeb(x, y) (*(y) = (x))

··· 7 #ifndef _ASM_X86_VGA_H 8 #define _ASM_X86_VGA_H 9 10 + #include <asm/set_memory.h> 11 + 12 /* 13 * On the PC, we can just recalculate addresses and then 14 * access the videoram directly without any black magic. 15 + * To support memory encryption however, we need to access 16 + * the videoram as decrypted memory. 17 */ 18 19 + #define VGA_MAP_MEM(x, s) \ 20 + ({ \ 21 + unsigned long start = (unsigned long)phys_to_virt(x); \ 22 + \ 23 + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) \ 24 + set_memory_decrypted(start, (s) >> PAGE_SHIFT); \ 25 + \ 26 + start; \ 27 + }) 28 29 #define vga_readb(x) (*(x)) 30 #define vga_writeb(x, y) (*(y) = (x))

+3 -3

arch/x86/kernel/acpi/boot.c

··· 115 #define ACPI_INVALID_GSI INT_MIN 116 117 /* 118 - * This is just a simple wrapper around early_ioremap(), 119 * with sanity checks for phys == 0 and size == 0. 120 */ 121 char *__init __acpi_map_table(unsigned long phys, unsigned long size) ··· 124 if (!phys || !size) 125 return NULL; 126 127 - return early_ioremap(phys, size); 128 } 129 130 void __init __acpi_unmap_table(char *map, unsigned long size) ··· 132 if (!map || !size) 133 return; 134 135 - early_iounmap(map, size); 136 } 137 138 #ifdef CONFIG_X86_LOCAL_APIC

··· 115 #define ACPI_INVALID_GSI INT_MIN 116 117 /* 118 + * This is just a simple wrapper around early_memremap(), 119 * with sanity checks for phys == 0 and size == 0. 120 */ 121 char *__init __acpi_map_table(unsigned long phys, unsigned long size) ··· 124 if (!phys || !size) 125 return NULL; 126 127 + return early_memremap(phys, size); 128 } 129 130 void __init __acpi_unmap_table(char *map, unsigned long size) ··· 132 if (!map || !size) 133 return; 134 135 + early_memunmap(map, size); 136 } 137 138 #ifdef CONFIG_X86_LOCAL_APIC

+25 -4

arch/x86/kernel/cpu/amd.c

··· 558 559 static void early_init_amd(struct cpuinfo_x86 *c) 560 { 561 early_init_amd_mc(c); 562 563 /* 564 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate ··· 626 */ 627 if (cpu_has_amd_erratum(c, amd_erratum_400)) 628 set_cpu_bug(c, X86_BUG_AMD_E400); 629 } 630 631 static void init_amd_k8(struct cpuinfo_x86 *c) ··· 765 766 static void init_amd(struct cpuinfo_x86 *c) 767 { 768 - u32 dummy; 769 - 770 early_init_amd(c); 771 772 /* ··· 825 */ 826 if (c->x86 > 0x11) 827 set_cpu_cap(c, X86_FEATURE_ARAT); 828 - 829 - rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); 830 831 /* 3DNow or LM implies PREFETCHW */ 832 if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))

··· 558 559 static void early_init_amd(struct cpuinfo_x86 *c) 560 { 561 + u32 dummy; 562 + 563 early_init_amd_mc(c); 564 + 565 + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); 566 567 /* 568 * c->x86_power is 8000_0007 edx. Bit 8 is TSC runs at constant rate ··· 622 */ 623 if (cpu_has_amd_erratum(c, amd_erratum_400)) 624 set_cpu_bug(c, X86_BUG_AMD_E400); 625 + 626 + /* 627 + * BIOS support is required for SME. If BIOS has enabled SME then 628 + * adjust x86_phys_bits by the SME physical address space reduction 629 + * value. If BIOS has not enabled SME then don't advertise the 630 + * feature (set in scattered.c). Also, since the SME support requires 631 + * long mode, don't advertise the feature under CONFIG_X86_32. 632 + */ 633 + if (cpu_has(c, X86_FEATURE_SME)) { 634 + u64 msr; 635 + 636 + /* Check if SME is enabled */ 637 + rdmsrl(MSR_K8_SYSCFG, msr); 638 + if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) { 639 + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; 640 + if (IS_ENABLED(CONFIG_X86_32)) 641 + clear_cpu_cap(c, X86_FEATURE_SME); 642 + } else { 643 + clear_cpu_cap(c, X86_FEATURE_SME); 644 + } 645 + } 646 } 647 648 static void init_amd_k8(struct cpuinfo_x86 *c) ··· 740 741 static void init_amd(struct cpuinfo_x86 *c) 742 { 743 early_init_amd(c); 744 745 /* ··· 802 */ 803 if (c->x86 > 0x11) 804 set_cpu_cap(c, X86_FEATURE_ARAT); 805 806 /* 3DNow or LM implies PREFETCHW */ 807 if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))

+8

arch/x86/kernel/cpu/bugs.c

··· 21 22 void __init check_bugs(void) 23 { 24 identify_boot_cpu(); 25 26 if (!IS_ENABLED(CONFIG_SMP)) {

··· 21 22 void __init check_bugs(void) 23 { 24 + #ifdef CONFIG_X86_32 25 + /* 26 + * Regardless of whether PCID is enumerated, the SDM says 27 + * that it can't be enabled in 32-bit mode. 28 + */ 29 + setup_clear_cpu_cap(X86_FEATURE_PCID); 30 + #endif 31 + 32 identify_boot_cpu(); 33 34 if (!IS_ENABLED(CONFIG_SMP)) {

+40

arch/x86/kernel/cpu/common.c

··· 168 } 169 __setup("nompx", x86_mpx_setup); 170 171 static int __init x86_noinvpcid_setup(char *s) 172 { 173 /* noinvpcid doesn't accept parameters */ ··· 326 #else 327 cr4_clear_bits(X86_CR4_SMAP); 328 #endif 329 } 330 } 331 ··· 1161 /* Set up SMEP/SMAP */ 1162 setup_smep(c); 1163 setup_smap(c); 1164 1165 /* 1166 * The vendor-specific functions might have changed features.

··· 168 } 169 __setup("nompx", x86_mpx_setup); 170 171 + #ifdef CONFIG_X86_64 172 + static int __init x86_pcid_setup(char *s) 173 + { 174 + /* require an exact match without trailing characters */ 175 + if (strlen(s)) 176 + return 0; 177 + 178 + /* do not emit a message if the feature is not present */ 179 + if (!boot_cpu_has(X86_FEATURE_PCID)) 180 + return 1; 181 + 182 + setup_clear_cpu_cap(X86_FEATURE_PCID); 183 + pr_info("nopcid: PCID feature disabled\n"); 184 + return 1; 185 + } 186 + __setup("nopcid", x86_pcid_setup); 187 + #endif 188 + 189 static int __init x86_noinvpcid_setup(char *s) 190 { 191 /* noinvpcid doesn't accept parameters */ ··· 308 #else 309 cr4_clear_bits(X86_CR4_SMAP); 310 #endif 311 + } 312 + } 313 + 314 + static void setup_pcid(struct cpuinfo_x86 *c) 315 + { 316 + if (cpu_has(c, X86_FEATURE_PCID)) { 317 + if (cpu_has(c, X86_FEATURE_PGE)) { 318 + cr4_set_bits(X86_CR4_PCIDE); 319 + } else { 320 + /* 321 + * flush_tlb_all(), as currently implemented, won't 322 + * work if PCID is on but PGE is not. Since that 323 + * combination doesn't exist on real hardware, there's 324 + * no reason to try to fully support it, but it's 325 + * polite to avoid corrupting data if we're on 326 + * an improperly configured VM. 327 + */ 328 + clear_cpu_cap(c, X86_FEATURE_PCID); 329 + } 330 } 331 } 332 ··· 1124 /* Set up SMEP/SMAP */ 1125 setup_smep(c); 1126 setup_smap(c); 1127 + 1128 + /* Set up PCID */ 1129 + setup_pcid(c); 1130 1131 /* 1132 * The vendor-specific functions might have changed features.

+43

arch/x86/kernel/cpu/mcheck/mce.c

··· 51 #include <asm/mce.h> 52 #include <asm/msr.h> 53 #include <asm/reboot.h> 54 55 #include "mce-internal.h" 56 ··· 1051 pr_err("Memory error not recovered"); 1052 return ret; 1053 } 1054 1055 /* 1056 * The actual machine check handler. This only handles real

··· 51 #include <asm/mce.h> 52 #include <asm/msr.h> 53 #include <asm/reboot.h> 54 + #include <asm/set_memory.h> 55 56 #include "mce-internal.h" 57 ··· 1050 pr_err("Memory error not recovered"); 1051 return ret; 1052 } 1053 + 1054 + #if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE) 1055 + 1056 + void arch_unmap_kpfn(unsigned long pfn) 1057 + { 1058 + unsigned long decoy_addr; 1059 + 1060 + /* 1061 + * Unmap this page from the kernel 1:1 mappings to make sure 1062 + * we don't log more errors because of speculative access to 1063 + * the page. 1064 + * We would like to just call: 1065 + * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); 1066 + * but doing that would radically increase the odds of a 1067 + * speculative access to the posion page because we'd have 1068 + * the virtual address of the kernel 1:1 mapping sitting 1069 + * around in registers. 1070 + * Instead we get tricky. We create a non-canonical address 1071 + * that looks just like the one we want, but has bit 63 flipped. 1072 + * This relies on set_memory_np() not checking whether we passed 1073 + * a legal address. 1074 + */ 1075 + 1076 + /* 1077 + * Build time check to see if we have a spare virtual bit. Don't want 1078 + * to leave this until run time because most developers don't have a 1079 + * system that can exercise this code path. This will only become a 1080 + * problem if/when we move beyond 5-level page tables. 1081 + * 1082 + * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) 1083 + */ 1084 + #if PGDIR_SHIFT + 9 < 63 1085 + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); 1086 + #else 1087 + #error "no unused virtual bit available" 1088 + #endif 1089 + 1090 + if (set_memory_np(decoy_addr, 1)) 1091 + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); 1092 + 1093 + } 1094 + #endif 1095 1096 /* 1097 * The actual machine check handler. This only handles real

+1

arch/x86/kernel/cpu/scattered.c

··· 31 { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, 32 { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, 33 { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, 34 { 0, 0, 0, 0, 0 } 35 }; 36

··· 31 { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, 32 { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, 33 { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, 34 + { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, 35 { 0, 0, 0, 0, 0 } 36 }; 37

+23 -3

arch/x86/kernel/e820.c

··· 96 * Note: this function only works correctly once the E820 table is sorted and 97 * not-overlapping (at least for the range specified), which is the case normally. 98 */ 99 - bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) 100 { 101 int i; 102 ··· 123 * coverage of the desired range exists: 124 */ 125 if (start >= end) 126 - return 1; 127 } 128 - return 0; 129 } 130 131 /*

··· 96 * Note: this function only works correctly once the E820 table is sorted and 97 * not-overlapping (at least for the range specified), which is the case normally. 98 */ 99 + static struct e820_entry *__e820__mapped_all(u64 start, u64 end, 100 + enum e820_type type) 101 { 102 int i; 103 ··· 122 * coverage of the desired range exists: 123 */ 124 if (start >= end) 125 + return entry; 126 } 127 + 128 + return NULL; 129 + } 130 + 131 + /* 132 + * This function checks if the entire range <start,end> is mapped with type. 133 + */ 134 + bool __init e820__mapped_all(u64 start, u64 end, enum e820_type type) 135 + { 136 + return __e820__mapped_all(start, end, type); 137 + } 138 + 139 + /* 140 + * This function returns the type associated with the range <start,end>. 141 + */ 142 + int e820__get_entry_type(u64 start, u64 end) 143 + { 144 + struct e820_entry *entry = __e820__mapped_all(start, end, 0); 145 + 146 + return entry ? entry->type : -EINVAL; 147 } 148 149 /*

+1 -1

arch/x86/kernel/espfix_64.c

··· 195 196 pte_p = pte_offset_kernel(&pmd, addr); 197 stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); 198 - pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); 199 for (n = 0; n < ESPFIX_PTE_CLONES; n++) 200 set_pte(&pte_p[n*PTE_STRIDE], pte); 201

··· 195 196 pte_p = pte_offset_kernel(&pmd, addr); 197 stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); 198 + pte = __pte(__pa(stack_page) | ((__PAGE_KERNEL_RO | _PAGE_ENC) & ptemask)); 199 for (n = 0; n < ESPFIX_PTE_CLONES; n++) 200 set_pte(&pte_p[n*PTE_STRIDE], pte); 201

+79 -16

arch/x86/kernel/head64.c

··· 14 #include <linux/start_kernel.h> 15 #include <linux/io.h> 16 #include <linux/memblock.h> 17 18 #include <asm/processor.h> 19 #include <asm/proto.h> ··· 34 /* 35 * Manage page tables very early on. 36 */ 37 - extern pgd_t early_top_pgt[PTRS_PER_PGD]; 38 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; 39 static unsigned int __initdata next_early_pgt; 40 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); ··· 45 return ptr - (void *)_text + (void *)physaddr; 46 } 47 48 - void __head __startup_64(unsigned long physaddr) 49 { 50 unsigned long load_delta, *p; 51 pgdval_t *pgd; 52 p4dval_t *p4d; 53 pudval_t *pud; ··· 70 /* Is the address not 2M aligned? */ 71 if (load_delta & ~PMD_PAGE_MASK) 72 for (;;); 73 74 /* Fixup the physical addresses in the page table */ 75 ··· 100 * creates a bunch of nonsense entries but that is fine -- 101 * it avoids problems around wraparound. 102 */ 103 next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); 104 pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); 105 pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); 106 107 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 108 p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); 109 110 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; 111 - pgd[i + 0] = (pgdval_t)p4d + _KERNPG_TABLE; 112 - pgd[i + 1] = (pgdval_t)p4d + _KERNPG_TABLE; 113 114 i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; 115 - p4d[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; 116 - p4d[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; 117 } else { 118 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; 119 - pgd[i + 0] = (pgdval_t)pud + _KERNPG_TABLE; 120 - pgd[i + 1] = (pgdval_t)pud + _KERNPG_TABLE; 121 } 122 123 i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; 124 - pud[i + 0] = (pudval_t)pmd + _KERNPG_TABLE; 125 - pud[i + 1] = (pudval_t)pmd + _KERNPG_TABLE; 126 127 pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; 128 pmd_entry += physaddr; 129 130 for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { ··· 149 pmd[i] += load_delta; 150 } 151 152 - /* Fixup phys_base */ 153 p = fixup_pointer(&phys_base, physaddr); 154 - *p += load_delta; 155 } 156 157 /* Wipe all early page tables except for the kernel symbol map */ ··· 180 { 181 memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); 182 next_early_pgt = 0; 183 - write_cr3(__pa_nodebug(early_top_pgt)); 184 } 185 186 /* Create a new PMD entry */ 187 - int __init early_make_pgtable(unsigned long address) 188 { 189 unsigned long physaddr = address - __PAGE_OFFSET; 190 pgdval_t pgd, *pgd_p; 191 p4dval_t p4d, *p4d_p; 192 pudval_t pud, *pud_p; 193 - pmdval_t pmd, *pmd_p; 194 195 /* Invalid address or early pgt is done ? */ 196 if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) ··· 249 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); 250 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; 251 } 252 - pmd = (physaddr & PMD_MASK) + early_pmd_flags; 253 pmd_p[pmd_index(address)] = pmd; 254 255 return 0; 256 } 257 258 /* Don't add a printk in there. printk relies on the PDA which is not initialized ··· 286 char * command_line; 287 unsigned long cmd_line_ptr; 288 289 memcpy(&boot_params, real_mode_data, sizeof boot_params); 290 sanitize_boot_params(&boot_params); 291 cmd_line_ptr = get_cmd_line_ptr(); ··· 299 command_line = __va(cmd_line_ptr); 300 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); 301 } 302 } 303 304 asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) ··· 335 clear_bss(); 336 337 clear_page(init_top_pgt); 338 339 kasan_early_init(); 340

··· 14 #include <linux/start_kernel.h> 15 #include <linux/io.h> 16 #include <linux/memblock.h> 17 + #include <linux/mem_encrypt.h> 18 19 #include <asm/processor.h> 20 #include <asm/proto.h> ··· 33 /* 34 * Manage page tables very early on. 35 */ 36 extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; 37 static unsigned int __initdata next_early_pgt; 38 pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); ··· 45 return ptr - (void *)_text + (void *)physaddr; 46 } 47 48 + unsigned long __head __startup_64(unsigned long physaddr, 49 + struct boot_params *bp) 50 { 51 unsigned long load_delta, *p; 52 + unsigned long pgtable_flags; 53 pgdval_t *pgd; 54 p4dval_t *p4d; 55 pudval_t *pud; ··· 68 /* Is the address not 2M aligned? */ 69 if (load_delta & ~PMD_PAGE_MASK) 70 for (;;); 71 + 72 + /* Activate Secure Memory Encryption (SME) if supported and enabled */ 73 + sme_enable(bp); 74 + 75 + /* Include the SME encryption mask in the fixup value */ 76 + load_delta += sme_get_me_mask(); 77 78 /* Fixup the physical addresses in the page table */ 79 ··· 92 * creates a bunch of nonsense entries but that is fine -- 93 * it avoids problems around wraparound. 94 */ 95 + 96 next_pgt_ptr = fixup_pointer(&next_early_pgt, physaddr); 97 pud = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); 98 pmd = fixup_pointer(early_dynamic_pgts[(*next_pgt_ptr)++], physaddr); 99 + 100 + pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); 101 102 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 103 p4d = fixup_pointer(early_dynamic_pgts[next_early_pgt++], physaddr); 104 105 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; 106 + pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; 107 + pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; 108 109 i = (physaddr >> P4D_SHIFT) % PTRS_PER_P4D; 110 + p4d[i + 0] = (pgdval_t)pud + pgtable_flags; 111 + p4d[i + 1] = (pgdval_t)pud + pgtable_flags; 112 } else { 113 i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; 114 + pgd[i + 0] = (pgdval_t)pud + pgtable_flags; 115 + pgd[i + 1] = (pgdval_t)pud + pgtable_flags; 116 } 117 118 i = (physaddr >> PUD_SHIFT) % PTRS_PER_PUD; 119 + pud[i + 0] = (pudval_t)pmd + pgtable_flags; 120 + pud[i + 1] = (pudval_t)pmd + pgtable_flags; 121 122 pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; 123 + pmd_entry += sme_get_me_mask(); 124 pmd_entry += physaddr; 125 126 for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) { ··· 137 pmd[i] += load_delta; 138 } 139 140 + /* 141 + * Fixup phys_base - remove the memory encryption mask to obtain 142 + * the true physical address. 143 + */ 144 p = fixup_pointer(&phys_base, physaddr); 145 + *p += load_delta - sme_get_me_mask(); 146 + 147 + /* Encrypt the kernel (if SME is active) */ 148 + sme_encrypt_kernel(); 149 + 150 + /* 151 + * Return the SME encryption mask (if SME is active) to be used as a 152 + * modifier for the initial pgdir entry programmed into CR3. 153 + */ 154 + return sme_get_me_mask(); 155 + } 156 + 157 + unsigned long __startup_secondary_64(void) 158 + { 159 + /* 160 + * Return the SME encryption mask (if SME is active) to be used as a 161 + * modifier for the initial pgdir entry programmed into CR3. 162 + */ 163 + return sme_get_me_mask(); 164 } 165 166 /* Wipe all early page tables except for the kernel symbol map */ ··· 147 { 148 memset(early_top_pgt, 0, sizeof(pgd_t)*(PTRS_PER_PGD-1)); 149 next_early_pgt = 0; 150 + write_cr3(__sme_pa_nodebug(early_top_pgt)); 151 } 152 153 /* Create a new PMD entry */ 154 + int __init __early_make_pgtable(unsigned long address, pmdval_t pmd) 155 { 156 unsigned long physaddr = address - __PAGE_OFFSET; 157 pgdval_t pgd, *pgd_p; 158 p4dval_t p4d, *p4d_p; 159 pudval_t pud, *pud_p; 160 + pmdval_t *pmd_p; 161 162 /* Invalid address or early pgt is done ? */ 163 if (physaddr >= MAXMEM || read_cr3_pa() != __pa_nodebug(early_top_pgt)) ··· 216 memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); 217 *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + phys_base + _KERNPG_TABLE; 218 } 219 pmd_p[pmd_index(address)] = pmd; 220 221 return 0; 222 + } 223 + 224 + int __init early_make_pgtable(unsigned long address) 225 + { 226 + unsigned long physaddr = address - __PAGE_OFFSET; 227 + pmdval_t pmd; 228 + 229 + pmd = (physaddr & PMD_MASK) + early_pmd_flags; 230 + 231 + return __early_make_pgtable(address, pmd); 232 } 233 234 /* Don't add a printk in there. printk relies on the PDA which is not initialized ··· 244 char * command_line; 245 unsigned long cmd_line_ptr; 246 247 + /* 248 + * If SME is active, this will create decrypted mappings of the 249 + * boot data in advance of the copy operations. 250 + */ 251 + sme_map_bootdata(real_mode_data); 252 + 253 memcpy(&boot_params, real_mode_data, sizeof boot_params); 254 sanitize_boot_params(&boot_params); 255 cmd_line_ptr = get_cmd_line_ptr(); ··· 251 command_line = __va(cmd_line_ptr); 252 memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); 253 } 254 + 255 + /* 256 + * The old boot data is no longer needed and won't be reserved, 257 + * freeing up that memory for use by the system. If SME is active, 258 + * we need to remove the mappings that were created so that the 259 + * memory doesn't remain mapped as decrypted. 260 + */ 261 + sme_unmap_bootdata(real_mode_data); 262 } 263 264 asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) ··· 279 clear_bss(); 280 281 clear_page(init_top_pgt); 282 + 283 + /* 284 + * SME support may update early_pmd_flags to include the memory 285 + * encryption mask, so it needs to be called before anything 286 + * that may generate a page fault. 287 + */ 288 + sme_early_init(); 289 290 kasan_early_init(); 291

+28 -12

arch/x86/kernel/head_64.S

··· 73 /* Sanitize CPU configuration */ 74 call verify_cpu 75 76 leaq _text(%rip), %rdi 77 pushq %rsi 78 call __startup_64 79 popq %rsi 80 81 - movq $(early_top_pgt - __START_KERNEL_map), %rax 82 jmp 1f 83 ENTRY(secondary_startup_64) 84 /* ··· 105 /* Sanitize CPU configuration */ 106 call verify_cpu 107 108 - movq $(init_top_pgt - __START_KERNEL_map), %rax 109 1: 110 111 /* Enable PAE mode, PGE and LA57 */ ··· 351 NEXT_PAGE(early_top_pgt) 352 .fill 511,8,0 353 #ifdef CONFIG_X86_5LEVEL 354 - .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 355 #else 356 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 357 #endif 358 359 NEXT_PAGE(early_dynamic_pgts) ··· 366 .fill 512,8,0 367 #else 368 NEXT_PAGE(init_top_pgt) 369 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 370 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 371 - .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 372 .org init_top_pgt + PGD_START_KERNEL*8, 0 373 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 374 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 375 376 NEXT_PAGE(level3_ident_pgt) 377 - .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE 378 .fill 511, 8, 0 379 NEXT_PAGE(level2_ident_pgt) 380 /* Since I easily can, map the first 1G. ··· 386 #ifdef CONFIG_X86_5LEVEL 387 NEXT_PAGE(level4_kernel_pgt) 388 .fill 511,8,0 389 - .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE 390 #endif 391 392 NEXT_PAGE(level3_kernel_pgt) 393 .fill L3_START_KERNEL,8,0 394 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ 395 - .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE 396 - .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 397 398 NEXT_PAGE(level2_kernel_pgt) 399 /* ··· 411 412 NEXT_PAGE(level2_fixmap_pgt) 413 .fill 506,8,0 414 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE 415 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ 416 .fill 5,8,0 417

··· 73 /* Sanitize CPU configuration */ 74 call verify_cpu 75 76 + /* 77 + * Perform pagetable fixups. Additionally, if SME is active, encrypt 78 + * the kernel and retrieve the modifier (SME encryption mask if SME 79 + * is active) to be added to the initial pgdir entry that will be 80 + * programmed into CR3. 81 + */ 82 leaq _text(%rip), %rdi 83 pushq %rsi 84 call __startup_64 85 popq %rsi 86 87 + /* Form the CR3 value being sure to include the CR3 modifier */ 88 + addq $(early_top_pgt - __START_KERNEL_map), %rax 89 jmp 1f 90 ENTRY(secondary_startup_64) 91 /* ··· 98 /* Sanitize CPU configuration */ 99 call verify_cpu 100 101 + /* 102 + * Retrieve the modifier (SME encryption mask if SME is active) to be 103 + * added to the initial pgdir entry that will be programmed into CR3. 104 + */ 105 + pushq %rsi 106 + call __startup_secondary_64 107 + popq %rsi 108 + 109 + /* Form the CR3 value being sure to include the CR3 modifier */ 110 + addq $(init_top_pgt - __START_KERNEL_map), %rax 111 1: 112 113 /* Enable PAE mode, PGE and LA57 */ ··· 335 NEXT_PAGE(early_top_pgt) 336 .fill 511,8,0 337 #ifdef CONFIG_X86_5LEVEL 338 + .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 339 #else 340 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 341 #endif 342 343 NEXT_PAGE(early_dynamic_pgts) ··· 350 .fill 512,8,0 351 #else 352 NEXT_PAGE(init_top_pgt) 353 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 354 .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 355 + .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 356 .org init_top_pgt + PGD_START_KERNEL*8, 0 357 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ 358 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 359 360 NEXT_PAGE(level3_ident_pgt) 361 + .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 362 .fill 511, 8, 0 363 NEXT_PAGE(level2_ident_pgt) 364 /* Since I easily can, map the first 1G. ··· 370 #ifdef CONFIG_X86_5LEVEL 371 NEXT_PAGE(level4_kernel_pgt) 372 .fill 511,8,0 373 + .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 374 #endif 375 376 NEXT_PAGE(level3_kernel_pgt) 377 .fill L3_START_KERNEL,8,0 378 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ 379 + .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC 380 + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 381 382 NEXT_PAGE(level2_kernel_pgt) 383 /* ··· 395 396 NEXT_PAGE(level2_fixmap_pgt) 397 .fill 506,8,0 398 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC 399 /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ 400 .fill 5,8,0 401

+11 -23

arch/x86/kernel/kdebugfs.c

··· 33 struct setup_data_node *node = file->private_data; 34 unsigned long remain; 35 loff_t pos = *ppos; 36 - struct page *pg; 37 void *p; 38 u64 pa; 39 ··· 46 count = node->len - pos; 47 48 pa = node->paddr + sizeof(struct setup_data) + pos; 49 - pg = pfn_to_page((pa + count - 1) >> PAGE_SHIFT); 50 - if (PageHighMem(pg)) { 51 - p = ioremap_cache(pa, count); 52 - if (!p) 53 - return -ENXIO; 54 - } else 55 - p = __va(pa); 56 57 remain = copy_to_user(user_buf, p, count); 58 59 - if (PageHighMem(pg)) 60 - iounmap(p); 61 62 if (remain) 63 return -EFAULT; ··· 103 struct setup_data *data; 104 int error; 105 struct dentry *d; 106 - struct page *pg; 107 u64 pa_data; 108 int no = 0; 109 ··· 119 goto err_dir; 120 } 121 122 - pg = pfn_to_page((pa_data+sizeof(*data)-1) >> PAGE_SHIFT); 123 - if (PageHighMem(pg)) { 124 - data = ioremap_cache(pa_data, sizeof(*data)); 125 - if (!data) { 126 - kfree(node); 127 - error = -ENXIO; 128 - goto err_dir; 129 - } 130 - } else 131 - data = __va(pa_data); 132 133 node->paddr = pa_data; 134 node->type = data->type; ··· 132 error = create_setup_data_node(d, no, node); 133 pa_data = data->next; 134 135 - if (PageHighMem(pg)) 136 - iounmap(data); 137 if (error) 138 goto err_dir; 139 no++;

··· 33 struct setup_data_node *node = file->private_data; 34 unsigned long remain; 35 loff_t pos = *ppos; 36 void *p; 37 u64 pa; 38 ··· 47 count = node->len - pos; 48 49 pa = node->paddr + sizeof(struct setup_data) + pos; 50 + p = memremap(pa, count, MEMREMAP_WB); 51 + if (!p) 52 + return -ENOMEM; 53 54 remain = copy_to_user(user_buf, p, count); 55 56 + memunmap(p); 57 58 if (remain) 59 return -EFAULT; ··· 109 struct setup_data *data; 110 int error; 111 struct dentry *d; 112 u64 pa_data; 113 int no = 0; 114 ··· 126 goto err_dir; 127 } 128 129 + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); 130 + if (!data) { 131 + kfree(node); 132 + error = -ENOMEM; 133 + goto err_dir; 134 + } 135 136 node->paddr = pa_data; 137 node->type = data->type; ··· 143 error = create_setup_data_node(d, no, node); 144 pa_data = data->next; 145 146 + memunmap(data); 147 if (error) 148 goto err_dir; 149 no++;

+14 -14

arch/x86/kernel/ksysfs.c

··· 16 #include <linux/stat.h> 17 #include <linux/slab.h> 18 #include <linux/mm.h> 19 20 - #include <asm/io.h> 21 #include <asm/setup.h> 22 23 static ssize_t version_show(struct kobject *kobj, ··· 79 *paddr = pa_data; 80 return 0; 81 } 82 - data = ioremap_cache(pa_data, sizeof(*data)); 83 if (!data) 84 return -ENOMEM; 85 86 pa_data = data->next; 87 - iounmap(data); 88 i++; 89 } 90 return -EINVAL; ··· 97 u64 pa_data = boot_params.hdr.setup_data; 98 99 while (pa_data) { 100 - data = ioremap_cache(pa_data, sizeof(*data)); 101 if (!data) 102 return -ENOMEM; 103 if (nr == i) { 104 *size = data->len; 105 - iounmap(data); 106 return 0; 107 } 108 109 pa_data = data->next; 110 - iounmap(data); 111 i++; 112 } 113 return -EINVAL; ··· 127 ret = get_setup_data_paddr(nr, &paddr); 128 if (ret) 129 return ret; 130 - data = ioremap_cache(paddr, sizeof(*data)); 131 if (!data) 132 return -ENOMEM; 133 134 ret = sprintf(buf, "0x%x\n", data->type); 135 - iounmap(data); 136 return ret; 137 } 138 ··· 154 ret = get_setup_data_paddr(nr, &paddr); 155 if (ret) 156 return ret; 157 - data = ioremap_cache(paddr, sizeof(*data)); 158 if (!data) 159 return -ENOMEM; 160 ··· 170 goto out; 171 172 ret = count; 173 - p = ioremap_cache(paddr + sizeof(*data), data->len); 174 if (!p) { 175 ret = -ENOMEM; 176 goto out; 177 } 178 memcpy(buf, p + off, count); 179 - iounmap(p); 180 out: 181 - iounmap(data); 182 return ret; 183 } 184 ··· 250 *nr = 0; 251 while (pa_data) { 252 *nr += 1; 253 - data = ioremap_cache(pa_data, sizeof(*data)); 254 if (!data) { 255 ret = -ENOMEM; 256 goto out; 257 } 258 pa_data = data->next; 259 - iounmap(data); 260 } 261 262 out:

··· 16 #include <linux/stat.h> 17 #include <linux/slab.h> 18 #include <linux/mm.h> 19 + #include <linux/io.h> 20 21 #include <asm/setup.h> 22 23 static ssize_t version_show(struct kobject *kobj, ··· 79 *paddr = pa_data; 80 return 0; 81 } 82 + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); 83 if (!data) 84 return -ENOMEM; 85 86 pa_data = data->next; 87 + memunmap(data); 88 i++; 89 } 90 return -EINVAL; ··· 97 u64 pa_data = boot_params.hdr.setup_data; 98 99 while (pa_data) { 100 + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); 101 if (!data) 102 return -ENOMEM; 103 if (nr == i) { 104 *size = data->len; 105 + memunmap(data); 106 return 0; 107 } 108 109 pa_data = data->next; 110 + memunmap(data); 111 i++; 112 } 113 return -EINVAL; ··· 127 ret = get_setup_data_paddr(nr, &paddr); 128 if (ret) 129 return ret; 130 + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); 131 if (!data) 132 return -ENOMEM; 133 134 ret = sprintf(buf, "0x%x\n", data->type); 135 + memunmap(data); 136 return ret; 137 } 138 ··· 154 ret = get_setup_data_paddr(nr, &paddr); 155 if (ret) 156 return ret; 157 + data = memremap(paddr, sizeof(*data), MEMREMAP_WB); 158 if (!data) 159 return -ENOMEM; 160 ··· 170 goto out; 171 172 ret = count; 173 + p = memremap(paddr + sizeof(*data), data->len, MEMREMAP_WB); 174 if (!p) { 175 ret = -ENOMEM; 176 goto out; 177 } 178 memcpy(buf, p + off, count); 179 + memunmap(p); 180 out: 181 + memunmap(data); 182 return ret; 183 } 184 ··· 250 *nr = 0; 251 while (pa_data) { 252 *nr += 1; 253 + data = memremap(pa_data, sizeof(*data), MEMREMAP_WB); 254 if (!data) { 255 ret = -ENOMEM; 256 goto out; 257 } 258 pa_data = data->next; 259 + memunmap(data); 260 } 261 262 out:

+23 -2

arch/x86/kernel/machine_kexec_64.c

··· 87 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); 88 } 89 pte = pte_offset_kernel(pmd, vaddr); 90 - set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); 91 return 0; 92 err: 93 free_transition_pgtable(image); ··· 115 .alloc_pgt_page = alloc_pgt_page, 116 .context = image, 117 .page_flag = __PAGE_KERNEL_LARGE_EXEC, 118 }; 119 unsigned long mstart, mend; 120 pgd_t *level4p; ··· 335 image->start = relocate_kernel((unsigned long)image->head, 336 (unsigned long)page_list, 337 image->start, 338 - image->preserve_context); 339 340 #ifdef CONFIG_KEXEC_JUMP 341 if (image->preserve_context) ··· 603 void arch_kexec_unprotect_crashkres(void) 604 { 605 kexec_mark_crashkres(false); 606 }

··· 87 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE)); 88 } 89 pte = pte_offset_kernel(pmd, vaddr); 90 + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC_NOENC)); 91 return 0; 92 err: 93 free_transition_pgtable(image); ··· 115 .alloc_pgt_page = alloc_pgt_page, 116 .context = image, 117 .page_flag = __PAGE_KERNEL_LARGE_EXEC, 118 + .kernpg_flag = _KERNPG_TABLE_NOENC, 119 }; 120 unsigned long mstart, mend; 121 pgd_t *level4p; ··· 334 image->start = relocate_kernel((unsigned long)image->head, 335 (unsigned long)page_list, 336 image->start, 337 + image->preserve_context, 338 + sme_active()); 339 340 #ifdef CONFIG_KEXEC_JUMP 341 if (image->preserve_context) ··· 601 void arch_kexec_unprotect_crashkres(void) 602 { 603 kexec_mark_crashkres(false); 604 + } 605 + 606 + int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) 607 + { 608 + /* 609 + * If SME is active we need to be sure that kexec pages are 610 + * not encrypted because when we boot to the new kernel the 611 + * pages won't be accessed encrypted (initially). 612 + */ 613 + return set_memory_decrypted((unsigned long)vaddr, pages); 614 + } 615 + 616 + void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) 617 + { 618 + /* 619 + * If SME is active we need to reset the pages back to being 620 + * an encrypted mapping before freeing them. 621 + */ 622 + set_memory_encrypted((unsigned long)vaddr, pages); 623 }

+75 -33

arch/x86/kernel/mpparse.c

··· 429 } 430 } 431 432 - static struct mpf_intel *mpf_found; 433 434 static unsigned long __init get_mpc_size(unsigned long physptr) 435 { 436 struct mpc_table *mpc; 437 unsigned long size; 438 439 - mpc = early_ioremap(physptr, PAGE_SIZE); 440 size = mpc->length; 441 - early_iounmap(mpc, PAGE_SIZE); 442 apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); 443 444 return size; ··· 450 unsigned long size; 451 452 size = get_mpc_size(mpf->physptr); 453 - mpc = early_ioremap(mpf->physptr, size); 454 /* 455 * Read the physical hardware table. Anything here will 456 * override the defaults. ··· 462 #endif 463 pr_err("BIOS bug, MP table errors detected!...\n"); 464 pr_cont("... disabling SMP support. (tell your hw vendor)\n"); 465 - early_iounmap(mpc, size); 466 return -1; 467 } 468 - early_iounmap(mpc, size); 469 470 if (early) 471 return -1; ··· 498 */ 499 void __init default_get_smp_config(unsigned int early) 500 { 501 - struct mpf_intel *mpf = mpf_found; 502 503 if (!smp_found_config) 504 return; 505 506 - if (!mpf) 507 return; 508 509 if (acpi_lapic && early) ··· 515 */ 516 if (acpi_lapic && acpi_ioapic) 517 return; 518 519 pr_info("Intel MultiProcessor Specification v1.%d\n", 520 mpf->specification); ··· 536 /* 537 * Now see if we need to read further. 538 */ 539 - if (mpf->feature1 != 0) { 540 if (early) { 541 /* 542 * local APIC has default address ··· 549 construct_default_ISA_mptable(mpf->feature1); 550 551 } else if (mpf->physptr) { 552 - if (check_physptr(mpf, early)) 553 return; 554 } else 555 BUG(); 556 ··· 561 /* 562 * Only use the first configuration found. 563 */ 564 } 565 566 static void __init smp_reserve_memory(struct mpf_intel *mpf) ··· 572 573 static int __init smp_scan_config(unsigned long base, unsigned long length) 574 { 575 - unsigned int *bp = phys_to_virt(base); 576 struct mpf_intel *mpf; 577 - unsigned long mem; 578 579 apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", 580 base, base + length - 1); 581 BUILD_BUG_ON(sizeof(*mpf) != 16); 582 583 while (length > 0) { 584 mpf = (struct mpf_intel *)bp; 585 if ((*bp == SMP_MAGIC_IDENT) && 586 (mpf->length == 1) && ··· 591 #ifdef CONFIG_X86_LOCAL_APIC 592 smp_found_config = 1; 593 #endif 594 - mpf_found = mpf; 595 596 - pr_info("found SMP MP-table at [mem %#010llx-%#010llx] mapped at [%p]\n", 597 - (unsigned long long) virt_to_phys(mpf), 598 - (unsigned long long) virt_to_phys(mpf) + 599 - sizeof(*mpf) - 1, mpf); 600 601 - mem = virt_to_phys(mpf); 602 - memblock_reserve(mem, sizeof(*mpf)); 603 if (mpf->physptr) 604 smp_reserve_memory(mpf); 605 606 - return 1; 607 } 608 - bp += 4; 609 length -= 16; 610 } 611 - return 0; 612 } 613 614 void __init default_find_smp_config(void) ··· 852 char oem[10]; 853 struct mpf_intel *mpf; 854 struct mpc_table *mpc, *mpc_new; 855 856 if (!enable_update_mptable) 857 return 0; 858 859 - mpf = mpf_found; 860 - if (!mpf) 861 return 0; 862 863 /* 864 * Now see if we need to go further. 865 */ 866 - if (mpf->feature1 != 0) 867 - return 0; 868 869 if (!mpf->physptr) 870 - return 0; 871 872 - mpc = phys_to_virt(mpf->physptr); 873 874 if (!smp_check_mpc(mpc, oem, str)) 875 - return 0; 876 877 - pr_info("mpf: %llx\n", (u64)virt_to_phys(mpf)); 878 pr_info("physptr: %x\n", mpf->physptr); 879 880 if (mpc_new_phys && mpc->length > mpc_new_length) { ··· 903 new = mpf_checksum((unsigned char *)mpc, mpc->length); 904 if (old == new) { 905 pr_info("mpc is readonly, please try alloc_mptable instead\n"); 906 - return 0; 907 } 908 pr_info("use in-position replacing\n"); 909 } else { 910 mpf->physptr = mpc_new_phys; 911 - mpc_new = phys_to_virt(mpc_new_phys); 912 memcpy(mpc_new, mpc, mpc->length); 913 mpc = mpc_new; 914 /* check if we can modify that */ 915 if (mpc_new_phys - mpf->physptr) { 916 struct mpf_intel *mpf_new; 917 /* steal 16 bytes from [0, 1k) */ 918 pr_info("mpf new: %x\n", 0x400 - 16); 919 - mpf_new = phys_to_virt(0x400 - 16); 920 memcpy(mpf_new, mpf, 16); 921 mpf = mpf_new; 922 mpf->physptr = mpc_new_phys; 923 } ··· 944 * may need pci=routeirq for all coverage 945 */ 946 replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); 947 948 return 0; 949 }

··· 429 } 430 } 431 432 + static unsigned long mpf_base; 433 434 static unsigned long __init get_mpc_size(unsigned long physptr) 435 { 436 struct mpc_table *mpc; 437 unsigned long size; 438 439 + mpc = early_memremap(physptr, PAGE_SIZE); 440 size = mpc->length; 441 + early_memunmap(mpc, PAGE_SIZE); 442 apic_printk(APIC_VERBOSE, " mpc: %lx-%lx\n", physptr, physptr + size); 443 444 return size; ··· 450 unsigned long size; 451 452 size = get_mpc_size(mpf->physptr); 453 + mpc = early_memremap(mpf->physptr, size); 454 + 455 /* 456 * Read the physical hardware table. Anything here will 457 * override the defaults. ··· 461 #endif 462 pr_err("BIOS bug, MP table errors detected!...\n"); 463 pr_cont("... disabling SMP support. (tell your hw vendor)\n"); 464 + early_memunmap(mpc, size); 465 return -1; 466 } 467 + early_memunmap(mpc, size); 468 469 if (early) 470 return -1; ··· 497 */ 498 void __init default_get_smp_config(unsigned int early) 499 { 500 + struct mpf_intel *mpf; 501 502 if (!smp_found_config) 503 return; 504 505 + if (!mpf_base) 506 return; 507 508 if (acpi_lapic && early) ··· 514 */ 515 if (acpi_lapic && acpi_ioapic) 516 return; 517 + 518 + mpf = early_memremap(mpf_base, sizeof(*mpf)); 519 + if (!mpf) { 520 + pr_err("MPTABLE: error mapping MP table\n"); 521 + return; 522 + } 523 524 pr_info("Intel MultiProcessor Specification v1.%d\n", 525 mpf->specification); ··· 529 /* 530 * Now see if we need to read further. 531 */ 532 + if (mpf->feature1) { 533 if (early) { 534 /* 535 * local APIC has default address ··· 542 construct_default_ISA_mptable(mpf->feature1); 543 544 } else if (mpf->physptr) { 545 + if (check_physptr(mpf, early)) { 546 + early_memunmap(mpf, sizeof(*mpf)); 547 return; 548 + } 549 } else 550 BUG(); 551 ··· 552 /* 553 * Only use the first configuration found. 554 */ 555 + 556 + early_memunmap(mpf, sizeof(*mpf)); 557 } 558 559 static void __init smp_reserve_memory(struct mpf_intel *mpf) ··· 561 562 static int __init smp_scan_config(unsigned long base, unsigned long length) 563 { 564 + unsigned int *bp; 565 struct mpf_intel *mpf; 566 + int ret = 0; 567 568 apic_printk(APIC_VERBOSE, "Scan for SMP in [mem %#010lx-%#010lx]\n", 569 base, base + length - 1); 570 BUILD_BUG_ON(sizeof(*mpf) != 16); 571 572 while (length > 0) { 573 + bp = early_memremap(base, length); 574 mpf = (struct mpf_intel *)bp; 575 if ((*bp == SMP_MAGIC_IDENT) && 576 (mpf->length == 1) && ··· 579 #ifdef CONFIG_X86_LOCAL_APIC 580 smp_found_config = 1; 581 #endif 582 + mpf_base = base; 583 584 + pr_info("found SMP MP-table at [mem %#010lx-%#010lx] mapped at [%p]\n", 585 + base, base + sizeof(*mpf) - 1, mpf); 586 587 + memblock_reserve(base, sizeof(*mpf)); 588 if (mpf->physptr) 589 smp_reserve_memory(mpf); 590 591 + ret = 1; 592 } 593 + early_memunmap(bp, length); 594 + 595 + if (ret) 596 + break; 597 + 598 + base += 16; 599 length -= 16; 600 } 601 + return ret; 602 } 603 604 void __init default_find_smp_config(void) ··· 838 char oem[10]; 839 struct mpf_intel *mpf; 840 struct mpc_table *mpc, *mpc_new; 841 + unsigned long size; 842 843 if (!enable_update_mptable) 844 return 0; 845 846 + if (!mpf_base) 847 return 0; 848 + 849 + mpf = early_memremap(mpf_base, sizeof(*mpf)); 850 + if (!mpf) { 851 + pr_err("MPTABLE: mpf early_memremap() failed\n"); 852 + return 0; 853 + } 854 855 /* 856 * Now see if we need to go further. 857 */ 858 + if (mpf->feature1) 859 + goto do_unmap_mpf; 860 861 if (!mpf->physptr) 862 + goto do_unmap_mpf; 863 864 + size = get_mpc_size(mpf->physptr); 865 + mpc = early_memremap(mpf->physptr, size); 866 + if (!mpc) { 867 + pr_err("MPTABLE: mpc early_memremap() failed\n"); 868 + goto do_unmap_mpf; 869 + } 870 871 if (!smp_check_mpc(mpc, oem, str)) 872 + goto do_unmap_mpc; 873 874 + pr_info("mpf: %llx\n", (u64)mpf_base); 875 pr_info("physptr: %x\n", mpf->physptr); 876 877 if (mpc_new_phys && mpc->length > mpc_new_length) { ··· 878 new = mpf_checksum((unsigned char *)mpc, mpc->length); 879 if (old == new) { 880 pr_info("mpc is readonly, please try alloc_mptable instead\n"); 881 + goto do_unmap_mpc; 882 } 883 pr_info("use in-position replacing\n"); 884 } else { 885 + mpc_new = early_memremap(mpc_new_phys, mpc_new_length); 886 + if (!mpc_new) { 887 + pr_err("MPTABLE: new mpc early_memremap() failed\n"); 888 + goto do_unmap_mpc; 889 + } 890 mpf->physptr = mpc_new_phys; 891 memcpy(mpc_new, mpc, mpc->length); 892 + early_memunmap(mpc, size); 893 mpc = mpc_new; 894 + size = mpc_new_length; 895 /* check if we can modify that */ 896 if (mpc_new_phys - mpf->physptr) { 897 struct mpf_intel *mpf_new; 898 /* steal 16 bytes from [0, 1k) */ 899 + mpf_new = early_memremap(0x400 - 16, sizeof(*mpf_new)); 900 + if (!mpf_new) { 901 + pr_err("MPTABLE: new mpf early_memremap() failed\n"); 902 + goto do_unmap_mpc; 903 + } 904 pr_info("mpf new: %x\n", 0x400 - 16); 905 memcpy(mpf_new, mpf, 16); 906 + early_memunmap(mpf, sizeof(*mpf)); 907 mpf = mpf_new; 908 mpf->physptr = mpc_new_phys; 909 } ··· 908 * may need pci=routeirq for all coverage 909 */ 910 replace_intsrc_all(mpc, mpc_new_phys, mpc_new_length); 911 + 912 + do_unmap_mpc: 913 + early_memunmap(mpc, size); 914 + 915 + do_unmap_mpf: 916 + early_memunmap(mpf, sizeof(*mpf)); 917 918 return 0; 919 }

+7 -4

arch/x86/kernel/pci-dma.c

··· 93 if (gfpflags_allow_blocking(flag)) { 94 page = dma_alloc_from_contiguous(dev, count, get_order(size), 95 flag); 96 - if (page && page_to_phys(page) + size > dma_mask) { 97 - dma_release_from_contiguous(dev, page, count); 98 - page = NULL; 99 } 100 } 101 /* fallback */ ··· 107 if (!page) 108 return NULL; 109 110 - addr = page_to_phys(page); 111 if (addr + size > dma_mask) { 112 __free_pages(page, get_order(size)); 113

··· 93 if (gfpflags_allow_blocking(flag)) { 94 page = dma_alloc_from_contiguous(dev, count, get_order(size), 95 flag); 96 + if (page) { 97 + addr = phys_to_dma(dev, page_to_phys(page)); 98 + if (addr + size > dma_mask) { 99 + dma_release_from_contiguous(dev, page, count); 100 + page = NULL; 101 + } 102 } 103 } 104 /* fallback */ ··· 104 if (!page) 105 return NULL; 106 107 + addr = phys_to_dma(dev, page_to_phys(page)); 108 if (addr + size > dma_mask) { 109 __free_pages(page, get_order(size)); 110

+1 -1

arch/x86/kernel/pci-nommu.c

··· 32 enum dma_data_direction dir, 33 unsigned long attrs) 34 { 35 - dma_addr_t bus = page_to_phys(page) + offset; 36 WARN_ON(size == 0); 37 if (!check_addr("map_single", dev, bus, size)) 38 return NOMMU_MAPPING_ERROR;

··· 32 enum dma_data_direction dir, 33 unsigned long attrs) 34 { 35 + dma_addr_t bus = phys_to_dma(dev, page_to_phys(page)) + offset; 36 WARN_ON(size == 0); 37 if (!check_addr("map_single", dev, bus, size)) 38 return NOMMU_MAPPING_ERROR;

+13 -2

arch/x86/kernel/pci-swiotlb.c

··· 6 #include <linux/swiotlb.h> 7 #include <linux/bootmem.h> 8 #include <linux/dma-mapping.h> 9 10 #include <asm/iommu.h> 11 #include <asm/swiotlb.h> 12 #include <asm/dma.h> 13 #include <asm/xen/swiotlb-xen.h> 14 #include <asm/iommu_table.h> 15 int swiotlb __read_mostly; 16 17 void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, ··· 81 pci_swiotlb_late_init); 82 83 /* 84 - * if 4GB or more detected (and iommu=off not set) return 1 85 - * and set swiotlb to 1. 86 */ 87 int __init pci_swiotlb_detect_4gb(void) 88 { ··· 91 if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) 92 swiotlb = 1; 93 #endif 94 return swiotlb; 95 } 96 IOMMU_INIT(pci_swiotlb_detect_4gb,

··· 6 #include <linux/swiotlb.h> 7 #include <linux/bootmem.h> 8 #include <linux/dma-mapping.h> 9 + #include <linux/mem_encrypt.h> 10 11 #include <asm/iommu.h> 12 #include <asm/swiotlb.h> 13 #include <asm/dma.h> 14 #include <asm/xen/swiotlb-xen.h> 15 #include <asm/iommu_table.h> 16 + 17 int swiotlb __read_mostly; 18 19 void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, ··· 79 pci_swiotlb_late_init); 80 81 /* 82 + * If 4GB or more detected (and iommu=off not set) or if SME is active 83 + * then set swiotlb to 1 and return 1. 84 */ 85 int __init pci_swiotlb_detect_4gb(void) 86 { ··· 89 if (!no_iommu && max_possible_pfn > MAX_DMA32_PFN) 90 swiotlb = 1; 91 #endif 92 + 93 + /* 94 + * If SME is active then swiotlb will be set to 1 so that bounce 95 + * buffers are allocated and used for devices that do not support 96 + * the addressing range required for the encryption mask. 97 + */ 98 + if (sme_active()) 99 + swiotlb = 1; 100 + 101 return swiotlb; 102 } 103 IOMMU_INIT(pci_swiotlb_detect_4gb,

+15 -2

arch/x86/kernel/process.c

··· 355 return ret; 356 } 357 #endif 358 void stop_this_cpu(void *dummy) 359 { 360 local_irq_disable(); ··· 366 disable_local_APIC(); 367 mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); 368 369 - for (;;) 370 - halt(); 371 } 372 373 /*

··· 355 return ret; 356 } 357 #endif 358 + 359 void stop_this_cpu(void *dummy) 360 { 361 local_irq_disable(); ··· 365 disable_local_APIC(); 366 mcheck_cpu_clear(this_cpu_ptr(&cpu_info)); 367 368 + for (;;) { 369 + /* 370 + * Use wbinvd followed by hlt to stop the processor. This 371 + * provides support for kexec on a processor that supports 372 + * SME. With kexec, going from SME inactive to SME active 373 + * requires clearing cache entries so that addresses without 374 + * the encryption bit set don't corrupt the same physical 375 + * address that has the encryption bit set when caches are 376 + * flushed. To achieve this a wbinvd is performed followed by 377 + * a hlt. Even if the processor is not in the kexec/SME 378 + * scenario this only adds a wbinvd to a halting processor. 379 + */ 380 + asm volatile("wbinvd; hlt" : : : "memory"); 381 + } 382 } 383 384 /*

+14

arch/x86/kernel/relocate_kernel_64.S

··· 47 * %rsi page_list 48 * %rdx start address 49 * %rcx preserve_context 50 */ 51 52 /* Save the CPU context, used for jumping back */ ··· 71 /* zero out flags, and disable interrupts */ 72 pushq $0 73 popfq 74 75 /* 76 * get physical address of control page now ··· 135 136 /* Flush the TLB (needed?) */ 137 movq %r9, %cr3 138 139 movq %rcx, %r11 140 call swap_pages

··· 47 * %rsi page_list 48 * %rdx start address 49 * %rcx preserve_context 50 + * %r8 sme_active 51 */ 52 53 /* Save the CPU context, used for jumping back */ ··· 70 /* zero out flags, and disable interrupts */ 71 pushq $0 72 popfq 73 + 74 + /* Save SME active flag */ 75 + movq %r8, %r12 76 77 /* 78 * get physical address of control page now ··· 131 132 /* Flush the TLB (needed?) */ 133 movq %r9, %cr3 134 + 135 + /* 136 + * If SME is active, there could be old encrypted cache line 137 + * entries that will conflict with the now unencrypted memory 138 + * used by kexec. Flush the caches before copying the kernel. 139 + */ 140 + testq %r12, %r12 141 + jz 1f 142 + wbinvd 143 + 1: 144 145 movq %rcx, %r11 146 call swap_pages

+9

arch/x86/kernel/setup.c

··· 69 #include <linux/crash_dump.h> 70 #include <linux/tboot.h> 71 #include <linux/jiffies.h> 72 73 #include <linux/usb/xhci-dbgp.h> 74 #include <video/edid.h> ··· 375 if (!boot_params.hdr.type_of_loader || 376 !ramdisk_image || !ramdisk_size) 377 return; /* No initrd provided by bootloader */ 378 379 initrd_start = 0; 380

··· 69 #include <linux/crash_dump.h> 70 #include <linux/tboot.h> 71 #include <linux/jiffies.h> 72 + #include <linux/mem_encrypt.h> 73 74 #include <linux/usb/xhci-dbgp.h> 75 #include <video/edid.h> ··· 374 if (!boot_params.hdr.type_of_loader || 375 !ramdisk_image || !ramdisk_size) 376 return; /* No initrd provided by bootloader */ 377 + 378 + /* 379 + * If SME is active, this memory will be marked encrypted by the 380 + * kernel when it is accessed (including relocation). However, the 381 + * ramdisk image was loaded decrypted by the bootloader, so make 382 + * sure that it is encrypted before accessing it. 383 + */ 384 + sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image); 385 386 initrd_start = 0; 387

+26 -4

arch/x86/kernel/sys_x86_64.c

··· 21 #include <asm/compat.h> 22 #include <asm/ia32.h> 23 #include <asm/syscalls.h> 24 25 /* 26 * Align a virtual address to avoid aliasing in the I$ on AMD F15h. ··· 101 return error; 102 } 103 104 - static void find_start_end(unsigned long flags, unsigned long *begin, 105 - unsigned long *end) 106 { 107 if (!in_compat_syscall() && (flags & MAP_32BIT)) { 108 /* This is usually used needed to map code in small ··· 121 } 122 123 *begin = get_mmap_base(1); 124 - *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); 125 } 126 127 unsigned long ··· 136 struct vm_unmapped_area_info info; 137 unsigned long begin, end; 138 139 if (flags & MAP_FIXED) 140 return addr; 141 142 - find_start_end(flags, &begin, &end); 143 144 if (len > end) 145 return -ENOMEM; ··· 179 unsigned long addr = addr0; 180 struct vm_unmapped_area_info info; 181 182 /* requested length too big for entire address space */ 183 if (len > TASK_SIZE) 184 return -ENOMEM; ··· 207 info.length = len; 208 info.low_limit = PAGE_SIZE; 209 info.high_limit = get_mmap_base(0); 210 info.align_mask = 0; 211 info.align_offset = pgoff << PAGE_SHIFT; 212 if (filp) {

··· 21 #include <asm/compat.h> 22 #include <asm/ia32.h> 23 #include <asm/syscalls.h> 24 + #include <asm/mpx.h> 25 26 /* 27 * Align a virtual address to avoid aliasing in the I$ on AMD F15h. ··· 100 return error; 101 } 102 103 + static void find_start_end(unsigned long addr, unsigned long flags, 104 + unsigned long *begin, unsigned long *end) 105 { 106 if (!in_compat_syscall() && (flags & MAP_32BIT)) { 107 /* This is usually used needed to map code in small ··· 120 } 121 122 *begin = get_mmap_base(1); 123 + if (in_compat_syscall()) 124 + *end = task_size_32bit(); 125 + else 126 + *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); 127 } 128 129 unsigned long ··· 132 struct vm_unmapped_area_info info; 133 unsigned long begin, end; 134 135 + addr = mpx_unmapped_area_check(addr, len, flags); 136 + if (IS_ERR_VALUE(addr)) 137 + return addr; 138 + 139 if (flags & MAP_FIXED) 140 return addr; 141 142 + find_start_end(addr, flags, &begin, &end); 143 144 if (len > end) 145 return -ENOMEM; ··· 171 unsigned long addr = addr0; 172 struct vm_unmapped_area_info info; 173 174 + addr = mpx_unmapped_area_check(addr, len, flags); 175 + if (IS_ERR_VALUE(addr)) 176 + return addr; 177 + 178 /* requested length too big for entire address space */ 179 if (len > TASK_SIZE) 180 return -ENOMEM; ··· 195 info.length = len; 196 info.low_limit = PAGE_SIZE; 197 info.high_limit = get_mmap_base(0); 198 + 199 + /* 200 + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area 201 + * in the full address space. 202 + * 203 + * !in_compat_syscall() check to avoid high addresses for x32. 204 + */ 205 + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) 206 + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; 207 + 208 info.align_mask = 0; 209 info.align_offset = pgoff << PAGE_SHIFT; 210 if (filp) {

+34 -7

arch/x86/kvm/mmu.c

··· 108 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 109 110 111 - #define PT64_BASE_ADDR_MASK (((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)) 112 #define PT64_DIR_BASE_ADDR_MASK \ 113 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 114 #define PT64_LVL_ADDR_MASK(level) \ ··· 126 * PT32_LEVEL_BITS))) - 1)) 127 128 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ 129 - | shadow_x_mask | shadow_nx_mask) 130 131 #define ACC_EXEC_MASK 1 132 #define ACC_WRITE_MASK PT_WRITABLE_MASK ··· 186 static u64 __read_mostly shadow_mmio_mask; 187 static u64 __read_mostly shadow_mmio_value; 188 static u64 __read_mostly shadow_present_mask; 189 190 /* 191 * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. ··· 350 */ 351 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 352 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, 353 - u64 acc_track_mask) 354 { 355 BUG_ON(!dirty_mask != !accessed_mask); 356 BUG_ON(!accessed_mask && !acc_track_mask); ··· 363 shadow_x_mask = x_mask; 364 shadow_present_mask = p_mask; 365 shadow_acc_track_mask = acc_track_mask; 366 } 367 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 368 ··· 2435 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); 2436 2437 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | 2438 - shadow_user_mask | shadow_x_mask; 2439 2440 if (sp_ad_disabled(sp)) 2441 spte |= shadow_acc_track_value; ··· 2747 pte_access &= ~ACC_WRITE_MASK; 2748 2749 spte |= (u64)pfn << PAGE_SHIFT; 2750 2751 if (pte_access & ACC_WRITE_MASK) { 2752 ··· 4109 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 4110 { 4111 bool uses_nx = context->nx || context->base_role.smep_andnot_wp; 4112 4113 /* 4114 * Passing "true" to the last argument is okay; it adds a check 4115 * on bit 8 of the SPTEs which KVM doesn't use anyway. 4116 */ 4117 - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, 4118 boot_cpu_data.x86_phys_bits, 4119 context->shadow_root_level, uses_nx, 4120 guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), 4121 true); 4122 } 4123 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); 4124 ··· 4148 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4149 struct kvm_mmu *context) 4150 { 4151 if (boot_cpu_is_amd()) 4152 - __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, 4153 boot_cpu_data.x86_phys_bits, 4154 context->shadow_root_level, false, 4155 boot_cpu_has(X86_FEATURE_GBPAGES), 4156 true, true); 4157 else 4158 - __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, 4159 boot_cpu_data.x86_phys_bits, 4160 false); 4161 4162 } 4163 4164 /*

··· 108 (((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1)) 109 110 111 + #define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) 112 #define PT64_DIR_BASE_ADDR_MASK \ 113 (PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1)) 114 #define PT64_LVL_ADDR_MASK(level) \ ··· 126 * PT32_LEVEL_BITS))) - 1)) 127 128 #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | shadow_user_mask \ 129 + | shadow_x_mask | shadow_nx_mask | shadow_me_mask) 130 131 #define ACC_EXEC_MASK 1 132 #define ACC_WRITE_MASK PT_WRITABLE_MASK ··· 186 static u64 __read_mostly shadow_mmio_mask; 187 static u64 __read_mostly shadow_mmio_value; 188 static u64 __read_mostly shadow_present_mask; 189 + static u64 __read_mostly shadow_me_mask; 190 191 /* 192 * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value. ··· 349 */ 350 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 351 u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask, 352 + u64 acc_track_mask, u64 me_mask) 353 { 354 BUG_ON(!dirty_mask != !accessed_mask); 355 BUG_ON(!accessed_mask && !acc_track_mask); ··· 362 shadow_x_mask = x_mask; 363 shadow_present_mask = p_mask; 364 shadow_acc_track_mask = acc_track_mask; 365 + shadow_me_mask = me_mask; 366 } 367 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); 368 ··· 2433 BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK); 2434 2435 spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK | 2436 + shadow_user_mask | shadow_x_mask | shadow_me_mask; 2437 2438 if (sp_ad_disabled(sp)) 2439 spte |= shadow_acc_track_value; ··· 2745 pte_access &= ~ACC_WRITE_MASK; 2746 2747 spte |= (u64)pfn << PAGE_SHIFT; 2748 + spte |= shadow_me_mask; 2749 2750 if (pte_access & ACC_WRITE_MASK) { 2751 ··· 4106 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) 4107 { 4108 bool uses_nx = context->nx || context->base_role.smep_andnot_wp; 4109 + struct rsvd_bits_validate *shadow_zero_check; 4110 + int i; 4111 4112 /* 4113 * Passing "true" to the last argument is okay; it adds a check 4114 * on bit 8 of the SPTEs which KVM doesn't use anyway. 4115 */ 4116 + shadow_zero_check = &context->shadow_zero_check; 4117 + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, 4118 boot_cpu_data.x86_phys_bits, 4119 context->shadow_root_level, uses_nx, 4120 guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), 4121 true); 4122 + 4123 + if (!shadow_me_mask) 4124 + return; 4125 + 4126 + for (i = context->shadow_root_level; --i >= 0;) { 4127 + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; 4128 + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; 4129 + } 4130 + 4131 } 4132 EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); 4133 ··· 4133 reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, 4134 struct kvm_mmu *context) 4135 { 4136 + struct rsvd_bits_validate *shadow_zero_check; 4137 + int i; 4138 + 4139 + shadow_zero_check = &context->shadow_zero_check; 4140 + 4141 if (boot_cpu_is_amd()) 4142 + __reset_rsvds_bits_mask(vcpu, shadow_zero_check, 4143 boot_cpu_data.x86_phys_bits, 4144 context->shadow_root_level, false, 4145 boot_cpu_has(X86_FEATURE_GBPAGES), 4146 true, true); 4147 else 4148 + __reset_rsvds_bits_mask_ept(shadow_zero_check, 4149 boot_cpu_data.x86_phys_bits, 4150 false); 4151 4152 + if (!shadow_me_mask) 4153 + return; 4154 + 4155 + for (i = context->shadow_root_level; --i >= 0;) { 4156 + shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask; 4157 + shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask; 4158 + } 4159 } 4160 4161 /*

+18 -17

arch/x86/kvm/svm.c

··· 1167 { 1168 struct vmcb *vmcb = svm->vmcb; 1169 struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; 1170 - phys_addr_t bpa = page_to_phys(svm->avic_backing_page); 1171 - phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page); 1172 - phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page); 1173 1174 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; 1175 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; ··· 1232 set_intercept(svm, INTERCEPT_MWAIT); 1233 } 1234 1235 - control->iopm_base_pa = iopm_base; 1236 - control->msrpm_base_pa = __pa(svm->msrpm); 1237 control->int_ctl = V_INTR_MASKING_MASK; 1238 1239 init_seg(&save->es); ··· 1377 return -EINVAL; 1378 1379 new_entry = READ_ONCE(*entry); 1380 - new_entry = (page_to_phys(svm->avic_backing_page) & 1381 - AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | 1382 - AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; 1383 WRITE_ONCE(*entry, new_entry); 1384 1385 svm->avic_physical_id_cache = entry; ··· 1647 1648 svm->vmcb = page_address(page); 1649 clear_page(svm->vmcb); 1650 - svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1651 svm->asid_generation = 0; 1652 init_vmcb(svm); 1653 ··· 1675 { 1676 struct vcpu_svm *svm = to_svm(vcpu); 1677 1678 - __free_page(pfn_to_page(svm->vmcb_pa >> PAGE_SHIFT)); 1679 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); 1680 __free_page(virt_to_page(svm->nested.hsave)); 1681 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); ··· 2330 u64 pdpte; 2331 int ret; 2332 2333 - ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(cr3), &pdpte, 2334 offset_in_page(cr3) + index * 8, 8); 2335 if (ret) 2336 return 0; ··· 2342 { 2343 struct vcpu_svm *svm = to_svm(vcpu); 2344 2345 - svm->vmcb->control.nested_cr3 = root; 2346 mark_dirty(svm->vmcb, VMCB_NPT); 2347 svm_flush_tlb(vcpu); 2348 } ··· 2873 svm->nested.msrpm[p] = svm->msrpm[p] | value; 2874 } 2875 2876 - svm->vmcb->control.msrpm_base_pa = __pa(svm->nested.msrpm); 2877 2878 return true; 2879 } ··· 4506 pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, 4507 irq.vector); 4508 *svm = to_svm(vcpu); 4509 - vcpu_info->pi_desc_addr = page_to_phys((*svm)->avic_backing_page); 4510 vcpu_info->vector = irq.vector; 4511 4512 return 0; ··· 4557 struct amd_iommu_pi_data pi; 4558 4559 /* Try to enable guest_mode in IRTE */ 4560 - pi.base = page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK; 4561 pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, 4562 svm->vcpu.vcpu_id); 4563 pi.is_guest_mode = true; ··· 5007 { 5008 struct vcpu_svm *svm = to_svm(vcpu); 5009 5010 - svm->vmcb->save.cr3 = root; 5011 mark_dirty(svm->vmcb, VMCB_CR); 5012 svm_flush_tlb(vcpu); 5013 } ··· 5016 { 5017 struct vcpu_svm *svm = to_svm(vcpu); 5018 5019 - svm->vmcb->control.nested_cr3 = root; 5020 mark_dirty(svm->vmcb, VMCB_NPT); 5021 5022 /* Also sync guest cr3 here in case we live migrate */

··· 1167 { 1168 struct vmcb *vmcb = svm->vmcb; 1169 struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; 1170 + phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); 1171 + phys_addr_t lpa = __sme_set(page_to_phys(vm_data->avic_logical_id_table_page)); 1172 + phys_addr_t ppa = __sme_set(page_to_phys(vm_data->avic_physical_id_table_page)); 1173 1174 vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; 1175 vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; ··· 1232 set_intercept(svm, INTERCEPT_MWAIT); 1233 } 1234 1235 + control->iopm_base_pa = __sme_set(iopm_base); 1236 + control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); 1237 control->int_ctl = V_INTR_MASKING_MASK; 1238 1239 init_seg(&save->es); ··· 1377 return -EINVAL; 1378 1379 new_entry = READ_ONCE(*entry); 1380 + new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & 1381 + AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | 1382 + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); 1383 WRITE_ONCE(*entry, new_entry); 1384 1385 svm->avic_physical_id_cache = entry; ··· 1647 1648 svm->vmcb = page_address(page); 1649 clear_page(svm->vmcb); 1650 + svm->vmcb_pa = __sme_set(page_to_pfn(page) << PAGE_SHIFT); 1651 svm->asid_generation = 0; 1652 init_vmcb(svm); 1653 ··· 1675 { 1676 struct vcpu_svm *svm = to_svm(vcpu); 1677 1678 + __free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT)); 1679 __free_pages(virt_to_page(svm->msrpm), MSRPM_ALLOC_ORDER); 1680 __free_page(virt_to_page(svm->nested.hsave)); 1681 __free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER); ··· 2330 u64 pdpte; 2331 int ret; 2332 2333 + ret = kvm_vcpu_read_guest_page(vcpu, gpa_to_gfn(__sme_clr(cr3)), &pdpte, 2334 offset_in_page(cr3) + index * 8, 8); 2335 if (ret) 2336 return 0; ··· 2342 { 2343 struct vcpu_svm *svm = to_svm(vcpu); 2344 2345 + svm->vmcb->control.nested_cr3 = __sme_set(root); 2346 mark_dirty(svm->vmcb, VMCB_NPT); 2347 svm_flush_tlb(vcpu); 2348 } ··· 2873 svm->nested.msrpm[p] = svm->msrpm[p] | value; 2874 } 2875 2876 + svm->vmcb->control.msrpm_base_pa = __sme_set(__pa(svm->nested.msrpm)); 2877 2878 return true; 2879 } ··· 4506 pr_debug("SVM: %s: use GA mode for irq %u\n", __func__, 4507 irq.vector); 4508 *svm = to_svm(vcpu); 4509 + vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); 4510 vcpu_info->vector = irq.vector; 4511 4512 return 0; ··· 4557 struct amd_iommu_pi_data pi; 4558 4559 /* Try to enable guest_mode in IRTE */ 4560 + pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & 4561 + AVIC_HPA_MASK); 4562 pi.ga_tag = AVIC_GATAG(kvm->arch.avic_vm_id, 4563 svm->vcpu.vcpu_id); 4564 pi.is_guest_mode = true; ··· 5006 { 5007 struct vcpu_svm *svm = to_svm(vcpu); 5008 5009 + svm->vmcb->save.cr3 = __sme_set(root); 5010 mark_dirty(svm->vmcb, VMCB_CR); 5011 svm_flush_tlb(vcpu); 5012 } ··· 5015 { 5016 struct vcpu_svm *svm = to_svm(vcpu); 5017 5018 + svm->vmcb->control.nested_cr3 = __sme_set(root); 5019 mark_dirty(svm->vmcb, VMCB_NPT); 5020 5021 /* Also sync guest cr3 here in case we live migrate */

+1 -1

arch/x86/kvm/vmx.c

··· 6556 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 6557 0ull, VMX_EPT_EXECUTABLE_MASK, 6558 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, 6559 - VMX_EPT_RWX_MASK); 6560 6561 ept_set_mmio_spte_mask(); 6562 kvm_enable_tdp();

··· 6556 enable_ept_ad_bits ? VMX_EPT_DIRTY_BIT : 0ull, 6557 0ull, VMX_EPT_EXECUTABLE_MASK, 6558 cpu_has_vmx_ept_execute_only() ? 0ull : VMX_EPT_READABLE_MASK, 6559 + VMX_EPT_RWX_MASK, 0ull); 6560 6561 ept_set_mmio_spte_mask(); 6562 kvm_enable_tdp();

+2 -1

arch/x86/kvm/x86.c

··· 54 #include <linux/kvm_irqfd.h> 55 #include <linux/irqbypass.h> 56 #include <linux/sched/stat.h> 57 58 #include <trace/events/kvm.h> 59 ··· 6126 6127 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 6128 PT_DIRTY_MASK, PT64_NX_MASK, 0, 6129 - PT_PRESENT_MASK, 0); 6130 kvm_timer_init(); 6131 6132 perf_register_guest_info_callbacks(&kvm_guest_cbs);

··· 54 #include <linux/kvm_irqfd.h> 55 #include <linux/irqbypass.h> 56 #include <linux/sched/stat.h> 57 + #include <linux/mem_encrypt.h> 58 59 #include <trace/events/kvm.h> 60 ··· 6125 6126 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 6127 PT_DIRTY_MASK, PT64_NX_MASK, 0, 6128 + PT_PRESENT_MASK, 0, sme_me_mask); 6129 kvm_timer_init(); 6130 6131 perf_register_guest_info_callbacks(&kvm_guest_cbs);

+105

arch/x86/lib/cmdline.c

··· 104 return 0; /* Buffer overrun */ 105 } 106 107 int cmdline_find_option_bool(const char *cmdline, const char *option) 108 { 109 return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); 110 }

··· 104 return 0; /* Buffer overrun */ 105 } 106 107 + /* 108 + * Find a non-boolean option (i.e. option=argument). In accordance with 109 + * standard Linux practice, if this option is repeated, this returns the 110 + * last instance on the command line. 111 + * 112 + * @cmdline: the cmdline string 113 + * @max_cmdline_size: the maximum size of cmdline 114 + * @option: option string to look for 115 + * @buffer: memory buffer to return the option argument 116 + * @bufsize: size of the supplied memory buffer 117 + * 118 + * Returns the length of the argument (regardless of if it was 119 + * truncated to fit in the buffer), or -1 on not found. 120 + */ 121 + static int 122 + __cmdline_find_option(const char *cmdline, int max_cmdline_size, 123 + const char *option, char *buffer, int bufsize) 124 + { 125 + char c; 126 + int pos = 0, len = -1; 127 + const char *opptr = NULL; 128 + char *bufptr = buffer; 129 + enum { 130 + st_wordstart = 0, /* Start of word/after whitespace */ 131 + st_wordcmp, /* Comparing this word */ 132 + st_wordskip, /* Miscompare, skip */ 133 + st_bufcpy, /* Copying this to buffer */ 134 + } state = st_wordstart; 135 + 136 + if (!cmdline) 137 + return -1; /* No command line */ 138 + 139 + /* 140 + * This 'pos' check ensures we do not overrun 141 + * a non-NULL-terminated 'cmdline' 142 + */ 143 + while (pos++ < max_cmdline_size) { 144 + c = *(char *)cmdline++; 145 + if (!c) 146 + break; 147 + 148 + switch (state) { 149 + case st_wordstart: 150 + if (myisspace(c)) 151 + break; 152 + 153 + state = st_wordcmp; 154 + opptr = option; 155 + /* fall through */ 156 + 157 + case st_wordcmp: 158 + if ((c == '=') && !*opptr) { 159 + /* 160 + * We matched all the way to the end of the 161 + * option we were looking for, prepare to 162 + * copy the argument. 163 + */ 164 + len = 0; 165 + bufptr = buffer; 166 + state = st_bufcpy; 167 + break; 168 + } else if (c == *opptr++) { 169 + /* 170 + * We are currently matching, so continue 171 + * to the next character on the cmdline. 172 + */ 173 + break; 174 + } 175 + state = st_wordskip; 176 + /* fall through */ 177 + 178 + case st_wordskip: 179 + if (myisspace(c)) 180 + state = st_wordstart; 181 + break; 182 + 183 + case st_bufcpy: 184 + if (myisspace(c)) { 185 + state = st_wordstart; 186 + } else { 187 + /* 188 + * Increment len, but don't overrun the 189 + * supplied buffer and leave room for the 190 + * NULL terminator. 191 + */ 192 + if (++len < bufsize) 193 + *bufptr++ = c; 194 + } 195 + break; 196 + } 197 + } 198 + 199 + if (bufsize) 200 + *bufptr = '\0'; 201 + 202 + return len; 203 + } 204 + 205 int cmdline_find_option_bool(const char *cmdline, const char *option) 206 { 207 return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option); 208 + } 209 + 210 + int cmdline_find_option(const char *cmdline, const char *option, char *buffer, 211 + int bufsize) 212 + { 213 + return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option, 214 + buffer, bufsize); 215 }

+2

arch/x86/mm/Makefile

··· 39 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o 40 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o 41

··· 39 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o 40 obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o 41 42 + obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt.o 43 + obj-$(CONFIG_AMD_MEM_ENCRYPT) += mem_encrypt_boot.o

+56 -37

arch/x86/mm/dump_pagetables.c

··· 13 */ 14 15 #include <linux/debugfs.h> 16 #include <linux/mm.h> 17 #include <linux/init.h> 18 #include <linux/sched.h> 19 #include <linux/seq_file.h> 20 21 - #include <asm/kasan.h> 22 #include <asm/pgtable.h> 23 24 /* ··· 138 { 139 pgprotval_t pr = pgprot_val(prot); 140 static const char * const level_name[] = 141 - { "cr3", "pgd", "pud", "pmd", "pte" }; 142 143 if (!pgprot_val(prot)) { 144 /* Not present */ ··· 162 pt_dump_cont_printf(m, dmsg, " "); 163 164 /* Bit 7 has a different meaning on level 3 vs 4 */ 165 - if (level <= 3 && pr & _PAGE_PSE) 166 pt_dump_cont_printf(m, dmsg, "PSE "); 167 else 168 pt_dump_cont_printf(m, dmsg, " "); 169 - if ((level == 4 && pr & _PAGE_PAT) || 170 - ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE)) 171 pt_dump_cont_printf(m, dmsg, "PAT "); 172 else 173 pt_dump_cont_printf(m, dmsg, " "); ··· 188 */ 189 static unsigned long normalize_addr(unsigned long u) 190 { 191 - #ifdef CONFIG_X86_64 192 - return (signed long)(u << 16) >> 16; 193 - #else 194 - return u; 195 - #endif 196 } 197 198 /* ··· 298 for (i = 0; i < PTRS_PER_PTE; i++) { 299 prot = pte_flags(*start); 300 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); 301 - note_page(m, st, __pgprot(prot), 4); 302 start++; 303 } 304 } 305 306 #if PTRS_PER_PMD > 1 307 308 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) 309 { 310 int i; 311 - pmd_t *start; 312 pgprotval_t prot; 313 314 - start = (pmd_t *)pud_page_vaddr(addr); 315 for (i = 0; i < PTRS_PER_PMD; i++) { 316 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 317 if (!pmd_none(*start)) { 318 if (pmd_large(*start) || !pmd_present(*start)) { 319 prot = pmd_flags(*start); 320 - note_page(m, st, __pgprot(prot), 3); 321 - } else { 322 walk_pte_level(m, st, *start, 323 P + i * PMD_LEVEL_MULT); 324 } 325 } else 326 - note_page(m, st, __pgprot(0), 3); 327 start++; 328 } 329 } ··· 366 367 #if PTRS_PER_PUD > 1 368 369 - /* 370 - * This is an optimization for CONFIG_DEBUG_WX=y + CONFIG_KASAN=y 371 - * KASAN fills page tables with the same values. Since there is no 372 - * point in checking page table more than once we just skip repeated 373 - * entries. This saves us dozens of seconds during boot. 374 - */ 375 - static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) 376 - { 377 - return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); 378 - } 379 - 380 static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) 381 { 382 int i; 383 - pud_t *start; 384 pgprotval_t prot; 385 pud_t *prev_pud = NULL; 386 387 - start = (pud_t *)p4d_page_vaddr(addr); 388 389 for (i = 0; i < PTRS_PER_PUD; i++) { 390 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 391 - if (!pud_none(*start) && 392 - !pud_already_checked(prev_pud, start, st->check_wx)) { 393 if (pud_large(*start) || !pud_present(*start)) { 394 prot = pud_flags(*start); 395 - note_page(m, st, __pgprot(prot), 2); 396 - } else { 397 walk_pmd_level(m, st, *start, 398 P + i * PUD_LEVEL_MULT); 399 } 400 } else 401 - note_page(m, st, __pgprot(0), 2); 402 403 prev_pud = start; 404 start++; ··· 404 static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) 405 { 406 int i; 407 - p4d_t *start; 408 pgprotval_t prot; 409 410 - start = (p4d_t *)pgd_page_vaddr(addr); 411 412 for (i = 0; i < PTRS_PER_P4D; i++) { 413 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); ··· 415 if (p4d_large(*start) || !p4d_present(*start)) { 416 prot = p4d_flags(*start); 417 note_page(m, st, __pgprot(prot), 2); 418 - } else { 419 walk_pud_level(m, st, *start, 420 P + i * P4D_LEVEL_MULT); 421 }

··· 13 */ 14 15 #include <linux/debugfs.h> 16 + #include <linux/kasan.h> 17 #include <linux/mm.h> 18 #include <linux/init.h> 19 #include <linux/sched.h> 20 #include <linux/seq_file.h> 21 22 #include <asm/pgtable.h> 23 24 /* ··· 138 { 139 pgprotval_t pr = pgprot_val(prot); 140 static const char * const level_name[] = 141 + { "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; 142 143 if (!pgprot_val(prot)) { 144 /* Not present */ ··· 162 pt_dump_cont_printf(m, dmsg, " "); 163 164 /* Bit 7 has a different meaning on level 3 vs 4 */ 165 + if (level <= 4 && pr & _PAGE_PSE) 166 pt_dump_cont_printf(m, dmsg, "PSE "); 167 else 168 pt_dump_cont_printf(m, dmsg, " "); 169 + if ((level == 5 && pr & _PAGE_PAT) || 170 + ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE)) 171 pt_dump_cont_printf(m, dmsg, "PAT "); 172 else 173 pt_dump_cont_printf(m, dmsg, " "); ··· 188 */ 189 static unsigned long normalize_addr(unsigned long u) 190 { 191 + int shift; 192 + if (!IS_ENABLED(CONFIG_X86_64)) 193 + return u; 194 + 195 + shift = 64 - (__VIRTUAL_MASK_SHIFT + 1); 196 + return (signed long)(u << shift) >> shift; 197 } 198 199 /* ··· 297 for (i = 0; i < PTRS_PER_PTE; i++) { 298 prot = pte_flags(*start); 299 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); 300 + note_page(m, st, __pgprot(prot), 5); 301 start++; 302 } 303 } 304 + #ifdef CONFIG_KASAN 305 + 306 + /* 307 + * This is an optimization for KASAN=y case. Since all kasan page tables 308 + * eventually point to the kasan_zero_page we could call note_page() 309 + * right away without walking through lower level page tables. This saves 310 + * us dozens of seconds (minutes for 5-level config) while checking for 311 + * W+X mapping or reading kernel_page_tables debugfs file. 312 + */ 313 + static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, 314 + void *pt) 315 + { 316 + if (__pa(pt) == __pa(kasan_zero_pmd) || 317 + #ifdef CONFIG_X86_5LEVEL 318 + __pa(pt) == __pa(kasan_zero_p4d) || 319 + #endif 320 + __pa(pt) == __pa(kasan_zero_pud)) { 321 + pgprotval_t prot = pte_flags(kasan_zero_pte[0]); 322 + note_page(m, st, __pgprot(prot), 5); 323 + return true; 324 + } 325 + return false; 326 + } 327 + #else 328 + static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st, 329 + void *pt) 330 + { 331 + return false; 332 + } 333 + #endif 334 335 #if PTRS_PER_PMD > 1 336 337 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) 338 { 339 int i; 340 + pmd_t *start, *pmd_start; 341 pgprotval_t prot; 342 343 + pmd_start = start = (pmd_t *)pud_page_vaddr(addr); 344 for (i = 0; i < PTRS_PER_PMD; i++) { 345 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 346 if (!pmd_none(*start)) { 347 if (pmd_large(*start) || !pmd_present(*start)) { 348 prot = pmd_flags(*start); 349 + note_page(m, st, __pgprot(prot), 4); 350 + } else if (!kasan_page_table(m, st, pmd_start)) { 351 walk_pte_level(m, st, *start, 352 P + i * PMD_LEVEL_MULT); 353 } 354 } else 355 + note_page(m, st, __pgprot(0), 4); 356 start++; 357 } 358 } ··· 335 336 #if PTRS_PER_PUD > 1 337 338 static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) 339 { 340 int i; 341 + pud_t *start, *pud_start; 342 pgprotval_t prot; 343 pud_t *prev_pud = NULL; 344 345 + pud_start = start = (pud_t *)p4d_page_vaddr(addr); 346 347 for (i = 0; i < PTRS_PER_PUD; i++) { 348 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 349 + if (!pud_none(*start)) { 350 if (pud_large(*start) || !pud_present(*start)) { 351 prot = pud_flags(*start); 352 + note_page(m, st, __pgprot(prot), 3); 353 + } else if (!kasan_page_table(m, st, pud_start)) { 354 walk_pmd_level(m, st, *start, 355 P + i * PUD_LEVEL_MULT); 356 } 357 } else 358 + note_page(m, st, __pgprot(0), 3); 359 360 prev_pud = start; 361 start++; ··· 385 static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) 386 { 387 int i; 388 + p4d_t *start, *p4d_start; 389 pgprotval_t prot; 390 391 + p4d_start = start = (p4d_t *)pgd_page_vaddr(addr); 392 393 for (i = 0; i < PTRS_PER_P4D; i++) { 394 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); ··· 396 if (p4d_large(*start) || !p4d_present(*start)) { 397 prot = p4d_flags(*start); 398 note_page(m, st, __pgprot(prot), 2); 399 + } else if (!kasan_page_table(m, st, p4d_start)) { 400 walk_pud_level(m, st, *start, 401 P + i * P4D_LEVEL_MULT); 402 }

+15 -11

arch/x86/mm/fault.c

··· 396 pte_t *pte; 397 398 #ifdef CONFIG_X86_PAE 399 - printk("*pdpt = %016Lx ", pgd_val(*pgd)); 400 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) 401 goto out; 402 #endif 403 p4d = p4d_offset(pgd, address); 404 pud = pud_offset(p4d, address); 405 pmd = pmd_offset(pud, address); 406 - printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 407 408 /* 409 * We must not directly access the pte in the highpte ··· 419 goto out; 420 421 pte = pte_offset_kernel(pmd, address); 422 - printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); 423 out: 424 - printk("\n"); 425 } 426 427 #else /* CONFIG_X86_64: */ ··· 569 if (bad_address(pgd)) 570 goto bad; 571 572 - printk("PGD %lx ", pgd_val(*pgd)); 573 574 if (!pgd_present(*pgd)) 575 goto out; ··· 578 if (bad_address(p4d)) 579 goto bad; 580 581 - printk("P4D %lx ", p4d_val(*p4d)); 582 if (!p4d_present(*p4d) || p4d_large(*p4d)) 583 goto out; 584 ··· 586 if (bad_address(pud)) 587 goto bad; 588 589 - printk("PUD %lx ", pud_val(*pud)); 590 if (!pud_present(*pud) || pud_large(*pud)) 591 goto out; 592 ··· 594 if (bad_address(pmd)) 595 goto bad; 596 597 - printk("PMD %lx ", pmd_val(*pmd)); 598 if (!pmd_present(*pmd) || pmd_large(*pmd)) 599 goto out; 600 ··· 602 if (bad_address(pte)) 603 goto bad; 604 605 - printk("PTE %lx", pte_val(*pte)); 606 out: 607 - printk("\n"); 608 return; 609 bad: 610 - printk("BAD\n"); 611 } 612 613 #endif /* CONFIG_X86_64 */

··· 396 pte_t *pte; 397 398 #ifdef CONFIG_X86_PAE 399 + pr_info("*pdpt = %016Lx ", pgd_val(*pgd)); 400 if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) 401 goto out; 402 + #define pr_pde pr_cont 403 + #else 404 + #define pr_pde pr_info 405 #endif 406 p4d = p4d_offset(pgd, address); 407 pud = pud_offset(p4d, address); 408 pmd = pmd_offset(pud, address); 409 + pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); 410 + #undef pr_pde 411 412 /* 413 * We must not directly access the pte in the highpte ··· 415 goto out; 416 417 pte = pte_offset_kernel(pmd, address); 418 + pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte)); 419 out: 420 + pr_cont("\n"); 421 } 422 423 #else /* CONFIG_X86_64: */ ··· 565 if (bad_address(pgd)) 566 goto bad; 567 568 + pr_info("PGD %lx ", pgd_val(*pgd)); 569 570 if (!pgd_present(*pgd)) 571 goto out; ··· 574 if (bad_address(p4d)) 575 goto bad; 576 577 + pr_cont("P4D %lx ", p4d_val(*p4d)); 578 if (!p4d_present(*p4d) || p4d_large(*p4d)) 579 goto out; 580 ··· 582 if (bad_address(pud)) 583 goto bad; 584 585 + pr_cont("PUD %lx ", pud_val(*pud)); 586 if (!pud_present(*pud) || pud_large(*pud)) 587 goto out; 588 ··· 590 if (bad_address(pmd)) 591 goto bad; 592 593 + pr_cont("PMD %lx ", pmd_val(*pmd)); 594 if (!pmd_present(*pmd) || pmd_large(*pmd)) 595 goto out; 596 ··· 598 if (bad_address(pte)) 599 goto bad; 600 601 + pr_cont("PTE %lx", pte_val(*pte)); 602 out: 603 + pr_cont("\n"); 604 return; 605 bad: 606 + pr_info("BAD\n"); 607 } 608 609 #endif /* CONFIG_X86_64 */

+23 -4

arch/x86/mm/hugetlbpage.c

··· 18 #include <asm/tlbflush.h> 19 #include <asm/pgalloc.h> 20 #include <asm/elf.h> 21 22 #if 0 /* This is just for testing */ 23 struct page * ··· 86 info.flags = 0; 87 info.length = len; 88 info.low_limit = get_mmap_base(1); 89 info.high_limit = in_compat_syscall() ? 90 - tasksize_32bit() : tasksize_64bit(); 91 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 92 info.align_offset = 0; 93 return vm_unmapped_area(&info); 94 } 95 96 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, 97 - unsigned long addr0, unsigned long len, 98 unsigned long pgoff, unsigned long flags) 99 { 100 struct hstate *h = hstate_file(file); 101 struct vm_unmapped_area_info info; 102 - unsigned long addr; 103 104 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 105 info.length = len; 106 info.low_limit = PAGE_SIZE; 107 info.high_limit = get_mmap_base(0); 108 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 109 info.align_offset = 0; 110 addr = vm_unmapped_area(&info); ··· 132 VM_BUG_ON(addr != -ENOMEM); 133 info.flags = 0; 134 info.low_limit = TASK_UNMAPPED_BASE; 135 - info.high_limit = TASK_SIZE; 136 addr = vm_unmapped_area(&info); 137 } 138 ··· 149 150 if (len & ~huge_page_mask(h)) 151 return -EINVAL; 152 if (len > TASK_SIZE) 153 return -ENOMEM; 154

··· 18 #include <asm/tlbflush.h> 19 #include <asm/pgalloc.h> 20 #include <asm/elf.h> 21 + #include <asm/mpx.h> 22 23 #if 0 /* This is just for testing */ 24 struct page * ··· 85 info.flags = 0; 86 info.length = len; 87 info.low_limit = get_mmap_base(1); 88 + 89 + /* 90 + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area 91 + * in the full address space. 92 + */ 93 info.high_limit = in_compat_syscall() ? 94 + task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); 95 + 96 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 97 info.align_offset = 0; 98 return vm_unmapped_area(&info); 99 } 100 101 static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, 102 + unsigned long addr, unsigned long len, 103 unsigned long pgoff, unsigned long flags) 104 { 105 struct hstate *h = hstate_file(file); 106 struct vm_unmapped_area_info info; 107 108 info.flags = VM_UNMAPPED_AREA_TOPDOWN; 109 info.length = len; 110 info.low_limit = PAGE_SIZE; 111 info.high_limit = get_mmap_base(0); 112 + 113 + /* 114 + * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area 115 + * in the full address space. 116 + */ 117 + if (addr > DEFAULT_MAP_WINDOW && !in_compat_syscall()) 118 + info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; 119 + 120 info.align_mask = PAGE_MASK & ~huge_page_mask(h); 121 info.align_offset = 0; 122 addr = vm_unmapped_area(&info); ··· 118 VM_BUG_ON(addr != -ENOMEM); 119 info.flags = 0; 120 info.low_limit = TASK_UNMAPPED_BASE; 121 + info.high_limit = TASK_SIZE_LOW; 122 addr = vm_unmapped_area(&info); 123 } 124 ··· 135 136 if (len & ~huge_page_mask(h)) 137 return -EINVAL; 138 + 139 + addr = mpx_unmapped_area_check(addr, len, flags); 140 + if (IS_ERR_VALUE(addr)) 141 + return addr; 142 + 143 if (len > TASK_SIZE) 144 return -ENOMEM; 145

+8 -4

arch/x86/mm/ident_map.c

··· 51 if (!pmd) 52 return -ENOMEM; 53 ident_pmd_init(info, pmd, addr, next); 54 - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); 55 } 56 57 return 0; ··· 79 if (!pud) 80 return -ENOMEM; 81 ident_pud_init(info, pud, addr, next); 82 - set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); 83 } 84 85 return 0; ··· 92 unsigned long end = pend + info->offset; 93 unsigned long next; 94 int result; 95 96 for (; addr < end; addr = next) { 97 pgd_t *pgd = pgd_page + pgd_index(addr); ··· 120 if (result) 121 return result; 122 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 123 - set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); 124 } else { 125 /* 126 * With p4d folded, pgd is equal to p4d. 127 * The pgd entry has to point to the pud page table in this case. 128 */ 129 pud_t *pud = pud_offset(p4d, 0); 130 - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); 131 } 132 } 133

··· 51 if (!pmd) 52 return -ENOMEM; 53 ident_pmd_init(info, pmd, addr, next); 54 + set_pud(pud, __pud(__pa(pmd) | info->kernpg_flag)); 55 } 56 57 return 0; ··· 79 if (!pud) 80 return -ENOMEM; 81 ident_pud_init(info, pud, addr, next); 82 + set_p4d(p4d, __p4d(__pa(pud) | info->kernpg_flag)); 83 } 84 85 return 0; ··· 92 unsigned long end = pend + info->offset; 93 unsigned long next; 94 int result; 95 + 96 + /* Set the default pagetable flags if not supplied */ 97 + if (!info->kernpg_flag) 98 + info->kernpg_flag = _KERNPG_TABLE; 99 100 for (; addr < end; addr = next) { 101 pgd_t *pgd = pgd_page + pgd_index(addr); ··· 116 if (result) 117 return result; 118 if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 119 + set_pgd(pgd, __pgd(__pa(p4d) | info->kernpg_flag)); 120 } else { 121 /* 122 * With p4d folded, pgd is equal to p4d. 123 * The pgd entry has to point to the pud page table in this case. 124 */ 125 pud_t *pud = pud_offset(p4d, 0); 126 + set_pgd(pgd, __pgd(__pa(pud) | info->kernpg_flag)); 127 } 128 } 129

+1 -1

arch/x86/mm/init.c

··· 815 816 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { 817 .loaded_mm = &init_mm, 818 - .state = 0, 819 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ 820 }; 821 EXPORT_SYMBOL_GPL(cpu_tlbstate);

··· 815 816 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { 817 .loaded_mm = &init_mm, 818 + .next_asid = 1, 819 .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ 820 }; 821 EXPORT_SYMBOL_GPL(cpu_tlbstate);

+270 -19

arch/x86/mm/ioremap.c

··· 13 #include <linux/slab.h> 14 #include <linux/vmalloc.h> 15 #include <linux/mmiotrace.h> 16 17 #include <asm/set_memory.h> 18 #include <asm/e820/api.h> ··· 23 #include <asm/tlbflush.h> 24 #include <asm/pgalloc.h> 25 #include <asm/pat.h> 26 27 #include "physaddr.h" 28 ··· 107 WARN_ON_ONCE(1); 108 return NULL; 109 } 110 - 111 - /* 112 - * Don't remap the low PCI/ISA area, it's always mapped.. 113 - */ 114 - if (is_ISA_range(phys_addr, last_addr)) 115 - return (__force void __iomem *)phys_to_virt(phys_addr); 116 117 /* 118 * Don't allow anybody to remap normal RAM that we're using.. ··· 337 return; 338 339 /* 340 - * __ioremap special-cases the PCI/ISA range by not instantiating a 341 - * vm_area and by simply returning an address into the kernel mapping 342 - * of ISA space. So handle that here. 343 */ 344 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && 345 - (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) 346 return; 347 348 addr = (volatile void __iomem *) 349 (PAGE_MASK & (unsigned long __force)addr); ··· 400 unsigned long offset = phys & ~PAGE_MASK; 401 void *vaddr; 402 403 - /* If page is RAM, we can use __va. Otherwise ioremap and unmap. */ 404 - if (page_is_ram(start >> PAGE_SHIFT)) 405 - return __va(phys); 406 407 - vaddr = ioremap_cache(start, PAGE_SIZE); 408 - /* Only add the offset on success and return NULL if the ioremap() failed: */ 409 if (vaddr) 410 vaddr += offset; 411 ··· 412 413 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) 414 { 415 - if (page_is_ram(phys >> PAGE_SHIFT)) 416 - return; 417 - 418 - iounmap((void __iomem *)((unsigned long)addr & PAGE_MASK)); 419 } 420 421 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; 422

··· 13 #include <linux/slab.h> 14 #include <linux/vmalloc.h> 15 #include <linux/mmiotrace.h> 16 + #include <linux/mem_encrypt.h> 17 + #include <linux/efi.h> 18 19 #include <asm/set_memory.h> 20 #include <asm/e820/api.h> ··· 21 #include <asm/tlbflush.h> 22 #include <asm/pgalloc.h> 23 #include <asm/pat.h> 24 + #include <asm/setup.h> 25 26 #include "physaddr.h" 27 ··· 104 WARN_ON_ONCE(1); 105 return NULL; 106 } 107 108 /* 109 * Don't allow anybody to remap normal RAM that we're using.. ··· 340 return; 341 342 /* 343 + * The PCI/ISA range special-casing was removed from __ioremap() 344 + * so this check, in theory, can be removed. However, there are 345 + * cases where iounmap() is called for addresses not obtained via 346 + * ioremap() (vga16fb for example). Add a warning so that these 347 + * cases can be caught and fixed. 348 */ 349 if ((void __force *)addr >= phys_to_virt(ISA_START_ADDRESS) && 350 + (void __force *)addr < phys_to_virt(ISA_END_ADDRESS)) { 351 + WARN(1, "iounmap() called for ISA range not obtained using ioremap()\n"); 352 return; 353 + } 354 355 addr = (volatile void __iomem *) 356 (PAGE_MASK & (unsigned long __force)addr); ··· 399 unsigned long offset = phys & ~PAGE_MASK; 400 void *vaddr; 401 402 + /* memremap() maps if RAM, otherwise falls back to ioremap() */ 403 + vaddr = memremap(start, PAGE_SIZE, MEMREMAP_WB); 404 405 + /* Only add the offset on success and return NULL if memremap() failed */ 406 if (vaddr) 407 vaddr += offset; 408 ··· 413 414 void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr) 415 { 416 + memunmap((void *)((unsigned long)addr & PAGE_MASK)); 417 } 418 + 419 + /* 420 + * Examine the physical address to determine if it is an area of memory 421 + * that should be mapped decrypted. If the memory is not part of the 422 + * kernel usable area it was accessed and created decrypted, so these 423 + * areas should be mapped decrypted. And since the encryption key can 424 + * change across reboots, persistent memory should also be mapped 425 + * decrypted. 426 + */ 427 + static bool memremap_should_map_decrypted(resource_size_t phys_addr, 428 + unsigned long size) 429 + { 430 + int is_pmem; 431 + 432 + /* 433 + * Check if the address is part of a persistent memory region. 434 + * This check covers areas added by E820, EFI and ACPI. 435 + */ 436 + is_pmem = region_intersects(phys_addr, size, IORESOURCE_MEM, 437 + IORES_DESC_PERSISTENT_MEMORY); 438 + if (is_pmem != REGION_DISJOINT) 439 + return true; 440 + 441 + /* 442 + * Check if the non-volatile attribute is set for an EFI 443 + * reserved area. 444 + */ 445 + if (efi_enabled(EFI_BOOT)) { 446 + switch (efi_mem_type(phys_addr)) { 447 + case EFI_RESERVED_TYPE: 448 + if (efi_mem_attributes(phys_addr) & EFI_MEMORY_NV) 449 + return true; 450 + break; 451 + default: 452 + break; 453 + } 454 + } 455 + 456 + /* Check if the address is outside kernel usable area */ 457 + switch (e820__get_entry_type(phys_addr, phys_addr + size - 1)) { 458 + case E820_TYPE_RESERVED: 459 + case E820_TYPE_ACPI: 460 + case E820_TYPE_NVS: 461 + case E820_TYPE_UNUSABLE: 462 + case E820_TYPE_PRAM: 463 + return true; 464 + default: 465 + break; 466 + } 467 + 468 + return false; 469 + } 470 + 471 + /* 472 + * Examine the physical address to determine if it is EFI data. Check 473 + * it against the boot params structure and EFI tables and memory types. 474 + */ 475 + static bool memremap_is_efi_data(resource_size_t phys_addr, 476 + unsigned long size) 477 + { 478 + u64 paddr; 479 + 480 + /* Check if the address is part of EFI boot/runtime data */ 481 + if (!efi_enabled(EFI_BOOT)) 482 + return false; 483 + 484 + paddr = boot_params.efi_info.efi_memmap_hi; 485 + paddr <<= 32; 486 + paddr |= boot_params.efi_info.efi_memmap; 487 + if (phys_addr == paddr) 488 + return true; 489 + 490 + paddr = boot_params.efi_info.efi_systab_hi; 491 + paddr <<= 32; 492 + paddr |= boot_params.efi_info.efi_systab; 493 + if (phys_addr == paddr) 494 + return true; 495 + 496 + if (efi_is_table_address(phys_addr)) 497 + return true; 498 + 499 + switch (efi_mem_type(phys_addr)) { 500 + case EFI_BOOT_SERVICES_DATA: 501 + case EFI_RUNTIME_SERVICES_DATA: 502 + return true; 503 + default: 504 + break; 505 + } 506 + 507 + return false; 508 + } 509 + 510 + /* 511 + * Examine the physical address to determine if it is boot data by checking 512 + * it against the boot params setup_data chain. 513 + */ 514 + static bool memremap_is_setup_data(resource_size_t phys_addr, 515 + unsigned long size) 516 + { 517 + struct setup_data *data; 518 + u64 paddr, paddr_next; 519 + 520 + paddr = boot_params.hdr.setup_data; 521 + while (paddr) { 522 + unsigned int len; 523 + 524 + if (phys_addr == paddr) 525 + return true; 526 + 527 + data = memremap(paddr, sizeof(*data), 528 + MEMREMAP_WB | MEMREMAP_DEC); 529 + 530 + paddr_next = data->next; 531 + len = data->len; 532 + 533 + memunmap(data); 534 + 535 + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) 536 + return true; 537 + 538 + paddr = paddr_next; 539 + } 540 + 541 + return false; 542 + } 543 + 544 + /* 545 + * Examine the physical address to determine if it is boot data by checking 546 + * it against the boot params setup_data chain (early boot version). 547 + */ 548 + static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, 549 + unsigned long size) 550 + { 551 + struct setup_data *data; 552 + u64 paddr, paddr_next; 553 + 554 + paddr = boot_params.hdr.setup_data; 555 + while (paddr) { 556 + unsigned int len; 557 + 558 + if (phys_addr == paddr) 559 + return true; 560 + 561 + data = early_memremap_decrypted(paddr, sizeof(*data)); 562 + 563 + paddr_next = data->next; 564 + len = data->len; 565 + 566 + early_memunmap(data, sizeof(*data)); 567 + 568 + if ((phys_addr > paddr) && (phys_addr < (paddr + len))) 569 + return true; 570 + 571 + paddr = paddr_next; 572 + } 573 + 574 + return false; 575 + } 576 + 577 + /* 578 + * Architecture function to determine if RAM remap is allowed. By default, a 579 + * RAM remap will map the data as encrypted. Determine if a RAM remap should 580 + * not be done so that the data will be mapped decrypted. 581 + */ 582 + bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size, 583 + unsigned long flags) 584 + { 585 + if (!sme_active()) 586 + return true; 587 + 588 + if (flags & MEMREMAP_ENC) 589 + return true; 590 + 591 + if (flags & MEMREMAP_DEC) 592 + return false; 593 + 594 + if (memremap_is_setup_data(phys_addr, size) || 595 + memremap_is_efi_data(phys_addr, size) || 596 + memremap_should_map_decrypted(phys_addr, size)) 597 + return false; 598 + 599 + return true; 600 + } 601 + 602 + /* 603 + * Architecture override of __weak function to adjust the protection attributes 604 + * used when remapping memory. By default, early_memremap() will map the data 605 + * as encrypted. Determine if an encrypted mapping should not be done and set 606 + * the appropriate protection attributes. 607 + */ 608 + pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr, 609 + unsigned long size, 610 + pgprot_t prot) 611 + { 612 + if (!sme_active()) 613 + return prot; 614 + 615 + if (early_memremap_is_setup_data(phys_addr, size) || 616 + memremap_is_efi_data(phys_addr, size) || 617 + memremap_should_map_decrypted(phys_addr, size)) 618 + prot = pgprot_decrypted(prot); 619 + else 620 + prot = pgprot_encrypted(prot); 621 + 622 + return prot; 623 + } 624 + 625 + bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size) 626 + { 627 + return arch_memremap_can_ram_remap(phys_addr, size, 0); 628 + } 629 + 630 + #ifdef CONFIG_ARCH_USE_MEMREMAP_PROT 631 + /* Remap memory with encryption */ 632 + void __init *early_memremap_encrypted(resource_size_t phys_addr, 633 + unsigned long size) 634 + { 635 + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC); 636 + } 637 + 638 + /* 639 + * Remap memory with encryption and write-protected - cannot be called 640 + * before pat_init() is called 641 + */ 642 + void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, 643 + unsigned long size) 644 + { 645 + /* Be sure the write-protect PAT entry is set for write-protect */ 646 + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) 647 + return NULL; 648 + 649 + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); 650 + } 651 + 652 + /* Remap memory without encryption */ 653 + void __init *early_memremap_decrypted(resource_size_t phys_addr, 654 + unsigned long size) 655 + { 656 + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC); 657 + } 658 + 659 + /* 660 + * Remap memory without encryption and write-protected - cannot be called 661 + * before pat_init() is called 662 + */ 663 + void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, 664 + unsigned long size) 665 + { 666 + /* Be sure the write-protect PAT entry is set for write-protect */ 667 + if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) 668 + return NULL; 669 + 670 + return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); 671 + } 672 + #endif /* CONFIG_ARCH_USE_MEMREMAP_PROT */ 673 674 static pte_t bm_pte[PAGE_SIZE/sizeof(pte_t)] __page_aligned_bss; 675

+3 -3

arch/x86/mm/kasan_init_64.c

··· 11 #include <asm/e820/types.h> 12 #include <asm/tlbflush.h> 13 #include <asm/sections.h> 14 15 - extern pgd_t early_top_pgt[PTRS_PER_PGD]; 16 extern struct range pfn_mapped[E820_MAX_ENTRIES]; 17 18 static int __init map_range(struct range *range) ··· 87 void __init kasan_early_init(void) 88 { 89 int i; 90 - pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; 91 pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; 92 pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; 93 p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; ··· 153 */ 154 memset(kasan_zero_page, 0, PAGE_SIZE); 155 for (i = 0; i < PTRS_PER_PTE; i++) { 156 - pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO); 157 set_pte(&kasan_zero_pte[i], pte); 158 } 159 /* Flush TLBs again to be sure that write protection applied. */

··· 11 #include <asm/e820/types.h> 12 #include <asm/tlbflush.h> 13 #include <asm/sections.h> 14 + #include <asm/pgtable.h> 15 16 extern struct range pfn_mapped[E820_MAX_ENTRIES]; 17 18 static int __init map_range(struct range *range) ··· 87 void __init kasan_early_init(void) 88 { 89 int i; 90 + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL | _PAGE_ENC; 91 pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; 92 pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; 93 p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; ··· 153 */ 154 memset(kasan_zero_page, 0, PAGE_SIZE); 155 for (i = 0; i < PTRS_PER_PTE; i++) { 156 + pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO | _PAGE_ENC); 157 set_pte(&kasan_zero_pte[i], pte); 158 } 159 /* Flush TLBs again to be sure that write protection applied. */

+593

arch/x86/mm/mem_encrypt.c

···

··· 1 + /* 2 + * AMD Memory Encryption Support 3 + * 4 + * Copyright (C) 2016 Advanced Micro Devices, Inc. 5 + * 6 + * Author: Tom Lendacky <thomas.lendacky@amd.com> 7 + * 8 + * This program is free software; you can redistribute it and/or modify 9 + * it under the terms of the GNU General Public License version 2 as 10 + * published by the Free Software Foundation. 11 + */ 12 + 13 + #include <linux/linkage.h> 14 + #include <linux/init.h> 15 + #include <linux/mm.h> 16 + #include <linux/dma-mapping.h> 17 + #include <linux/swiotlb.h> 18 + #include <linux/mem_encrypt.h> 19 + 20 + #include <asm/tlbflush.h> 21 + #include <asm/fixmap.h> 22 + #include <asm/setup.h> 23 + #include <asm/bootparam.h> 24 + #include <asm/set_memory.h> 25 + #include <asm/cacheflush.h> 26 + #include <asm/sections.h> 27 + #include <asm/processor-flags.h> 28 + #include <asm/msr.h> 29 + #include <asm/cmdline.h> 30 + 31 + static char sme_cmdline_arg[] __initdata = "mem_encrypt"; 32 + static char sme_cmdline_on[] __initdata = "on"; 33 + static char sme_cmdline_off[] __initdata = "off"; 34 + 35 + /* 36 + * Since SME related variables are set early in the boot process they must 37 + * reside in the .data section so as not to be zeroed out when the .bss 38 + * section is later cleared. 39 + */ 40 + unsigned long sme_me_mask __section(.data) = 0; 41 + EXPORT_SYMBOL_GPL(sme_me_mask); 42 + 43 + /* Buffer used for early in-place encryption by BSP, no locking needed */ 44 + static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE); 45 + 46 + /* 47 + * This routine does not change the underlying encryption setting of the 48 + * page(s) that map this memory. It assumes that eventually the memory is 49 + * meant to be accessed as either encrypted or decrypted but the contents 50 + * are currently not in the desired state. 51 + * 52 + * This routine follows the steps outlined in the AMD64 Architecture 53 + * Programmer's Manual Volume 2, Section 7.10.8 Encrypt-in-Place. 54 + */ 55 + static void __init __sme_early_enc_dec(resource_size_t paddr, 56 + unsigned long size, bool enc) 57 + { 58 + void *src, *dst; 59 + size_t len; 60 + 61 + if (!sme_me_mask) 62 + return; 63 + 64 + local_flush_tlb(); 65 + wbinvd(); 66 + 67 + /* 68 + * There are limited number of early mapping slots, so map (at most) 69 + * one page at time. 70 + */ 71 + while (size) { 72 + len = min_t(size_t, sizeof(sme_early_buffer), size); 73 + 74 + /* 75 + * Create mappings for the current and desired format of 76 + * the memory. Use a write-protected mapping for the source. 77 + */ 78 + src = enc ? early_memremap_decrypted_wp(paddr, len) : 79 + early_memremap_encrypted_wp(paddr, len); 80 + 81 + dst = enc ? early_memremap_encrypted(paddr, len) : 82 + early_memremap_decrypted(paddr, len); 83 + 84 + /* 85 + * If a mapping can't be obtained to perform the operation, 86 + * then eventual access of that area in the desired mode 87 + * will cause a crash. 88 + */ 89 + BUG_ON(!src || !dst); 90 + 91 + /* 92 + * Use a temporary buffer, of cache-line multiple size, to 93 + * avoid data corruption as documented in the APM. 94 + */ 95 + memcpy(sme_early_buffer, src, len); 96 + memcpy(dst, sme_early_buffer, len); 97 + 98 + early_memunmap(dst, len); 99 + early_memunmap(src, len); 100 + 101 + paddr += len; 102 + size -= len; 103 + } 104 + } 105 + 106 + void __init sme_early_encrypt(resource_size_t paddr, unsigned long size) 107 + { 108 + __sme_early_enc_dec(paddr, size, true); 109 + } 110 + 111 + void __init sme_early_decrypt(resource_size_t paddr, unsigned long size) 112 + { 113 + __sme_early_enc_dec(paddr, size, false); 114 + } 115 + 116 + static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, 117 + bool map) 118 + { 119 + unsigned long paddr = (unsigned long)vaddr - __PAGE_OFFSET; 120 + pmdval_t pmd_flags, pmd; 121 + 122 + /* Use early_pmd_flags but remove the encryption mask */ 123 + pmd_flags = __sme_clr(early_pmd_flags); 124 + 125 + do { 126 + pmd = map ? (paddr & PMD_MASK) + pmd_flags : 0; 127 + __early_make_pgtable((unsigned long)vaddr, pmd); 128 + 129 + vaddr += PMD_SIZE; 130 + paddr += PMD_SIZE; 131 + size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; 132 + } while (size); 133 + 134 + __native_flush_tlb(); 135 + } 136 + 137 + void __init sme_unmap_bootdata(char *real_mode_data) 138 + { 139 + struct boot_params *boot_data; 140 + unsigned long cmdline_paddr; 141 + 142 + if (!sme_active()) 143 + return; 144 + 145 + /* Get the command line address before unmapping the real_mode_data */ 146 + boot_data = (struct boot_params *)real_mode_data; 147 + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); 148 + 149 + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), false); 150 + 151 + if (!cmdline_paddr) 152 + return; 153 + 154 + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, false); 155 + } 156 + 157 + void __init sme_map_bootdata(char *real_mode_data) 158 + { 159 + struct boot_params *boot_data; 160 + unsigned long cmdline_paddr; 161 + 162 + if (!sme_active()) 163 + return; 164 + 165 + __sme_early_map_unmap_mem(real_mode_data, sizeof(boot_params), true); 166 + 167 + /* Get the command line address after mapping the real_mode_data */ 168 + boot_data = (struct boot_params *)real_mode_data; 169 + cmdline_paddr = boot_data->hdr.cmd_line_ptr | ((u64)boot_data->ext_cmd_line_ptr << 32); 170 + 171 + if (!cmdline_paddr) 172 + return; 173 + 174 + __sme_early_map_unmap_mem(__va(cmdline_paddr), COMMAND_LINE_SIZE, true); 175 + } 176 + 177 + void __init sme_early_init(void) 178 + { 179 + unsigned int i; 180 + 181 + if (!sme_me_mask) 182 + return; 183 + 184 + early_pmd_flags = __sme_set(early_pmd_flags); 185 + 186 + __supported_pte_mask = __sme_set(__supported_pte_mask); 187 + 188 + /* Update the protection map with memory encryption mask */ 189 + for (i = 0; i < ARRAY_SIZE(protection_map); i++) 190 + protection_map[i] = pgprot_encrypted(protection_map[i]); 191 + } 192 + 193 + /* Architecture __weak replacement functions */ 194 + void __init mem_encrypt_init(void) 195 + { 196 + if (!sme_me_mask) 197 + return; 198 + 199 + /* Call into SWIOTLB to update the SWIOTLB DMA buffers */ 200 + swiotlb_update_mem_attributes(); 201 + 202 + pr_info("AMD Secure Memory Encryption (SME) active\n"); 203 + } 204 + 205 + void swiotlb_set_mem_attributes(void *vaddr, unsigned long size) 206 + { 207 + WARN(PAGE_ALIGN(size) != size, 208 + "size is not page-aligned (%#lx)\n", size); 209 + 210 + /* Make the SWIOTLB buffer area decrypted */ 211 + set_memory_decrypted((unsigned long)vaddr, size >> PAGE_SHIFT); 212 + } 213 + 214 + static void __init sme_clear_pgd(pgd_t *pgd_base, unsigned long start, 215 + unsigned long end) 216 + { 217 + unsigned long pgd_start, pgd_end, pgd_size; 218 + pgd_t *pgd_p; 219 + 220 + pgd_start = start & PGDIR_MASK; 221 + pgd_end = end & PGDIR_MASK; 222 + 223 + pgd_size = (((pgd_end - pgd_start) / PGDIR_SIZE) + 1); 224 + pgd_size *= sizeof(pgd_t); 225 + 226 + pgd_p = pgd_base + pgd_index(start); 227 + 228 + memset(pgd_p, 0, pgd_size); 229 + } 230 + 231 + #define PGD_FLAGS _KERNPG_TABLE_NOENC 232 + #define P4D_FLAGS _KERNPG_TABLE_NOENC 233 + #define PUD_FLAGS _KERNPG_TABLE_NOENC 234 + #define PMD_FLAGS (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) 235 + 236 + static void __init *sme_populate_pgd(pgd_t *pgd_base, void *pgtable_area, 237 + unsigned long vaddr, pmdval_t pmd_val) 238 + { 239 + pgd_t *pgd_p; 240 + p4d_t *p4d_p; 241 + pud_t *pud_p; 242 + pmd_t *pmd_p; 243 + 244 + pgd_p = pgd_base + pgd_index(vaddr); 245 + if (native_pgd_val(*pgd_p)) { 246 + if (IS_ENABLED(CONFIG_X86_5LEVEL)) 247 + p4d_p = (p4d_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); 248 + else 249 + pud_p = (pud_t *)(native_pgd_val(*pgd_p) & ~PTE_FLAGS_MASK); 250 + } else { 251 + pgd_t pgd; 252 + 253 + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 254 + p4d_p = pgtable_area; 255 + memset(p4d_p, 0, sizeof(*p4d_p) * PTRS_PER_P4D); 256 + pgtable_area += sizeof(*p4d_p) * PTRS_PER_P4D; 257 + 258 + pgd = native_make_pgd((pgdval_t)p4d_p + PGD_FLAGS); 259 + } else { 260 + pud_p = pgtable_area; 261 + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); 262 + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; 263 + 264 + pgd = native_make_pgd((pgdval_t)pud_p + PGD_FLAGS); 265 + } 266 + native_set_pgd(pgd_p, pgd); 267 + } 268 + 269 + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 270 + p4d_p += p4d_index(vaddr); 271 + if (native_p4d_val(*p4d_p)) { 272 + pud_p = (pud_t *)(native_p4d_val(*p4d_p) & ~PTE_FLAGS_MASK); 273 + } else { 274 + p4d_t p4d; 275 + 276 + pud_p = pgtable_area; 277 + memset(pud_p, 0, sizeof(*pud_p) * PTRS_PER_PUD); 278 + pgtable_area += sizeof(*pud_p) * PTRS_PER_PUD; 279 + 280 + p4d = native_make_p4d((pudval_t)pud_p + P4D_FLAGS); 281 + native_set_p4d(p4d_p, p4d); 282 + } 283 + } 284 + 285 + pud_p += pud_index(vaddr); 286 + if (native_pud_val(*pud_p)) { 287 + if (native_pud_val(*pud_p) & _PAGE_PSE) 288 + goto out; 289 + 290 + pmd_p = (pmd_t *)(native_pud_val(*pud_p) & ~PTE_FLAGS_MASK); 291 + } else { 292 + pud_t pud; 293 + 294 + pmd_p = pgtable_area; 295 + memset(pmd_p, 0, sizeof(*pmd_p) * PTRS_PER_PMD); 296 + pgtable_area += sizeof(*pmd_p) * PTRS_PER_PMD; 297 + 298 + pud = native_make_pud((pmdval_t)pmd_p + PUD_FLAGS); 299 + native_set_pud(pud_p, pud); 300 + } 301 + 302 + pmd_p += pmd_index(vaddr); 303 + if (!native_pmd_val(*pmd_p) || !(native_pmd_val(*pmd_p) & _PAGE_PSE)) 304 + native_set_pmd(pmd_p, native_make_pmd(pmd_val)); 305 + 306 + out: 307 + return pgtable_area; 308 + } 309 + 310 + static unsigned long __init sme_pgtable_calc(unsigned long len) 311 + { 312 + unsigned long p4d_size, pud_size, pmd_size; 313 + unsigned long total; 314 + 315 + /* 316 + * Perform a relatively simplistic calculation of the pagetable 317 + * entries that are needed. That mappings will be covered by 2MB 318 + * PMD entries so we can conservatively calculate the required 319 + * number of P4D, PUD and PMD structures needed to perform the 320 + * mappings. Incrementing the count for each covers the case where 321 + * the addresses cross entries. 322 + */ 323 + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 324 + p4d_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; 325 + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; 326 + pud_size = (ALIGN(len, P4D_SIZE) / P4D_SIZE) + 1; 327 + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 328 + } else { 329 + p4d_size = 0; 330 + pud_size = (ALIGN(len, PGDIR_SIZE) / PGDIR_SIZE) + 1; 331 + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 332 + } 333 + pmd_size = (ALIGN(len, PUD_SIZE) / PUD_SIZE) + 1; 334 + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; 335 + 336 + total = p4d_size + pud_size + pmd_size; 337 + 338 + /* 339 + * Now calculate the added pagetable structures needed to populate 340 + * the new pagetables. 341 + */ 342 + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 343 + p4d_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; 344 + p4d_size *= sizeof(p4d_t) * PTRS_PER_P4D; 345 + pud_size = ALIGN(total, P4D_SIZE) / P4D_SIZE; 346 + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 347 + } else { 348 + p4d_size = 0; 349 + pud_size = ALIGN(total, PGDIR_SIZE) / PGDIR_SIZE; 350 + pud_size *= sizeof(pud_t) * PTRS_PER_PUD; 351 + } 352 + pmd_size = ALIGN(total, PUD_SIZE) / PUD_SIZE; 353 + pmd_size *= sizeof(pmd_t) * PTRS_PER_PMD; 354 + 355 + total += p4d_size + pud_size + pmd_size; 356 + 357 + return total; 358 + } 359 + 360 + void __init sme_encrypt_kernel(void) 361 + { 362 + unsigned long workarea_start, workarea_end, workarea_len; 363 + unsigned long execute_start, execute_end, execute_len; 364 + unsigned long kernel_start, kernel_end, kernel_len; 365 + unsigned long pgtable_area_len; 366 + unsigned long paddr, pmd_flags; 367 + unsigned long decrypted_base; 368 + void *pgtable_area; 369 + pgd_t *pgd; 370 + 371 + if (!sme_active()) 372 + return; 373 + 374 + /* 375 + * Prepare for encrypting the kernel by building new pagetables with 376 + * the necessary attributes needed to encrypt the kernel in place. 377 + * 378 + * One range of virtual addresses will map the memory occupied 379 + * by the kernel as encrypted. 380 + * 381 + * Another range of virtual addresses will map the memory occupied 382 + * by the kernel as decrypted and write-protected. 383 + * 384 + * The use of write-protect attribute will prevent any of the 385 + * memory from being cached. 386 + */ 387 + 388 + /* Physical addresses gives us the identity mapped virtual addresses */ 389 + kernel_start = __pa_symbol(_text); 390 + kernel_end = ALIGN(__pa_symbol(_end), PMD_PAGE_SIZE); 391 + kernel_len = kernel_end - kernel_start; 392 + 393 + /* Set the encryption workarea to be immediately after the kernel */ 394 + workarea_start = kernel_end; 395 + 396 + /* 397 + * Calculate required number of workarea bytes needed: 398 + * executable encryption area size: 399 + * stack page (PAGE_SIZE) 400 + * encryption routine page (PAGE_SIZE) 401 + * intermediate copy buffer (PMD_PAGE_SIZE) 402 + * pagetable structures for the encryption of the kernel 403 + * pagetable structures for workarea (in case not currently mapped) 404 + */ 405 + execute_start = workarea_start; 406 + execute_end = execute_start + (PAGE_SIZE * 2) + PMD_PAGE_SIZE; 407 + execute_len = execute_end - execute_start; 408 + 409 + /* 410 + * One PGD for both encrypted and decrypted mappings and a set of 411 + * PUDs and PMDs for each of the encrypted and decrypted mappings. 412 + */ 413 + pgtable_area_len = sizeof(pgd_t) * PTRS_PER_PGD; 414 + pgtable_area_len += sme_pgtable_calc(execute_end - kernel_start) * 2; 415 + 416 + /* PUDs and PMDs needed in the current pagetables for the workarea */ 417 + pgtable_area_len += sme_pgtable_calc(execute_len + pgtable_area_len); 418 + 419 + /* 420 + * The total workarea includes the executable encryption area and 421 + * the pagetable area. 422 + */ 423 + workarea_len = execute_len + pgtable_area_len; 424 + workarea_end = workarea_start + workarea_len; 425 + 426 + /* 427 + * Set the address to the start of where newly created pagetable 428 + * structures (PGDs, PUDs and PMDs) will be allocated. New pagetable 429 + * structures are created when the workarea is added to the current 430 + * pagetables and when the new encrypted and decrypted kernel 431 + * mappings are populated. 432 + */ 433 + pgtable_area = (void *)execute_end; 434 + 435 + /* 436 + * Make sure the current pagetable structure has entries for 437 + * addressing the workarea. 438 + */ 439 + pgd = (pgd_t *)native_read_cr3_pa(); 440 + paddr = workarea_start; 441 + while (paddr < workarea_end) { 442 + pgtable_area = sme_populate_pgd(pgd, pgtable_area, 443 + paddr, 444 + paddr + PMD_FLAGS); 445 + 446 + paddr += PMD_PAGE_SIZE; 447 + } 448 + 449 + /* Flush the TLB - no globals so cr3 is enough */ 450 + native_write_cr3(__native_read_cr3()); 451 + 452 + /* 453 + * A new pagetable structure is being built to allow for the kernel 454 + * to be encrypted. It starts with an empty PGD that will then be 455 + * populated with new PUDs and PMDs as the encrypted and decrypted 456 + * kernel mappings are created. 457 + */ 458 + pgd = pgtable_area; 459 + memset(pgd, 0, sizeof(*pgd) * PTRS_PER_PGD); 460 + pgtable_area += sizeof(*pgd) * PTRS_PER_PGD; 461 + 462 + /* Add encrypted kernel (identity) mappings */ 463 + pmd_flags = PMD_FLAGS | _PAGE_ENC; 464 + paddr = kernel_start; 465 + while (paddr < kernel_end) { 466 + pgtable_area = sme_populate_pgd(pgd, pgtable_area, 467 + paddr, 468 + paddr + pmd_flags); 469 + 470 + paddr += PMD_PAGE_SIZE; 471 + } 472 + 473 + /* 474 + * A different PGD index/entry must be used to get different 475 + * pagetable entries for the decrypted mapping. Choose the next 476 + * PGD index and convert it to a virtual address to be used as 477 + * the base of the mapping. 478 + */ 479 + decrypted_base = (pgd_index(workarea_end) + 1) & (PTRS_PER_PGD - 1); 480 + decrypted_base <<= PGDIR_SHIFT; 481 + 482 + /* Add decrypted, write-protected kernel (non-identity) mappings */ 483 + pmd_flags = (PMD_FLAGS & ~_PAGE_CACHE_MASK) | (_PAGE_PAT | _PAGE_PWT); 484 + paddr = kernel_start; 485 + while (paddr < kernel_end) { 486 + pgtable_area = sme_populate_pgd(pgd, pgtable_area, 487 + paddr + decrypted_base, 488 + paddr + pmd_flags); 489 + 490 + paddr += PMD_PAGE_SIZE; 491 + } 492 + 493 + /* Add decrypted workarea mappings to both kernel mappings */ 494 + paddr = workarea_start; 495 + while (paddr < workarea_end) { 496 + pgtable_area = sme_populate_pgd(pgd, pgtable_area, 497 + paddr, 498 + paddr + PMD_FLAGS); 499 + 500 + pgtable_area = sme_populate_pgd(pgd, pgtable_area, 501 + paddr + decrypted_base, 502 + paddr + PMD_FLAGS); 503 + 504 + paddr += PMD_PAGE_SIZE; 505 + } 506 + 507 + /* Perform the encryption */ 508 + sme_encrypt_execute(kernel_start, kernel_start + decrypted_base, 509 + kernel_len, workarea_start, (unsigned long)pgd); 510 + 511 + /* 512 + * At this point we are running encrypted. Remove the mappings for 513 + * the decrypted areas - all that is needed for this is to remove 514 + * the PGD entry/entries. 515 + */ 516 + sme_clear_pgd(pgd, kernel_start + decrypted_base, 517 + kernel_end + decrypted_base); 518 + 519 + sme_clear_pgd(pgd, workarea_start + decrypted_base, 520 + workarea_end + decrypted_base); 521 + 522 + /* Flush the TLB - no globals so cr3 is enough */ 523 + native_write_cr3(__native_read_cr3()); 524 + } 525 + 526 + void __init __nostackprotector sme_enable(struct boot_params *bp) 527 + { 528 + const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off; 529 + unsigned int eax, ebx, ecx, edx; 530 + bool active_by_default; 531 + unsigned long me_mask; 532 + char buffer[16]; 533 + u64 msr; 534 + 535 + /* Check for the SME support leaf */ 536 + eax = 0x80000000; 537 + ecx = 0; 538 + native_cpuid(&eax, &ebx, &ecx, &edx); 539 + if (eax < 0x8000001f) 540 + return; 541 + 542 + /* 543 + * Check for the SME feature: 544 + * CPUID Fn8000_001F[EAX] - Bit 0 545 + * Secure Memory Encryption support 546 + * CPUID Fn8000_001F[EBX] - Bits 5:0 547 + * Pagetable bit position used to indicate encryption 548 + */ 549 + eax = 0x8000001f; 550 + ecx = 0; 551 + native_cpuid(&eax, &ebx, &ecx, &edx); 552 + if (!(eax & 1)) 553 + return; 554 + 555 + me_mask = 1UL << (ebx & 0x3f); 556 + 557 + /* Check if SME is enabled */ 558 + msr = __rdmsr(MSR_K8_SYSCFG); 559 + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) 560 + return; 561 + 562 + /* 563 + * Fixups have not been applied to phys_base yet and we're running 564 + * identity mapped, so we must obtain the address to the SME command 565 + * line argument data using rip-relative addressing. 566 + */ 567 + asm ("lea sme_cmdline_arg(%%rip), %0" 568 + : "=r" (cmdline_arg) 569 + : "p" (sme_cmdline_arg)); 570 + asm ("lea sme_cmdline_on(%%rip), %0" 571 + : "=r" (cmdline_on) 572 + : "p" (sme_cmdline_on)); 573 + asm ("lea sme_cmdline_off(%%rip), %0" 574 + : "=r" (cmdline_off) 575 + : "p" (sme_cmdline_off)); 576 + 577 + if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT)) 578 + active_by_default = true; 579 + else 580 + active_by_default = false; 581 + 582 + cmdline_ptr = (const char *)((u64)bp->hdr.cmd_line_ptr | 583 + ((u64)bp->ext_cmd_line_ptr << 32)); 584 + 585 + cmdline_find_option(cmdline_ptr, cmdline_arg, buffer, sizeof(buffer)); 586 + 587 + if (!strncmp(buffer, cmdline_on, sizeof(buffer))) 588 + sme_me_mask = me_mask; 589 + else if (!strncmp(buffer, cmdline_off, sizeof(buffer))) 590 + sme_me_mask = 0; 591 + else 592 + sme_me_mask = active_by_default ? me_mask : 0; 593 + }

+149

arch/x86/mm/mem_encrypt_boot.S

···

··· 1 + /* 2 + * AMD Memory Encryption Support 3 + * 4 + * Copyright (C) 2016 Advanced Micro Devices, Inc. 5 + * 6 + * Author: Tom Lendacky <thomas.lendacky@amd.com> 7 + * 8 + * This program is free software; you can redistribute it and/or modify 9 + * it under the terms of the GNU General Public License version 2 as 10 + * published by the Free Software Foundation. 11 + */ 12 + 13 + #include <linux/linkage.h> 14 + #include <asm/pgtable.h> 15 + #include <asm/page.h> 16 + #include <asm/processor-flags.h> 17 + #include <asm/msr-index.h> 18 + 19 + .text 20 + .code64 21 + ENTRY(sme_encrypt_execute) 22 + 23 + /* 24 + * Entry parameters: 25 + * RDI - virtual address for the encrypted kernel mapping 26 + * RSI - virtual address for the decrypted kernel mapping 27 + * RDX - length of kernel 28 + * RCX - virtual address of the encryption workarea, including: 29 + * - stack page (PAGE_SIZE) 30 + * - encryption routine page (PAGE_SIZE) 31 + * - intermediate copy buffer (PMD_PAGE_SIZE) 32 + * R8 - physcial address of the pagetables to use for encryption 33 + */ 34 + 35 + push %rbp 36 + movq %rsp, %rbp /* RBP now has original stack pointer */ 37 + 38 + /* Set up a one page stack in the non-encrypted memory area */ 39 + movq %rcx, %rax /* Workarea stack page */ 40 + leaq PAGE_SIZE(%rax), %rsp /* Set new stack pointer */ 41 + addq $PAGE_SIZE, %rax /* Workarea encryption routine */ 42 + 43 + push %r12 44 + movq %rdi, %r10 /* Encrypted kernel */ 45 + movq %rsi, %r11 /* Decrypted kernel */ 46 + movq %rdx, %r12 /* Kernel length */ 47 + 48 + /* Copy encryption routine into the workarea */ 49 + movq %rax, %rdi /* Workarea encryption routine */ 50 + leaq __enc_copy(%rip), %rsi /* Encryption routine */ 51 + movq $(.L__enc_copy_end - __enc_copy), %rcx /* Encryption routine length */ 52 + rep movsb 53 + 54 + /* Setup registers for call */ 55 + movq %r10, %rdi /* Encrypted kernel */ 56 + movq %r11, %rsi /* Decrypted kernel */ 57 + movq %r8, %rdx /* Pagetables used for encryption */ 58 + movq %r12, %rcx /* Kernel length */ 59 + movq %rax, %r8 /* Workarea encryption routine */ 60 + addq $PAGE_SIZE, %r8 /* Workarea intermediate copy buffer */ 61 + 62 + call *%rax /* Call the encryption routine */ 63 + 64 + pop %r12 65 + 66 + movq %rbp, %rsp /* Restore original stack pointer */ 67 + pop %rbp 68 + 69 + ret 70 + ENDPROC(sme_encrypt_execute) 71 + 72 + ENTRY(__enc_copy) 73 + /* 74 + * Routine used to encrypt kernel. 75 + * This routine must be run outside of the kernel proper since 76 + * the kernel will be encrypted during the process. So this 77 + * routine is defined here and then copied to an area outside 78 + * of the kernel where it will remain and run decrypted 79 + * during execution. 80 + * 81 + * On entry the registers must be: 82 + * RDI - virtual address for the encrypted kernel mapping 83 + * RSI - virtual address for the decrypted kernel mapping 84 + * RDX - address of the pagetables to use for encryption 85 + * RCX - length of kernel 86 + * R8 - intermediate copy buffer 87 + * 88 + * RAX - points to this routine 89 + * 90 + * The kernel will be encrypted by copying from the non-encrypted 91 + * kernel space to an intermediate buffer and then copying from the 92 + * intermediate buffer back to the encrypted kernel space. The physical 93 + * addresses of the two kernel space mappings are the same which 94 + * results in the kernel being encrypted "in place". 95 + */ 96 + /* Enable the new page tables */ 97 + mov %rdx, %cr3 98 + 99 + /* Flush any global TLBs */ 100 + mov %cr4, %rdx 101 + andq $~X86_CR4_PGE, %rdx 102 + mov %rdx, %cr4 103 + orq $X86_CR4_PGE, %rdx 104 + mov %rdx, %cr4 105 + 106 + /* Set the PAT register PA5 entry to write-protect */ 107 + push %rcx 108 + movl $MSR_IA32_CR_PAT, %ecx 109 + rdmsr 110 + push %rdx /* Save original PAT value */ 111 + andl $0xffff00ff, %edx /* Clear PA5 */ 112 + orl $0x00000500, %edx /* Set PA5 to WP */ 113 + wrmsr 114 + pop %rdx /* RDX contains original PAT value */ 115 + pop %rcx 116 + 117 + movq %rcx, %r9 /* Save kernel length */ 118 + movq %rdi, %r10 /* Save encrypted kernel address */ 119 + movq %rsi, %r11 /* Save decrypted kernel address */ 120 + 121 + wbinvd /* Invalidate any cache entries */ 122 + 123 + /* Copy/encrypt 2MB at a time */ 124 + 1: 125 + movq %r11, %rsi /* Source - decrypted kernel */ 126 + movq %r8, %rdi /* Dest - intermediate copy buffer */ 127 + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ 128 + rep movsb 129 + 130 + movq %r8, %rsi /* Source - intermediate copy buffer */ 131 + movq %r10, %rdi /* Dest - encrypted kernel */ 132 + movq $PMD_PAGE_SIZE, %rcx /* 2MB length */ 133 + rep movsb 134 + 135 + addq $PMD_PAGE_SIZE, %r11 136 + addq $PMD_PAGE_SIZE, %r10 137 + subq $PMD_PAGE_SIZE, %r9 /* Kernel length decrement */ 138 + jnz 1b /* Kernel length not zero? */ 139 + 140 + /* Restore PAT register */ 141 + push %rdx /* Save original PAT value */ 142 + movl $MSR_IA32_CR_PAT, %ecx 143 + rdmsr 144 + pop %rdx /* Restore original PAT value */ 145 + wrmsr 146 + 147 + ret 148 + .L__enc_copy_end: 149 + ENDPROC(__enc_copy)

+6 -6

arch/x86/mm/mmap.c

··· 37 .flags = -1, 38 }; 39 40 - unsigned long tasksize_32bit(void) 41 { 42 return IA32_PAGE_OFFSET; 43 } 44 45 - unsigned long tasksize_64bit(void) 46 { 47 - return TASK_SIZE_MAX; 48 } 49 50 static unsigned long stack_maxrandom_size(unsigned long task_size) 51 { 52 unsigned long max = 0; 53 if (current->flags & PF_RANDOMIZE) { 54 - max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); 55 max <<= PAGE_SHIFT; 56 } 57 ··· 141 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 142 143 arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, 144 - arch_rnd(mmap64_rnd_bits), tasksize_64bit()); 145 146 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES 147 /* ··· 151 * mmap_base, the compat syscall uses mmap_compat_base. 152 */ 153 arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, 154 - arch_rnd(mmap32_rnd_bits), tasksize_32bit()); 155 #endif 156 } 157

··· 37 .flags = -1, 38 }; 39 40 + unsigned long task_size_32bit(void) 41 { 42 return IA32_PAGE_OFFSET; 43 } 44 45 + unsigned long task_size_64bit(int full_addr_space) 46 { 47 + return full_addr_space ? TASK_SIZE_MAX : DEFAULT_MAP_WINDOW; 48 } 49 50 static unsigned long stack_maxrandom_size(unsigned long task_size) 51 { 52 unsigned long max = 0; 53 if (current->flags & PF_RANDOMIZE) { 54 + max = (-1UL) & __STACK_RND_MASK(task_size == task_size_32bit()); 55 max <<= PAGE_SHIFT; 56 } 57 ··· 141 mm->get_unmapped_area = arch_get_unmapped_area_topdown; 142 143 arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, 144 + arch_rnd(mmap64_rnd_bits), task_size_64bit(0)); 145 146 #ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES 147 /* ··· 151 * mmap_base, the compat syscall uses mmap_compat_base. 152 */ 153 arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, 154 + arch_rnd(mmap32_rnd_bits), task_size_32bit()); 155 #endif 156 } 157

+32 -1

arch/x86/mm/mpx.c

··· 355 */ 356 bd_base = mpx_get_bounds_dir(); 357 down_write(&mm->mmap_sem); 358 mm->context.bd_addr = bd_base; 359 if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) 360 ret = -ENXIO; 361 - 362 up_write(&mm->mmap_sem); 363 return ret; 364 } ··· 1038 ret = mpx_unmap_tables(mm, start, end); 1039 if (ret) 1040 force_sig(SIGSEGV, current); 1041 }

··· 355 */ 356 bd_base = mpx_get_bounds_dir(); 357 down_write(&mm->mmap_sem); 358 + 359 + /* MPX doesn't support addresses above 47 bits yet. */ 360 + if (find_vma(mm, DEFAULT_MAP_WINDOW)) { 361 + pr_warn_once("%s (%d): MPX cannot handle addresses " 362 + "above 47-bits. Disabling.", 363 + current->comm, current->pid); 364 + ret = -ENXIO; 365 + goto out; 366 + } 367 mm->context.bd_addr = bd_base; 368 if (mm->context.bd_addr == MPX_INVALID_BOUNDS_DIR) 369 ret = -ENXIO; 370 + out: 371 up_write(&mm->mmap_sem); 372 return ret; 373 } ··· 1029 ret = mpx_unmap_tables(mm, start, end); 1030 if (ret) 1031 force_sig(SIGSEGV, current); 1032 + } 1033 + 1034 + /* MPX cannot handle addresses above 47 bits yet. */ 1035 + unsigned long mpx_unmapped_area_check(unsigned long addr, unsigned long len, 1036 + unsigned long flags) 1037 + { 1038 + if (!kernel_managing_mpx_tables(current->mm)) 1039 + return addr; 1040 + if (addr + len <= DEFAULT_MAP_WINDOW) 1041 + return addr; 1042 + if (flags & MAP_FIXED) 1043 + return -ENOMEM; 1044 + 1045 + /* 1046 + * Requested len is larger than the whole area we're allowed to map in. 1047 + * Resetting hinting address wouldn't do much good -- fail early. 1048 + */ 1049 + if (len > DEFAULT_MAP_WINDOW) 1050 + return -ENOMEM; 1051 + 1052 + /* Look for unmap area within DEFAULT_MAP_WINDOW */ 1053 + return 0; 1054 }

+67

arch/x86/mm/pageattr.c

··· 1775 __pgprot(0), 1, 0, NULL); 1776 } 1777 1778 int set_pages_uc(struct page *page, int numpages) 1779 { 1780 unsigned long addr = (unsigned long)page_address(page); ··· 2083 2084 if (!(page_flags & _PAGE_RW)) 2085 cpa.mask_clr = __pgprot(_PAGE_RW); 2086 2087 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); 2088

··· 1775 __pgprot(0), 1, 0, NULL); 1776 } 1777 1778 + static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) 1779 + { 1780 + struct cpa_data cpa; 1781 + unsigned long start; 1782 + int ret; 1783 + 1784 + /* Nothing to do if the SME is not active */ 1785 + if (!sme_active()) 1786 + return 0; 1787 + 1788 + /* Should not be working on unaligned addresses */ 1789 + if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) 1790 + addr &= PAGE_MASK; 1791 + 1792 + start = addr; 1793 + 1794 + memset(&cpa, 0, sizeof(cpa)); 1795 + cpa.vaddr = &addr; 1796 + cpa.numpages = numpages; 1797 + cpa.mask_set = enc ? __pgprot(_PAGE_ENC) : __pgprot(0); 1798 + cpa.mask_clr = enc ? __pgprot(0) : __pgprot(_PAGE_ENC); 1799 + cpa.pgd = init_mm.pgd; 1800 + 1801 + /* Must avoid aliasing mappings in the highmem code */ 1802 + kmap_flush_unused(); 1803 + vm_unmap_aliases(); 1804 + 1805 + /* 1806 + * Before changing the encryption attribute, we need to flush caches. 1807 + */ 1808 + if (static_cpu_has(X86_FEATURE_CLFLUSH)) 1809 + cpa_flush_range(start, numpages, 1); 1810 + else 1811 + cpa_flush_all(1); 1812 + 1813 + ret = __change_page_attr_set_clr(&cpa, 1); 1814 + 1815 + /* 1816 + * After changing the encryption attribute, we need to flush TLBs 1817 + * again in case any speculative TLB caching occurred (but no need 1818 + * to flush caches again). We could just use cpa_flush_all(), but 1819 + * in case TLB flushing gets optimized in the cpa_flush_range() 1820 + * path use the same logic as above. 1821 + */ 1822 + if (static_cpu_has(X86_FEATURE_CLFLUSH)) 1823 + cpa_flush_range(start, numpages, 0); 1824 + else 1825 + cpa_flush_all(0); 1826 + 1827 + return ret; 1828 + } 1829 + 1830 + int set_memory_encrypted(unsigned long addr, int numpages) 1831 + { 1832 + return __set_memory_enc_dec(addr, numpages, true); 1833 + } 1834 + EXPORT_SYMBOL_GPL(set_memory_encrypted); 1835 + 1836 + int set_memory_decrypted(unsigned long addr, int numpages) 1837 + { 1838 + return __set_memory_enc_dec(addr, numpages, false); 1839 + } 1840 + EXPORT_SYMBOL_GPL(set_memory_decrypted); 1841 + 1842 int set_pages_uc(struct page *page, int numpages) 1843 { 1844 unsigned long addr = (unsigned long)page_address(page); ··· 2019 2020 if (!(page_flags & _PAGE_RW)) 2021 cpa.mask_clr = __pgprot(_PAGE_RW); 2022 + 2023 + if (!(page_flags & _PAGE_ENC)) 2024 + cpa.mask_clr = pgprot_encrypted(cpa.mask_clr); 2025 2026 cpa.mask_set = __pgprot(_PAGE_PRESENT | page_flags); 2027

+6 -3

arch/x86/mm/pat.c

··· 293 * pat_init - Initialize PAT MSR and PAT table 294 * 295 * This function initializes PAT MSR and PAT table with an OS-defined value 296 - * to enable additional cache attributes, WC and WT. 297 * 298 * This function must be called on all CPUs using the specific sequence of 299 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this ··· 352 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 353 * 011 3 UC : _PAGE_CACHE_MODE_UC 354 * 100 4 WB : Reserved 355 - * 101 5 WC : Reserved 356 * 110 6 UC-: Reserved 357 * 111 7 WT : _PAGE_CACHE_MODE_WT 358 * ··· 360 * corresponding types in the presence of PAT errata. 361 */ 362 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 363 - PAT(4, WB) | PAT(5, WC) | PAT(6, UC_MINUS) | PAT(7, WT); 364 } 365 366 if (!boot_cpu_done) { ··· 744 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 745 unsigned long size, pgprot_t vma_prot) 746 { 747 return vma_prot; 748 } 749

··· 293 * pat_init - Initialize PAT MSR and PAT table 294 * 295 * This function initializes PAT MSR and PAT table with an OS-defined value 296 + * to enable additional cache attributes, WC, WT and WP. 297 * 298 * This function must be called on all CPUs using the specific sequence of 299 * operations defined in Intel SDM. mtrr_rendezvous_handler() provides this ··· 352 * 010 2 UC-: _PAGE_CACHE_MODE_UC_MINUS 353 * 011 3 UC : _PAGE_CACHE_MODE_UC 354 * 100 4 WB : Reserved 355 + * 101 5 WP : _PAGE_CACHE_MODE_WP 356 * 110 6 UC-: Reserved 357 * 111 7 WT : _PAGE_CACHE_MODE_WT 358 * ··· 360 * corresponding types in the presence of PAT errata. 361 */ 362 pat = PAT(0, WB) | PAT(1, WC) | PAT(2, UC_MINUS) | PAT(3, UC) | 363 + PAT(4, WB) | PAT(5, WP) | PAT(6, UC_MINUS) | PAT(7, WT); 364 } 365 366 if (!boot_cpu_done) { ··· 744 pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 745 unsigned long size, pgprot_t vma_prot) 746 { 747 + if (!phys_mem_access_encrypted(pfn << PAGE_SHIFT, size)) 748 + vma_prot = pgprot_decrypted(vma_prot); 749 + 750 return vma_prot; 751 } 752

+4 -4

arch/x86/mm/pgtable.c

··· 56 { 57 pgtable_page_dtor(pte); 58 paravirt_release_pte(page_to_pfn(pte)); 59 - tlb_remove_page(tlb, pte); 60 } 61 62 #if CONFIG_PGTABLE_LEVELS > 2 ··· 72 tlb->need_flush_all = 1; 73 #endif 74 pgtable_pmd_page_dtor(page); 75 - tlb_remove_page(tlb, page); 76 } 77 78 #if CONFIG_PGTABLE_LEVELS > 3 79 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 80 { 81 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 82 - tlb_remove_page(tlb, virt_to_page(pud)); 83 } 84 85 #if CONFIG_PGTABLE_LEVELS > 4 86 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) 87 { 88 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); 89 - tlb_remove_page(tlb, virt_to_page(p4d)); 90 } 91 #endif /* CONFIG_PGTABLE_LEVELS > 4 */ 92 #endif /* CONFIG_PGTABLE_LEVELS > 3 */

··· 56 { 57 pgtable_page_dtor(pte); 58 paravirt_release_pte(page_to_pfn(pte)); 59 + tlb_remove_table(tlb, pte); 60 } 61 62 #if CONFIG_PGTABLE_LEVELS > 2 ··· 72 tlb->need_flush_all = 1; 73 #endif 74 pgtable_pmd_page_dtor(page); 75 + tlb_remove_table(tlb, page); 76 } 77 78 #if CONFIG_PGTABLE_LEVELS > 3 79 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) 80 { 81 paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); 82 + tlb_remove_table(tlb, virt_to_page(pud)); 83 } 84 85 #if CONFIG_PGTABLE_LEVELS > 4 86 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) 87 { 88 paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); 89 + tlb_remove_table(tlb, virt_to_page(p4d)); 90 } 91 #endif /* CONFIG_PGTABLE_LEVELS > 4 */ 92 #endif /* CONFIG_PGTABLE_LEVELS > 3 */

+251 -88

arch/x86/mm/tlb.c

··· 28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 29 */ 30 31 void leave_mm(int cpu) 32 { 33 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); ··· 79 if (loaded_mm == &init_mm) 80 return; 81 82 - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) 83 - BUG(); 84 85 switch_mm(NULL, &init_mm, NULL); 86 } 87 - EXPORT_SYMBOL_GPL(leave_mm); 88 89 void switch_mm(struct mm_struct *prev, struct mm_struct *next, 90 struct task_struct *tsk) ··· 98 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 99 struct task_struct *tsk) 100 { 101 - unsigned cpu = smp_processor_id(); 102 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 103 104 /* 105 - * NB: The scheduler will call us with prev == next when 106 - * switching from lazy TLB mode to normal mode if active_mm 107 - * isn't changing. When this happens, there is no guarantee 108 - * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next. 109 * 110 * NB: leave_mm() calls us with prev == NULL and tsk == NULL. 111 */ 112 113 - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); 114 115 if (real_prev == next) { 116 /* 117 - * There's nothing to do: we always keep the per-mm control 118 - * regs in sync with cpu_tlbstate.loaded_mm. Just 119 - * sanity-check mm_cpumask. 120 */ 121 - if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next)))) 122 - cpumask_set_cpu(cpu, mm_cpumask(next)); 123 - return; 124 } 125 126 - if (IS_ENABLED(CONFIG_VMAP_STACK)) { 127 - /* 128 - * If our current stack is in vmalloc space and isn't 129 - * mapped in the new pgd, we'll double-fault. Forcibly 130 - * map it. 131 - */ 132 - unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); 133 - 134 - pgd_t *pgd = next->pgd + stack_pgd_index; 135 - 136 - if (unlikely(pgd_none(*pgd))) 137 - set_pgd(pgd, init_mm.pgd[stack_pgd_index]); 138 - } 139 - 140 - this_cpu_write(cpu_tlbstate.loaded_mm, next); 141 - 142 - WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); 143 - cpumask_set_cpu(cpu, mm_cpumask(next)); 144 - 145 - /* 146 - * Re-load page tables. 147 - * 148 - * This logic has an ordering constraint: 149 - * 150 - * CPU 0: Write to a PTE for 'next' 151 - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. 152 - * CPU 1: set bit 1 in next's mm_cpumask 153 - * CPU 1: load from the PTE that CPU 0 writes (implicit) 154 - * 155 - * We need to prevent an outcome in which CPU 1 observes 156 - * the new PTE value and CPU 0 observes bit 1 clear in 157 - * mm_cpumask. (If that occurs, then the IPI will never 158 - * be sent, and CPU 0's TLB will contain a stale entry.) 159 - * 160 - * The bad outcome can occur if either CPU's load is 161 - * reordered before that CPU's store, so both CPUs must 162 - * execute full barriers to prevent this from happening. 163 - * 164 - * Thus, switch_mm needs a full barrier between the 165 - * store to mm_cpumask and any operation that could load 166 - * from next->pgd. TLB fills are special and can happen 167 - * due to instruction fetches or for no reason at all, 168 - * and neither LOCK nor MFENCE orders them. 169 - * Fortunately, load_cr3() is serializing and gives the 170 - * ordering guarantee we need. 171 - */ 172 - load_cr3(next->pgd); 173 - 174 - /* 175 - * This gets called via leave_mm() in the idle path where RCU 176 - * functions differently. Tracing normally uses RCU, so we have to 177 - * call the tracepoint specially here. 178 - */ 179 - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); 180 - 181 - /* Stop flush ipis for the previous mm */ 182 - WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) && 183 - real_prev != &init_mm); 184 - cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 185 - 186 - /* Load per-mm CR4 and LDTR state */ 187 load_mm_cr4(next); 188 switch_ldt(real_prev, next); 189 } 190 191 static void flush_tlb_func_common(const struct flush_tlb_info *f, 192 bool local, enum tlb_flush_reason reason) 193 { 194 /* This code cannot presently handle being reentered. */ 195 VM_WARN_ON(!irqs_disabled()); 196 197 - if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) { 198 - leave_mm(smp_processor_id()); 199 return; 200 } 201 202 - if (f->end == TLB_FLUSH_ALL) { 203 - local_flush_tlb(); 204 - if (local) 205 - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 206 - trace_tlb_flush(reason, TLB_FLUSH_ALL); 207 - } else { 208 unsigned long addr; 209 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; 210 addr = f->start; 211 while (addr < f->end) { 212 __flush_tlb_single(addr); ··· 319 if (local) 320 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); 321 trace_tlb_flush(reason, nr_pages); 322 } 323 } 324 325 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) ··· 362 (info->end - info->start) >> PAGE_SHIFT); 363 364 if (is_uv_system()) { 365 unsigned int cpu; 366 367 cpu = smp_processor_id(); ··· 413 414 cpu = get_cpu(); 415 416 - /* Synchronize with switch_mm. */ 417 - smp_mb(); 418 419 /* Should we flush just the requested range? */ 420 if ((end != TLB_FLUSH_ALL) && ··· 436 437 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) 438 flush_tlb_others(mm_cpumask(mm), &info); 439 put_cpu(); 440 } 441 ··· 445 { 446 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 447 __flush_tlb_all(); 448 - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) 449 - leave_mm(smp_processor_id()); 450 } 451 452 void flush_tlb_all(void) ··· 497 498 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) 499 flush_tlb_others(&batch->cpumask, &info); 500 cpumask_clear(&batch->cpumask); 501 502 put_cpu();

··· 28 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi 29 */ 30 31 + atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1); 32 + 33 + static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, 34 + u16 *new_asid, bool *need_flush) 35 + { 36 + u16 asid; 37 + 38 + if (!static_cpu_has(X86_FEATURE_PCID)) { 39 + *new_asid = 0; 40 + *need_flush = true; 41 + return; 42 + } 43 + 44 + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) { 45 + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) != 46 + next->context.ctx_id) 47 + continue; 48 + 49 + *new_asid = asid; 50 + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) < 51 + next_tlb_gen); 52 + return; 53 + } 54 + 55 + /* 56 + * We don't currently own an ASID slot on this CPU. 57 + * Allocate a slot. 58 + */ 59 + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1; 60 + if (*new_asid >= TLB_NR_DYN_ASIDS) { 61 + *new_asid = 0; 62 + this_cpu_write(cpu_tlbstate.next_asid, 1); 63 + } 64 + *need_flush = true; 65 + } 66 + 67 void leave_mm(int cpu) 68 { 69 struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); ··· 43 if (loaded_mm == &init_mm) 44 return; 45 46 + /* Warn if we're not lazy. */ 47 + WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))); 48 49 switch_mm(NULL, &init_mm, NULL); 50 } 51 52 void switch_mm(struct mm_struct *prev, struct mm_struct *next, 53 struct task_struct *tsk) ··· 63 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, 64 struct task_struct *tsk) 65 { 66 struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); 67 + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 68 + unsigned cpu = smp_processor_id(); 69 + u64 next_tlb_gen; 70 71 /* 72 + * NB: The scheduler will call us with prev == next when switching 73 + * from lazy TLB mode to normal mode if active_mm isn't changing. 74 + * When this happens, we don't assume that CR3 (and hence 75 + * cpu_tlbstate.loaded_mm) matches next. 76 * 77 * NB: leave_mm() calls us with prev == NULL and tsk == NULL. 78 */ 79 80 + /* We don't want flush_tlb_func_* to run concurrently with us. */ 81 + if (IS_ENABLED(CONFIG_PROVE_LOCKING)) 82 + WARN_ON_ONCE(!irqs_disabled()); 83 + 84 + /* 85 + * Verify that CR3 is what we think it is. This will catch 86 + * hypothetical buggy code that directly switches to swapper_pg_dir 87 + * without going through leave_mm() / switch_mm_irqs_off() or that 88 + * does something like write_cr3(read_cr3_pa()). 89 + */ 90 + VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid)); 91 92 if (real_prev == next) { 93 + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != 94 + next->context.ctx_id); 95 + 96 + if (cpumask_test_cpu(cpu, mm_cpumask(next))) { 97 + /* 98 + * There's nothing to do: we weren't lazy, and we 99 + * aren't changing our mm. We don't need to flush 100 + * anything, nor do we need to update CR3, CR4, or 101 + * LDTR. 102 + */ 103 + return; 104 + } 105 + 106 + /* Resume remote flushes and then read tlb_gen. */ 107 + cpumask_set_cpu(cpu, mm_cpumask(next)); 108 + next_tlb_gen = atomic64_read(&next->context.tlb_gen); 109 + 110 + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) < 111 + next_tlb_gen) { 112 + /* 113 + * Ideally, we'd have a flush_tlb() variant that 114 + * takes the known CR3 value as input. This would 115 + * be faster on Xen PV and on hypothetical CPUs 116 + * on which INVPCID is fast. 117 + */ 118 + this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen, 119 + next_tlb_gen); 120 + write_cr3(__sme_pa(next->pgd) | prev_asid); 121 + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 122 + TLB_FLUSH_ALL); 123 + } 124 + 125 /* 126 + * We just exited lazy mode, which means that CR4 and/or LDTR 127 + * may be stale. (Changes to the required CR4 and LDTR states 128 + * are not reflected in tlb_gen.) 129 */ 130 + } else { 131 + u16 new_asid; 132 + bool need_flush; 133 + 134 + if (IS_ENABLED(CONFIG_VMAP_STACK)) { 135 + /* 136 + * If our current stack is in vmalloc space and isn't 137 + * mapped in the new pgd, we'll double-fault. Forcibly 138 + * map it. 139 + */ 140 + unsigned int index = pgd_index(current_stack_pointer()); 141 + pgd_t *pgd = next->pgd + index; 142 + 143 + if (unlikely(pgd_none(*pgd))) 144 + set_pgd(pgd, init_mm.pgd[index]); 145 + } 146 + 147 + /* Stop remote flushes for the previous mm */ 148 + if (cpumask_test_cpu(cpu, mm_cpumask(real_prev))) 149 + cpumask_clear_cpu(cpu, mm_cpumask(real_prev)); 150 + 151 + VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next))); 152 + 153 + /* 154 + * Start remote flushes and then read tlb_gen. 155 + */ 156 + cpumask_set_cpu(cpu, mm_cpumask(next)); 157 + next_tlb_gen = atomic64_read(&next->context.tlb_gen); 158 + 159 + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); 160 + 161 + if (need_flush) { 162 + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); 163 + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); 164 + write_cr3(__sme_pa(next->pgd) | new_asid); 165 + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 166 + TLB_FLUSH_ALL); 167 + } else { 168 + /* The new ASID is already up to date. */ 169 + write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH); 170 + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0); 171 + } 172 + 173 + this_cpu_write(cpu_tlbstate.loaded_mm, next); 174 + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); 175 } 176 177 load_mm_cr4(next); 178 switch_ldt(real_prev, next); 179 } 180 181 + /* 182 + * flush_tlb_func_common()'s memory ordering requirement is that any 183 + * TLB fills that happen after we flush the TLB are ordered after we 184 + * read active_mm's tlb_gen. We don't need any explicit barriers 185 + * because all x86 flush operations are serializing and the 186 + * atomic64_read operation won't be reordered by the compiler. 187 + */ 188 static void flush_tlb_func_common(const struct flush_tlb_info *f, 189 bool local, enum tlb_flush_reason reason) 190 { 191 + /* 192 + * We have three different tlb_gen values in here. They are: 193 + * 194 + * - mm_tlb_gen: the latest generation. 195 + * - local_tlb_gen: the generation that this CPU has already caught 196 + * up to. 197 + * - f->new_tlb_gen: the generation that the requester of the flush 198 + * wants us to catch up to. 199 + */ 200 + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); 201 + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); 202 + u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen); 203 + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); 204 + 205 /* This code cannot presently handle being reentered. */ 206 VM_WARN_ON(!irqs_disabled()); 207 208 + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != 209 + loaded_mm->context.ctx_id); 210 + 211 + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) { 212 + /* 213 + * We're in lazy mode -- don't flush. We can get here on 214 + * remote flushes due to races and on local flushes if a 215 + * kernel thread coincidentally flushes the mm it's lazily 216 + * still using. 217 + */ 218 return; 219 } 220 221 + if (unlikely(local_tlb_gen == mm_tlb_gen)) { 222 + /* 223 + * There's nothing to do: we're already up to date. This can 224 + * happen if two concurrent flushes happen -- the first flush to 225 + * be handled can catch us all the way up, leaving no work for 226 + * the second flush. 227 + */ 228 + trace_tlb_flush(reason, 0); 229 + return; 230 + } 231 + 232 + WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen); 233 + WARN_ON_ONCE(f->new_tlb_gen > mm_tlb_gen); 234 + 235 + /* 236 + * If we get to this point, we know that our TLB is out of date. 237 + * This does not strictly imply that we need to flush (it's 238 + * possible that f->new_tlb_gen <= local_tlb_gen), but we're 239 + * going to need to flush in the very near future, so we might 240 + * as well get it over with. 241 + * 242 + * The only question is whether to do a full or partial flush. 243 + * 244 + * We do a partial flush if requested and two extra conditions 245 + * are met: 246 + * 247 + * 1. f->new_tlb_gen == local_tlb_gen + 1. We have an invariant that 248 + * we've always done all needed flushes to catch up to 249 + * local_tlb_gen. If, for example, local_tlb_gen == 2 and 250 + * f->new_tlb_gen == 3, then we know that the flush needed to bring 251 + * us up to date for tlb_gen 3 is the partial flush we're 252 + * processing. 253 + * 254 + * As an example of why this check is needed, suppose that there 255 + * are two concurrent flushes. The first is a full flush that 256 + * changes context.tlb_gen from 1 to 2. The second is a partial 257 + * flush that changes context.tlb_gen from 2 to 3. If they get 258 + * processed on this CPU in reverse order, we'll see 259 + * local_tlb_gen == 1, mm_tlb_gen == 3, and end != TLB_FLUSH_ALL. 260 + * If we were to use __flush_tlb_single() and set local_tlb_gen to 261 + * 3, we'd be break the invariant: we'd update local_tlb_gen above 262 + * 1 without the full flush that's needed for tlb_gen 2. 263 + * 264 + * 2. f->new_tlb_gen == mm_tlb_gen. This is purely an optimiation. 265 + * Partial TLB flushes are not all that much cheaper than full TLB 266 + * flushes, so it seems unlikely that it would be a performance win 267 + * to do a partial flush if that won't bring our TLB fully up to 268 + * date. By doing a full flush instead, we can increase 269 + * local_tlb_gen all the way to mm_tlb_gen and we can probably 270 + * avoid another flush in the very near future. 271 + */ 272 + if (f->end != TLB_FLUSH_ALL && 273 + f->new_tlb_gen == local_tlb_gen + 1 && 274 + f->new_tlb_gen == mm_tlb_gen) { 275 + /* Partial flush */ 276 unsigned long addr; 277 unsigned long nr_pages = (f->end - f->start) >> PAGE_SHIFT; 278 + 279 addr = f->start; 280 while (addr < f->end) { 281 __flush_tlb_single(addr); ··· 180 if (local) 181 count_vm_tlb_events(NR_TLB_LOCAL_FLUSH_ONE, nr_pages); 182 trace_tlb_flush(reason, nr_pages); 183 + } else { 184 + /* Full flush. */ 185 + local_flush_tlb(); 186 + if (local) 187 + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); 188 + trace_tlb_flush(reason, TLB_FLUSH_ALL); 189 } 190 + 191 + /* Both paths above update our state to mm_tlb_gen. */ 192 + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); 193 } 194 195 static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) ··· 214 (info->end - info->start) >> PAGE_SHIFT); 215 216 if (is_uv_system()) { 217 + /* 218 + * This whole special case is confused. UV has a "Broadcast 219 + * Assist Unit", which seems to be a fancy way to send IPIs. 220 + * Back when x86 used an explicit TLB flush IPI, UV was 221 + * optimized to use its own mechanism. These days, x86 uses 222 + * smp_call_function_many(), but UV still uses a manual IPI, 223 + * and that IPI's action is out of date -- it does a manual 224 + * flush instead of calling flush_tlb_func_remote(). This 225 + * means that the percpu tlb_gen variables won't be updated 226 + * and we'll do pointless flushes on future context switches. 227 + * 228 + * Rather than hooking native_flush_tlb_others() here, I think 229 + * that UV should be updated so that smp_call_function_many(), 230 + * etc, are optimal on UV. 231 + */ 232 unsigned int cpu; 233 234 cpu = smp_processor_id(); ··· 250 251 cpu = get_cpu(); 252 253 + /* This is also a barrier that synchronizes with switch_mm(). */ 254 + info.new_tlb_gen = inc_mm_tlb_gen(mm); 255 256 /* Should we flush just the requested range? */ 257 if ((end != TLB_FLUSH_ALL) && ··· 273 274 if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) 275 flush_tlb_others(mm_cpumask(mm), &info); 276 + 277 put_cpu(); 278 } 279 ··· 281 { 282 count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); 283 __flush_tlb_all(); 284 } 285 286 void flush_tlb_all(void) ··· 335 336 if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) 337 flush_tlb_others(&batch->cpumask, &info); 338 + 339 cpumask_clear(&batch->cpumask); 340 341 put_cpu();

+2 -2

arch/x86/pci/common.c

··· 674 675 pa_data = boot_params.hdr.setup_data; 676 while (pa_data) { 677 - data = ioremap(pa_data, sizeof(*rom)); 678 if (!data) 679 return -ENOMEM; 680 ··· 693 } 694 } 695 pa_data = data->next; 696 - iounmap(data); 697 } 698 set_dma_domain_ops(dev); 699 set_dev_domain_options(dev);

··· 674 675 pa_data = boot_params.hdr.setup_data; 676 while (pa_data) { 677 + data = memremap(pa_data, sizeof(*rom), MEMREMAP_WB); 678 if (!data) 679 return -ENOMEM; 680 ··· 693 } 694 } 695 pa_data = data->next; 696 + memunmap(data); 697 } 698 set_dma_domain_ops(dev); 699 set_dev_domain_options(dev);

+3 -3

arch/x86/platform/efi/efi.c

··· 1035 /* 1036 * Convenience functions to obtain memory types and attributes 1037 */ 1038 - u32 efi_mem_type(unsigned long phys_addr) 1039 { 1040 efi_memory_desc_t *md; 1041 1042 if (!efi_enabled(EFI_MEMMAP)) 1043 - return 0; 1044 1045 for_each_efi_memory_desc(md) { 1046 if ((md->phys_addr <= phys_addr) && ··· 1048 (md->num_pages << EFI_PAGE_SHIFT)))) 1049 return md->type; 1050 } 1051 - return 0; 1052 } 1053 1054 static int __init arch_parse_efi_cmdline(char *str)

··· 1035 /* 1036 * Convenience functions to obtain memory types and attributes 1037 */ 1038 + int efi_mem_type(unsigned long phys_addr) 1039 { 1040 efi_memory_desc_t *md; 1041 1042 if (!efi_enabled(EFI_MEMMAP)) 1043 + return -ENOTSUPP; 1044 1045 for_each_efi_memory_desc(md) { 1046 if ((md->phys_addr <= phys_addr) && ··· 1048 (md->num_pages << EFI_PAGE_SHIFT)))) 1049 return md->type; 1050 } 1051 + return -EINVAL; 1052 } 1053 1054 static int __init arch_parse_efi_cmdline(char *str)

+11 -4

arch/x86/platform/efi/efi_64.c

··· 327 328 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) 329 { 330 - unsigned long pfn, text; 331 struct page *page; 332 unsigned npages; 333 pgd_t *pgd; ··· 335 if (efi_enabled(EFI_OLD_MEMMAP)) 336 return 0; 337 338 - efi_scratch.efi_pgt = (pgd_t *)__pa(efi_pgd); 339 pgd = efi_pgd; 340 341 /* ··· 350 * phys_efi_set_virtual_address_map(). 351 */ 352 pfn = pa_memmap >> PAGE_SHIFT; 353 - if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, _PAGE_NX | _PAGE_RW)) { 354 pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap); 355 return 1; 356 } ··· 394 text = __pa(_text); 395 pfn = text >> PAGE_SHIFT; 396 397 - if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, _PAGE_RW)) { 398 pr_err("Failed to map kernel text 1:1\n"); 399 return 1; 400 }

··· 327 328 int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) 329 { 330 + unsigned long pfn, text, pf; 331 struct page *page; 332 unsigned npages; 333 pgd_t *pgd; ··· 335 if (efi_enabled(EFI_OLD_MEMMAP)) 336 return 0; 337 338 + /* 339 + * Since the PGD is encrypted, set the encryption mask so that when 340 + * this value is loaded into cr3 the PGD will be decrypted during 341 + * the pagetable walk. 342 + */ 343 + efi_scratch.efi_pgt = (pgd_t *)__sme_pa(efi_pgd); 344 pgd = efi_pgd; 345 346 /* ··· 345 * phys_efi_set_virtual_address_map(). 346 */ 347 pfn = pa_memmap >> PAGE_SHIFT; 348 + pf = _PAGE_NX | _PAGE_RW | _PAGE_ENC; 349 + if (kernel_map_pages_in_pgd(pgd, pfn, pa_memmap, num_pages, pf)) { 350 pr_err("Error ident-mapping new memmap (0x%lx)!\n", pa_memmap); 351 return 1; 352 } ··· 388 text = __pa(_text); 389 pfn = text >> PAGE_SHIFT; 390 391 + pf = _PAGE_RW | _PAGE_ENC; 392 + if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) { 393 pr_err("Failed to map kernel text 1:1\n"); 394 return 1; 395 }

+12

arch/x86/realmode/init.c

··· 1 #include <linux/io.h> 2 #include <linux/slab.h> 3 #include <linux/memblock.h> 4 5 #include <asm/set_memory.h> 6 #include <asm/pgtable.h> ··· 60 61 base = (unsigned char *)real_mode_header; 62 63 memcpy(base, real_mode_blob, size); 64 65 phys_base = __pa(base); ··· 107 trampoline_header->start = (u64) secondary_startup_64; 108 trampoline_cr4_features = &trampoline_header->cr4; 109 *trampoline_cr4_features = mmu_cr4_features; 110 111 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); 112 trampoline_pgd[0] = trampoline_pgd_entry.pgd;

··· 1 #include <linux/io.h> 2 #include <linux/slab.h> 3 #include <linux/memblock.h> 4 + #include <linux/mem_encrypt.h> 5 6 #include <asm/set_memory.h> 7 #include <asm/pgtable.h> ··· 59 60 base = (unsigned char *)real_mode_header; 61 62 + /* 63 + * If SME is active, the trampoline area will need to be in 64 + * decrypted memory in order to bring up other processors 65 + * successfully. 66 + */ 67 + set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT); 68 + 69 memcpy(base, real_mode_blob, size); 70 71 phys_base = __pa(base); ··· 99 trampoline_header->start = (u64) secondary_startup_64; 100 trampoline_cr4_features = &trampoline_header->cr4; 101 *trampoline_cr4_features = mmu_cr4_features; 102 + 103 + trampoline_header->flags = 0; 104 + if (sme_active()) 105 + trampoline_header->flags |= TH_FLAGS_SME_ACTIVE; 106 107 trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); 108 trampoline_pgd[0] = trampoline_pgd_entry.pgd;

+24

arch/x86/realmode/rm/trampoline_64.S

··· 30 #include <asm/msr.h> 31 #include <asm/segment.h> 32 #include <asm/processor-flags.h> 33 #include "realmode.h" 34 35 .text ··· 93 movl %edx, %fs 94 movl %edx, %gs 95 96 movl pa_tr_cr4, %eax 97 movl %eax, %cr4 # Enable PAE mode 98 ··· 170 tr_start: .space 8 171 GLOBAL(tr_efer) .space 8 172 GLOBAL(tr_cr4) .space 4 173 END(trampoline_header) 174 175 #include "trampoline_common.S"

··· 30 #include <asm/msr.h> 31 #include <asm/segment.h> 32 #include <asm/processor-flags.h> 33 + #include <asm/realmode.h> 34 #include "realmode.h" 35 36 .text ··· 92 movl %edx, %fs 93 movl %edx, %gs 94 95 + /* 96 + * Check for memory encryption support. This is a safety net in 97 + * case BIOS hasn't done the necessary step of setting the bit in 98 + * the MSR for this AP. If SME is active and we've gotten this far 99 + * then it is safe for us to set the MSR bit and continue. If we 100 + * don't we'll eventually crash trying to execute encrypted 101 + * instructions. 102 + */ 103 + bt $TH_FLAGS_SME_ACTIVE_BIT, pa_tr_flags 104 + jnc .Ldone 105 + movl $MSR_K8_SYSCFG, %ecx 106 + rdmsr 107 + bts $MSR_K8_SYSCFG_MEM_ENCRYPT_BIT, %eax 108 + jc .Ldone 109 + 110 + /* 111 + * Memory encryption is enabled but the SME enable bit for this 112 + * CPU has has not been set. It is safe to set it, so do so. 113 + */ 114 + wrmsr 115 + .Ldone: 116 + 117 movl pa_tr_cr4, %eax 118 movl %eax, %cr4 # Enable PAE mode 119 ··· 147 tr_start: .space 8 148 GLOBAL(tr_efer) .space 8 149 GLOBAL(tr_cr4) .space 4 150 + GLOBAL(tr_flags) .space 4 151 END(trampoline_header) 152 153 #include "trampoline_common.S"

+5

arch/x86/xen/Kconfig

··· 17 bool "Xen PV guest support" 18 default y 19 depends on XEN 20 select XEN_HAVE_PVMMU 21 select XEN_HAVE_VPMU 22 help ··· 78 config XEN_PVH 79 bool "Support for running as a PVH guest" 80 depends on XEN && XEN_PVHVM && ACPI 81 def_bool n

··· 17 bool "Xen PV guest support" 18 default y 19 depends on XEN 20 + # XEN_PV is not ready to work with 5-level paging. 21 + # Changes to hypervisor are also required. 22 + depends on !X86_5LEVEL 23 select XEN_HAVE_PVMMU 24 select XEN_HAVE_VPMU 25 help ··· 75 config XEN_PVH 76 bool "Support for running as a PVH guest" 77 depends on XEN && XEN_PVHVM && ACPI 78 + # Pre-built page tables are not ready to handle 5-level paging. 79 + depends on !X86_5LEVEL 80 def_bool n

+7

arch/x86/xen/enlighten_pv.c

··· 263 setup_clear_cpu_cap(X86_FEATURE_MTRR); 264 setup_clear_cpu_cap(X86_FEATURE_ACC); 265 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 266 267 if (!xen_initial_domain()) 268 setup_clear_cpu_cap(X86_FEATURE_ACPI);

··· 263 setup_clear_cpu_cap(X86_FEATURE_MTRR); 264 setup_clear_cpu_cap(X86_FEATURE_ACC); 265 setup_clear_cpu_cap(X86_FEATURE_X2APIC); 266 + setup_clear_cpu_cap(X86_FEATURE_SME); 267 + 268 + /* 269 + * Xen PV would need some work to support PCID: CR3 handling as well 270 + * as xen_flush_tlb_others() would need updating. 271 + */ 272 + setup_clear_cpu_cap(X86_FEATURE_PCID); 273 274 if (!xen_initial_domain()) 275 setup_clear_cpu_cap(X86_FEATURE_ACPI);

+2 -3

arch/x86/xen/mmu_pv.c

··· 1005 /* Get the "official" set of cpus referring to our pagetable. */ 1006 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { 1007 for_each_online_cpu(cpu) { 1008 - if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) 1009 - && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) 1010 continue; 1011 smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); 1012 } 1013 return; 1014 } 1015 - cpumask_copy(mask, mm_cpumask(mm)); 1016 1017 /* 1018 * It's possible that a vcpu may have a stale reference to our ··· 1019 * look at its actual current cr3 value, and force it to flush 1020 * if needed. 1021 */ 1022 for_each_online_cpu(cpu) { 1023 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) 1024 cpumask_set_cpu(cpu, mask);

··· 1005 /* Get the "official" set of cpus referring to our pagetable. */ 1006 if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { 1007 for_each_online_cpu(cpu) { 1008 + if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) 1009 continue; 1010 smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1); 1011 } 1012 return; 1013 } 1014 1015 /* 1016 * It's possible that a vcpu may have a stale reference to our ··· 1021 * look at its actual current cr3 value, and force it to flush 1022 * if needed. 1023 */ 1024 + cpumask_clear(mask); 1025 for_each_online_cpu(cpu) { 1026 if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) 1027 cpumask_set_cpu(cpu, mask);

+1 -1

arch/x86/xen/xen-head.S

··· 58 #else 59 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) 60 /* Map the p2m table to a 512GB-aligned user address. */ 61 - ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) 62 #endif 63 #ifdef CONFIG_XEN_PV 64 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)

··· 58 #else 59 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) 60 /* Map the p2m table to a 512GB-aligned user address. */ 61 + ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad (PUD_SIZE * PTRS_PER_PUD)) 62 #endif 63 #ifdef CONFIG_XEN_PV 64 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)

-2

drivers/acpi/processor_idle.c

··· 708 static void acpi_idle_enter_bm(struct acpi_processor *pr, 709 struct acpi_processor_cx *cx, bool timer_bc) 710 { 711 - acpi_unlazy_tlb(smp_processor_id()); 712 - 713 /* 714 * Must be done before busmaster disable as we might need to 715 * access HPET !

··· 708 static void acpi_idle_enter_bm(struct acpi_processor *pr, 709 struct acpi_processor_cx *cx, bool timer_bc) 710 { 711 /* 712 * Must be done before busmaster disable as we might need to 713 * access HPET !

+3 -2

drivers/firmware/dmi-sysfs.c

··· 25 #include <linux/slab.h> 26 #include <linux/list.h> 27 #include <linux/io.h> 28 29 #define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider 30 the top entry type is only 8 bits */ ··· 381 u8 __iomem *mapped; 382 ssize_t wrote = 0; 383 384 - mapped = ioremap(sel->access_method_address, sel->area_length); 385 if (!mapped) 386 return -EIO; 387 ··· 391 wrote++; 392 } 393 394 - iounmap(mapped); 395 return wrote; 396 } 397

··· 25 #include <linux/slab.h> 26 #include <linux/list.h> 27 #include <linux/io.h> 28 + #include <asm/dmi.h> 29 30 #define MAX_ENTRY_TYPE 255 /* Most of these aren't used, but we consider 31 the top entry type is only 8 bits */ ··· 380 u8 __iomem *mapped; 381 ssize_t wrote = 0; 382 383 + mapped = dmi_remap(sel->access_method_address, sel->area_length); 384 if (!mapped) 385 return -EIO; 386 ··· 390 wrote++; 391 } 392 393 + dmi_unmap(mapped); 394 return wrote; 395 } 396

+33

drivers/firmware/efi/efi.c

··· 55 }; 56 EXPORT_SYMBOL(efi); 57 58 static bool disable_runtime; 59 static int __init setup_noefi(char *arg) 60 { ··· 872 } 873 874 return err; 875 } 876 877 #ifdef CONFIG_KEXEC

··· 55 }; 56 EXPORT_SYMBOL(efi); 57 58 + static unsigned long *efi_tables[] = { 59 + &efi.mps, 60 + &efi.acpi, 61 + &efi.acpi20, 62 + &efi.smbios, 63 + &efi.smbios3, 64 + &efi.sal_systab, 65 + &efi.boot_info, 66 + &efi.hcdp, 67 + &efi.uga, 68 + &efi.uv_systab, 69 + &efi.fw_vendor, 70 + &efi.runtime, 71 + &efi.config_table, 72 + &efi.esrt, 73 + &efi.properties_table, 74 + &efi.mem_attr_table, 75 + }; 76 + 77 static bool disable_runtime; 78 static int __init setup_noefi(char *arg) 79 { ··· 853 } 854 855 return err; 856 + } 857 + 858 + bool efi_is_table_address(unsigned long phys_addr) 859 + { 860 + unsigned int i; 861 + 862 + if (phys_addr == EFI_INVALID_TABLE_ADDR) 863 + return false; 864 + 865 + for (i = 0; i < ARRAY_SIZE(efi_tables); i++) 866 + if (*(efi_tables[i]) == phys_addr) 867 + return true; 868 + 869 + return false; 870 } 871 872 #ifdef CONFIG_KEXEC

+2 -2

drivers/firmware/pcdp.c

··· 95 if (efi.hcdp == EFI_INVALID_TABLE_ADDR) 96 return -ENODEV; 97 98 - pcdp = early_ioremap(efi.hcdp, 4096); 99 printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp); 100 101 if (strstr(cmdline, "console=hcdp")) { ··· 131 } 132 133 out: 134 - early_iounmap(pcdp, 4096); 135 return rc; 136 }

··· 95 if (efi.hcdp == EFI_INVALID_TABLE_ADDR) 96 return -ENODEV; 97 98 + pcdp = early_memremap(efi.hcdp, 4096); 99 printk(KERN_INFO "PCDP: v%d at 0x%lx\n", pcdp->rev, efi.hcdp); 100 101 if (strstr(cmdline, "console=hcdp")) { ··· 131 } 132 133 out: 134 + early_memunmap(pcdp, 4096); 135 return rc; 136 }

+2

drivers/gpu/drm/drm_gem.c

··· 36 #include <linux/pagemap.h> 37 #include <linux/shmem_fs.h> 38 #include <linux/dma-buf.h> 39 #include <drm/drmP.h> 40 #include <drm/drm_vma_manager.h> 41 #include <drm/drm_gem.h> ··· 966 vma->vm_ops = dev->driver->gem_vm_ops; 967 vma->vm_private_data = obj; 968 vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); 969 970 /* Take a ref for this mapping of the object, so that the fault 971 * handler can dereference the mmap offset's pointer to the object.

··· 36 #include <linux/pagemap.h> 37 #include <linux/shmem_fs.h> 38 #include <linux/dma-buf.h> 39 + #include <linux/mem_encrypt.h> 40 #include <drm/drmP.h> 41 #include <drm/drm_vma_manager.h> 42 #include <drm/drm_gem.h> ··· 965 vma->vm_ops = dev->driver->gem_vm_ops; 966 vma->vm_private_data = obj; 967 vma->vm_page_prot = pgprot_writecombine(vm_get_page_prot(vma->vm_flags)); 968 + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); 969 970 /* Take a ref for this mapping of the object, so that the fault 971 * handler can dereference the mmap offset's pointer to the object.

+4

drivers/gpu/drm/drm_vm.c

··· 40 #include <linux/efi.h> 41 #include <linux/slab.h> 42 #endif 43 #include <asm/pgtable.h> 44 #include "drm_internal.h" 45 #include "drm_legacy.h" ··· 58 struct vm_area_struct *vma) 59 { 60 pgprot_t tmp = vm_get_page_prot(vma->vm_flags); 61 62 #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) 63 if (map->type == _DRM_REGISTERS && !(map->flags & _DRM_WRITE_COMBINING))

··· 40 #include <linux/efi.h> 41 #include <linux/slab.h> 42 #endif 43 + #include <linux/mem_encrypt.h> 44 #include <asm/pgtable.h> 45 #include "drm_internal.h" 46 #include "drm_legacy.h" ··· 57 struct vm_area_struct *vma) 58 { 59 pgprot_t tmp = vm_get_page_prot(vma->vm_flags); 60 + 61 + /* We don't want graphics memory to be mapped encrypted */ 62 + tmp = pgprot_decrypted(tmp); 63 64 #if defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) 65 if (map->type == _DRM_REGISTERS && !(map->flags & _DRM_WRITE_COMBINING))

+5 -2

drivers/gpu/drm/ttm/ttm_bo_vm.c

··· 39 #include <linux/rbtree.h> 40 #include <linux/module.h> 41 #include <linux/uaccess.h> 42 43 #define TTM_BO_VM_NUM_PREFAULT 16 44 ··· 231 * first page. 232 */ 233 for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) { 234 - if (bo->mem.bus.is_iomem) 235 pfn = bdev->driver->io_mem_pfn(bo, page_offset); 236 - else { 237 page = ttm->pages[page_offset]; 238 if (unlikely(!page && i == 0)) { 239 retval = VM_FAULT_OOM;

··· 39 #include <linux/rbtree.h> 40 #include <linux/module.h> 41 #include <linux/uaccess.h> 42 + #include <linux/mem_encrypt.h> 43 44 #define TTM_BO_VM_NUM_PREFAULT 16 45 ··· 230 * first page. 231 */ 232 for (i = 0; i < TTM_BO_VM_NUM_PREFAULT; ++i) { 233 + if (bo->mem.bus.is_iomem) { 234 + /* Iomem should not be marked encrypted */ 235 + cvma.vm_page_prot = pgprot_decrypted(cvma.vm_page_prot); 236 pfn = bdev->driver->io_mem_pfn(bo, page_offset); 237 + } else { 238 page = ttm->pages[page_offset]; 239 if (unlikely(!page && i == 0)) { 240 retval = VM_FAULT_OOM;

+4

drivers/gpu/drm/udl/udl_fb.c

··· 14 #include <linux/slab.h> 15 #include <linux/fb.h> 16 #include <linux/dma-buf.h> 17 18 #include <drm/drmP.h> 19 #include <drm/drm_crtc.h> ··· 169 170 pr_notice("mmap() framebuffer addr:%lu size:%lu\n", 171 pos, size); 172 173 while (size > 0) { 174 page = vmalloc_to_pfn((void *)pos);

··· 14 #include <linux/slab.h> 15 #include <linux/fb.h> 16 #include <linux/dma-buf.h> 17 + #include <linux/mem_encrypt.h> 18 19 #include <drm/drmP.h> 20 #include <drm/drm_crtc.h> ··· 168 169 pr_notice("mmap() framebuffer addr:%lu size:%lu\n", 170 pos, size); 171 + 172 + /* We don't want the framebuffer to be mapped encrypted */ 173 + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); 174 175 while (size > 0) { 176 page = vmalloc_to_pfn((void *)pos);

+4 -5

drivers/idle/intel_idle.c

··· 913 struct cpuidle_state *state = &drv->states[index]; 914 unsigned long eax = flg2MWAIT(state->flags); 915 unsigned int cstate; 916 - int cpu = smp_processor_id(); 917 918 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; 919 920 /* 921 - * leave_mm() to avoid costly and often unnecessary wakeups 922 - * for flushing the user TLB's associated with the active mm. 923 */ 924 - if (state->flags & CPUIDLE_FLAG_TLB_FLUSHED) 925 - leave_mm(cpu); 926 927 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 928 tick_broadcast_enter();

··· 913 struct cpuidle_state *state = &drv->states[index]; 914 unsigned long eax = flg2MWAIT(state->flags); 915 unsigned int cstate; 916 917 cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; 918 919 /* 920 + * NB: if CPUIDLE_FLAG_TLB_FLUSHED is set, this idle transition 921 + * will probably flush the TLB. It's not guaranteed to flush 922 + * the TLB, though, so it's not clear that we can do anything 923 + * useful with this knowledge. 924 */ 925 926 if (!(lapic_timer_reliable_states & (1 << (cstate)))) 927 tick_broadcast_enter();

+16 -14

drivers/iommu/amd_iommu.c

··· 575 576 static void dump_command(unsigned long phys_addr) 577 { 578 - struct iommu_cmd *cmd = phys_to_virt(phys_addr); 579 int i; 580 581 for (i = 0; i < 4; ++i) ··· 919 920 static void build_completion_wait(struct iommu_cmd *cmd, u64 address) 921 { 922 WARN_ON(address & 0x7ULL); 923 924 memset(cmd, 0, sizeof(*cmd)); 925 - cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK; 926 - cmd->data[1] = upper_32_bits(__pa(address)); 927 cmd->data[2] = 1; 928 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); 929 } ··· 1385 return false; 1386 1387 *pte = PM_LEVEL_PDE(domain->mode, 1388 - virt_to_phys(domain->pt_root)); 1389 domain->pt_root = pte; 1390 domain->mode += 1; 1391 domain->updated = true; ··· 1422 if (!page) 1423 return NULL; 1424 1425 - __npte = PM_LEVEL_PDE(level, virt_to_phys(page)); 1426 1427 /* pte could have been changed somewhere. */ 1428 if (cmpxchg64(pte, __pte, __npte) != __pte) { ··· 1538 return -EBUSY; 1539 1540 if (count > 1) { 1541 - __pte = PAGE_SIZE_PTE(phys_addr, page_size); 1542 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC; 1543 } else 1544 - __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC; 1545 1546 if (prot & IOMMU_PROT_IR) 1547 __pte |= IOMMU_PTE_IR; ··· 1757 if (!(tbl[i] & GCR3_VALID)) 1758 continue; 1759 1760 - ptr = __va(tbl[i] & PAGE_MASK); 1761 1762 free_page((unsigned long)ptr); 1763 } ··· 1772 if (!(tbl[i] & GCR3_VALID)) 1773 continue; 1774 1775 - ptr = __va(tbl[i] & PAGE_MASK); 1776 1777 free_gcr3_tbl_level1(ptr); 1778 } ··· 2051 u64 flags = 0; 2052 2053 if (domain->mode != PAGE_MODE_NONE) 2054 - pte_root = virt_to_phys(domain->pt_root); 2055 2056 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 2057 << DEV_ENTRY_MODE_SHIFT; ··· 2063 flags |= DTE_FLAG_IOTLB; 2064 2065 if (domain->flags & PD_IOMMUV2_MASK) { 2066 - u64 gcr3 = __pa(domain->gcr3_tbl); 2067 u64 glx = domain->glx; 2068 u64 tmp; 2069 ··· 3608 if (root == NULL) 3609 return NULL; 3610 3611 - *pte = __pa(root) | GCR3_VALID; 3612 } 3613 3614 - root = __va(*pte & PAGE_MASK); 3615 3616 level -= 1; 3617 } ··· 3790 3791 dte = amd_iommu_dev_table[devid].data[2]; 3792 dte &= ~DTE_IRQ_PHYS_ADDR_MASK; 3793 - dte |= virt_to_phys(table->table); 3794 dte |= DTE_IRQ_REMAP_INTCTL; 3795 dte |= DTE_IRQ_TABLE_LEN; 3796 dte |= DTE_IRQ_REMAP_ENABLE;

··· 575 576 static void dump_command(unsigned long phys_addr) 577 { 578 + struct iommu_cmd *cmd = iommu_phys_to_virt(phys_addr); 579 int i; 580 581 for (i = 0; i < 4; ++i) ··· 919 920 static void build_completion_wait(struct iommu_cmd *cmd, u64 address) 921 { 922 + u64 paddr = iommu_virt_to_phys((void *)address); 923 + 924 WARN_ON(address & 0x7ULL); 925 926 memset(cmd, 0, sizeof(*cmd)); 927 + cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK; 928 + cmd->data[1] = upper_32_bits(paddr); 929 cmd->data[2] = 1; 930 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT); 931 } ··· 1383 return false; 1384 1385 *pte = PM_LEVEL_PDE(domain->mode, 1386 + iommu_virt_to_phys(domain->pt_root)); 1387 domain->pt_root = pte; 1388 domain->mode += 1; 1389 domain->updated = true; ··· 1420 if (!page) 1421 return NULL; 1422 1423 + __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 1424 1425 /* pte could have been changed somewhere. */ 1426 if (cmpxchg64(pte, __pte, __npte) != __pte) { ··· 1536 return -EBUSY; 1537 1538 if (count > 1) { 1539 + __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size); 1540 __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC; 1541 } else 1542 + __pte = __sme_set(phys_addr) | IOMMU_PTE_P | IOMMU_PTE_FC; 1543 1544 if (prot & IOMMU_PROT_IR) 1545 __pte |= IOMMU_PTE_IR; ··· 1755 if (!(tbl[i] & GCR3_VALID)) 1756 continue; 1757 1758 + ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1759 1760 free_page((unsigned long)ptr); 1761 } ··· 1770 if (!(tbl[i] & GCR3_VALID)) 1771 continue; 1772 1773 + ptr = iommu_phys_to_virt(tbl[i] & PAGE_MASK); 1774 1775 free_gcr3_tbl_level1(ptr); 1776 } ··· 2049 u64 flags = 0; 2050 2051 if (domain->mode != PAGE_MODE_NONE) 2052 + pte_root = iommu_virt_to_phys(domain->pt_root); 2053 2054 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 2055 << DEV_ENTRY_MODE_SHIFT; ··· 2061 flags |= DTE_FLAG_IOTLB; 2062 2063 if (domain->flags & PD_IOMMUV2_MASK) { 2064 + u64 gcr3 = iommu_virt_to_phys(domain->gcr3_tbl); 2065 u64 glx = domain->glx; 2066 u64 tmp; 2067 ··· 3606 if (root == NULL) 3607 return NULL; 3608 3609 + *pte = iommu_virt_to_phys(root) | GCR3_VALID; 3610 } 3611 3612 + root = iommu_phys_to_virt(*pte & PAGE_MASK); 3613 3614 level -= 1; 3615 } ··· 3788 3789 dte = amd_iommu_dev_table[devid].data[2]; 3790 dte &= ~DTE_IRQ_PHYS_ADDR_MASK; 3791 + dte |= iommu_virt_to_phys(table->table); 3792 dte |= DTE_IRQ_REMAP_INTCTL; 3793 dte |= DTE_IRQ_TABLE_LEN; 3794 dte |= DTE_IRQ_REMAP_ENABLE;

+28 -6

drivers/iommu/amd_iommu_init.c

··· 30 #include <linux/iommu.h> 31 #include <linux/kmemleak.h> 32 #include <linux/crash_dump.h> 33 #include <asm/pci-direct.h> 34 #include <asm/iommu.h> 35 #include <asm/gart.h> ··· 349 350 BUG_ON(iommu->mmio_base == NULL); 351 352 - entry = virt_to_phys(amd_iommu_dev_table); 353 entry |= (dev_table_size >> 12) - 1; 354 memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, 355 &entry, sizeof(entry)); ··· 607 608 BUG_ON(iommu->cmd_buf == NULL); 609 610 - entry = (u64)virt_to_phys(iommu->cmd_buf); 611 entry |= MMIO_CMD_SIZE_512; 612 613 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, ··· 636 637 BUG_ON(iommu->evt_buf == NULL); 638 639 - entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; 640 641 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 642 &entry, sizeof(entry)); ··· 669 if (iommu->ppr_log == NULL) 670 return; 671 672 - entry = (u64)virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512; 673 674 memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET, 675 &entry, sizeof(entry)); ··· 749 if (!iommu->ga_log_tail) 750 goto err_out; 751 752 - entry = (u64)virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512; 753 memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET, 754 &entry, sizeof(entry)); 755 - entry = ((u64)virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL; 756 memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET, 757 &entry, sizeof(entry)); 758 writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); ··· 2565 return ret; 2566 } 2567 2568 /**************************************************************************** 2569 * 2570 * Early detect code. This code runs at IOMMU detection time in the DMA ··· 2595 int ret; 2596 2597 if (no_iommu || (iommu_detected && !gart_iommu_aperture)) 2598 return -ENODEV; 2599 2600 ret = iommu_go_to_state(IOMMU_IVRS_DETECTED);

··· 30 #include <linux/iommu.h> 31 #include <linux/kmemleak.h> 32 #include <linux/crash_dump.h> 33 + #include <linux/mem_encrypt.h> 34 #include <asm/pci-direct.h> 35 #include <asm/iommu.h> 36 #include <asm/gart.h> ··· 348 349 BUG_ON(iommu->mmio_base == NULL); 350 351 + entry = iommu_virt_to_phys(amd_iommu_dev_table); 352 entry |= (dev_table_size >> 12) - 1; 353 memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, 354 &entry, sizeof(entry)); ··· 606 607 BUG_ON(iommu->cmd_buf == NULL); 608 609 + entry = iommu_virt_to_phys(iommu->cmd_buf); 610 entry |= MMIO_CMD_SIZE_512; 611 612 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, ··· 635 636 BUG_ON(iommu->evt_buf == NULL); 637 638 + entry = iommu_virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; 639 640 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, 641 &entry, sizeof(entry)); ··· 668 if (iommu->ppr_log == NULL) 669 return; 670 671 + entry = iommu_virt_to_phys(iommu->ppr_log) | PPR_LOG_SIZE_512; 672 673 memcpy_toio(iommu->mmio_base + MMIO_PPR_LOG_OFFSET, 674 &entry, sizeof(entry)); ··· 748 if (!iommu->ga_log_tail) 749 goto err_out; 750 751 + entry = iommu_virt_to_phys(iommu->ga_log) | GA_LOG_SIZE_512; 752 memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_BASE_OFFSET, 753 &entry, sizeof(entry)); 754 + entry = (iommu_virt_to_phys(iommu->ga_log) & 0xFFFFFFFFFFFFFULL) & ~7ULL; 755 memcpy_toio(iommu->mmio_base + MMIO_GA_LOG_TAIL_OFFSET, 756 &entry, sizeof(entry)); 757 writel(0x00, iommu->mmio_base + MMIO_GA_HEAD_OFFSET); ··· 2564 return ret; 2565 } 2566 2567 + static bool amd_iommu_sme_check(void) 2568 + { 2569 + if (!sme_active() || (boot_cpu_data.x86 != 0x17)) 2570 + return true; 2571 + 2572 + /* For Fam17h, a specific level of support is required */ 2573 + if (boot_cpu_data.microcode >= 0x08001205) 2574 + return true; 2575 + 2576 + if ((boot_cpu_data.microcode >= 0x08001126) && 2577 + (boot_cpu_data.microcode <= 0x080011ff)) 2578 + return true; 2579 + 2580 + pr_notice("AMD-Vi: IOMMU not currently supported when SME is active\n"); 2581 + 2582 + return false; 2583 + } 2584 + 2585 /**************************************************************************** 2586 * 2587 * Early detect code. This code runs at IOMMU detection time in the DMA ··· 2576 int ret; 2577 2578 if (no_iommu || (iommu_detected && !gart_iommu_aperture)) 2579 + return -ENODEV; 2580 + 2581 + if (!amd_iommu_sme_check()) 2582 return -ENODEV; 2583 2584 ret = iommu_go_to_state(IOMMU_IVRS_DETECTED);

+10

drivers/iommu/amd_iommu_proto.h

··· 87 return !!(iommu->features & f); 88 } 89 90 #endif /* _ASM_X86_AMD_IOMMU_PROTO_H */

··· 87 return !!(iommu->features & f); 88 } 89 90 + static inline u64 iommu_virt_to_phys(void *vaddr) 91 + { 92 + return (u64)__sme_set(virt_to_phys(vaddr)); 93 + } 94 + 95 + static inline void *iommu_phys_to_virt(unsigned long paddr) 96 + { 97 + return phys_to_virt(__sme_clr(paddr)); 98 + } 99 + 100 #endif /* _ASM_X86_AMD_IOMMU_PROTO_H */

+1 -1

drivers/iommu/amd_iommu_types.h

··· 344 345 #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) 346 #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) 347 - #define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) 348 #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) 349 350 #define IOMMU_PROT_MASK 0x03

··· 344 345 #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) 346 #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) 347 + #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) 348 #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) 349 350 #define IOMMU_PROT_MASK 0x03

+12 -11

drivers/sfi/sfi_core.c

··· 68 #include <linux/init.h> 69 #include <linux/sfi.h> 70 #include <linux/slab.h> 71 72 #include "sfi_core.h" 73 ··· 87 /* 88 * FW creates and saves the SFI tables in memory. When these tables get 89 * used, they may need to be mapped to virtual address space, and the mapping 90 - * can happen before or after the ioremap() is ready, so a flag is needed 91 * to indicating this 92 */ 93 - static u32 sfi_use_ioremap __read_mostly; 94 95 /* 96 - * sfi_un/map_memory calls early_ioremap/iounmap which is a __init function 97 * and introduces section mismatch. So use __ref to make it calm. 98 */ 99 static void __iomem * __ref sfi_map_memory(u64 phys, u32 size) ··· 101 if (!phys || !size) 102 return NULL; 103 104 - if (sfi_use_ioremap) 105 - return ioremap_cache(phys, size); 106 else 107 - return early_ioremap(phys, size); 108 } 109 110 static void __ref sfi_unmap_memory(void __iomem *virt, u32 size) ··· 112 if (!virt || !size) 113 return; 114 115 - if (sfi_use_ioremap) 116 - iounmap(virt); 117 else 118 - early_iounmap(virt, size); 119 } 120 121 static void sfi_print_table_header(unsigned long long pa, ··· 508 length = syst_va->header.len; 509 sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple)); 510 511 - /* Use ioremap now after it is ready */ 512 - sfi_use_ioremap = 1; 513 syst_va = sfi_map_memory(syst_pa, length); 514 515 sfi_acpi_init();

··· 68 #include <linux/init.h> 69 #include <linux/sfi.h> 70 #include <linux/slab.h> 71 + #include <linux/io.h> 72 73 #include "sfi_core.h" 74 ··· 86 /* 87 * FW creates and saves the SFI tables in memory. When these tables get 88 * used, they may need to be mapped to virtual address space, and the mapping 89 + * can happen before or after the memremap() is ready, so a flag is needed 90 * to indicating this 91 */ 92 + static u32 sfi_use_memremap __read_mostly; 93 94 /* 95 + * sfi_un/map_memory calls early_memremap/memunmap which is a __init function 96 * and introduces section mismatch. So use __ref to make it calm. 97 */ 98 static void __iomem * __ref sfi_map_memory(u64 phys, u32 size) ··· 100 if (!phys || !size) 101 return NULL; 102 103 + if (sfi_use_memremap) 104 + return memremap(phys, size, MEMREMAP_WB); 105 else 106 + return early_memremap(phys, size); 107 } 108 109 static void __ref sfi_unmap_memory(void __iomem *virt, u32 size) ··· 111 if (!virt || !size) 112 return; 113 114 + if (sfi_use_memremap) 115 + memunmap(virt); 116 else 117 + early_memunmap(virt, size); 118 } 119 120 static void sfi_print_table_header(unsigned long long pa, ··· 507 length = syst_va->header.len; 508 sfi_unmap_memory(syst_va, sizeof(struct sfi_table_simple)); 509 510 + /* Use memremap now after it is ready */ 511 + sfi_use_memremap = 1; 512 syst_va = sfi_map_memory(syst_pa, length); 513 514 sfi_acpi_init();

+12

drivers/video/fbdev/core/fbmem.c

··· 32 #include <linux/device.h> 33 #include <linux/efi.h> 34 #include <linux/fb.h> 35 36 #include <asm/fb.h> 37 ··· 1397 mutex_lock(&info->mm_lock); 1398 if (fb->fb_mmap) { 1399 int res; 1400 res = fb->fb_mmap(info, vma); 1401 mutex_unlock(&info->mm_lock); 1402 return res; ··· 1428 mutex_unlock(&info->mm_lock); 1429 1430 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 1431 fb_pgprotect(file, vma, start); 1432 1433 return vm_iomap_memory(vma, start, len);

··· 32 #include <linux/device.h> 33 #include <linux/efi.h> 34 #include <linux/fb.h> 35 + #include <linux/mem_encrypt.h> 36 37 #include <asm/fb.h> 38 ··· 1396 mutex_lock(&info->mm_lock); 1397 if (fb->fb_mmap) { 1398 int res; 1399 + 1400 + /* 1401 + * The framebuffer needs to be accessed decrypted, be sure 1402 + * SME protection is removed ahead of the call 1403 + */ 1404 + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); 1405 res = fb->fb_mmap(info, vma); 1406 mutex_unlock(&info->mm_lock); 1407 return res; ··· 1421 mutex_unlock(&info->mm_lock); 1422 1423 vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); 1424 + /* 1425 + * The framebuffer needs to be accessed decrypted, be sure 1426 + * SME protection is removed 1427 + */ 1428 + vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot); 1429 fb_pgprotect(file, vma, start); 1430 1431 return vm_iomap_memory(vma, start, len);

+2

include/asm-generic/early_ioremap.h

··· 13 unsigned long size); 14 extern void *early_memremap_ro(resource_size_t phys_addr, 15 unsigned long size); 16 extern void early_iounmap(void __iomem *addr, unsigned long size); 17 extern void early_memunmap(void *addr, unsigned long size); 18

··· 13 unsigned long size); 14 extern void *early_memremap_ro(resource_size_t phys_addr, 15 unsigned long size); 16 + extern void *early_memremap_prot(resource_size_t phys_addr, 17 + unsigned long size, unsigned long prot_val); 18 extern void early_iounmap(void __iomem *addr, unsigned long size); 19 extern void early_memunmap(void *addr, unsigned long size); 20

+12

include/asm-generic/pgtable.h

··· 583 #endif /* CONFIG_MMU */ 584 585 /* 586 * A facility to provide lazy MMU batching. This allows PTE updates and 587 * page invalidations to be delayed until a call to leave lazy MMU mode 588 * is issued. Some architectures may benefit from doing this, and it is

··· 583 #endif /* CONFIG_MMU */ 584 585 /* 586 + * No-op macros that just return the current protection value. Defined here 587 + * because these macros can be used used even if CONFIG_MMU is not defined. 588 + */ 589 + #ifndef pgprot_encrypted 590 + #define pgprot_encrypted(prot) (prot) 591 + #endif 592 + 593 + #ifndef pgprot_decrypted 594 + #define pgprot_decrypted(prot) (prot) 595 + #endif 596 + 597 + /* 598 * A facility to provide lazy MMU batching. This allows PTE updates and 599 * page invalidations to be delayed until a call to leave lazy MMU mode 600 * is issued. Some architectures may benefit from doing this, and it is

+2

include/linux/compiler-gcc.h

··· 166 167 #if GCC_VERSION >= 40100 168 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0) 169 #endif 170 171 #if GCC_VERSION >= 40300

··· 166 167 #if GCC_VERSION >= 40100 168 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0) 169 + 170 + #define __nostackprotector __attribute__((__optimize__("no-stack-protector"))) 171 #endif 172 173 #if GCC_VERSION >= 40300

+4

include/linux/compiler.h

··· 501 #define __visible 502 #endif 503 504 /* 505 * Assume alignment of return value. 506 */

··· 501 #define __visible 502 #endif 503 504 + #ifndef __nostackprotector 505 + # define __nostackprotector 506 + #endif 507 + 508 /* 509 * Assume alignment of return value. 510 */

+13

include/linux/dma-mapping.h

··· 10 #include <linux/scatterlist.h> 11 #include <linux/kmemcheck.h> 12 #include <linux/bug.h> 13 14 /** 15 * List of possible attributes associated with a DMA mapping. The semantics ··· 573 return 0; 574 } 575 576 static inline int dma_supported(struct device *dev, u64 mask) 577 { 578 const struct dma_map_ops *ops = get_dma_ops(dev); ··· 595 { 596 if (!dev->dma_mask || !dma_supported(dev, mask)) 597 return -EIO; 598 *dev->dma_mask = mask; 599 return 0; 600 } ··· 617 { 618 if (!dma_supported(dev, mask)) 619 return -EIO; 620 dev->coherent_dma_mask = mask; 621 return 0; 622 }

··· 10 #include <linux/scatterlist.h> 11 #include <linux/kmemcheck.h> 12 #include <linux/bug.h> 13 + #include <linux/mem_encrypt.h> 14 15 /** 16 * List of possible attributes associated with a DMA mapping. The semantics ··· 572 return 0; 573 } 574 575 + static inline void dma_check_mask(struct device *dev, u64 mask) 576 + { 577 + if (sme_active() && (mask < (((u64)sme_get_me_mask() << 1) - 1))) 578 + dev_warn(dev, "SME is active, device will require DMA bounce buffers\n"); 579 + } 580 + 581 static inline int dma_supported(struct device *dev, u64 mask) 582 { 583 const struct dma_map_ops *ops = get_dma_ops(dev); ··· 588 { 589 if (!dev->dma_mask || !dma_supported(dev, mask)) 590 return -EIO; 591 + 592 + dma_check_mask(dev, mask); 593 + 594 *dev->dma_mask = mask; 595 return 0; 596 } ··· 607 { 608 if (!dma_supported(dev, mask)) 609 return -EIO; 610 + 611 + dma_check_mask(dev, mask); 612 + 613 dev->coherent_dma_mask = mask; 614 return 0; 615 }

+8 -1

include/linux/efi.h

··· 985 extern int efi_config_parse_tables(void *config_tables, int count, int sz, 986 efi_config_table_type_t *arch_tables); 987 extern u64 efi_get_iobase (void); 988 - extern u32 efi_mem_type (unsigned long phys_addr); 989 extern u64 efi_mem_attributes (unsigned long phys_addr); 990 extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); 991 extern int __init efi_uart_console_only (void); ··· 1113 return test_bit(feature, &efi.flags) != 0; 1114 } 1115 extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused); 1116 #else 1117 static inline bool efi_enabled(int feature) 1118 { ··· 1125 1126 static inline bool 1127 efi_capsule_pending(int *reset_type) 1128 { 1129 return false; 1130 }

··· 985 extern int efi_config_parse_tables(void *config_tables, int count, int sz, 986 efi_config_table_type_t *arch_tables); 987 extern u64 efi_get_iobase (void); 988 + extern int efi_mem_type(unsigned long phys_addr); 989 extern u64 efi_mem_attributes (unsigned long phys_addr); 990 extern u64 efi_mem_attribute (unsigned long phys_addr, unsigned long size); 991 extern int __init efi_uart_console_only (void); ··· 1113 return test_bit(feature, &efi.flags) != 0; 1114 } 1115 extern void efi_reboot(enum reboot_mode reboot_mode, const char *__unused); 1116 + 1117 + extern bool efi_is_table_address(unsigned long phys_addr); 1118 #else 1119 static inline bool efi_enabled(int feature) 1120 { ··· 1123 1124 static inline bool 1125 efi_capsule_pending(int *reset_type) 1126 + { 1127 + return false; 1128 + } 1129 + 1130 + static inline bool efi_is_table_address(unsigned long phys_addr) 1131 { 1132 return false; 1133 }

+2

include/linux/io.h

··· 157 MEMREMAP_WB = 1 << 0, 158 MEMREMAP_WT = 1 << 1, 159 MEMREMAP_WC = 1 << 2, 160 }; 161 162 void *memremap(resource_size_t offset, size_t size, unsigned long flags);

··· 157 MEMREMAP_WB = 1 << 0, 158 MEMREMAP_WT = 1 << 1, 159 MEMREMAP_WC = 1 << 2, 160 + MEMREMAP_ENC = 1 << 3, 161 + MEMREMAP_DEC = 1 << 4, 162 }; 163 164 void *memremap(resource_size_t offset, size_t size, unsigned long flags);

+8

include/linux/kexec.h

··· 327 return phys_to_virt(boot_phys_to_phys(entry)); 328 } 329 330 #else /* !CONFIG_KEXEC_CORE */ 331 struct pt_regs; 332 struct task_struct;

··· 327 return phys_to_virt(boot_phys_to_phys(entry)); 328 } 329 330 + #ifndef arch_kexec_post_alloc_pages 331 + static inline int arch_kexec_post_alloc_pages(void *vaddr, unsigned int pages, gfp_t gfp) { return 0; } 332 + #endif 333 + 334 + #ifndef arch_kexec_pre_free_pages 335 + static inline void arch_kexec_pre_free_pages(void *vaddr, unsigned int pages) { } 336 + #endif 337 + 338 #else /* !CONFIG_KEXEC_CORE */ 339 struct pt_regs; 340 struct task_struct;

+48

include/linux/mem_encrypt.h

···

··· 1 + /* 2 + * AMD Memory Encryption Support 3 + * 4 + * Copyright (C) 2016 Advanced Micro Devices, Inc. 5 + * 6 + * Author: Tom Lendacky <thomas.lendacky@amd.com> 7 + * 8 + * This program is free software; you can redistribute it and/or modify 9 + * it under the terms of the GNU General Public License version 2 as 10 + * published by the Free Software Foundation. 11 + */ 12 + 13 + #ifndef __MEM_ENCRYPT_H__ 14 + #define __MEM_ENCRYPT_H__ 15 + 16 + #ifndef __ASSEMBLY__ 17 + 18 + #ifdef CONFIG_ARCH_HAS_MEM_ENCRYPT 19 + 20 + #include <asm/mem_encrypt.h> 21 + 22 + #else /* !CONFIG_ARCH_HAS_MEM_ENCRYPT */ 23 + 24 + #define sme_me_mask 0UL 25 + 26 + #endif /* CONFIG_ARCH_HAS_MEM_ENCRYPT */ 27 + 28 + static inline bool sme_active(void) 29 + { 30 + return !!sme_me_mask; 31 + } 32 + 33 + static inline unsigned long sme_get_me_mask(void) 34 + { 35 + return sme_me_mask; 36 + } 37 + 38 + /* 39 + * The __sme_set() and __sme_clr() macros are useful for adding or removing 40 + * the encryption mask from a value (e.g. when dealing with pagetable 41 + * entries). 42 + */ 43 + #define __sme_set(x) ((unsigned long)(x) | sme_me_mask) 44 + #define __sme_clr(x) ((unsigned long)(x) & ~sme_me_mask) 45 + 46 + #endif /* __ASSEMBLY__ */ 47 + 48 + #endif /* __MEM_ENCRYPT_H__ */

+6

include/linux/mm_inline.h

··· 126 127 #define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) 128 129 #endif

··· 126 127 #define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) 128 129 + #ifdef arch_unmap_kpfn 130 + extern void arch_unmap_kpfn(unsigned long pfn); 131 + #else 132 + static __always_inline void arch_unmap_kpfn(unsigned long pfn) { } 133 + #endif 134 + 135 #endif

+1

include/linux/swiotlb.h

··· 35 extern unsigned long swiotlb_nr_tbl(void); 36 unsigned long swiotlb_size_or_default(void); 37 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); 38 39 /* 40 * Enumeration for sync targets

··· 35 extern unsigned long swiotlb_nr_tbl(void); 36 unsigned long swiotlb_size_or_default(void); 37 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); 38 + extern void __init swiotlb_update_mem_attributes(void); 39 40 /* 41 * Enumeration for sync targets

+10

init/main.c

··· 487 } 488 #endif 489 490 /* 491 * Set up kernel memory allocators 492 */ ··· 641 * too: 642 */ 643 locking_selftest(); 644 645 #ifdef CONFIG_BLK_DEV_INITRD 646 if (initrd_start && !initrd_below_start_ok &&

··· 487 } 488 #endif 489 490 + void __init __weak mem_encrypt_init(void) { } 491 + 492 /* 493 * Set up kernel memory allocators 494 */ ··· 639 * too: 640 */ 641 locking_selftest(); 642 + 643 + /* 644 + * This needs to be called before any devices perform DMA 645 + * operations that might use the SWIOTLB bounce buffers. It will 646 + * mark the bounce buffers as decrypted so that their usage will 647 + * not cause "plain-text" data to be decrypted when accessed. 648 + */ 649 + mem_encrypt_init(); 650 651 #ifdef CONFIG_BLK_DEV_INITRD 652 if (initrd_start && !initrd_below_start_ok &&

+11 -1

kernel/kexec_core.c

··· 301 { 302 struct page *pages; 303 304 - pages = alloc_pages(gfp_mask, order); 305 if (pages) { 306 unsigned int count, i; 307 ··· 310 count = 1 << order; 311 for (i = 0; i < count; i++) 312 SetPageReserved(pages + i); 313 } 314 315 return pages; ··· 328 329 order = page_private(page); 330 count = 1 << order; 331 for (i = 0; i < count; i++) 332 ClearPageReserved(page + i); 333 __free_pages(page, order);

··· 301 { 302 struct page *pages; 303 304 + pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); 305 if (pages) { 306 unsigned int count, i; 307 ··· 310 count = 1 << order; 311 for (i = 0; i < count; i++) 312 SetPageReserved(pages + i); 313 + 314 + arch_kexec_post_alloc_pages(page_address(pages), count, 315 + gfp_mask); 316 + 317 + if (gfp_mask & __GFP_ZERO) 318 + for (i = 0; i < count; i++) 319 + clear_highpage(pages + i); 320 } 321 322 return pages; ··· 321 322 order = page_private(page); 323 count = 1 << order; 324 + 325 + arch_kexec_pre_free_pages(page_address(page), count); 326 + 327 for (i = 0; i < count; i++) 328 ClearPageReserved(page + i); 329 __free_pages(page, order);

+16 -4

kernel/memremap.c

··· 34 } 35 #endif 36 37 - static void *try_ram_remap(resource_size_t offset, size_t size) 38 { 39 unsigned long pfn = PHYS_PFN(offset); 40 41 /* In the simple case just return the existing linear address */ 42 - if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) 43 return __va(offset); 44 return NULL; /* fallback to arch_memremap_wb */ 45 } 46 ··· 59 * memremap() - remap an iomem_resource as cacheable memory 60 * @offset: iomem resource start address 61 * @size: size of remap 62 - * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC 63 * 64 * memremap() is "ioremap" for cases where it is known that the resource 65 * being mapped does not have i/o side effects and the __iomem ··· 107 * the requested range is potentially in System RAM. 108 */ 109 if (is_ram == REGION_INTERSECTS) 110 - addr = try_ram_remap(offset, size); 111 if (!addr) 112 addr = arch_memremap_wb(offset, size); 113 }

··· 34 } 35 #endif 36 37 + #ifndef arch_memremap_can_ram_remap 38 + static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, 39 + unsigned long flags) 40 + { 41 + return true; 42 + } 43 + #endif 44 + 45 + static void *try_ram_remap(resource_size_t offset, size_t size, 46 + unsigned long flags) 47 { 48 unsigned long pfn = PHYS_PFN(offset); 49 50 /* In the simple case just return the existing linear address */ 51 + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && 52 + arch_memremap_can_ram_remap(offset, size, flags)) 53 return __va(offset); 54 + 55 return NULL; /* fallback to arch_memremap_wb */ 56 } 57 ··· 48 * memremap() - remap an iomem_resource as cacheable memory 49 * @offset: iomem resource start address 50 * @size: size of remap 51 + * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, 52 + * MEMREMAP_ENC, MEMREMAP_DEC 53 * 54 * memremap() is "ioremap" for cases where it is known that the resource 55 * being mapped does not have i/o side effects and the __iomem ··· 95 * the requested range is potentially in System RAM. 96 */ 97 if (is_ram == REGION_INTERSECTS) 98 + addr = try_ram_remap(offset, size, flags); 99 if (!addr) 100 addr = arch_memremap_wb(offset, size); 101 }

+49 -8

lib/swiotlb.c

··· 30 #include <linux/highmem.h> 31 #include <linux/gfp.h> 32 #include <linux/scatterlist.h> 33 34 #include <asm/io.h> 35 #include <asm/dma.h> ··· 156 return size ? size : (IO_TLB_DEFAULT_SIZE); 157 } 158 159 /* Note that this doesn't work with highmem page */ 160 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, 161 volatile void *address) ··· 191 (unsigned long long)io_tlb_start, 192 (unsigned long long)io_tlb_end, 193 bytes >> 20, vstart, vend - 1); 194 } 195 196 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) ··· 355 io_tlb_start = virt_to_phys(tlb); 356 io_tlb_end = io_tlb_start + bytes; 357 358 memset(tlb, 0, bytes); 359 360 /* ··· 366 if (!v_overflow_buffer) 367 goto cleanup2; 368 369 io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); 370 371 /* ··· 507 if (no_iotlb_memory) 508 panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); 509 510 mask = dma_get_seg_boundary(hwdev); 511 512 tbl_dma_addr &= mask; ··· 622 return SWIOTLB_MAP_ERROR; 623 } 624 625 - start_dma_addr = phys_to_dma(hwdev, io_tlb_start); 626 return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, 627 dir, attrs); 628 } ··· 743 goto err_warn; 744 745 ret = phys_to_virt(paddr); 746 - dev_addr = phys_to_dma(hwdev, paddr); 747 748 /* Confirm address can be DMA'd by device */ 749 if (dev_addr + size - 1 > dma_mask) { ··· 853 map = map_single(dev, phys, size, dir, attrs); 854 if (map == SWIOTLB_MAP_ERROR) { 855 swiotlb_full(dev, size, dir, 1); 856 - return phys_to_dma(dev, io_tlb_overflow_buffer); 857 } 858 859 - dev_addr = phys_to_dma(dev, map); 860 861 /* Ensure that the address returned is DMA'ble */ 862 if (dma_capable(dev, dev_addr, size)) ··· 865 attrs |= DMA_ATTR_SKIP_CPU_SYNC; 866 swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); 867 868 - return phys_to_dma(dev, io_tlb_overflow_buffer); 869 } 870 EXPORT_SYMBOL_GPL(swiotlb_map_page); 871 ··· 999 sg_dma_len(sgl) = 0; 1000 return 0; 1001 } 1002 - sg->dma_address = phys_to_dma(hwdev, map); 1003 } else 1004 sg->dma_address = dev_addr; 1005 sg_dma_len(sg) = sg->length; ··· 1067 int 1068 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) 1069 { 1070 - return (dma_addr == phys_to_dma(hwdev, io_tlb_overflow_buffer)); 1071 } 1072 EXPORT_SYMBOL(swiotlb_dma_mapping_error); 1073 ··· 1080 int 1081 swiotlb_dma_supported(struct device *hwdev, u64 mask) 1082 { 1083 - return phys_to_dma(hwdev, io_tlb_end - 1) <= mask; 1084 } 1085 EXPORT_SYMBOL(swiotlb_dma_supported);

··· 30 #include <linux/highmem.h> 31 #include <linux/gfp.h> 32 #include <linux/scatterlist.h> 33 + #include <linux/mem_encrypt.h> 34 35 #include <asm/io.h> 36 #include <asm/dma.h> ··· 155 return size ? size : (IO_TLB_DEFAULT_SIZE); 156 } 157 158 + void __weak swiotlb_set_mem_attributes(void *vaddr, unsigned long size) { } 159 + 160 + /* For swiotlb, clear memory encryption mask from dma addresses */ 161 + static dma_addr_t swiotlb_phys_to_dma(struct device *hwdev, 162 + phys_addr_t address) 163 + { 164 + return __sme_clr(phys_to_dma(hwdev, address)); 165 + } 166 + 167 /* Note that this doesn't work with highmem page */ 168 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, 169 volatile void *address) ··· 181 (unsigned long long)io_tlb_start, 182 (unsigned long long)io_tlb_end, 183 bytes >> 20, vstart, vend - 1); 184 + } 185 + 186 + /* 187 + * Early SWIOTLB allocation may be too early to allow an architecture to 188 + * perform the desired operations. This function allows the architecture to 189 + * call SWIOTLB when the operations are possible. It needs to be called 190 + * before the SWIOTLB memory is used. 191 + */ 192 + void __init swiotlb_update_mem_attributes(void) 193 + { 194 + void *vaddr; 195 + unsigned long bytes; 196 + 197 + if (no_iotlb_memory || late_alloc) 198 + return; 199 + 200 + vaddr = phys_to_virt(io_tlb_start); 201 + bytes = PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT); 202 + swiotlb_set_mem_attributes(vaddr, bytes); 203 + memset(vaddr, 0, bytes); 204 + 205 + vaddr = phys_to_virt(io_tlb_overflow_buffer); 206 + bytes = PAGE_ALIGN(io_tlb_overflow); 207 + swiotlb_set_mem_attributes(vaddr, bytes); 208 + memset(vaddr, 0, bytes); 209 } 210 211 int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) ··· 320 io_tlb_start = virt_to_phys(tlb); 321 io_tlb_end = io_tlb_start + bytes; 322 323 + swiotlb_set_mem_attributes(tlb, bytes); 324 memset(tlb, 0, bytes); 325 326 /* ··· 330 if (!v_overflow_buffer) 331 goto cleanup2; 332 333 + swiotlb_set_mem_attributes(v_overflow_buffer, io_tlb_overflow); 334 + memset(v_overflow_buffer, 0, io_tlb_overflow); 335 io_tlb_overflow_buffer = virt_to_phys(v_overflow_buffer); 336 337 /* ··· 469 if (no_iotlb_memory) 470 panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); 471 472 + if (sme_active()) 473 + pr_warn_once("SME is active and system is using DMA bounce buffers\n"); 474 + 475 mask = dma_get_seg_boundary(hwdev); 476 477 tbl_dma_addr &= mask; ··· 581 return SWIOTLB_MAP_ERROR; 582 } 583 584 + start_dma_addr = swiotlb_phys_to_dma(hwdev, io_tlb_start); 585 return swiotlb_tbl_map_single(hwdev, start_dma_addr, phys, size, 586 dir, attrs); 587 } ··· 702 goto err_warn; 703 704 ret = phys_to_virt(paddr); 705 + dev_addr = swiotlb_phys_to_dma(hwdev, paddr); 706 707 /* Confirm address can be DMA'd by device */ 708 if (dev_addr + size - 1 > dma_mask) { ··· 812 map = map_single(dev, phys, size, dir, attrs); 813 if (map == SWIOTLB_MAP_ERROR) { 814 swiotlb_full(dev, size, dir, 1); 815 + return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer); 816 } 817 818 + dev_addr = swiotlb_phys_to_dma(dev, map); 819 820 /* Ensure that the address returned is DMA'ble */ 821 if (dma_capable(dev, dev_addr, size)) ··· 824 attrs |= DMA_ATTR_SKIP_CPU_SYNC; 825 swiotlb_tbl_unmap_single(dev, map, size, dir, attrs); 826 827 + return swiotlb_phys_to_dma(dev, io_tlb_overflow_buffer); 828 } 829 EXPORT_SYMBOL_GPL(swiotlb_map_page); 830 ··· 958 sg_dma_len(sgl) = 0; 959 return 0; 960 } 961 + sg->dma_address = swiotlb_phys_to_dma(hwdev, map); 962 } else 963 sg->dma_address = dev_addr; 964 sg_dma_len(sg) = sg->length; ··· 1026 int 1027 swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr) 1028 { 1029 + return (dma_addr == swiotlb_phys_to_dma(hwdev, io_tlb_overflow_buffer)); 1030 } 1031 EXPORT_SYMBOL(swiotlb_dma_mapping_error); 1032 ··· 1039 int 1040 swiotlb_dma_supported(struct device *hwdev, u64 mask) 1041 { 1042 + return swiotlb_phys_to_dma(hwdev, io_tlb_end - 1) <= mask; 1043 } 1044 EXPORT_SYMBOL(swiotlb_dma_supported);

+25 -3

mm/early_ioremap.c

··· 30 31 static int after_paging_init __initdata; 32 33 void __init __weak early_ioremap_shutdown(void) 34 { 35 } ··· 222 void __init * 223 early_memremap(resource_size_t phys_addr, unsigned long size) 224 { 225 - return (__force void *)__early_ioremap(phys_addr, size, 226 - FIXMAP_PAGE_NORMAL); 227 } 228 #ifdef FIXMAP_PAGE_RO 229 void __init * 230 early_memremap_ro(resource_size_t phys_addr, unsigned long size) 231 { 232 - return (__force void *)__early_ioremap(phys_addr, size, FIXMAP_PAGE_RO); 233 } 234 #endif 235

··· 30 31 static int after_paging_init __initdata; 32 33 + pgprot_t __init __weak early_memremap_pgprot_adjust(resource_size_t phys_addr, 34 + unsigned long size, 35 + pgprot_t prot) 36 + { 37 + return prot; 38 + } 39 + 40 void __init __weak early_ioremap_shutdown(void) 41 { 42 } ··· 215 void __init * 216 early_memremap(resource_size_t phys_addr, unsigned long size) 217 { 218 + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, 219 + FIXMAP_PAGE_NORMAL); 220 + 221 + return (__force void *)__early_ioremap(phys_addr, size, prot); 222 } 223 #ifdef FIXMAP_PAGE_RO 224 void __init * 225 early_memremap_ro(resource_size_t phys_addr, unsigned long size) 226 { 227 + pgprot_t prot = early_memremap_pgprot_adjust(phys_addr, size, 228 + FIXMAP_PAGE_RO); 229 + 230 + return (__force void *)__early_ioremap(phys_addr, size, prot); 231 + } 232 + #endif 233 + 234 + #ifdef CONFIG_ARCH_USE_MEMREMAP_PROT 235 + void __init * 236 + early_memremap_prot(resource_size_t phys_addr, unsigned long size, 237 + unsigned long prot_val) 238 + { 239 + return (__force void *)__early_ioremap(phys_addr, size, 240 + __pgprot(prot_val)); 241 } 242 #endif 243

+2

mm/memory-failure.c

··· 1146 return 0; 1147 } 1148 1149 orig_head = hpage = compound_head(p); 1150 num_poisoned_pages_inc(); 1151

··· 1146 return 0; 1147 } 1148 1149 + arch_unmap_kpfn(pfn); 1150 + 1151 orig_head = hpage = compound_head(p); 1152 num_poisoned_pages_inc(); 1153