Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Ingo Molnar:
"Misc fixes: a binutils fix, an lguest fix, an mcelog fix and a missing
documentation fix"

* 'x86-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/mce: Avoid using object after free in genpool
lguest, x86/entry/32: Fix handling of guest syscalls using interrupt gates
x86/build: Build compressed x86 kernels as PIE
x86/mm/pkeys: Add missing Documentation

+89 -5
+27
Documentation/x86/protection-keys.txt
··· 1 + Memory Protection Keys for Userspace (PKU aka PKEYs) is a CPU feature 2 + which will be found on future Intel CPUs. 3 + 4 + Memory Protection Keys provides a mechanism for enforcing page-based 5 + protections, but without requiring modification of the page tables 6 + when an application changes protection domains. It works by 7 + dedicating 4 previously ignored bits in each page table entry to a 8 + "protection key", giving 16 possible keys. 9 + 10 + There is also a new user-accessible register (PKRU) with two separate 11 + bits (Access Disable and Write Disable) for each key. Being a CPU 12 + register, PKRU is inherently thread-local, potentially giving each 13 + thread a different set of protections from every other thread. 14 + 15 + There are two new instructions (RDPKRU/WRPKRU) for reading and writing 16 + to the new register. The feature is only available in 64-bit mode, 17 + even though there is theoretically space in the PAE PTEs. These 18 + permissions are enforced on data access only and have no effect on 19 + instruction fetches. 20 + 21 + =========================== Config Option =========================== 22 + 23 + This config option adds approximately 1.5kb of text. and 50 bytes of 24 + data to the executable. A workload which does large O_DIRECT reads 25 + of holes in XFS files was run to exercise get_user_pages_fast(). No 26 + performance delta was observed with the config option 27 + enabled or disabled.
+13 -1
arch/x86/boot/compressed/Makefile
··· 26 26 vmlinux.bin.xz vmlinux.bin.lzo vmlinux.bin.lz4 27 27 28 28 KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 29 - KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 29 + KBUILD_CFLAGS += -fno-strict-aliasing $(call cc-option, -fPIE, -fPIC) 30 30 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 31 31 cflags-$(CONFIG_X86_32) := -march=i386 32 32 cflags-$(CONFIG_X86_64) := -mcmodel=small ··· 40 40 UBSAN_SANITIZE :=n 41 41 42 42 LDFLAGS := -m elf_$(UTS_MACHINE) 43 + ifeq ($(CONFIG_RELOCATABLE),y) 44 + # If kernel is relocatable, build compressed kernel as PIE. 45 + ifeq ($(CONFIG_X86_32),y) 46 + LDFLAGS += $(call ld-option, -pie) $(call ld-option, --no-dynamic-linker) 47 + else 48 + # To build 64-bit compressed kernel as PIE, we disable relocation 49 + # overflow check to avoid relocation overflow error with a new linker 50 + # command-line option, -z noreloc-overflow. 51 + LDFLAGS += $(shell $(LD) --help 2>&1 | grep -q "\-z noreloc-overflow" \ 52 + && echo "-z noreloc-overflow -pie --no-dynamic-linker") 53 + endif 54 + endif 43 55 LDFLAGS_vmlinux := -T 44 56 45 57 hostprogs-y := mkpiggy
+28
arch/x86/boot/compressed/head_32.S
··· 31 31 #include <asm/asm-offsets.h> 32 32 #include <asm/bootparam.h> 33 33 34 + /* 35 + * The 32-bit x86 assembler in binutils 2.26 will generate R_386_GOT32X 36 + * relocation to get the symbol address in PIC. When the compressed x86 37 + * kernel isn't built as PIC, the linker optimizes R_386_GOT32X 38 + * relocations to their fixed symbol addresses. However, when the 39 + * compressed x86 kernel is loaded at a different address, it leads 40 + * to the following load failure: 41 + * 42 + * Failed to allocate space for phdrs 43 + * 44 + * during the decompression stage. 45 + * 46 + * If the compressed x86 kernel is relocatable at run-time, it should be 47 + * compiled with -fPIE, instead of -fPIC, if possible and should be built as 48 + * Position Independent Executable (PIE) so that linker won't optimize 49 + * R_386_GOT32X relocation to its fixed symbol address. Older 50 + * linkers generate R_386_32 relocations against locally defined symbols, 51 + * _bss, _ebss, _got and _egot, in PIE. It isn't wrong, just less 52 + * optimal than R_386_RELATIVE. But the x86 kernel fails to properly handle 53 + * R_386_32 relocations when relocating the kernel. To generate 54 + * R_386_RELATIVE relocations, we mark _bss, _ebss, _got and _egot as 55 + * hidden: 56 + */ 57 + .hidden _bss 58 + .hidden _ebss 59 + .hidden _got 60 + .hidden _egot 61 + 34 62 __HEAD 35 63 ENTRY(startup_32) 36 64 #ifdef CONFIG_EFI_STUB
+8
arch/x86/boot/compressed/head_64.S
··· 33 33 #include <asm/asm-offsets.h> 34 34 #include <asm/bootparam.h> 35 35 36 + /* 37 + * Locally defined symbols should be marked hidden: 38 + */ 39 + .hidden _bss 40 + .hidden _ebss 41 + .hidden _got 42 + .hidden _egot 43 + 36 44 __HEAD 37 45 .code32 38 46 ENTRY(startup_32)
+2 -2
arch/x86/kernel/cpu/mcheck/mce-genpool.c
··· 29 29 void mce_gen_pool_process(void) 30 30 { 31 31 struct llist_node *head; 32 - struct mce_evt_llist *node; 32 + struct mce_evt_llist *node, *tmp; 33 33 struct mce *mce; 34 34 35 35 head = llist_del_all(&mce_event_llist); ··· 37 37 return; 38 38 39 39 head = llist_reverse_order(head); 40 - llist_for_each_entry(node, head, llnode) { 40 + llist_for_each_entry_safe(node, tmp, head, llnode) { 41 41 mce = &node->mce; 42 42 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); 43 43 gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+5 -1
drivers/lguest/interrupts_and_traps.c
··· 331 331 * Actually now I think of it, it's possible that Ron *is* half the Plan 9 332 332 * userbase. Oh well. 333 333 */ 334 - static bool could_be_syscall(unsigned int num) 334 + bool could_be_syscall(unsigned int num) 335 335 { 336 336 /* Normal Linux IA32_SYSCALL_VECTOR or reserved vector? */ 337 337 return num == IA32_SYSCALL_VECTOR || num == syscall_vector; ··· 416 416 * 417 417 * This routine indicates if a particular trap number could be delivered 418 418 * directly. 419 + * 420 + * Unfortunately, Linux 4.6 started using an interrupt gate instead of a 421 + * trap gate for syscalls, so this trick is ineffective. See Mastery for 422 + * how we could do this anyway... 419 423 */ 420 424 static bool direct_trap(unsigned int num) 421 425 {
+1
drivers/lguest/lg.h
··· 167 167 bool send_notify_to_eventfd(struct lg_cpu *cpu); 168 168 void init_clockdev(struct lg_cpu *cpu); 169 169 bool check_syscall_vector(struct lguest *lg); 170 + bool could_be_syscall(unsigned int num); 170 171 int init_interrupts(void); 171 172 void free_interrupts(void); 172 173
+5 -1
drivers/lguest/x86/core.c
··· 429 429 return; 430 430 break; 431 431 case 32 ... 255: 432 + /* This might be a syscall. */ 433 + if (could_be_syscall(cpu->regs->trapnum)) 434 + break; 435 + 432 436 /* 433 - * These values mean a real interrupt occurred, in which case 437 + * Other values mean a real interrupt occurred, in which case 434 438 * the Host handler has already been run. We just do a 435 439 * friendly check if another process should now be run, then 436 440 * return to run the Guest again.