Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

s390/kvm: avoid global config of vm.alloc_pgste=1

The system control vm.alloc_pgste is used to control the size of the
page tables, either 2K or 4K. The idea is that a KVM host sets the
vm.alloc_pgste control to 1 which causes *all* new processes to run
with 4K page tables. For a non-kvm system the control should stay off
to save on memory used for page tables.

Trouble is that distributions choose to set the control globally to
be able to run KVM guests. This wastes memory on non-KVM systems.

Introduce the PT_S390_PGSTE ELF segment type to "mark" the qemu
executable with it. All executables with this (empty) segment in
its ELF phdr array will be started with 4K page tables. Any executable
without PT_S390_PGSTE will run with the default 2K page tables.

This removes the need to set vm.alloc_pgste=1 for a KVM host and
minimizes the waste of memory for page tables.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>

+53 -2
+1
arch/s390/Kconfig
··· 64 64 65 65 config S390 66 66 def_bool y 67 + select ARCH_BINFMT_ELF_STATE 67 68 select ARCH_HAS_DEVMEM_IS_ALLOWED 68 69 select ARCH_HAS_ELF_RANDOMIZE 69 70 select ARCH_HAS_GCOV_PROFILE_ALL
+32
arch/s390/include/asm/elf.h
··· 117 117 #define ELF_DATA ELFDATA2MSB 118 118 #define ELF_ARCH EM_S390 119 119 120 + /* s390 specific phdr types */ 121 + #define PT_S390_PGSTE 0x70000000 122 + 120 123 /* 121 124 * ELF register definitions.. 122 125 */ ··· 153 150 (((x)->e_machine == EM_S390 || (x)->e_machine == EM_S390_OLD) \ 154 151 && (x)->e_ident[EI_CLASS] == ELF_CLASS) 155 152 #define compat_start_thread start_thread31 153 + 154 + struct arch_elf_state { 155 + int rc; 156 + }; 157 + 158 + #define INIT_ARCH_ELF_STATE { .rc = 0 } 159 + 160 + #define arch_check_elf(ehdr, interp, interp_ehdr, state) (0) 161 + #ifdef CONFIG_PGSTE 162 + #define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) \ 163 + ({ \ 164 + struct arch_elf_state *_state = state; \ 165 + if ((phdr)->p_type == PT_S390_PGSTE && \ 166 + !page_table_allocate_pgste && \ 167 + !test_thread_flag(TIF_PGSTE) && \ 168 + !current->mm->context.alloc_pgste) { \ 169 + set_thread_flag(TIF_PGSTE); \ 170 + set_pt_regs_flag(task_pt_regs(current), \ 171 + PIF_SYSCALL_RESTART); \ 172 + _state->rc = -EAGAIN; \ 173 + } \ 174 + _state->rc; \ 175 + }) 176 + #else 177 + #define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) \ 178 + ({ \ 179 + (state)->rc; \ 180 + }) 181 + #endif 156 182 157 183 /* For SVR4/S390 the function pointer to be registered with `atexit` is 158 184 passed in R14. */
+3 -1
arch/s390/include/asm/mmu_context.h
··· 25 25 mm->context.gmap_asce = 0; 26 26 mm->context.flush_mm = 0; 27 27 #ifdef CONFIG_PGSTE 28 - mm->context.alloc_pgste = page_table_allocate_pgste; 28 + mm->context.alloc_pgste = page_table_allocate_pgste || 29 + test_thread_flag(TIF_PGSTE) || 30 + current->mm->context.alloc_pgste; 29 31 mm->context.has_pgste = 0; 30 32 mm->context.use_skey = 0; 31 33 mm->context.use_cmma = 0;
+2
arch/s390/include/asm/ptrace.h
··· 11 11 12 12 #define PIF_SYSCALL 0 /* inside a system call */ 13 13 #define PIF_PER_TRAP 1 /* deliver sigtrap on return to user */ 14 + #define PIF_SYSCALL_RESTART 2 /* restart the current system call */ 14 15 15 16 #define _PIF_SYSCALL _BITUL(PIF_SYSCALL) 16 17 #define _PIF_PER_TRAP _BITUL(PIF_PER_TRAP) 18 + #define _PIF_SYSCALL_RESTART _BITUL(PIF_SYSCALL_RESTART) 17 19 18 20 #ifndef __ASSEMBLY__ 19 21
+1
arch/s390/include/asm/thread_info.h
··· 58 58 #define TIF_UPROBE 3 /* breakpointed or single-stepping */ 59 59 #define TIF_GUARDED_STORAGE 4 /* load guarded storage control block */ 60 60 #define TIF_PATCH_PENDING 5 /* pending live patching update */ 61 + #define TIF_PGSTE 6 /* New mm's will use 4K page tables */ 61 62 62 63 #define TIF_31BIT 16 /* 32bit process */ 63 64 #define TIF_MEMDIE 17 /* is terminating due to OOM killer */
+14 -1
arch/s390/kernel/entry.S
··· 52 52 _TIF_SYSCALL_TRACEPOINT) 53 53 _CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE_PRIMARY | \ 54 54 _CIF_ASCE_SECONDARY | _CIF_FPU) 55 - _PIF_WORK = (_PIF_PER_TRAP) 55 + _PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART) 56 56 57 57 #define BASED(name) name-cleanup_critical(%r13) 58 58 ··· 334 334 jo .Lsysc_mcck_pending 335 335 TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED 336 336 jo .Lsysc_reschedule 337 + TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART 338 + jo .Lsysc_syscall_restart 337 339 #ifdef CONFIG_UPROBES 338 340 TSTMSK __TI_flags(%r12),_TIF_UPROBE 339 341 jo .Lsysc_uprobe_notify ··· 349 347 jo .Lsysc_patch_pending # handle live patching just before 350 348 # signals and possible syscall restart 351 349 #endif 350 + TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART 351 + jo .Lsysc_syscall_restart 352 352 TSTMSK __TI_flags(%r12),_TIF_SIGPENDING 353 353 jo .Lsysc_sigpending 354 354 TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME ··· 450 446 lgr %r2,%r11 # pass pointer to pt_regs 451 447 larl %r14,.Lsysc_return 452 448 jg do_per_trap 449 + 450 + # 451 + # _PIF_SYSCALL_RESTART is set, repeat the current system call 452 + # 453 + .Lsysc_syscall_restart: 454 + ni __PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART 455 + lmg %r1,%r7,__PT_R1(%r11) # load svc arguments 456 + lg %r2,__PT_ORIG_GPR2(%r11) 457 + j .Lsysc_do_svc 453 458 454 459 # 455 460 # call tracehook_report_syscall_entry/tracehook_report_syscall_exit before