Merge tag 'x86_urgent_for_v5.11_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Borislav Petkov:
"As expected, fixes started trickling in after the holidays so here is
the accumulated pile of x86 fixes for 5.11:

- A fix for fanotify_mark() missing the conversion of x86_32 native
syscalls which take 64-bit arguments to the compat handlers due to
former having a general compat handler. (Brian Gerst)

- Add a forgotten pmd page destructor call to pud_free_pmd_page()
where a pmd page is freed. (Dan Williams)

- Make IN/OUT insns with an u8 immediate port operand handling for
SEV-ES guests more precise by using only the single port byte and
not the whole s32 value of the insn decoder. (Peter Gonda)

- Correct a straddling end range check before returning the proper
MTRR type, when the end address is the same as top of memory.
(Ying-Tsun Huang)

- Change PQR_ASSOC MSR update scheme when moving a task to a resctrl
resource group to avoid significant performance overhead with some
resctrl workloads. (Fenghua Yu)

- Avoid the actual task move overhead when the task is already in the
resource group. (Fenghua Yu)"

* tag 'x86_urgent_for_v5.11_rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
x86/resctrl: Don't move a task to the same resource group
x86/resctrl: Use an IPI instead of task_work_add() to update PQR_ASSOC MSR
x86/mtrr: Correct the range check before performing MTRR type lookups
x86/sev-es: Fix SEV-ES OUT/IN immediate opcode vc handling
x86/mm: Fix leak of pmd ptlock
fanotify: Fix sys_fanotify_mark() on native x86-32

+94 -83
+6
arch/Kconfig
··· 1105 1105 config ARCH_SUPPORTS_DEBUG_PAGEALLOC 1106 1106 bool 1107 1107 1108 + config ARCH_SPLIT_ARG64 1109 + bool 1110 + help 1111 + If a 32-bit architecture requires 64-bit arguments to be split into 1112 + pairs of 32-bit arguments, select this option. 1113 + 1108 1114 source "kernel/gcov/Kconfig" 1109 1115 1110 1116 source "scripts/gcc-plugins/Kconfig"
+1
arch/x86/Kconfig
··· 19 19 select KMAP_LOCAL 20 20 select MODULES_USE_ELF_REL 21 21 select OLD_SIGACTION 22 + select ARCH_SPLIT_ARG64 22 23 23 24 config X86_64 24 25 def_bool y
+3 -3
arch/x86/kernel/cpu/mtrr/generic.c
··· 167 167 *repeat = 0; 168 168 *uniform = 1; 169 169 170 - /* Make end inclusive instead of exclusive */ 171 - end--; 172 - 173 170 prev_match = MTRR_TYPE_INVALID; 174 171 for (i = 0; i < num_var_ranges; ++i) { 175 172 unsigned short start_state, end_state, inclusive; ··· 257 260 u8 type, prev_type, is_uniform = 1, dummy; 258 261 int repeat; 259 262 u64 partial_end; 263 + 264 + /* Make end inclusive instead of exclusive */ 265 + end--; 260 266 261 267 if (!mtrr_state_set) 262 268 return MTRR_TYPE_INVALID;
+49 -68
arch/x86/kernel/cpu/resctrl/rdtgroup.c
··· 525 525 kfree(rdtgrp); 526 526 } 527 527 528 - struct task_move_callback { 529 - struct callback_head work; 530 - struct rdtgroup *rdtgrp; 531 - }; 532 - 533 - static void move_myself(struct callback_head *head) 528 + static void _update_task_closid_rmid(void *task) 534 529 { 535 - struct task_move_callback *callback; 536 - struct rdtgroup *rdtgrp; 537 - 538 - callback = container_of(head, struct task_move_callback, work); 539 - rdtgrp = callback->rdtgrp; 540 - 541 530 /* 542 - * If resource group was deleted before this task work callback 543 - * was invoked, then assign the task to root group and free the 544 - * resource group. 531 + * If the task is still current on this CPU, update PQR_ASSOC MSR. 532 + * Otherwise, the MSR is updated when the task is scheduled in. 545 533 */ 546 - if (atomic_dec_and_test(&rdtgrp->waitcount) && 547 - (rdtgrp->flags & RDT_DELETED)) { 548 - current->closid = 0; 549 - current->rmid = 0; 550 - rdtgroup_remove(rdtgrp); 551 - } 534 + if (task == current) 535 + resctrl_sched_in(); 536 + } 552 537 553 - if (unlikely(current->flags & PF_EXITING)) 554 - goto out; 555 - 556 - preempt_disable(); 557 - /* update PQR_ASSOC MSR to make resource group go into effect */ 558 - resctrl_sched_in(); 559 - preempt_enable(); 560 - 561 - out: 562 - kfree(callback); 538 + static void update_task_closid_rmid(struct task_struct *t) 539 + { 540 + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) 541 + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); 542 + else 543 + _update_task_closid_rmid(t); 563 544 } 564 545 565 546 static int __rdtgroup_move_task(struct task_struct *tsk, 566 547 struct rdtgroup *rdtgrp) 567 548 { 568 - struct task_move_callback *callback; 569 - int ret; 570 - 571 - callback = kzalloc(sizeof(*callback), GFP_KERNEL); 572 - if (!callback) 573 - return -ENOMEM; 574 - callback->work.func = move_myself; 575 - callback->rdtgrp = rdtgrp; 549 + /* If the task is already in rdtgrp, no need to move the task. */ 550 + if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid && 551 + tsk->rmid == rdtgrp->mon.rmid) || 552 + (rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid && 553 + tsk->closid == rdtgrp->mon.parent->closid)) 554 + return 0; 576 555 577 556 /* 578 - * Take a refcount, so rdtgrp cannot be freed before the 579 - * callback has been invoked. 557 + * Set the task's closid/rmid before the PQR_ASSOC MSR can be 558 + * updated by them. 559 + * 560 + * For ctrl_mon groups, move both closid and rmid. 561 + * For monitor groups, can move the tasks only from 562 + * their parent CTRL group. 580 563 */ 581 - atomic_inc(&rdtgrp->waitcount); 582 - ret = task_work_add(tsk, &callback->work, TWA_RESUME); 583 - if (ret) { 584 - /* 585 - * Task is exiting. Drop the refcount and free the callback. 586 - * No need to check the refcount as the group cannot be 587 - * deleted before the write function unlocks rdtgroup_mutex. 588 - */ 589 - atomic_dec(&rdtgrp->waitcount); 590 - kfree(callback); 591 - rdt_last_cmd_puts("Task exited\n"); 592 - } else { 593 - /* 594 - * For ctrl_mon groups move both closid and rmid. 595 - * For monitor groups, can move the tasks only from 596 - * their parent CTRL group. 597 - */ 598 - if (rdtgrp->type == RDTCTRL_GROUP) { 599 - tsk->closid = rdtgrp->closid; 564 + 565 + if (rdtgrp->type == RDTCTRL_GROUP) { 566 + tsk->closid = rdtgrp->closid; 567 + tsk->rmid = rdtgrp->mon.rmid; 568 + } else if (rdtgrp->type == RDTMON_GROUP) { 569 + if (rdtgrp->mon.parent->closid == tsk->closid) { 600 570 tsk->rmid = rdtgrp->mon.rmid; 601 - } else if (rdtgrp->type == RDTMON_GROUP) { 602 - if (rdtgrp->mon.parent->closid == tsk->closid) { 603 - tsk->rmid = rdtgrp->mon.rmid; 604 - } else { 605 - rdt_last_cmd_puts("Can't move task to different control group\n"); 606 - ret = -EINVAL; 607 - } 571 + } else { 572 + rdt_last_cmd_puts("Can't move task to different control group\n"); 573 + return -EINVAL; 608 574 } 609 575 } 610 - return ret; 576 + 577 + /* 578 + * Ensure the task's closid and rmid are written before determining if 579 + * the task is current that will decide if it will be interrupted. 580 + */ 581 + barrier(); 582 + 583 + /* 584 + * By now, the task's closid and rmid are set. If the task is current 585 + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource 586 + * group go into effect. If the task is not current, the MSR will be 587 + * updated when the task is scheduled in. 588 + */ 589 + update_task_closid_rmid(tsk); 590 + 591 + return 0; 611 592 } 612 593 613 594 static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
+2 -2
arch/x86/kernel/sev-es-shared.c
··· 305 305 case 0xe4: 306 306 case 0xe5: 307 307 *exitinfo |= IOIO_TYPE_IN; 308 - *exitinfo |= (u64)insn->immediate.value << 16; 308 + *exitinfo |= (u8)insn->immediate.value << 16; 309 309 break; 310 310 311 311 /* OUT immediate opcodes */ 312 312 case 0xe6: 313 313 case 0xe7: 314 314 *exitinfo |= IOIO_TYPE_OUT; 315 - *exitinfo |= (u64)insn->immediate.value << 16; 315 + *exitinfo |= (u8)insn->immediate.value << 16; 316 316 break; 317 317 318 318 /* IN register opcodes */
+2
arch/x86/mm/pgtable.c
··· 829 829 } 830 830 831 831 free_page((unsigned long)pmd_sv); 832 + 833 + pgtable_pmd_page_dtor(virt_to_page(pmd)); 832 834 free_page((unsigned long)pmd); 833 835 834 836 return 1;
+7 -10
fs/notify/fanotify/fanotify_user.c
··· 1285 1285 return ret; 1286 1286 } 1287 1287 1288 + #ifndef CONFIG_ARCH_SPLIT_ARG64 1288 1289 SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, 1289 1290 __u64, mask, int, dfd, 1290 1291 const char __user *, pathname) 1291 1292 { 1292 1293 return do_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname); 1293 1294 } 1295 + #endif 1294 1296 1295 - #ifdef CONFIG_COMPAT 1296 - COMPAT_SYSCALL_DEFINE6(fanotify_mark, 1297 + #if defined(CONFIG_ARCH_SPLIT_ARG64) || defined(CONFIG_COMPAT) 1298 + SYSCALL32_DEFINE6(fanotify_mark, 1297 1299 int, fanotify_fd, unsigned int, flags, 1298 - __u32, mask0, __u32, mask1, int, dfd, 1300 + SC_ARG64(mask), int, dfd, 1299 1301 const char __user *, pathname) 1300 1302 { 1301 - return do_fanotify_mark(fanotify_fd, flags, 1302 - #ifdef __BIG_ENDIAN 1303 - ((__u64)mask0 << 32) | mask1, 1304 - #else 1305 - ((__u64)mask1 << 32) | mask0, 1306 - #endif 1307 - dfd, pathname); 1303 + return do_fanotify_mark(fanotify_fd, flags, SC_VAL64(__u64, mask), 1304 + dfd, pathname); 1308 1305 } 1309 1306 #endif 1310 1307
+24
include/linux/syscalls.h
··· 251 251 static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) 252 252 #endif /* __SYSCALL_DEFINEx */ 253 253 254 + /* For split 64-bit arguments on 32-bit architectures */ 255 + #ifdef __LITTLE_ENDIAN 256 + #define SC_ARG64(name) u32, name##_lo, u32, name##_hi 257 + #else 258 + #define SC_ARG64(name) u32, name##_hi, u32, name##_lo 259 + #endif 260 + #define SC_VAL64(type, name) ((type) name##_hi << 32 | name##_lo) 261 + 262 + #ifdef CONFIG_COMPAT 263 + #define SYSCALL32_DEFINE1 COMPAT_SYSCALL_DEFINE1 264 + #define SYSCALL32_DEFINE2 COMPAT_SYSCALL_DEFINE2 265 + #define SYSCALL32_DEFINE3 COMPAT_SYSCALL_DEFINE3 266 + #define SYSCALL32_DEFINE4 COMPAT_SYSCALL_DEFINE4 267 + #define SYSCALL32_DEFINE5 COMPAT_SYSCALL_DEFINE5 268 + #define SYSCALL32_DEFINE6 COMPAT_SYSCALL_DEFINE6 269 + #else 270 + #define SYSCALL32_DEFINE1 SYSCALL_DEFINE1 271 + #define SYSCALL32_DEFINE2 SYSCALL_DEFINE2 272 + #define SYSCALL32_DEFINE3 SYSCALL_DEFINE3 273 + #define SYSCALL32_DEFINE4 SYSCALL_DEFINE4 274 + #define SYSCALL32_DEFINE5 SYSCALL_DEFINE5 275 + #define SYSCALL32_DEFINE6 SYSCALL_DEFINE6 276 + #endif 277 + 254 278 /* 255 279 * Called before coming back to user-mode. Returning to user-mode with an 256 280 * address limit different than USER_DS can allow to overwrite kernel memory.