Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

x86/arch_prctl: Add controls for dynamic XSTATE components

Dynamically enabled XSTATE features are by default disabled for all
processes. A process has to request permission to use such a feature.

To support this implement a architecture specific prctl() with the options:

- ARCH_GET_XCOMP_SUPP

Copies the supported feature bitmap into the user space provided
u64 storage. The pointer is handed in via arg2

- ARCH_GET_XCOMP_PERM

Copies the process wide permitted feature bitmap into the user space
provided u64 storage. The pointer is handed in via arg2

- ARCH_REQ_XCOMP_PERM

Request permission for a feature set. A feature set can be mapped to a
facility, e.g. AMX, and can require one or more XSTATE components to
be enabled.

The feature argument is the number of the highest XSTATE component
which is required for a facility to work.

The request argument is not a user supplied bitmap because that makes
filtering harder (think seccomp) and even impossible because to
support 32bit tasks the argument would have to be a pointer.

The permission mechanism works this way:

Task asks for permission for a facility and kernel checks whether that's
supported. If supported it does:

1) Check whether permission has already been granted

2) Compute the size of the required kernel and user space buffer
(sigframe) size.

3) Validate that no task has a sigaltstack installed
which is smaller than the resulting sigframe size

4) Add the requested feature bit(s) to the permission bitmap of
current->group_leader->fpu and store the sizes in the group
leaders fpu struct as well.

If that is successful then the feature is still not enabled for any of the
tasks. The first usage of a related instruction will result in a #NM
trap. The trap handler validates the permission bit of the tasks group
leader and if permitted it installs a larger kernel buffer and transfers
the permission and size info to the new fpstate container which makes all
the FPU functions which require per task information aware of the extended
feature set.

[ tglx: Adopted to new base code, added missing serialization,
massaged namings, comments and changelog ]

Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Chang S. Bae <chang.seok.bae@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Link: https://lkml.kernel.org/r/20211021225527.10184-7-chang.seok.bae@intel.com

authored by

Chang S. Bae and committed by
Borislav Petkov
db8268df c33f0a81

+178 -3
+4
arch/x86/include/asm/fpu/api.h
··· 151 151 return gfpu->fpstate->is_confidential; 152 152 } 153 153 154 + /* prctl */ 155 + struct task_struct; 156 + extern long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2); 157 + 154 158 #endif /* _ASM_X86_FPU_API_H */
+1 -1
arch/x86/include/asm/proto.h
··· 40 40 extern int reboot_force; 41 41 42 42 long do_arch_prctl_common(struct task_struct *task, int option, 43 - unsigned long cpuid_enabled); 43 + unsigned long arg2); 44 44 45 45 #endif /* _ASM_X86_PROTO_H */
+4
arch/x86/include/uapi/asm/prctl.h
··· 10 10 #define ARCH_GET_CPUID 0x1011 11 11 #define ARCH_SET_CPUID 0x1012 12 12 13 + #define ARCH_GET_XCOMP_SUPP 0x1021 14 + #define ARCH_GET_XCOMP_PERM 0x1022 15 + #define ARCH_REQ_XCOMP_PERM 0x1023 16 + 13 17 #define ARCH_MAP_VDSO_X32 0x2001 14 18 #define ARCH_MAP_VDSO_32 0x2002 15 19 #define ARCH_MAP_VDSO_64 0x2003
+156
arch/x86/kernel/fpu/xstate.c
··· 8 8 #include <linux/compat.h> 9 9 #include <linux/cpu.h> 10 10 #include <linux/mman.h> 11 + #include <linux/nospec.h> 11 12 #include <linux/pkeys.h> 12 13 #include <linux/seq_file.h> 13 14 #include <linux/proc_fs.h> ··· 19 18 #include <asm/fpu/xcr.h> 20 19 21 20 #include <asm/tlbflush.h> 21 + #include <asm/prctl.h> 22 + #include <asm/elf.h> 22 23 23 24 #include "internal.h" 24 25 #include "legacy.h" ··· 1300 1297 } 1301 1298 EXPORT_SYMBOL_GPL(fpstate_clear_xstate_component); 1302 1299 #endif 1300 + 1301 + #ifdef CONFIG_X86_64 1302 + static int validate_sigaltstack(unsigned int usize) 1303 + { 1304 + struct task_struct *thread, *leader = current->group_leader; 1305 + unsigned long framesize = get_sigframe_size(); 1306 + 1307 + lockdep_assert_held(&current->sighand->siglock); 1308 + 1309 + /* get_sigframe_size() is based on fpu_user_cfg.max_size */ 1310 + framesize -= fpu_user_cfg.max_size; 1311 + framesize += usize; 1312 + for_each_thread(leader, thread) { 1313 + if (thread->sas_ss_size && thread->sas_ss_size < framesize) 1314 + return -ENOSPC; 1315 + } 1316 + return 0; 1317 + } 1318 + 1319 + static int __xstate_request_perm(u64 permitted, u64 requested) 1320 + { 1321 + /* 1322 + * This deliberately does not exclude !XSAVES as we still might 1323 + * decide to optionally context switch XCR0 or talk the silicon 1324 + * vendors into extending XFD for the pre AMX states. 1325 + */ 1326 + bool compacted = cpu_feature_enabled(X86_FEATURE_XSAVES); 1327 + struct fpu *fpu = &current->group_leader->thread.fpu; 1328 + unsigned int ksize, usize; 1329 + u64 mask; 1330 + int ret; 1331 + 1332 + /* Check whether fully enabled */ 1333 + if ((permitted & requested) == requested) 1334 + return 0; 1335 + 1336 + /* Calculate the resulting kernel state size */ 1337 + mask = permitted | requested; 1338 + ksize = xstate_calculate_size(mask, compacted); 1339 + 1340 + /* Calculate the resulting user state size */ 1341 + mask &= XFEATURE_MASK_USER_SUPPORTED; 1342 + usize = xstate_calculate_size(mask, false); 1343 + 1344 + ret = validate_sigaltstack(usize); 1345 + if (ret) 1346 + return ret; 1347 + 1348 + /* Pairs with the READ_ONCE() in xstate_get_group_perm() */ 1349 + WRITE_ONCE(fpu->perm.__state_perm, requested); 1350 + /* Protected by sighand lock */ 1351 + fpu->perm.__state_size = ksize; 1352 + fpu->perm.__user_state_size = usize; 1353 + return ret; 1354 + } 1355 + 1356 + /* 1357 + * Permissions array to map facilities with more than one component 1358 + */ 1359 + static const u64 xstate_prctl_req[XFEATURE_MAX] = { 1360 + /* [XFEATURE_XTILE_DATA] = XFEATURE_MASK_XTILE, */ 1361 + }; 1362 + 1363 + static int xstate_request_perm(unsigned long idx) 1364 + { 1365 + u64 permitted, requested; 1366 + int ret; 1367 + 1368 + if (idx >= XFEATURE_MAX) 1369 + return -EINVAL; 1370 + 1371 + /* 1372 + * Look up the facility mask which can require more than 1373 + * one xstate component. 1374 + */ 1375 + idx = array_index_nospec(idx, ARRAY_SIZE(xstate_prctl_req)); 1376 + requested = xstate_prctl_req[idx]; 1377 + if (!requested) 1378 + return -EOPNOTSUPP; 1379 + 1380 + if ((fpu_user_cfg.max_features & requested) != requested) 1381 + return -EOPNOTSUPP; 1382 + 1383 + /* Lockless quick check */ 1384 + permitted = xstate_get_host_group_perm(); 1385 + if ((permitted & requested) == requested) 1386 + return 0; 1387 + 1388 + /* Protect against concurrent modifications */ 1389 + spin_lock_irq(&current->sighand->siglock); 1390 + permitted = xstate_get_host_group_perm(); 1391 + ret = __xstate_request_perm(permitted, requested); 1392 + spin_unlock_irq(&current->sighand->siglock); 1393 + return ret; 1394 + } 1395 + #else /* CONFIG_X86_64 */ 1396 + static inline int xstate_request_perm(unsigned long idx) 1397 + { 1398 + return -EPERM; 1399 + } 1400 + #endif /* !CONFIG_X86_64 */ 1401 + 1402 + /** 1403 + * fpu_xstate_prctl - xstate permission operations 1404 + * @tsk: Redundant pointer to current 1405 + * @option: A subfunction of arch_prctl() 1406 + * @arg2: option argument 1407 + * Return: 0 if successful; otherwise, an error code 1408 + * 1409 + * Option arguments: 1410 + * 1411 + * ARCH_GET_XCOMP_SUPP: Pointer to user space u64 to store the info 1412 + * ARCH_GET_XCOMP_PERM: Pointer to user space u64 to store the info 1413 + * ARCH_REQ_XCOMP_PERM: Facility number requested 1414 + * 1415 + * For facilities which require more than one XSTATE component, the request 1416 + * must be the highest state component number related to that facility, 1417 + * e.g. for AMX which requires XFEATURE_XTILE_CFG(17) and 1418 + * XFEATURE_XTILE_DATA(18) this would be XFEATURE_XTILE_DATA(18). 1419 + */ 1420 + long fpu_xstate_prctl(struct task_struct *tsk, int option, unsigned long arg2) 1421 + { 1422 + u64 __user *uptr = (u64 __user *)arg2; 1423 + u64 permitted, supported; 1424 + unsigned long idx = arg2; 1425 + 1426 + if (tsk != current) 1427 + return -EPERM; 1428 + 1429 + switch (option) { 1430 + case ARCH_GET_XCOMP_SUPP: 1431 + supported = fpu_user_cfg.max_features | fpu_user_cfg.legacy_features; 1432 + return put_user(supported, uptr); 1433 + 1434 + case ARCH_GET_XCOMP_PERM: 1435 + /* 1436 + * Lockless snapshot as it can also change right after the 1437 + * dropping the lock. 1438 + */ 1439 + permitted = xstate_get_host_group_perm(); 1440 + permitted &= XFEATURE_MASK_USER_SUPPORTED; 1441 + return put_user(permitted, uptr); 1442 + 1443 + case ARCH_REQ_XCOMP_PERM: 1444 + if (!IS_ENABLED(CONFIG_X86_64)) 1445 + return -EOPNOTSUPP; 1446 + 1447 + return xstate_request_perm(idx); 1448 + 1449 + default: 1450 + return -EINVAL; 1451 + } 1452 + } 1303 1453 1304 1454 #ifdef CONFIG_PROC_PID_ARCH_STATUS 1305 1455 /*
+6
arch/x86/kernel/fpu/xstate.h
··· 15 15 xsave->header.xcomp_bv = mask | XCOMP_BV_COMPACTED_FORMAT; 16 16 } 17 17 18 + static inline u64 xstate_get_host_group_perm(void) 19 + { 20 + /* Pairs with WRITE_ONCE() in xstate_request_perm() */ 21 + return READ_ONCE(current->group_leader->thread.fpu.perm.__state_perm); 22 + } 23 + 18 24 enum xstate_copy_mode { 19 25 XSTATE_COPY_FP, 20 26 XSTATE_COPY_FX,
+7 -2
arch/x86/kernel/process.c
··· 30 30 #include <asm/apic.h> 31 31 #include <linux/uaccess.h> 32 32 #include <asm/mwait.h> 33 + #include <asm/fpu/api.h> 33 34 #include <asm/fpu/sched.h> 34 35 #include <asm/debugreg.h> 35 36 #include <asm/nmi.h> ··· 1004 1003 } 1005 1004 1006 1005 long do_arch_prctl_common(struct task_struct *task, int option, 1007 - unsigned long cpuid_enabled) 1006 + unsigned long arg2) 1008 1007 { 1009 1008 switch (option) { 1010 1009 case ARCH_GET_CPUID: 1011 1010 return get_cpuid_mode(); 1012 1011 case ARCH_SET_CPUID: 1013 - return set_cpuid_mode(task, cpuid_enabled); 1012 + return set_cpuid_mode(task, arg2); 1013 + case ARCH_GET_XCOMP_SUPP: 1014 + case ARCH_GET_XCOMP_PERM: 1015 + case ARCH_REQ_XCOMP_PERM: 1016 + return fpu_xstate_prctl(task, option, arg2); 1014 1017 } 1015 1018 1016 1019 return -EINVAL;