Merge tag 'cgroup-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

+3 -3

Documentation/admin-guide/cgroup-v1/memory.rst

··· 195 195 196 196 RSS pages are accounted at page_fault unless they've already been accounted 197 197 for earlier. A file page will be accounted for as Page Cache when it's 198 - inserted into inode (radix-tree). While it's mapped into the page tables of 198 + inserted into inode (xarray). While it's mapped into the page tables of 199 199 processes, duplicate accounting is carefully avoided. 200 200 201 201 An RSS page is unaccounted when it's fully unmapped. A PageCache page is 202 - unaccounted when it's removed from radix-tree. Even if RSS pages are fully 202 + unaccounted when it's removed from xarray. Even if RSS pages are fully 203 203 unmapped (by kswapd), they may exist as SwapCache in the system until they 204 204 are really freed. Such SwapCaches are also accounted. 205 205 A swapped-in page is accounted after adding into swapcache. ··· 907 907 notification, i.e. groups A and B will not receive it. This is done to avoid 908 908 excessive "broadcasting" of messages, which disturbs the system and which is 909 909 especially bad if we are low on memory or thrashing. Group B, will receive 910 - notification only if there are no event listers for group C. 910 + notification only if there are no event listeners for group C. 911 911 912 912 There are three optional modes that specify different propagation behavior: 913 913

+1 -1

Documentation/admin-guide/cgroup-v2.rst

··· 1045 1045 - user_usec 1046 1046 - system_usec 1047 1047 1048 - and the following three when the controller is enabled: 1048 + and the following five when the controller is enabled: 1049 1049 1050 1050 - nr_periods 1051 1051 - nr_throttled

+2

MAINTAINERS

··· 5255 5255 F: Documentation/admin-guide/cgroup-v1/cpusets.rst 5256 5256 F: include/linux/cpuset.h 5257 5257 F: kernel/cgroup/cpuset.c 5258 + F: tools/testing/selftests/cgroup/test_cpuset.c 5259 + F: tools/testing/selftests/cgroup/test_cpuset_prs.sh 5258 5260 5259 5261 CONTROL GROUP - MEMORY RESOURCE CONTROLLER (MEMCG) 5260 5262 M: Johannes Weiner <hannes@cmpxchg.org>

+14

include/linux/cgroup-defs.h

··· 342 342 struct cgroup_base_stat last_bstat; 343 343 344 344 /* 345 + * This field is used to record the cumulative per-cpu time of 346 + * the cgroup and its descendants. Currently it can be read via 347 + * eBPF/drgn etc, and we are still trying to determine how to 348 + * expose it in the cgroupfs interface. 349 + */ 350 + struct cgroup_base_stat subtree_bstat; 351 + 352 + /* 353 + * Snapshots at the last reading. These are used to calculate the 354 + * deltas to propagate to the per-cpu subtree_bstat. 355 + */ 356 + struct cgroup_base_stat last_subtree_bstat; 357 + 358 + /* 345 359 * Child cgroups with stat updates on this cpu since the last read 346 360 * are linked on the parent's ->updated_children through 347 361 * ->updated_next.

+13 -15

include/linux/misc_cgroup.h

··· 31 31 * struct misc_res: Per cgroup per misc type resource 32 32 * @max: Maximum limit on the resource. 33 33 * @usage: Current usage of the resource. 34 - * @failed: True if charged failed for the resource in a cgroup. 34 + * @events: Number of times, the resource limit exceeded. 35 35 */ 36 36 struct misc_res { 37 - unsigned long max; 38 - atomic_long_t usage; 39 - atomic_long_t events; 37 + u64 max; 38 + atomic64_t usage; 39 + atomic64_t events; 40 40 }; 41 41 42 42 /** 43 43 * struct misc_cg - Miscellaneous controller's cgroup structure. 44 44 * @css: cgroup subsys state object. 45 + * @events_file: Handle for the misc resources events file. 45 46 * @res: Array of misc resources usage in the cgroup. 46 47 */ 47 48 struct misc_cg { ··· 54 53 struct misc_res res[MISC_CG_RES_TYPES]; 55 54 }; 56 55 57 - unsigned long misc_cg_res_total_usage(enum misc_res_type type); 58 - int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity); 59 - int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, 60 - unsigned long amount); 61 - void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, 62 - unsigned long amount); 56 + u64 misc_cg_res_total_usage(enum misc_res_type type); 57 + int misc_cg_set_capacity(enum misc_res_type type, u64 capacity); 58 + int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount); 59 + void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount); 63 60 64 61 /** 65 62 * css_misc() - Get misc cgroup from the css. ··· 98 99 99 100 #else /* !CONFIG_CGROUP_MISC */ 100 101 101 - static inline unsigned long misc_cg_res_total_usage(enum misc_res_type type) 102 + static inline u64 misc_cg_res_total_usage(enum misc_res_type type) 102 103 { 103 104 return 0; 104 105 } 105 106 106 - static inline int misc_cg_set_capacity(enum misc_res_type type, 107 - unsigned long capacity) 107 + static inline int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) 108 108 { 109 109 return 0; 110 110 } 111 111 112 112 static inline int misc_cg_try_charge(enum misc_res_type type, 113 113 struct misc_cg *cg, 114 - unsigned long amount) 114 + u64 amount) 115 115 { 116 116 return 0; 117 117 } 118 118 119 119 static inline void misc_cg_uncharge(enum misc_res_type type, 120 120 struct misc_cg *cg, 121 - unsigned long amount) 121 + u64 amount) 122 122 { 123 123 } 124 124

-2

include/uapi/linux/cgroupstats.h

··· 24 24 * basis. This data is shared using taskstats. 25 25 * 26 26 * Most of these states are derived by looking at the task->state value 27 - * For the nr_io_wait state, a flag in the delay accounting structure 28 - * indicates that the task is waiting on IO 29 27 * 30 28 * Each member is aligned to a 8 byte boundary. 31 29 */

+1 -1

kernel/cgroup/cgroup-v1.c

··· 431 431 if (l->list[mid] == pid) { 432 432 index = mid; 433 433 break; 434 - } else if (l->list[mid] <= pid) 434 + } else if (l->list[mid] < pid) 435 435 index = mid + 1; 436 436 else 437 437 end = mid;

+41 -44

kernel/cgroup/cgroup.c

··· 493 493 } 494 494 495 495 /** 496 - * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem 497 - * @cgrp: the cgroup of interest 498 - * @ss: the subsystem of interest 499 - * 500 - * Find and get @cgrp's css associated with @ss. If the css doesn't exist 501 - * or is offline, %NULL is returned. 502 - */ 503 - static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, 504 - struct cgroup_subsys *ss) 505 - { 506 - struct cgroup_subsys_state *css; 507 - 508 - rcu_read_lock(); 509 - css = cgroup_css(cgrp, ss); 510 - if (css && !css_tryget_online(css)) 511 - css = NULL; 512 - rcu_read_unlock(); 513 - 514 - return css; 515 - } 516 - 517 - /** 518 496 * cgroup_e_css_by_mask - obtain a cgroup's effective css for the specified ss 519 497 * @cgrp: the cgroup of interest 520 498 * @ss: the subsystem of interest (%NULL returns @cgrp->self) ··· 657 679 * @ssid: the index of the subsystem, CGROUP_SUBSYS_COUNT after reaching the end 658 680 * @cgrp: the target cgroup to iterate css's of 659 681 * 660 - * Should be called under cgroup_[tree_]mutex. 682 + * Should be called under cgroup_mutex. 661 683 */ 662 684 #define for_each_css(css, ssid, cgrp) \ 663 685 for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT; (ssid)++) \ ··· 907 929 #define CSS_SET_HASH_BITS 7 908 930 static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); 909 931 910 - static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) 932 + static unsigned long css_set_hash(struct cgroup_subsys_state **css) 911 933 { 912 934 unsigned long key = 0UL; 913 935 struct cgroup_subsys *ss; ··· 1048 1070 */ 1049 1071 static struct css_set *find_existing_css_set(struct css_set *old_cset, 1050 1072 struct cgroup *cgrp, 1051 - struct cgroup_subsys_state *template[]) 1073 + struct cgroup_subsys_state **template) 1052 1074 { 1053 1075 struct cgroup_root *root = cgrp->root; 1054 1076 struct cgroup_subsys *ss; ··· 1714 1736 struct cftype *cfts, *failed_cfts; 1715 1737 int ret; 1716 1738 1717 - if ((css->flags & CSS_VISIBLE) || !cgrp->kn) 1739 + if (css->flags & CSS_VISIBLE) 1718 1740 return 0; 1719 1741 1720 1742 if (!css->ss) { ··· 2477 2499 2478 2500 /* 2479 2501 * This function may be called both before and 2480 - * after cgroup_taskset_migrate(). The two cases 2502 + * after cgroup_migrate_execute(). The two cases 2481 2503 * can be distinguished by looking at whether @cset 2482 2504 * has its ->mg_dst_cset set. 2483 2505 */ ··· 3632 3654 return 0; 3633 3655 } 3634 3656 3635 - static int __maybe_unused cgroup_extra_stat_show(struct seq_file *seq, 3636 - struct cgroup *cgrp, int ssid) 3657 + #ifdef CONFIG_CGROUP_SCHED 3658 + /** 3659 + * cgroup_tryget_css - try to get a cgroup's css for the specified subsystem 3660 + * @cgrp: the cgroup of interest 3661 + * @ss: the subsystem of interest 3662 + * 3663 + * Find and get @cgrp's css associated with @ss. If the css doesn't exist 3664 + * or is offline, %NULL is returned. 3665 + */ 3666 + static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, 3667 + struct cgroup_subsys *ss) 3637 3668 { 3669 + struct cgroup_subsys_state *css; 3670 + 3671 + rcu_read_lock(); 3672 + css = cgroup_css(cgrp, ss); 3673 + if (css && !css_tryget_online(css)) 3674 + css = NULL; 3675 + rcu_read_unlock(); 3676 + 3677 + return css; 3678 + } 3679 + 3680 + static int cgroup_extra_stat_show(struct seq_file *seq, int ssid) 3681 + { 3682 + struct cgroup *cgrp = seq_css(seq)->cgroup; 3638 3683 struct cgroup_subsys *ss = cgroup_subsys[ssid]; 3639 3684 struct cgroup_subsys_state *css; 3640 3685 int ret; ··· 3673 3672 css_put(css); 3674 3673 return ret; 3675 3674 } 3675 + #endif 3676 3676 3677 3677 static int cpu_stat_show(struct seq_file *seq, void *v) 3678 3678 { 3679 - struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; 3680 3679 int ret = 0; 3681 3680 3682 3681 cgroup_base_stat_cputime_show(seq); 3683 3682 #ifdef CONFIG_CGROUP_SCHED 3684 - ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); 3683 + ret = cgroup_extra_stat_show(seq, cpu_cgrp_id); 3685 3684 #endif 3686 3685 return ret; 3687 3686 } ··· 4351 4350 return ret; 4352 4351 } 4353 4352 4354 - static int cgroup_rm_cftypes_locked(struct cftype *cfts) 4353 + static void cgroup_rm_cftypes_locked(struct cftype *cfts) 4355 4354 { 4356 4355 lockdep_assert_held(&cgroup_mutex); 4357 4356 4358 4357 list_del(&cfts->node); 4359 4358 cgroup_apply_cftypes(cfts, false); 4360 4359 cgroup_exit_cftypes(cfts); 4361 - return 0; 4362 4360 } 4363 4361 4364 4362 /** ··· 4373 4373 */ 4374 4374 int cgroup_rm_cftypes(struct cftype *cfts) 4375 4375 { 4376 - int ret; 4377 - 4378 4376 if (!cfts || cfts[0].name[0] == '\0') 4379 4377 return 0; 4380 4378 ··· 4380 4382 return -ENOENT; 4381 4383 4382 4384 cgroup_lock(); 4383 - ret = cgroup_rm_cftypes_locked(cfts); 4385 + cgroup_rm_cftypes_locked(cfts); 4384 4386 cgroup_unlock(); 4385 - return ret; 4387 + return 0; 4386 4388 } 4387 4389 4388 4390 /** ··· 5335 5337 * RCU callback. 5336 5338 * 5337 5339 * 4. After the grace period, the css can be freed. Implemented in 5338 - * css_free_work_fn(). 5340 + * css_free_rwork_fn(). 5339 5341 * 5340 5342 * It is actually hairier because both step 2 and 4 require process context 5341 5343 * and thus involve punting to css->destroy_work adding two additional ··· 5579 5581 5580 5582 /* 5581 5583 * The returned cgroup is fully initialized including its control mask, but 5582 - * it isn't associated with its kernfs_node and doesn't have the control 5583 - * mask applied. 5584 + * it doesn't have the control mask applied. 5584 5585 */ 5585 5586 static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, 5586 5587 umode_t mode) ··· 5905 5908 /* 5906 5909 * Mark @cgrp and the associated csets dead. The former prevents 5907 5910 * further task migration and child creation by disabling 5908 - * cgroup_lock_live_group(). The latter makes the csets ignored by 5911 + * cgroup_kn_lock_live(). The latter makes the csets ignored by 5909 5912 * the migration path. 5910 5913 */ 5911 5914 cgrp->self.flags &= ~CSS_ONLINE; ··· 5927 5930 parent->nr_threaded_children--; 5928 5931 5929 5932 spin_lock_irq(&css_set_lock); 5930 - for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { 5933 + for (tcgrp = parent; tcgrp; tcgrp = cgroup_parent(tcgrp)) { 5931 5934 tcgrp->nr_descendants--; 5932 5935 tcgrp->nr_dying_descendants++; 5933 5936 /* ··· 6120 6123 continue; 6121 6124 6122 6125 if (cgroup1_ssid_disabled(ssid)) 6123 - printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", 6124 - ss->name); 6126 + pr_info("Disabling %s control group subsystem in v1 mounts\n", 6127 + ss->name); 6125 6128 6126 6129 cgrp_dfl_root.subsys_mask |= 1 << ss->id; 6127 6130

+164 -100

kernel/cgroup/cpuset.c

··· 1230 1230 /* 1231 1231 * Percpu kthreads in top_cpuset are ignored 1232 1232 */ 1233 - if ((task->flags & PF_KTHREAD) && kthread_is_per_cpu(task)) 1233 + if (kthread_is_per_cpu(task)) 1234 1234 continue; 1235 1235 cpumask_andnot(new_cpus, possible_mask, cs->subparts_cpus); 1236 1236 } else { ··· 1255 1255 static void compute_effective_cpumask(struct cpumask *new_cpus, 1256 1256 struct cpuset *cs, struct cpuset *parent) 1257 1257 { 1258 - if (parent->nr_subparts_cpus) { 1258 + if (parent->nr_subparts_cpus && is_partition_valid(cs)) { 1259 1259 cpumask_or(new_cpus, parent->effective_cpus, 1260 1260 parent->subparts_cpus); 1261 1261 cpumask_and(new_cpus, new_cpus, cs->cpus_allowed); ··· 1277 1277 1278 1278 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, 1279 1279 int turning_on); 1280 + static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, 1281 + struct tmpmasks *tmp); 1282 + 1283 + /* 1284 + * Update partition exclusive flag 1285 + * 1286 + * Return: 0 if successful, an error code otherwise 1287 + */ 1288 + static int update_partition_exclusive(struct cpuset *cs, int new_prs) 1289 + { 1290 + bool exclusive = (new_prs > 0); 1291 + 1292 + if (exclusive && !is_cpu_exclusive(cs)) { 1293 + if (update_flag(CS_CPU_EXCLUSIVE, cs, 1)) 1294 + return PERR_NOTEXCL; 1295 + } else if (!exclusive && is_cpu_exclusive(cs)) { 1296 + /* Turning off CS_CPU_EXCLUSIVE will not return error */ 1297 + update_flag(CS_CPU_EXCLUSIVE, cs, 0); 1298 + } 1299 + return 0; 1300 + } 1301 + 1302 + /* 1303 + * Update partition load balance flag and/or rebuild sched domain 1304 + * 1305 + * Changing load balance flag will automatically call 1306 + * rebuild_sched_domains_locked(). 1307 + */ 1308 + static void update_partition_sd_lb(struct cpuset *cs, int old_prs) 1309 + { 1310 + int new_prs = cs->partition_root_state; 1311 + bool new_lb = (new_prs != PRS_ISOLATED); 1312 + bool rebuild_domains = (new_prs > 0) || (old_prs > 0); 1313 + 1314 + if (new_lb != !!is_sched_load_balance(cs)) { 1315 + rebuild_domains = true; 1316 + if (new_lb) 1317 + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1318 + else 1319 + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1320 + } 1321 + 1322 + if (rebuild_domains) 1323 + rebuild_sched_domains_locked(); 1324 + } 1325 + 1280 1326 /** 1281 1327 * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset 1282 1328 * @cs: The cpuset that requests change in partition root state ··· 1382 1336 return is_partition_invalid(parent) 1383 1337 ? PERR_INVPARENT : PERR_NOTPART; 1384 1338 } 1385 - if ((newmask && cpumask_empty(newmask)) || 1386 - (!newmask && cpumask_empty(cs->cpus_allowed))) 1339 + if (!newmask && cpumask_empty(cs->cpus_allowed)) 1387 1340 return PERR_CPUSEMPTY; 1388 1341 1389 1342 /* ··· 1449 1404 adding = cpumask_andnot(tmp->addmask, tmp->addmask, 1450 1405 parent->subparts_cpus); 1451 1406 /* 1407 + * Empty cpumask is not allowed 1408 + */ 1409 + if (cpumask_empty(newmask)) { 1410 + part_error = PERR_CPUSEMPTY; 1411 + /* 1452 1412 * Make partition invalid if parent's effective_cpus could 1453 1413 * become empty and there are tasks in the parent. 1454 1414 */ 1455 - if (adding && 1415 + } else if (adding && 1456 1416 cpumask_subset(parent->effective_cpus, tmp->addmask) && 1457 1417 !cpumask_intersects(tmp->delmask, cpu_active_mask) && 1458 1418 partition_is_populated(parent, cs)) { ··· 1530 1480 1531 1481 /* 1532 1482 * Transitioning between invalid to valid or vice versa may require 1533 - * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE. 1483 + * changing CS_CPU_EXCLUSIVE. 1534 1484 */ 1535 1485 if (old_prs != new_prs) { 1536 - if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) && 1537 - (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0)) 1538 - return PERR_NOTEXCL; 1539 - if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs)) 1540 - update_flag(CS_CPU_EXCLUSIVE, cs, 0); 1486 + int err = update_partition_exclusive(cs, new_prs); 1487 + 1488 + if (err) 1489 + return err; 1541 1490 } 1542 1491 1543 1492 /* ··· 1569 1520 1570 1521 spin_unlock_irq(&callback_lock); 1571 1522 1572 - if (adding || deleting) 1523 + if (adding || deleting) { 1573 1524 update_tasks_cpumask(parent, tmp->addmask); 1525 + if (parent->child_ecpus_count) 1526 + update_sibling_cpumasks(parent, cs, tmp); 1527 + } 1574 1528 1575 1529 /* 1576 - * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. 1577 - * rebuild_sched_domains_locked() may be called. 1530 + * For partcmd_update without newmask, it is being called from 1531 + * cpuset_hotplug_workfn() where cpus_read_lock() wasn't taken. 1532 + * Update the load balance flag and scheduling domain if 1533 + * cpus_read_trylock() is successful. 1578 1534 */ 1579 - if (old_prs != new_prs) { 1580 - if (old_prs == PRS_ISOLATED) 1581 - update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); 1582 - else if (new_prs == PRS_ISOLATED) 1583 - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 1535 + if ((cmd == partcmd_update) && !newmask && cpus_read_trylock()) { 1536 + update_partition_sd_lb(cs, old_prs); 1537 + cpus_read_unlock(); 1584 1538 } 1539 + 1585 1540 notify_partition_change(cs, old_prs); 1586 1541 return 0; 1587 1542 } 1543 + 1544 + /* 1545 + * update_cpumasks_hier() flags 1546 + */ 1547 + #define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ 1548 + #define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ 1588 1549 1589 1550 /* 1590 1551 * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree ··· 1610 1551 * Called with cpuset_mutex held 1611 1552 */ 1612 1553 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, 1613 - bool force) 1554 + int flags) 1614 1555 { 1615 1556 struct cpuset *cp; 1616 1557 struct cgroup_subsys_state *pos_css; ··· 1647 1588 } 1648 1589 1649 1590 /* 1650 - * Skip the whole subtree if the cpumask remains the same 1651 - * and has no partition root state and force flag not set. 1591 + * Skip the whole subtree if 1592 + * 1) the cpumask remains the same, 1593 + * 2) has no partition root state, 1594 + * 3) HIER_CHECKALL flag not set, and 1595 + * 4) for v2 load balance state same as its parent. 1652 1596 */ 1653 - if (!cp->partition_root_state && !force && 1654 - cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { 1597 + if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && 1598 + cpumask_equal(tmp->new_cpus, cp->effective_cpus) && 1599 + (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || 1600 + (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { 1655 1601 pos_css = css_rightmost_descendant(pos_css); 1656 1602 continue; 1657 1603 } ··· 1740 1676 update_tasks_cpumask(cp, tmp->new_cpus); 1741 1677 1742 1678 /* 1679 + * On default hierarchy, inherit the CS_SCHED_LOAD_BALANCE 1680 + * from parent if current cpuset isn't a valid partition root 1681 + * and their load balance states differ. 1682 + */ 1683 + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 1684 + !is_partition_valid(cp) && 1685 + (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { 1686 + if (is_sched_load_balance(parent)) 1687 + set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); 1688 + else 1689 + clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); 1690 + } 1691 + 1692 + /* 1743 1693 * On legacy hierarchy, if the effective cpumask of any non- 1744 1694 * empty cpuset is changed, we need to rebuild sched domains. 1745 1695 * On default hierarchy, the cpuset needs to be a partition ··· 1770 1692 } 1771 1693 rcu_read_unlock(); 1772 1694 1773 - if (need_rebuild_sched_domains) 1695 + if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD)) 1774 1696 rebuild_sched_domains_locked(); 1775 1697 } 1776 1698 ··· 1794 1716 * to use the right effective_cpus value. 1795 1717 * 1796 1718 * The update_cpumasks_hier() function may sleep. So we have to 1797 - * release the RCU read lock before calling it. 1719 + * release the RCU read lock before calling it. HIER_NO_SD_REBUILD 1720 + * flag is used to suppress rebuild of sched domains as the callers 1721 + * will take care of that. 1798 1722 */ 1799 1723 rcu_read_lock(); 1800 1724 cpuset_for_each_child(sibling, pos_css, parent) { ··· 1808 1728 continue; 1809 1729 1810 1730 rcu_read_unlock(); 1811 - update_cpumasks_hier(sibling, tmp, false); 1731 + update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); 1812 1732 rcu_read_lock(); 1813 1733 css_put(&sibling->css); 1814 1734 } ··· 1827 1747 int retval; 1828 1748 struct tmpmasks tmp; 1829 1749 bool invalidate = false; 1750 + int old_prs = cs->partition_root_state; 1830 1751 1831 1752 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ 1832 1753 if (cs == &top_cpuset) ··· 1855 1774 if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) 1856 1775 return 0; 1857 1776 1858 - #ifdef CONFIG_CPUMASK_OFFSTACK 1859 - /* 1860 - * Use the cpumasks in trialcs for tmpmasks when they are pointers 1861 - * to allocated cpumasks. 1862 - * 1863 - * Note that update_parent_subparts_cpumask() uses only addmask & 1864 - * delmask, but not new_cpus. 1865 - */ 1866 - tmp.addmask = trialcs->subparts_cpus; 1867 - tmp.delmask = trialcs->effective_cpus; 1868 - tmp.new_cpus = NULL; 1869 - #endif 1777 + if (alloc_cpumasks(NULL, &tmp)) 1778 + return -ENOMEM; 1870 1779 1871 1780 retval = validate_change(cs, trialcs); 1872 1781 ··· 1885 1814 retval = 0; 1886 1815 } 1887 1816 if (retval < 0) 1888 - return retval; 1817 + goto out_free; 1889 1818 1890 1819 if (cs->partition_root_state) { 1891 1820 if (invalidate) ··· 1920 1849 } 1921 1850 spin_unlock_irq(&callback_lock); 1922 1851 1923 - #ifdef CONFIG_CPUMASK_OFFSTACK 1924 - /* Now trialcs->cpus_allowed is available */ 1925 - tmp.new_cpus = trialcs->cpus_allowed; 1926 - #endif 1927 - 1928 1852 /* effective_cpus will be updated here */ 1929 - update_cpumasks_hier(cs, &tmp, false); 1853 + update_cpumasks_hier(cs, &tmp, 0); 1930 1854 1931 1855 if (cs->partition_root_state) { 1932 1856 struct cpuset *parent = parent_cs(cs); ··· 1932 1866 */ 1933 1867 if (parent->child_ecpus_count) 1934 1868 update_sibling_cpumasks(parent, cs, &tmp); 1869 + 1870 + /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains */ 1871 + update_partition_sd_lb(cs, old_prs); 1935 1872 } 1873 + out_free: 1874 + free_cpumasks(NULL, &tmp); 1936 1875 return 0; 1937 1876 } 1938 1877 ··· 2313 2242 static int update_prstate(struct cpuset *cs, int new_prs) 2314 2243 { 2315 2244 int err = PERR_NONE, old_prs = cs->partition_root_state; 2316 - bool sched_domain_rebuilt = false; 2317 2245 struct cpuset *parent = parent_cs(cs); 2318 2246 struct tmpmasks tmpmask; 2319 2247 ··· 2331 2261 if (alloc_cpumasks(NULL, &tmpmask)) 2332 2262 return -ENOMEM; 2333 2263 2264 + err = update_partition_exclusive(cs, new_prs); 2265 + if (err) 2266 + goto out; 2267 + 2334 2268 if (!old_prs) { 2335 2269 /* 2336 - * Turning on partition root requires setting the 2337 - * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed 2338 - * cannot be empty. 2270 + * cpus_allowed cannot be empty. 2339 2271 */ 2340 2272 if (cpumask_empty(cs->cpus_allowed)) { 2341 2273 err = PERR_CPUSEMPTY; 2342 2274 goto out; 2343 2275 } 2344 2276 2345 - err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); 2346 - if (err) { 2347 - err = PERR_NOTEXCL; 2348 - goto out; 2349 - } 2350 - 2351 2277 err = update_parent_subparts_cpumask(cs, partcmd_enable, 2352 2278 NULL, &tmpmask); 2353 - if (err) { 2354 - update_flag(CS_CPU_EXCLUSIVE, cs, 0); 2355 - goto out; 2356 - } 2357 - 2358 - if (new_prs == PRS_ISOLATED) { 2359 - /* 2360 - * Disable the load balance flag should not return an 2361 - * error unless the system is running out of memory. 2362 - */ 2363 - update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); 2364 - sched_domain_rebuilt = true; 2365 - } 2366 2279 } else if (old_prs && new_prs) { 2367 2280 /* 2368 2281 * A change in load balance state only, no change in cpumasks. 2369 2282 */ 2370 - update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED)); 2371 - sched_domain_rebuilt = true; 2372 - goto out; /* Sched domain is rebuilt in update_flag() */ 2283 + ; 2373 2284 } else { 2374 2285 /* 2375 2286 * Switching back to member is always allowed even if it ··· 2369 2318 compute_effective_cpumask(cs->effective_cpus, cs, parent); 2370 2319 spin_unlock_irq(&callback_lock); 2371 2320 } 2372 - 2373 - /* Turning off CS_CPU_EXCLUSIVE will not return error */ 2374 - update_flag(CS_CPU_EXCLUSIVE, cs, 0); 2375 - 2376 - if (!is_sched_load_balance(cs)) { 2377 - /* Make sure load balance is on */ 2378 - update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); 2379 - sched_domain_rebuilt = true; 2380 - } 2381 2321 } 2382 - 2383 - update_tasks_cpumask(parent, tmpmask.new_cpus); 2384 - 2385 - if (parent->child_ecpus_count) 2386 - update_sibling_cpumasks(parent, cs, &tmpmask); 2387 - 2388 - if (!sched_domain_rebuilt) 2389 - rebuild_sched_domains_locked(); 2390 2322 out: 2391 2323 /* 2392 - * Make partition invalid if an error happen 2324 + * Make partition invalid & disable CS_CPU_EXCLUSIVE if an error 2325 + * happens. 2393 2326 */ 2394 - if (err) 2327 + if (err) { 2395 2328 new_prs = -new_prs; 2329 + update_partition_exclusive(cs, new_prs); 2330 + } 2331 + 2396 2332 spin_lock_irq(&callback_lock); 2397 2333 cs->partition_root_state = new_prs; 2398 2334 WRITE_ONCE(cs->prs_err, err); 2399 2335 spin_unlock_irq(&callback_lock); 2336 + 2400 2337 /* 2401 2338 * Update child cpusets, if present. 2402 2339 * Force update if switching back to member. 2403 2340 */ 2404 2341 if (!list_empty(&cs->css.children)) 2405 - update_cpumasks_hier(cs, &tmpmask, !new_prs); 2342 + update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); 2343 + 2344 + /* Update sched domains and load balance flag */ 2345 + update_partition_sd_lb(cs, old_prs); 2406 2346 2407 2347 notify_partition_change(cs, old_prs); 2408 2348 free_cpumasks(NULL, &tmpmask); ··· 2529 2487 struct cgroup_subsys_state *css; 2530 2488 struct cpuset *cs, *oldcs; 2531 2489 struct task_struct *task; 2490 + bool cpus_updated, mems_updated; 2532 2491 int ret; 2533 2492 2534 2493 /* used later by cpuset_attach() */ ··· 2544 2501 if (ret) 2545 2502 goto out_unlock; 2546 2503 2504 + cpus_updated = !cpumask_equal(cs->effective_cpus, oldcs->effective_cpus); 2505 + mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); 2506 + 2547 2507 cgroup_taskset_for_each(task, css, tset) { 2548 2508 ret = task_can_attach(task); 2549 2509 if (ret) 2550 2510 goto out_unlock; 2551 - ret = security_task_setscheduler(task); 2552 - if (ret) 2553 - goto out_unlock; 2511 + 2512 + /* 2513 + * Skip rights over task check in v2 when nothing changes, 2514 + * migration permission derives from hierarchy ownership in 2515 + * cgroup_procs_write_permission()). 2516 + */ 2517 + if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || 2518 + (cpus_updated || mems_updated)) { 2519 + ret = security_task_setscheduler(task); 2520 + if (ret) 2521 + goto out_unlock; 2522 + } 2554 2523 2555 2524 if (dl_task(task)) { 2556 2525 cs->nr_migrate_dl_tasks++; ··· 3277 3222 cs->use_parent_ecpus = true; 3278 3223 parent->child_ecpus_count++; 3279 3224 } 3225 + 3226 + /* 3227 + * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated 3228 + */ 3229 + if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && 3230 + !is_sched_load_balance(parent)) 3231 + clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 3232 + 3280 3233 spin_unlock_irq(&callback_lock); 3281 3234 3282 3235 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) ··· 3584 3521 is_empty = cpumask_empty(cs->cpus_allowed) || 3585 3522 nodes_empty(cs->mems_allowed); 3586 3523 3587 - mutex_unlock(&cpuset_mutex); 3588 - 3589 3524 /* 3590 3525 * Move tasks to the nearest ancestor with execution resources, 3591 3526 * This is full cgroup operation which will also call back into 3592 3527 * cpuset. Should be done outside any lock. 3593 3528 */ 3594 - if (is_empty) 3529 + if (is_empty) { 3530 + mutex_unlock(&cpuset_mutex); 3595 3531 remove_tasks_in_empty_cpuset(cs); 3596 - 3597 - mutex_lock(&cpuset_mutex); 3532 + mutex_lock(&cpuset_mutex); 3533 + } 3598 3534 } 3599 3535 3600 3536 static void ··· 3753 3691 3754 3692 /** 3755 3693 * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset 3694 + * @work: unused 3756 3695 * 3757 3696 * This function is called after either CPU or memory configuration has 3758 3697 * changed and updates cpuset accordingly. The top_cpuset is always ··· 4136 4073 4137 4074 /** 4138 4075 * cpuset_spread_node() - On which node to begin search for a page 4076 + * @rotor: round robin rotor 4139 4077 * 4140 4078 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 4141 4079 * tasks in a cpuset with is_spread_page or is_spread_slab set),

+27 -28

kernel/cgroup/misc.c

··· 14 14 #include <linux/misc_cgroup.h> 15 15 16 16 #define MAX_STR "max" 17 - #define MAX_NUM ULONG_MAX 17 + #define MAX_NUM U64_MAX 18 18 19 19 /* Miscellaneous res name, keep it in sync with enum misc_res_type */ 20 20 static const char *const misc_res_name[] = { ··· 37 37 * more than the actual capacity. We are using Limits resource distribution 38 38 * model of cgroup for miscellaneous controller. 39 39 */ 40 - static unsigned long misc_res_capacity[MISC_CG_RES_TYPES]; 40 + static u64 misc_res_capacity[MISC_CG_RES_TYPES]; 41 41 42 42 /** 43 43 * parent_misc() - Get the parent of the passed misc cgroup. ··· 74 74 * Context: Any context. 75 75 * Return: Current total usage of the resource. 76 76 */ 77 - unsigned long misc_cg_res_total_usage(enum misc_res_type type) 77 + u64 misc_cg_res_total_usage(enum misc_res_type type) 78 78 { 79 79 if (valid_type(type)) 80 - return atomic_long_read(&root_cg.res[type].usage); 80 + return atomic64_read(&root_cg.res[type].usage); 81 81 82 82 return 0; 83 83 } ··· 95 95 * * %0 - Successfully registered the capacity. 96 96 * * %-EINVAL - If @type is invalid. 97 97 */ 98 - int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity) 98 + int misc_cg_set_capacity(enum misc_res_type type, u64 capacity) 99 99 { 100 100 if (!valid_type(type)) 101 101 return -EINVAL; ··· 114 114 * Context: Any context. 115 115 */ 116 116 static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg, 117 - unsigned long amount) 117 + u64 amount) 118 118 { 119 - WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage), 119 + WARN_ONCE(atomic64_add_negative(-amount, &cg->res[type].usage), 120 120 "misc cgroup resource %s became less than 0", 121 121 misc_res_name[type]); 122 122 } ··· 137 137 * * -EBUSY - If max limit will be crossed or total usage will be more than the 138 138 * capacity. 139 139 */ 140 - int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, 141 - unsigned long amount) 140 + int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount) 142 141 { 143 142 struct misc_cg *i, *j; 144 143 int ret; 145 144 struct misc_res *res; 146 - int new_usage; 145 + u64 new_usage; 147 146 148 147 if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type]))) 149 148 return -EINVAL; ··· 153 154 for (i = cg; i; i = parent_misc(i)) { 154 155 res = &i->res[type]; 155 156 156 - new_usage = atomic_long_add_return(amount, &res->usage); 157 + new_usage = atomic64_add_return(amount, &res->usage); 157 158 if (new_usage > READ_ONCE(res->max) || 158 159 new_usage > READ_ONCE(misc_res_capacity[type])) { 159 160 ret = -EBUSY; ··· 164 165 165 166 err_charge: 166 167 for (j = i; j; j = parent_misc(j)) { 167 - atomic_long_inc(&j->res[type].events); 168 + atomic64_inc(&j->res[type].events); 168 169 cgroup_file_notify(&j->events_file); 169 170 } 170 171 ··· 183 184 * 184 185 * Context: Any context. 185 186 */ 186 - void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, 187 - unsigned long amount) 187 + void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount) 188 188 { 189 189 struct misc_cg *i; 190 190 ··· 207 209 { 208 210 int i; 209 211 struct misc_cg *cg = css_misc(seq_css(sf)); 210 - unsigned long max; 212 + u64 max; 211 213 212 214 for (i = 0; i < MISC_CG_RES_TYPES; i++) { 213 215 if (READ_ONCE(misc_res_capacity[i])) { ··· 215 217 if (max == MAX_NUM) 216 218 seq_printf(sf, "%s max\n", misc_res_name[i]); 217 219 else 218 - seq_printf(sf, "%s %lu\n", misc_res_name[i], 220 + seq_printf(sf, "%s %llu\n", misc_res_name[i], 219 221 max); 220 222 } 221 223 } ··· 239 241 * Return: 240 242 * * >= 0 - Number of bytes processed in the input. 241 243 * * -EINVAL - If buf is not valid. 242 - * * -ERANGE - If number is bigger than the unsigned long capacity. 244 + * * -ERANGE - If number is bigger than the u64 capacity. 243 245 */ 244 246 static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf, 245 247 size_t nbytes, loff_t off) 246 248 { 247 249 struct misc_cg *cg; 248 - unsigned long max; 250 + u64 max; 249 251 int ret = 0, i; 250 252 enum misc_res_type type = MISC_CG_RES_TYPES; 251 253 char *token; ··· 269 271 if (!strcmp(MAX_STR, buf)) { 270 272 max = MAX_NUM; 271 273 } else { 272 - ret = kstrtoul(buf, 0, &max); 274 + ret = kstrtou64(buf, 0, &max); 273 275 if (ret) 274 276 return ret; 275 277 } ··· 295 297 static int misc_cg_current_show(struct seq_file *sf, void *v) 296 298 { 297 299 int i; 298 - unsigned long usage; 300 + u64 usage; 299 301 struct misc_cg *cg = css_misc(seq_css(sf)); 300 302 301 303 for (i = 0; i < MISC_CG_RES_TYPES; i++) { 302 - usage = atomic_long_read(&cg->res[i].usage); 304 + usage = atomic64_read(&cg->res[i].usage); 303 305 if (READ_ONCE(misc_res_capacity[i]) || usage) 304 - seq_printf(sf, "%s %lu\n", misc_res_name[i], usage); 306 + seq_printf(sf, "%s %llu\n", misc_res_name[i], usage); 305 307 } 306 308 307 309 return 0; ··· 320 322 static int misc_cg_capacity_show(struct seq_file *sf, void *v) 321 323 { 322 324 int i; 323 - unsigned long cap; 325 + u64 cap; 324 326 325 327 for (i = 0; i < MISC_CG_RES_TYPES; i++) { 326 328 cap = READ_ONCE(misc_res_capacity[i]); 327 329 if (cap) 328 - seq_printf(sf, "%s %lu\n", misc_res_name[i], cap); 330 + seq_printf(sf, "%s %llu\n", misc_res_name[i], cap); 329 331 } 330 332 331 333 return 0; ··· 334 336 static int misc_events_show(struct seq_file *sf, void *v) 335 337 { 336 338 struct misc_cg *cg = css_misc(seq_css(sf)); 337 - unsigned long events, i; 339 + u64 events; 340 + int i; 338 341 339 342 for (i = 0; i < MISC_CG_RES_TYPES; i++) { 340 - events = atomic_long_read(&cg->res[i].events); 343 + events = atomic64_read(&cg->res[i].events); 341 344 if (READ_ONCE(misc_res_capacity[i]) || events) 342 - seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events); 345 + seq_printf(sf, "%s.max %llu\n", misc_res_name[i], events); 343 346 } 344 347 return 0; 345 348 } ··· 396 397 397 398 for (i = 0; i < MISC_CG_RES_TYPES; i++) { 398 399 WRITE_ONCE(cg->res[i].max, MAX_NUM); 399 - atomic_long_set(&cg->res[i].usage, 0); 400 + atomic64_set(&cg->res[i].usage, 0); 400 401 } 401 402 402 403 return &cg->css;

-6

kernel/cgroup/namespace.c

··· 149 149 .install = cgroupns_install, 150 150 .owner = cgroupns_owner, 151 151 }; 152 - 153 - static __init int cgroup_namespaces_init(void) 154 - { 155 - return 0; 156 - } 157 - subsys_initcall(cgroup_namespaces_init);

+10 -2

kernel/cgroup/rstat.c

··· 344 344 { 345 345 struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); 346 346 struct cgroup *parent = cgroup_parent(cgrp); 347 + struct cgroup_rstat_cpu *prstatc; 347 348 struct cgroup_base_stat delta; 348 349 unsigned seq; 349 350 ··· 358 357 delta = rstatc->bstat; 359 358 } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); 360 359 361 - /* propagate percpu delta to global */ 360 + /* propagate per-cpu delta to cgroup and per-cpu global statistics */ 362 361 cgroup_base_stat_sub(&delta, &rstatc->last_bstat); 363 362 cgroup_base_stat_add(&cgrp->bstat, &delta); 364 363 cgroup_base_stat_add(&rstatc->last_bstat, &delta); 364 + cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); 365 365 366 - /* propagate global delta to parent (unless that's root) */ 366 + /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ 367 367 if (cgroup_parent(parent)) { 368 368 delta = cgrp->bstat; 369 369 cgroup_base_stat_sub(&delta, &cgrp->last_bstat); 370 370 cgroup_base_stat_add(&parent->bstat, &delta); 371 371 cgroup_base_stat_add(&cgrp->last_bstat, &delta); 372 + 373 + delta = rstatc->subtree_bstat; 374 + prstatc = cgroup_rstat_cpu(parent, cpu); 375 + cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); 376 + cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); 377 + cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); 372 378 } 373 379 } 374 380

+1

tools/testing/selftests/cgroup/.gitignore

··· 5 5 test_kmem 6 6 test_kill 7 7 test_cpu 8 + test_cpuset 8 9 test_zswap 9 10 wait_inotify

+2

tools/testing/selftests/cgroup/Makefile

··· 12 12 TEST_GEN_PROGS += test_freezer 13 13 TEST_GEN_PROGS += test_kill 14 14 TEST_GEN_PROGS += test_cpu 15 + TEST_GEN_PROGS += test_cpuset 15 16 TEST_GEN_PROGS += test_zswap 16 17 17 18 LOCAL_HDRS += $(selfdir)/clone3/clone3_selftests.h $(selfdir)/pidfd/pidfd.h ··· 25 24 $(OUTPUT)/test_freezer: cgroup_util.c 26 25 $(OUTPUT)/test_kill: cgroup_util.c 27 26 $(OUTPUT)/test_cpu: cgroup_util.c 27 + $(OUTPUT)/test_cpuset: cgroup_util.c 28 28 $(OUTPUT)/test_zswap: cgroup_util.c

+2

tools/testing/selftests/cgroup/cgroup_util.c

··· 286 286 { 287 287 int ret; 288 288 289 + if (!cgroup) 290 + return 0; 289 291 retry: 290 292 ret = rmdir(cgroup); 291 293 if (ret && errno == EBUSY) {

+2

tools/testing/selftests/cgroup/cgroup_util.h

··· 11 11 #define USEC_PER_SEC 1000000L 12 12 #define NSEC_PER_SEC 1000000000L 13 13 14 + #define TEST_UID 65534 /* usually nobody, any !root is fine */ 15 + 14 16 /* 15 17 * Checks if two given values differ by less than err% of their sum. 16 18 */

+1 -1

tools/testing/selftests/cgroup/test_core.c

··· 683 683 */ 684 684 static int test_cgcore_lesser_euid_open(const char *root) 685 685 { 686 - const uid_t test_euid = 65534; /* usually nobody, any !root is fine */ 686 + const uid_t test_euid = TEST_UID; 687 687 int ret = KSFT_FAIL; 688 688 char *cg_test_a = NULL, *cg_test_b = NULL; 689 689 char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL;

+275

tools/testing/selftests/cgroup/test_cpuset.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #include <linux/limits.h> 4 + #include <signal.h> 5 + 6 + #include "../kselftest.h" 7 + #include "cgroup_util.h" 8 + 9 + static int idle_process_fn(const char *cgroup, void *arg) 10 + { 11 + (void)pause(); 12 + return 0; 13 + } 14 + 15 + static int do_migration_fn(const char *cgroup, void *arg) 16 + { 17 + int object_pid = (int)(size_t)arg; 18 + 19 + if (setuid(TEST_UID)) 20 + return EXIT_FAILURE; 21 + 22 + // XXX checking /proc/$pid/cgroup would be quicker than wait 23 + if (cg_enter(cgroup, object_pid) || 24 + cg_wait_for_proc_count(cgroup, 1)) 25 + return EXIT_FAILURE; 26 + 27 + return EXIT_SUCCESS; 28 + } 29 + 30 + static int do_controller_fn(const char *cgroup, void *arg) 31 + { 32 + const char *child = cgroup; 33 + const char *parent = arg; 34 + 35 + if (setuid(TEST_UID)) 36 + return EXIT_FAILURE; 37 + 38 + if (!cg_read_strstr(child, "cgroup.controllers", "cpuset")) 39 + return EXIT_FAILURE; 40 + 41 + if (cg_write(parent, "cgroup.subtree_control", "+cpuset")) 42 + return EXIT_FAILURE; 43 + 44 + if (cg_read_strstr(child, "cgroup.controllers", "cpuset")) 45 + return EXIT_FAILURE; 46 + 47 + if (cg_write(parent, "cgroup.subtree_control", "-cpuset")) 48 + return EXIT_FAILURE; 49 + 50 + if (!cg_read_strstr(child, "cgroup.controllers", "cpuset")) 51 + return EXIT_FAILURE; 52 + 53 + return EXIT_SUCCESS; 54 + } 55 + 56 + /* 57 + * Migrate a process between two sibling cgroups. 58 + * The success should only depend on the parent cgroup permissions and not the 59 + * migrated process itself (cpuset controller is in place because it uses 60 + * security_task_setscheduler() in cgroup v1). 61 + * 62 + * Deliberately don't set cpuset.cpus in children to avoid definining migration 63 + * permissions between two different cpusets. 64 + */ 65 + static int test_cpuset_perms_object(const char *root, bool allow) 66 + { 67 + char *parent = NULL, *child_src = NULL, *child_dst = NULL; 68 + char *parent_procs = NULL, *child_src_procs = NULL, *child_dst_procs = NULL; 69 + const uid_t test_euid = TEST_UID; 70 + int object_pid = 0; 71 + int ret = KSFT_FAIL; 72 + 73 + parent = cg_name(root, "cpuset_test_0"); 74 + if (!parent) 75 + goto cleanup; 76 + parent_procs = cg_name(parent, "cgroup.procs"); 77 + if (!parent_procs) 78 + goto cleanup; 79 + if (cg_create(parent)) 80 + goto cleanup; 81 + 82 + child_src = cg_name(parent, "cpuset_test_1"); 83 + if (!child_src) 84 + goto cleanup; 85 + child_src_procs = cg_name(child_src, "cgroup.procs"); 86 + if (!child_src_procs) 87 + goto cleanup; 88 + if (cg_create(child_src)) 89 + goto cleanup; 90 + 91 + child_dst = cg_name(parent, "cpuset_test_2"); 92 + if (!child_dst) 93 + goto cleanup; 94 + child_dst_procs = cg_name(child_dst, "cgroup.procs"); 95 + if (!child_dst_procs) 96 + goto cleanup; 97 + if (cg_create(child_dst)) 98 + goto cleanup; 99 + 100 + if (cg_write(parent, "cgroup.subtree_control", "+cpuset")) 101 + goto cleanup; 102 + 103 + if (cg_read_strstr(child_src, "cgroup.controllers", "cpuset") || 104 + cg_read_strstr(child_dst, "cgroup.controllers", "cpuset")) 105 + goto cleanup; 106 + 107 + /* Enable permissions along src->dst tree path */ 108 + if (chown(child_src_procs, test_euid, -1) || 109 + chown(child_dst_procs, test_euid, -1)) 110 + goto cleanup; 111 + 112 + if (allow && chown(parent_procs, test_euid, -1)) 113 + goto cleanup; 114 + 115 + /* Fork a privileged child as a test object */ 116 + object_pid = cg_run_nowait(child_src, idle_process_fn, NULL); 117 + if (object_pid < 0) 118 + goto cleanup; 119 + 120 + /* Carry out migration in a child process that can drop all privileges 121 + * (including capabilities), the main process must remain privileged for 122 + * cleanup. 123 + * Child process's cgroup is irrelevant but we place it into child_dst 124 + * as hacky way to pass information about migration target to the child. 125 + */ 126 + if (allow ^ (cg_run(child_dst, do_migration_fn, (void *)(size_t)object_pid) == EXIT_SUCCESS)) 127 + goto cleanup; 128 + 129 + ret = KSFT_PASS; 130 + 131 + cleanup: 132 + if (object_pid > 0) { 133 + (void)kill(object_pid, SIGTERM); 134 + (void)clone_reap(object_pid, WEXITED); 135 + } 136 + 137 + cg_destroy(child_dst); 138 + free(child_dst_procs); 139 + free(child_dst); 140 + 141 + cg_destroy(child_src); 142 + free(child_src_procs); 143 + free(child_src); 144 + 145 + cg_destroy(parent); 146 + free(parent_procs); 147 + free(parent); 148 + 149 + return ret; 150 + } 151 + 152 + static int test_cpuset_perms_object_allow(const char *root) 153 + { 154 + return test_cpuset_perms_object(root, true); 155 + } 156 + 157 + static int test_cpuset_perms_object_deny(const char *root) 158 + { 159 + return test_cpuset_perms_object(root, false); 160 + } 161 + 162 + /* 163 + * Migrate a process between parent and child implicitely 164 + * Implicit migration happens when a controller is enabled/disabled. 165 + * 166 + */ 167 + static int test_cpuset_perms_subtree(const char *root) 168 + { 169 + char *parent = NULL, *child = NULL; 170 + char *parent_procs = NULL, *parent_subctl = NULL, *child_procs = NULL; 171 + const uid_t test_euid = TEST_UID; 172 + int object_pid = 0; 173 + int ret = KSFT_FAIL; 174 + 175 + parent = cg_name(root, "cpuset_test_0"); 176 + if (!parent) 177 + goto cleanup; 178 + parent_procs = cg_name(parent, "cgroup.procs"); 179 + if (!parent_procs) 180 + goto cleanup; 181 + parent_subctl = cg_name(parent, "cgroup.subtree_control"); 182 + if (!parent_subctl) 183 + goto cleanup; 184 + if (cg_create(parent)) 185 + goto cleanup; 186 + 187 + child = cg_name(parent, "cpuset_test_1"); 188 + if (!child) 189 + goto cleanup; 190 + child_procs = cg_name(child, "cgroup.procs"); 191 + if (!child_procs) 192 + goto cleanup; 193 + if (cg_create(child)) 194 + goto cleanup; 195 + 196 + /* Enable permissions as in a delegated subtree */ 197 + if (chown(parent_procs, test_euid, -1) || 198 + chown(parent_subctl, test_euid, -1) || 199 + chown(child_procs, test_euid, -1)) 200 + goto cleanup; 201 + 202 + /* Put a privileged child in the subtree and modify controller state 203 + * from an unprivileged process, the main process remains privileged 204 + * for cleanup. 205 + * The unprivileged child runs in subtree too to avoid parent and 206 + * internal-node constraing violation. 207 + */ 208 + object_pid = cg_run_nowait(child, idle_process_fn, NULL); 209 + if (object_pid < 0) 210 + goto cleanup; 211 + 212 + if (cg_run(child, do_controller_fn, parent) != EXIT_SUCCESS) 213 + goto cleanup; 214 + 215 + ret = KSFT_PASS; 216 + 217 + cleanup: 218 + if (object_pid > 0) { 219 + (void)kill(object_pid, SIGTERM); 220 + (void)clone_reap(object_pid, WEXITED); 221 + } 222 + 223 + cg_destroy(child); 224 + free(child_procs); 225 + free(child); 226 + 227 + cg_destroy(parent); 228 + free(parent_subctl); 229 + free(parent_procs); 230 + free(parent); 231 + 232 + return ret; 233 + } 234 + 235 + 236 + #define T(x) { x, #x } 237 + struct cpuset_test { 238 + int (*fn)(const char *root); 239 + const char *name; 240 + } tests[] = { 241 + T(test_cpuset_perms_object_allow), 242 + T(test_cpuset_perms_object_deny), 243 + T(test_cpuset_perms_subtree), 244 + }; 245 + #undef T 246 + 247 + int main(int argc, char *argv[]) 248 + { 249 + char root[PATH_MAX]; 250 + int i, ret = EXIT_SUCCESS; 251 + 252 + if (cg_find_unified_root(root, sizeof(root))) 253 + ksft_exit_skip("cgroup v2 isn't mounted\n"); 254 + 255 + if (cg_read_strstr(root, "cgroup.subtree_control", "cpuset")) 256 + if (cg_write(root, "cgroup.subtree_control", "+cpuset")) 257 + ksft_exit_skip("Failed to set cpuset controller\n"); 258 + 259 + for (i = 0; i < ARRAY_SIZE(tests); i++) { 260 + switch (tests[i].fn(root)) { 261 + case KSFT_PASS: 262 + ksft_test_result_pass("%s\n", tests[i].name); 263 + break; 264 + case KSFT_SKIP: 265 + ksft_test_result_skip("%s\n", tests[i].name); 266 + break; 267 + default: 268 + ret = EXIT_FAILURE; 269 + ksft_test_result_fail("%s\n", tests[i].name); 270 + break; 271 + } 272 + } 273 + 274 + return ret; 275 + }

+1 -1

tools/testing/selftests/cgroup/test_cpuset_prs.sh

··· 10 10 skip_test() { 11 11 echo "$1" 12 12 echo "Test SKIPPED" 13 - exit 0 13 + exit 4 # ksft_skip 14 14 } 15 15 16 16 [[ $(id -u) -eq 0 ]] || skip_test "Test must be run as root!"