Merge branch 'for-3.17' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

+14

Documentation/cgroups/cgroups.txt

··· 599 599 while the caller holds cgroup_mutex and it is ensured that either 600 600 attach() or cancel_attach() will be called in future. 601 601 602 + void css_reset(struct cgroup_subsys_state *css) 603 + (cgroup_mutex held by caller) 604 + 605 + An optional operation which should restore @css's configuration to the 606 + initial state. This is currently only used on the unified hierarchy 607 + when a subsystem is disabled on a cgroup through 608 + "cgroup.subtree_control" but should remain enabled because other 609 + subsystems depend on it. cgroup core makes such a css invisible by 610 + removing the associated interface files and invokes this callback so 611 + that the hidden subsystem can return to the initial neutral state. 612 + This prevents unexpected resource control from a hidden css and 613 + ensures that the configuration is in the initial state when it is made 614 + visible again later. 615 + 602 616 void cancel_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) 603 617 (cgroup_mutex held by caller) 604 618

+29 -6

Documentation/cgroups/unified-hierarchy.txt

··· 94 94 95 95 mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT 96 96 97 - All controllers which are not bound to other hierarchies are 98 - automatically bound to unified hierarchy and show up at the root of 99 - it. Controllers which are enabled only in the root of unified 100 - hierarchy can be bound to other hierarchies at any time. This allows 101 - mixing unified hierarchy with the traditional multiple hierarchies in 102 - a fully backward compatible way. 97 + All controllers which support the unified hierarchy and are not bound 98 + to other hierarchies are automatically bound to unified hierarchy and 99 + show up at the root of it. Controllers which are enabled only in the 100 + root of unified hierarchy can be bound to other hierarchies. This 101 + allows mixing unified hierarchy with the traditional multiple 102 + hierarchies in a fully backward compatible way. 103 + 104 + For development purposes, the following boot parameter makes all 105 + controllers to appear on the unified hierarchy whether supported or 106 + not. 107 + 108 + cgroup__DEVEL__legacy_files_on_dfl 109 + 110 + A controller can be moved across hierarchies only after the controller 111 + is no longer referenced in its current hierarchy. Because per-cgroup 112 + controller states are destroyed asynchronously and controllers may 113 + have lingering references, a controller may not show up immediately on 114 + the unified hierarchy after the final umount of the previous 115 + hierarchy. Similarly, a controller should be fully disabled to be 116 + moved out of the unified hierarchy and it may take some time for the 117 + disabled controller to become available for other hierarchies; 118 + furthermore, due to dependencies among controllers, other controllers 119 + may need to be disabled too. 120 + 121 + While useful for development and manual configurations, dynamically 122 + moving controllers between the unified and other hierarchies is 123 + strongly discouraged for production use. It is recommended to decide 124 + the hierarchies and controller associations before starting using the 125 + controllers. 103 126 104 127 105 128 2-2. cgroup.subtree_control

+11 -2

block/blk-cgroup.c

··· 928 928 .css_offline = blkcg_css_offline, 929 929 .css_free = blkcg_css_free, 930 930 .can_attach = blkcg_can_attach, 931 - .base_cftypes = blkcg_files, 931 + .legacy_cftypes = blkcg_files, 932 + #ifdef CONFIG_MEMCG 933 + /* 934 + * This ensures that, if available, memcg is automatically enabled 935 + * together on the default hierarchy so that the owner cgroup can 936 + * be retrieved from writeback pages. 937 + */ 938 + .depends_on = 1 << memory_cgrp_id, 939 + #endif 932 940 }; 933 941 EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); 934 942 ··· 1128 1120 1129 1121 /* everything is in place, add intf files for the new policy */ 1130 1122 if (pol->cftypes) 1131 - WARN_ON(cgroup_add_cftypes(&blkio_cgrp_subsys, pol->cftypes)); 1123 + WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, 1124 + pol->cftypes)); 1132 1125 ret = 0; 1133 1126 out_unlock: 1134 1127 mutex_unlock(&blkcg_pol_mutex);

+3 -3

block/blk-throttle.c

··· 412 412 int rw; 413 413 414 414 /* 415 - * If sane_hierarchy is enabled, we switch to properly hierarchical 415 + * If on the default hierarchy, we switch to properly hierarchical 416 416 * behavior where limits on a given throtl_grp are applied to the 417 417 * whole subtree rather than just the group itself. e.g. If 16M 418 418 * read_bps limit is set on the root group, the whole system can't 419 419 * exceed 16M for the device. 420 420 * 421 - * If sane_hierarchy is not enabled, the broken flat hierarchy 421 + * If not on the default hierarchy, the broken flat hierarchy 422 422 * behavior is retained where all throtl_grps are treated as if 423 423 * they're all separate root groups right below throtl_data. 424 424 * Limits of a group don't interact with limits of other groups ··· 426 426 */ 427 427 parent_sq = &td->service_queue; 428 428 429 - if (cgroup_sane_behavior(blkg->blkcg->css.cgroup) && blkg->parent) 429 + if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent) 430 430 parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 431 431 432 432 throtl_service_queue_init(&tg->service_queue, parent_sq);

+85 -80

include/linux/cgroup.h

··· 203 203 struct kernfs_node *kn; /* cgroup kernfs entry */ 204 204 struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */ 205 205 206 - /* the bitmask of subsystems enabled on the child cgroups */ 206 + /* 207 + * The bitmask of subsystems enabled on the child cgroups. 208 + * ->subtree_control is the one configured through 209 + * "cgroup.subtree_control" while ->child_subsys_mask is the 210 + * effective one which may have more subsystems enabled. 211 + * Controller knobs are made available iff it's enabled in 212 + * ->subtree_control. 213 + */ 214 + unsigned int subtree_control; 207 215 unsigned int child_subsys_mask; 208 216 209 217 /* Private pointers for each registered subsystem */ ··· 256 248 257 249 /* cgroup_root->flags */ 258 250 enum { 259 - /* 260 - * Unfortunately, cgroup core and various controllers are riddled 261 - * with idiosyncrasies and pointless options. The following flag, 262 - * when set, will force sane behavior - some options are forced on, 263 - * others are disallowed, and some controllers will change their 264 - * hierarchical or other behaviors. 265 - * 266 - * The set of behaviors affected by this flag are still being 267 - * determined and developed and the mount option for this flag is 268 - * prefixed with __DEVEL__. The prefix will be dropped once we 269 - * reach the point where all behaviors are compatible with the 270 - * planned unified hierarchy, which will automatically turn on this 271 - * flag. 272 - * 273 - * The followings are the behaviors currently affected this flag. 274 - * 275 - * - Mount options "noprefix", "xattr", "clone_children", 276 - * "release_agent" and "name" are disallowed. 277 - * 278 - * - When mounting an existing superblock, mount options should 279 - * match. 280 - * 281 - * - Remount is disallowed. 282 - * 283 - * - rename(2) is disallowed. 284 - * 285 - * - "tasks" is removed. Everything should be at process 286 - * granularity. Use "cgroup.procs" instead. 287 - * 288 - * - "cgroup.procs" is not sorted. pids will be unique unless they 289 - * got recycled inbetween reads. 290 - * 291 - * - "release_agent" and "notify_on_release" are removed. 292 - * Replacement notification mechanism will be implemented. 293 - * 294 - * - "cgroup.clone_children" is removed. 295 - * 296 - * - "cgroup.subtree_populated" is available. Its value is 0 if 297 - * the cgroup and its descendants contain no task; otherwise, 1. 298 - * The file also generates kernfs notification which can be 299 - * monitored through poll and [di]notify when the value of the 300 - * file changes. 301 - * 302 - * - If mount is requested with sane_behavior but without any 303 - * subsystem, the default unified hierarchy is mounted. 304 - * 305 - * - cpuset: tasks will be kept in empty cpusets when hotplug happens 306 - * and take masks of ancestors with non-empty cpus/mems, instead of 307 - * being moved to an ancestor. 308 - * 309 - * - cpuset: a task can be moved into an empty cpuset, and again it 310 - * takes masks of ancestors. 311 - * 312 - * - memcg: use_hierarchy is on by default and the cgroup file for 313 - * the flag is not created. 314 - * 315 - * - blkcg: blk-throttle becomes properly hierarchical. 316 - * 317 - * - debug: disallowed on the default hierarchy. 318 - */ 319 - CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), 320 - 251 + CGRP_ROOT_SANE_BEHAVIOR = (1 << 0), /* __DEVEL__sane_behavior specified */ 321 252 CGRP_ROOT_NOPREFIX = (1 << 1), /* mounted subsystems have no named prefix */ 322 253 CGRP_ROOT_XATTR = (1 << 2), /* supports extended attributes */ 323 - 324 - /* mount options live below bit 16 */ 325 - CGRP_ROOT_OPTION_MASK = (1 << 16) - 1, 326 254 }; 327 255 328 256 /* ··· 384 440 enum { 385 441 CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ 386 442 CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ 387 - CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ 388 443 CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ 389 - CFTYPE_ONLY_ON_DFL = (1 << 4), /* only on default hierarchy */ 444 + 445 + /* internal flags, do not use outside cgroup core proper */ 446 + __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ 447 + __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ 390 448 }; 391 449 392 450 #define MAX_CFTYPE_NAME 64 ··· 472 526 extern struct cgroup_root cgrp_dfl_root; 473 527 extern struct css_set init_css_set; 474 528 529 + /** 530 + * cgroup_on_dfl - test whether a cgroup is on the default hierarchy 531 + * @cgrp: the cgroup of interest 532 + * 533 + * The default hierarchy is the v2 interface of cgroup and this function 534 + * can be used to test whether a cgroup is on the default hierarchy for 535 + * cases where a subsystem should behave differnetly depending on the 536 + * interface version. 537 + * 538 + * The set of behaviors which change on the default hierarchy are still 539 + * being determined and the mount option is prefixed with __DEVEL__. 540 + * 541 + * List of changed behaviors: 542 + * 543 + * - Mount options "noprefix", "xattr", "clone_children", "release_agent" 544 + * and "name" are disallowed. 545 + * 546 + * - When mounting an existing superblock, mount options should match. 547 + * 548 + * - Remount is disallowed. 549 + * 550 + * - rename(2) is disallowed. 551 + * 552 + * - "tasks" is removed. Everything should be at process granularity. Use 553 + * "cgroup.procs" instead. 554 + * 555 + * - "cgroup.procs" is not sorted. pids will be unique unless they got 556 + * recycled inbetween reads. 557 + * 558 + * - "release_agent" and "notify_on_release" are removed. Replacement 559 + * notification mechanism will be implemented. 560 + * 561 + * - "cgroup.clone_children" is removed. 562 + * 563 + * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup 564 + * and its descendants contain no task; otherwise, 1. The file also 565 + * generates kernfs notification which can be monitored through poll and 566 + * [di]notify when the value of the file changes. 567 + * 568 + * - cpuset: tasks will be kept in empty cpusets when hotplug happens and 569 + * take masks of ancestors with non-empty cpus/mems, instead of being 570 + * moved to an ancestor. 571 + * 572 + * - cpuset: a task can be moved into an empty cpuset, and again it takes 573 + * masks of ancestors. 574 + * 575 + * - memcg: use_hierarchy is on by default and the cgroup file for the flag 576 + * is not created. 577 + * 578 + * - blkcg: blk-throttle becomes properly hierarchical. 579 + * 580 + * - debug: disallowed on the default hierarchy. 581 + */ 475 582 static inline bool cgroup_on_dfl(const struct cgroup *cgrp) 476 583 { 477 584 return cgrp->root == &cgrp_dfl_root; 478 - } 479 - 480 - /* 481 - * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This 482 - * function can be called as long as @cgrp is accessible. 483 - */ 484 - static inline bool cgroup_sane_behavior(const struct cgroup *cgrp) 485 - { 486 - return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR; 487 585 } 488 586 489 587 /* no synchronization, the result can only be used as a hint */ ··· 592 602 593 603 char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); 594 604 595 - int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 605 + int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 606 + int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); 596 607 int cgroup_rm_cftypes(struct cftype *cfts); 597 608 598 609 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); ··· 625 634 int (*css_online)(struct cgroup_subsys_state *css); 626 635 void (*css_offline)(struct cgroup_subsys_state *css); 627 636 void (*css_free)(struct cgroup_subsys_state *css); 637 + void (*css_reset)(struct cgroup_subsys_state *css); 628 638 629 639 int (*can_attach)(struct cgroup_subsys_state *css, 630 640 struct cgroup_taskset *tset); ··· 674 682 */ 675 683 struct list_head cfts; 676 684 677 - /* base cftypes, automatically registered with subsys itself */ 678 - struct cftype *base_cftypes; 685 + /* 686 + * Base cftypes which are automatically registered. The two can 687 + * point to the same array. 688 + */ 689 + struct cftype *dfl_cftypes; /* for the default hierarchy */ 690 + struct cftype *legacy_cftypes; /* for the legacy hierarchies */ 691 + 692 + /* 693 + * A subsystem may depend on other subsystems. When such subsystem 694 + * is enabled on a cgroup, the depended-upon subsystems are enabled 695 + * together if available. Subsystems enabled due to dependency are 696 + * not visible to userland until explicitly enabled. The following 697 + * specifies the mask of subsystems that this one depends on. 698 + */ 699 + unsigned int depends_on; 679 700 }; 680 701 681 702 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;

+326 -143

kernel/cgroup.c

··· 149 149 */ 150 150 static bool cgrp_dfl_root_visible; 151 151 152 + /* 153 + * Set by the boot param of the same name and makes subsystems with NULL 154 + * ->dfl_files to use ->legacy_files on the default hierarchy. 155 + */ 156 + static bool cgroup_legacy_files_on_dfl; 157 + 152 158 /* some controllers are not supported in the default hierarchy */ 153 - static const unsigned int cgrp_dfl_root_inhibit_ss_mask = 0 154 - #ifdef CONFIG_CGROUP_DEBUG 155 - | (1 << debug_cgrp_id) 156 - #endif 157 - ; 159 + static unsigned int cgrp_dfl_root_inhibit_ss_mask; 158 160 159 161 /* The list of hierarchy roots */ 160 162 ··· 182 180 */ 183 181 static int need_forkexit_callback __read_mostly; 184 182 185 - static struct cftype cgroup_base_files[]; 183 + static struct cftype cgroup_dfl_base_files[]; 184 + static struct cftype cgroup_legacy_base_files[]; 186 185 187 186 static void cgroup_put(struct cgroup *cgrp); 188 187 static int rebind_subsystems(struct cgroup_root *dst_root, 189 188 unsigned int ss_mask); 190 189 static int cgroup_destroy_locked(struct cgroup *cgrp); 191 - static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss); 190 + static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, 191 + bool visible); 192 192 static void css_release(struct percpu_ref *ref); 193 193 static void kill_css(struct cgroup_subsys_state *css); 194 194 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], ··· 1041 1037 } 1042 1038 1043 1039 /** 1040 + * cgroup_refresh_child_subsys_mask - update child_subsys_mask 1041 + * @cgrp: the target cgroup 1042 + * 1043 + * On the default hierarchy, a subsystem may request other subsystems to be 1044 + * enabled together through its ->depends_on mask. In such cases, more 1045 + * subsystems than specified in "cgroup.subtree_control" may be enabled. 1046 + * 1047 + * This function determines which subsystems need to be enabled given the 1048 + * current @cgrp->subtree_control and records it in 1049 + * @cgrp->child_subsys_mask. The resulting mask is always a superset of 1050 + * @cgrp->subtree_control and follows the usual hierarchy rules. 1051 + */ 1052 + static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) 1053 + { 1054 + struct cgroup *parent = cgroup_parent(cgrp); 1055 + unsigned int cur_ss_mask = cgrp->subtree_control; 1056 + struct cgroup_subsys *ss; 1057 + int ssid; 1058 + 1059 + lockdep_assert_held(&cgroup_mutex); 1060 + 1061 + if (!cgroup_on_dfl(cgrp)) { 1062 + cgrp->child_subsys_mask = cur_ss_mask; 1063 + return; 1064 + } 1065 + 1066 + while (true) { 1067 + unsigned int new_ss_mask = cur_ss_mask; 1068 + 1069 + for_each_subsys(ss, ssid) 1070 + if (cur_ss_mask & (1 << ssid)) 1071 + new_ss_mask |= ss->depends_on; 1072 + 1073 + /* 1074 + * Mask out subsystems which aren't available. This can 1075 + * happen only if some depended-upon subsystems were bound 1076 + * to non-default hierarchies. 1077 + */ 1078 + if (parent) 1079 + new_ss_mask &= parent->child_subsys_mask; 1080 + else 1081 + new_ss_mask &= cgrp->root->subsys_mask; 1082 + 1083 + if (new_ss_mask == cur_ss_mask) 1084 + break; 1085 + cur_ss_mask = new_ss_mask; 1086 + } 1087 + 1088 + cgrp->child_subsys_mask = cur_ss_mask; 1089 + } 1090 + 1091 + /** 1044 1092 * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods 1045 1093 * @kn: the kernfs_node being serviced 1046 1094 * ··· 1264 1208 up_write(&css_set_rwsem); 1265 1209 1266 1210 src_root->subsys_mask &= ~(1 << ssid); 1267 - src_root->cgrp.child_subsys_mask &= ~(1 << ssid); 1211 + src_root->cgrp.subtree_control &= ~(1 << ssid); 1212 + cgroup_refresh_child_subsys_mask(&src_root->cgrp); 1268 1213 1269 1214 /* default hierarchy doesn't enable controllers by default */ 1270 1215 dst_root->subsys_mask |= 1 << ssid; 1271 - if (dst_root != &cgrp_dfl_root) 1272 - dst_root->cgrp.child_subsys_mask |= 1 << ssid; 1216 + if (dst_root != &cgrp_dfl_root) { 1217 + dst_root->cgrp.subtree_control |= 1 << ssid; 1218 + cgroup_refresh_child_subsys_mask(&dst_root->cgrp); 1219 + } 1273 1220 1274 1221 if (ss->bind) 1275 1222 ss->bind(css); ··· 1292 1233 for_each_subsys(ss, ssid) 1293 1234 if (root->subsys_mask & (1 << ssid)) 1294 1235 seq_printf(seq, ",%s", ss->name); 1295 - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) 1296 - seq_puts(seq, ",sane_behavior"); 1297 1236 if (root->flags & CGRP_ROOT_NOPREFIX) 1298 1237 seq_puts(seq, ",noprefix"); 1299 1238 if (root->flags & CGRP_ROOT_XATTR) ··· 1325 1268 bool all_ss = false, one_ss = false; 1326 1269 unsigned int mask = -1U; 1327 1270 struct cgroup_subsys *ss; 1271 + int nr_opts = 0; 1328 1272 int i; 1329 1273 1330 1274 #ifdef CONFIG_CPUSETS ··· 1335 1277 memset(opts, 0, sizeof(*opts)); 1336 1278 1337 1279 while ((token = strsep(&o, ",")) != NULL) { 1280 + nr_opts++; 1281 + 1338 1282 if (!*token) 1339 1283 return -EINVAL; 1340 1284 if (!strcmp(token, "none")) { ··· 1421 1361 return -ENOENT; 1422 1362 } 1423 1363 1424 - /* Consistency checks */ 1425 - 1426 1364 if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1427 1365 pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); 1428 - 1429 - if ((opts->flags & (CGRP_ROOT_NOPREFIX | CGRP_ROOT_XATTR)) || 1430 - opts->cpuset_clone_children || opts->release_agent || 1431 - opts->name) { 1432 - pr_err("sane_behavior: noprefix, xattr, clone_children, release_agent and name are not allowed\n"); 1366 + if (nr_opts != 1) { 1367 + pr_err("sane_behavior: no other mount options allowed\n"); 1433 1368 return -EINVAL; 1434 1369 } 1435 - } else { 1436 - /* 1437 - * If the 'all' option was specified select all the 1438 - * subsystems, otherwise if 'none', 'name=' and a subsystem 1439 - * name options were not specified, let's default to 'all' 1440 - */ 1441 - if (all_ss || (!one_ss && !opts->none && !opts->name)) 1442 - for_each_subsys(ss, i) 1443 - if (!ss->disabled) 1444 - opts->subsys_mask |= (1 << i); 1445 - 1446 - /* 1447 - * We either have to specify by name or by subsystems. (So 1448 - * all empty hierarchies must have a name). 1449 - */ 1450 - if (!opts->subsys_mask && !opts->name) 1451 - return -EINVAL; 1370 + return 0; 1452 1371 } 1372 + 1373 + /* 1374 + * If the 'all' option was specified select all the subsystems, 1375 + * otherwise if 'none', 'name=' and a subsystem name options were 1376 + * not specified, let's default to 'all' 1377 + */ 1378 + if (all_ss || (!one_ss && !opts->none && !opts->name)) 1379 + for_each_subsys(ss, i) 1380 + if (!ss->disabled) 1381 + opts->subsys_mask |= (1 << i); 1382 + 1383 + /* 1384 + * We either have to specify by name or by subsystems. (So all 1385 + * empty hierarchies must have a name). 1386 + */ 1387 + if (!opts->subsys_mask && !opts->name) 1388 + return -EINVAL; 1453 1389 1454 1390 /* 1455 1391 * Option noprefix was introduced just for backward compatibility ··· 1454 1398 */ 1455 1399 if ((opts->flags & CGRP_ROOT_NOPREFIX) && (opts->subsys_mask & mask)) 1456 1400 return -EINVAL; 1457 - 1458 1401 1459 1402 /* Can't specify "none" and some subsystems */ 1460 1403 if (opts->subsys_mask && opts->none) ··· 1469 1414 struct cgroup_sb_opts opts; 1470 1415 unsigned int added_mask, removed_mask; 1471 1416 1472 - if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) { 1473 - pr_err("sane_behavior: remount is not allowed\n"); 1417 + if (root == &cgrp_dfl_root) { 1418 + pr_err("remount is not allowed\n"); 1474 1419 return -EINVAL; 1475 1420 } 1476 1421 ··· 1489 1434 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1490 1435 1491 1436 /* Don't allow flags or name to change at remount */ 1492 - if (((opts.flags ^ root->flags) & CGRP_ROOT_OPTION_MASK) || 1437 + if ((opts.flags ^ root->flags) || 1493 1438 (opts.name && strcmp(opts.name, root->name))) { 1494 1439 pr_err("option or name mismatch, new: 0x%x \"%s\", old: 0x%x \"%s\"\n", 1495 - opts.flags & CGRP_ROOT_OPTION_MASK, opts.name ?: "", 1496 - root->flags & CGRP_ROOT_OPTION_MASK, root->name); 1440 + opts.flags, opts.name ?: "", root->flags, root->name); 1497 1441 ret = -EINVAL; 1498 1442 goto out_unlock; 1499 1443 } ··· 1617 1563 { 1618 1564 LIST_HEAD(tmp_links); 1619 1565 struct cgroup *root_cgrp = &root->cgrp; 1566 + struct cftype *base_files; 1620 1567 struct css_set *cset; 1621 1568 int i, ret; 1622 1569 ··· 1655 1600 } 1656 1601 root_cgrp->kn = root->kf_root->kn; 1657 1602 1658 - ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); 1603 + if (root == &cgrp_dfl_root) 1604 + base_files = cgroup_dfl_base_files; 1605 + else 1606 + base_files = cgroup_legacy_base_files; 1607 + 1608 + ret = cgroup_addrm_files(root_cgrp, base_files, true); 1659 1609 if (ret) 1660 1610 goto destroy_root; 1661 1611 ··· 1732 1672 goto out_unlock; 1733 1673 1734 1674 /* look for a matching existing root */ 1735 - if (!opts.subsys_mask && !opts.none && !opts.name) { 1675 + if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { 1736 1676 cgrp_dfl_root_visible = true; 1737 1677 root = &cgrp_dfl_root; 1738 1678 cgroup_get(&root->cgrp); ··· 1790 1730 goto out_unlock; 1791 1731 } 1792 1732 1793 - if ((root->flags ^ opts.flags) & CGRP_ROOT_OPTION_MASK) { 1794 - if ((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) { 1795 - pr_err("sane_behavior: new mount options should match the existing superblock\n"); 1796 - ret = -EINVAL; 1797 - goto out_unlock; 1798 - } else { 1799 - pr_warn("new mount options do not match the existing superblock, will be ignored\n"); 1800 - } 1801 - } 1733 + if (root->flags ^ opts.flags) 1734 + pr_warn("new mount options do not match the existing superblock, will be ignored\n"); 1802 1735 1803 1736 /* 1804 1737 * We want to reuse @root whose lifetime is governed by its ··· 2510 2457 2511 2458 static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) 2512 2459 { 2513 - struct cgroup *cgrp = seq_css(seq)->cgroup; 2514 - 2515 - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); 2460 + seq_puts(seq, "0\n"); 2516 2461 return 0; 2517 2462 } 2518 2463 ··· 2547 2496 { 2548 2497 struct cgroup *cgrp = seq_css(seq)->cgroup; 2549 2498 2550 - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->child_subsys_mask); 2499 + cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); 2551 2500 return 0; 2552 2501 } 2553 2502 ··· 2556 2505 { 2557 2506 struct cgroup *cgrp = seq_css(seq)->cgroup; 2558 2507 2559 - cgroup_print_ss_mask(seq, cgrp->child_subsys_mask); 2508 + cgroup_print_ss_mask(seq, cgrp->subtree_control); 2560 2509 return 0; 2561 2510 } 2562 2511 ··· 2662 2611 loff_t off) 2663 2612 { 2664 2613 unsigned int enable = 0, disable = 0; 2614 + unsigned int css_enable, css_disable, old_ctrl, new_ctrl; 2665 2615 struct cgroup *cgrp, *child; 2666 2616 struct cgroup_subsys *ss; 2667 2617 char *tok; ··· 2702 2650 2703 2651 for_each_subsys(ss, ssid) { 2704 2652 if (enable & (1 << ssid)) { 2705 - if (cgrp->child_subsys_mask & (1 << ssid)) { 2653 + if (cgrp->subtree_control & (1 << ssid)) { 2706 2654 enable &= ~(1 << ssid); 2707 2655 continue; 2708 2656 } 2657 + 2658 + /* unavailable or not enabled on the parent? */ 2659 + if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || 2660 + (cgroup_parent(cgrp) && 2661 + !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { 2662 + ret = -ENOENT; 2663 + goto out_unlock; 2664 + } 2665 + 2666 + /* 2667 + * @ss is already enabled through dependency and 2668 + * we'll just make it visible. Skip draining. 2669 + */ 2670 + if (cgrp->child_subsys_mask & (1 << ssid)) 2671 + continue; 2709 2672 2710 2673 /* 2711 2674 * Because css offlining is asynchronous, userland ··· 2744 2677 2745 2678 return restart_syscall(); 2746 2679 } 2747 - 2748 - /* unavailable or not enabled on the parent? */ 2749 - if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || 2750 - (cgroup_parent(cgrp) && 2751 - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ssid)))) { 2752 - ret = -ENOENT; 2753 - goto out_unlock; 2754 - } 2755 2680 } else if (disable & (1 << ssid)) { 2756 - if (!(cgrp->child_subsys_mask & (1 << ssid))) { 2681 + if (!(cgrp->subtree_control & (1 << ssid))) { 2757 2682 disable &= ~(1 << ssid); 2758 2683 continue; 2759 2684 } 2760 2685 2761 2686 /* a child has it enabled? */ 2762 2687 cgroup_for_each_live_child(child, cgrp) { 2763 - if (child->child_subsys_mask & (1 << ssid)) { 2688 + if (child->subtree_control & (1 << ssid)) { 2764 2689 ret = -EBUSY; 2765 2690 goto out_unlock; 2766 2691 } ··· 2766 2707 } 2767 2708 2768 2709 /* 2769 - * Except for the root, child_subsys_mask must be zero for a cgroup 2710 + * Except for the root, subtree_control must be zero for a cgroup 2770 2711 * with tasks so that child cgroups don't compete against tasks. 2771 2712 */ 2772 2713 if (enable && cgroup_parent(cgrp) && !list_empty(&cgrp->cset_links)) { ··· 2775 2716 } 2776 2717 2777 2718 /* 2778 - * Create csses for enables and update child_subsys_mask. This 2779 - * changes cgroup_e_css() results which in turn makes the 2780 - * subsequent cgroup_update_dfl_csses() associate all tasks in the 2781 - * subtree to the updated csses. 2719 + * Update subsys masks and calculate what needs to be done. More 2720 + * subsystems than specified may need to be enabled or disabled 2721 + * depending on subsystem dependencies. 2722 + */ 2723 + cgrp->subtree_control |= enable; 2724 + cgrp->subtree_control &= ~disable; 2725 + 2726 + old_ctrl = cgrp->child_subsys_mask; 2727 + cgroup_refresh_child_subsys_mask(cgrp); 2728 + new_ctrl = cgrp->child_subsys_mask; 2729 + 2730 + css_enable = ~old_ctrl & new_ctrl; 2731 + css_disable = old_ctrl & ~new_ctrl; 2732 + enable |= css_enable; 2733 + disable |= css_disable; 2734 + 2735 + /* 2736 + * Create new csses or make the existing ones visible. A css is 2737 + * created invisible if it's being implicitly enabled through 2738 + * dependency. An invisible css is made visible when the userland 2739 + * explicitly enables it. 2782 2740 */ 2783 2741 for_each_subsys(ss, ssid) { 2784 2742 if (!(enable & (1 << ssid))) 2785 2743 continue; 2786 2744 2787 2745 cgroup_for_each_live_child(child, cgrp) { 2788 - ret = create_css(child, ss); 2746 + if (css_enable & (1 << ssid)) 2747 + ret = create_css(child, ss, 2748 + cgrp->subtree_control & (1 << ssid)); 2749 + else 2750 + ret = cgroup_populate_dir(child, 1 << ssid); 2789 2751 if (ret) 2790 2752 goto err_undo_css; 2791 2753 } 2792 2754 } 2793 2755 2794 - cgrp->child_subsys_mask |= enable; 2795 - cgrp->child_subsys_mask &= ~disable; 2796 - 2756 + /* 2757 + * At this point, cgroup_e_css() results reflect the new csses 2758 + * making the following cgroup_update_dfl_csses() properly update 2759 + * css associations of all tasks in the subtree. 2760 + */ 2797 2761 ret = cgroup_update_dfl_csses(cgrp); 2798 2762 if (ret) 2799 2763 goto err_undo_css; 2800 2764 2801 - /* all tasks are now migrated away from the old csses, kill them */ 2765 + /* 2766 + * All tasks are migrated out of disabled csses. Kill or hide 2767 + * them. A css is hidden when the userland requests it to be 2768 + * disabled while other subsystems are still depending on it. The 2769 + * css must not actively control resources and be in the vanilla 2770 + * state if it's made visible again later. Controllers which may 2771 + * be depended upon should provide ->css_reset() for this purpose. 2772 + */ 2802 2773 for_each_subsys(ss, ssid) { 2803 2774 if (!(disable & (1 << ssid))) 2804 2775 continue; 2805 2776 2806 - cgroup_for_each_live_child(child, cgrp) 2807 - kill_css(cgroup_css(child, ss)); 2777 + cgroup_for_each_live_child(child, cgrp) { 2778 + struct cgroup_subsys_state *css = cgroup_css(child, ss); 2779 + 2780 + if (css_disable & (1 << ssid)) { 2781 + kill_css(css); 2782 + } else { 2783 + cgroup_clear_dir(child, 1 << ssid); 2784 + if (ss->css_reset) 2785 + ss->css_reset(css); 2786 + } 2787 + } 2808 2788 } 2809 2789 2810 2790 kernfs_activate(cgrp->kn); ··· 2853 2755 return ret ?: nbytes; 2854 2756 2855 2757 err_undo_css: 2856 - cgrp->child_subsys_mask &= ~enable; 2857 - cgrp->child_subsys_mask |= disable; 2758 + cgrp->subtree_control &= ~enable; 2759 + cgrp->subtree_control |= disable; 2760 + cgroup_refresh_child_subsys_mask(cgrp); 2858 2761 2859 2762 for_each_subsys(ss, ssid) { 2860 2763 if (!(enable & (1 << ssid))) ··· 2863 2764 2864 2765 cgroup_for_each_live_child(child, cgrp) { 2865 2766 struct cgroup_subsys_state *css = cgroup_css(child, ss); 2866 - if (css) 2767 + 2768 + if (!css) 2769 + continue; 2770 + 2771 + if (css_enable & (1 << ssid)) 2867 2772 kill_css(css); 2773 + else 2774 + cgroup_clear_dir(child, 1 << ssid); 2868 2775 } 2869 2776 } 2870 2777 goto out_unlock; ··· 2983 2878 2984 2879 /* 2985 2880 * This isn't a proper migration and its usefulness is very 2986 - * limited. Disallow if sane_behavior. 2881 + * limited. Disallow on the default hierarchy. 2987 2882 */ 2988 - if (cgroup_sane_behavior(cgrp)) 2883 + if (cgroup_on_dfl(cgrp)) 2989 2884 return -EPERM; 2990 2885 2991 2886 /* ··· 3069 2964 3070 2965 for (cft = cfts; cft->name[0] != '\0'; cft++) { 3071 2966 /* does cft->flags tell us to skip this file on @cgrp? */ 3072 - if ((cft->flags & CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) 2967 + if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp)) 3073 2968 continue; 3074 - if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp)) 2969 + if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp)) 3075 2970 continue; 3076 2971 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgroup_parent(cgrp)) 3077 2972 continue; ··· 3129 3024 kfree(cft->kf_ops); 3130 3025 cft->kf_ops = NULL; 3131 3026 cft->ss = NULL; 3027 + 3028 + /* revert flags set by cgroup core while adding @cfts */ 3029 + cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL); 3132 3030 } 3133 3031 } 3134 3032 ··· 3217 3109 * function currently returns 0 as long as @cfts registration is successful 3218 3110 * even if some file creation attempts on existing cgroups fail. 3219 3111 */ 3220 - int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 3112 + static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 3221 3113 { 3222 3114 int ret; 3223 3115 ··· 3240 3132 3241 3133 mutex_unlock(&cgroup_mutex); 3242 3134 return ret; 3135 + } 3136 + 3137 + /** 3138 + * cgroup_add_dfl_cftypes - add an array of cftypes for default hierarchy 3139 + * @ss: target cgroup subsystem 3140 + * @cfts: zero-length name terminated array of cftypes 3141 + * 3142 + * Similar to cgroup_add_cftypes() but the added files are only used for 3143 + * the default hierarchy. 3144 + */ 3145 + int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 3146 + { 3147 + struct cftype *cft; 3148 + 3149 + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) 3150 + cft->flags |= __CFTYPE_ONLY_ON_DFL; 3151 + return cgroup_add_cftypes(ss, cfts); 3152 + } 3153 + 3154 + /** 3155 + * cgroup_add_legacy_cftypes - add an array of cftypes for legacy hierarchies 3156 + * @ss: target cgroup subsystem 3157 + * @cfts: zero-length name terminated array of cftypes 3158 + * 3159 + * Similar to cgroup_add_cftypes() but the added files are only used for 3160 + * the legacy hierarchies. 3161 + */ 3162 + int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 3163 + { 3164 + struct cftype *cft; 3165 + 3166 + for (cft = cfts; cft && cft->name[0] != '\0'; cft++) 3167 + cft->flags |= __CFTYPE_NOT_ON_DFL; 3168 + return cgroup_add_cftypes(ss, cfts); 3243 3169 } 3244 3170 3245 3171 /** ··· 3841 3699 * 3842 3700 * All this extra complexity was caused by the original implementation 3843 3701 * committing to an entirely unnecessary property. In the long term, we 3844 - * want to do away with it. Explicitly scramble sort order if 3845 - * sane_behavior so that no such expectation exists in the new interface. 3702 + * want to do away with it. Explicitly scramble sort order if on the 3703 + * default hierarchy so that no such expectation exists in the new 3704 + * interface. 3846 3705 * 3847 3706 * Scrambling is done by swapping every two consecutive bits, which is 3848 3707 * non-identity one-to-one mapping which disturbs sort order sufficiently. ··· 3858 3715 3859 3716 static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) 3860 3717 { 3861 - if (cgroup_sane_behavior(cgrp)) 3718 + if (cgroup_on_dfl(cgrp)) 3862 3719 return pid_fry(pid); 3863 3720 else 3864 3721 return pid; ··· 3961 3818 css_task_iter_end(&it); 3962 3819 length = n; 3963 3820 /* now sort & (if procs) strip out duplicates */ 3964 - if (cgroup_sane_behavior(cgrp)) 3821 + if (cgroup_on_dfl(cgrp)) 3965 3822 sort(array, length, sizeof(pid_t), fried_cmppid, NULL); 3966 3823 else 3967 3824 sort(array, length, sizeof(pid_t), cmppid, NULL); ··· 4183 4040 return 0; 4184 4041 } 4185 4042 4186 - static struct cftype cgroup_base_files[] = { 4043 + /* cgroup core interface files for the default hierarchy */ 4044 + static struct cftype cgroup_dfl_base_files[] = { 4045 + { 4046 + .name = "cgroup.procs", 4047 + .seq_start = cgroup_pidlist_start, 4048 + .seq_next = cgroup_pidlist_next, 4049 + .seq_stop = cgroup_pidlist_stop, 4050 + .seq_show = cgroup_pidlist_show, 4051 + .private = CGROUP_FILE_PROCS, 4052 + .write = cgroup_procs_write, 4053 + .mode = S_IRUGO | S_IWUSR, 4054 + }, 4055 + { 4056 + .name = "cgroup.controllers", 4057 + .flags = CFTYPE_ONLY_ON_ROOT, 4058 + .seq_show = cgroup_root_controllers_show, 4059 + }, 4060 + { 4061 + .name = "cgroup.controllers", 4062 + .flags = CFTYPE_NOT_ON_ROOT, 4063 + .seq_show = cgroup_controllers_show, 4064 + }, 4065 + { 4066 + .name = "cgroup.subtree_control", 4067 + .seq_show = cgroup_subtree_control_show, 4068 + .write = cgroup_subtree_control_write, 4069 + }, 4070 + { 4071 + .name = "cgroup.populated", 4072 + .flags = CFTYPE_NOT_ON_ROOT, 4073 + .seq_show = cgroup_populated_show, 4074 + }, 4075 + { } /* terminate */ 4076 + }; 4077 + 4078 + /* cgroup core interface files for the legacy hierarchies */ 4079 + static struct cftype cgroup_legacy_base_files[] = { 4187 4080 { 4188 4081 .name = "cgroup.procs", 4189 4082 .seq_start = cgroup_pidlist_start, ··· 4232 4053 }, 4233 4054 { 4234 4055 .name = "cgroup.clone_children", 4235 - .flags = CFTYPE_INSANE, 4236 4056 .read_u64 = cgroup_clone_children_read, 4237 4057 .write_u64 = cgroup_clone_children_write, 4238 4058 }, ··· 4241 4063 .seq_show = cgroup_sane_behavior_show, 4242 4064 }, 4243 4065 { 4244 - .name = "cgroup.controllers", 4245 - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_ONLY_ON_ROOT, 4246 - .seq_show = cgroup_root_controllers_show, 4247 - }, 4248 - { 4249 - .name = "cgroup.controllers", 4250 - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, 4251 - .seq_show = cgroup_controllers_show, 4252 - }, 4253 - { 4254 - .name = "cgroup.subtree_control", 4255 - .flags = CFTYPE_ONLY_ON_DFL, 4256 - .seq_show = cgroup_subtree_control_show, 4257 - .write = cgroup_subtree_control_write, 4258 - }, 4259 - { 4260 - .name = "cgroup.populated", 4261 - .flags = CFTYPE_ONLY_ON_DFL | CFTYPE_NOT_ON_ROOT, 4262 - .seq_show = cgroup_populated_show, 4263 - }, 4264 - 4265 - /* 4266 - * Historical crazy stuff. These don't have "cgroup." prefix and 4267 - * don't exist if sane_behavior. If you're depending on these, be 4268 - * prepared to be burned. 4269 - */ 4270 - { 4271 4066 .name = "tasks", 4272 - .flags = CFTYPE_INSANE, /* use "procs" instead */ 4273 4067 .seq_start = cgroup_pidlist_start, 4274 4068 .seq_next = cgroup_pidlist_next, 4275 4069 .seq_stop = cgroup_pidlist_stop, ··· 4252 4102 }, 4253 4103 { 4254 4104 .name = "notify_on_release", 4255 - .flags = CFTYPE_INSANE, 4256 4105 .read_u64 = cgroup_read_notify_on_release, 4257 4106 .write_u64 = cgroup_write_notify_on_release, 4258 4107 }, 4259 4108 { 4260 4109 .name = "release_agent", 4261 - .flags = CFTYPE_INSANE | CFTYPE_ONLY_ON_ROOT, 4110 + .flags = CFTYPE_ONLY_ON_ROOT, 4262 4111 .seq_show = cgroup_release_agent_show, 4263 4112 .write = cgroup_release_agent_write, 4264 4113 .max_write_len = PATH_MAX - 1, ··· 4465 4316 * create_css - create a cgroup_subsys_state 4466 4317 * @cgrp: the cgroup new css will be associated with 4467 4318 * @ss: the subsys of new css 4319 + * @visible: whether to create control knobs for the new css or not 4468 4320 * 4469 4321 * Create a new css associated with @cgrp - @ss pair. On success, the new 4470 - * css is online and installed in @cgrp with all interface files created. 4471 - * Returns 0 on success, -errno on failure. 4322 + * css is online and installed in @cgrp with all interface files created if 4323 + * @visible. Returns 0 on success, -errno on failure. 4472 4324 */ 4473 - static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss) 4325 + static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, 4326 + bool visible) 4474 4327 { 4475 4328 struct cgroup *parent = cgroup_parent(cgrp); 4476 4329 struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); ··· 4496 4345 goto err_free_percpu_ref; 4497 4346 css->id = err; 4498 4347 4499 - err = cgroup_populate_dir(cgrp, 1 << ss->id); 4500 - if (err) 4501 - goto err_free_id; 4348 + if (visible) { 4349 + err = cgroup_populate_dir(cgrp, 1 << ss->id); 4350 + if (err) 4351 + goto err_free_id; 4352 + } 4502 4353 4503 4354 /* @css is ready to be brought online now, make it visible */ 4504 4355 list_add_tail_rcu(&css->sibling, &parent_css->children); ··· 4540 4387 struct cgroup_root *root; 4541 4388 struct cgroup_subsys *ss; 4542 4389 struct kernfs_node *kn; 4390 + struct cftype *base_files; 4543 4391 int ssid, ret; 4544 4392 4545 4393 parent = cgroup_kn_lock_live(parent_kn); ··· 4611 4457 if (ret) 4612 4458 goto out_destroy; 4613 4459 4614 - ret = cgroup_addrm_files(cgrp, cgroup_base_files, true); 4460 + if (cgroup_on_dfl(cgrp)) 4461 + base_files = cgroup_dfl_base_files; 4462 + else 4463 + base_files = cgroup_legacy_base_files; 4464 + 4465 + ret = cgroup_addrm_files(cgrp, base_files, true); 4615 4466 if (ret) 4616 4467 goto out_destroy; 4617 4468 4618 4469 /* let's create and online css's */ 4619 4470 for_each_subsys(ss, ssid) { 4620 4471 if (parent->child_subsys_mask & (1 << ssid)) { 4621 - ret = create_css(cgrp, ss); 4472 + ret = create_css(cgrp, ss, 4473 + parent->subtree_control & (1 << ssid)); 4622 4474 if (ret) 4623 4475 goto out_destroy; 4624 4476 } ··· 4632 4472 4633 4473 /* 4634 4474 * On the default hierarchy, a child doesn't automatically inherit 4635 - * child_subsys_mask from the parent. Each is configured manually. 4475 + * subtree_control from the parent. Each is configured manually. 4636 4476 */ 4637 - if (!cgroup_on_dfl(cgrp)) 4638 - cgrp->child_subsys_mask = parent->child_subsys_mask; 4477 + if (!cgroup_on_dfl(cgrp)) { 4478 + cgrp->subtree_control = parent->subtree_control; 4479 + cgroup_refresh_child_subsys_mask(cgrp); 4480 + } 4639 4481 4640 4482 kernfs_activate(kn); 4641 4483 ··· 4900 4738 */ 4901 4739 int __init cgroup_init_early(void) 4902 4740 { 4903 - static struct cgroup_sb_opts __initdata opts = 4904 - { .flags = CGRP_ROOT_SANE_BEHAVIOR }; 4741 + static struct cgroup_sb_opts __initdata opts; 4905 4742 struct cgroup_subsys *ss; 4906 4743 int i; 4907 4744 ··· 4938 4777 unsigned long key; 4939 4778 int ssid, err; 4940 4779 4941 - BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); 4780 + BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); 4781 + BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); 4942 4782 4943 4783 mutex_lock(&cgroup_mutex); 4944 4784 ··· 4971 4809 * disabled flag and cftype registration needs kmalloc, 4972 4810 * both of which aren't available during early_init. 4973 4811 */ 4974 - if (!ss->disabled) { 4975 - cgrp_dfl_root.subsys_mask |= 1 << ss->id; 4976 - WARN_ON(cgroup_add_cftypes(ss, ss->base_cftypes)); 4812 + if (ss->disabled) 4813 + continue; 4814 + 4815 + cgrp_dfl_root.subsys_mask |= 1 << ss->id; 4816 + 4817 + if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes) 4818 + ss->dfl_cftypes = ss->legacy_cftypes; 4819 + 4820 + if (!ss->dfl_cftypes) 4821 + cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; 4822 + 4823 + if (ss->dfl_cftypes == ss->legacy_cftypes) { 4824 + WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); 4825 + } else { 4826 + WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); 4827 + WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); 4977 4828 } 4978 4829 } 4979 4830 ··· 5382 5207 } 5383 5208 __setup("cgroup_disable=", cgroup_disable); 5384 5209 5210 + static int __init cgroup_set_legacy_files_on_dfl(char *str) 5211 + { 5212 + printk("cgroup: using legacy files on the default hierarchy\n"); 5213 + cgroup_legacy_files_on_dfl = true; 5214 + return 0; 5215 + } 5216 + __setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl); 5217 + 5385 5218 /** 5386 5219 * css_tryget_online_from_dir - get corresponding css from a cgroup dentry 5387 5220 * @dentry: directory dentry of interest ··· 5584 5401 struct cgroup_subsys debug_cgrp_subsys = { 5585 5402 .css_alloc = debug_css_alloc, 5586 5403 .css_free = debug_css_free, 5587 - .base_cftypes = debug_files, 5404 + .legacy_cftypes = debug_files, 5588 5405 }; 5589 5406 #endif /* CONFIG_CGROUP_DEBUG */

+1 -1

kernel/cgroup_freezer.c

··· 480 480 .css_free = freezer_css_free, 481 481 .attach = freezer_attach, 482 482 .fork = freezer_fork, 483 - .base_cftypes = files, 483 + .legacy_cftypes = files, 484 484 };

+306 -194

kernel/cpuset.c

··· 76 76 struct cgroup_subsys_state css; 77 77 78 78 unsigned long flags; /* "unsigned long" so bitops work */ 79 - cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 80 - nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ 79 + 80 + /* 81 + * On default hierarchy: 82 + * 83 + * The user-configured masks can only be changed by writing to 84 + * cpuset.cpus and cpuset.mems, and won't be limited by the 85 + * parent masks. 86 + * 87 + * The effective masks is the real masks that apply to the tasks 88 + * in the cpuset. They may be changed if the configured masks are 89 + * changed or hotplug happens. 90 + * 91 + * effective_mask == configured_mask & parent's effective_mask, 92 + * and if it ends up empty, it will inherit the parent's mask. 93 + * 94 + * 95 + * On legacy hierachy: 96 + * 97 + * The user-configured masks are always the same with effective masks. 98 + */ 99 + 100 + /* user-configured CPUs and Memory Nodes allow to tasks */ 101 + cpumask_var_t cpus_allowed; 102 + nodemask_t mems_allowed; 103 + 104 + /* effective CPUs and Memory Nodes allow to tasks */ 105 + cpumask_var_t effective_cpus; 106 + nodemask_t effective_mems; 81 107 82 108 /* 83 109 * This is old Memory Nodes tasks took on. ··· 333 307 */ 334 308 static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 335 309 { 336 - while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) 310 + while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) 337 311 cs = parent_cs(cs); 338 - cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); 312 + cpumask_and(pmask, cs->effective_cpus, cpu_online_mask); 339 313 } 340 314 341 315 /* ··· 351 325 */ 352 326 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 353 327 { 354 - while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) 328 + while (!nodes_intersects(cs->effective_mems, node_states[N_MEMORY])) 355 329 cs = parent_cs(cs); 356 - nodes_and(*pmask, cs->mems_allowed, node_states[N_MEMORY]); 330 + nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]); 357 331 } 358 332 359 333 /* ··· 402 376 if (!trial) 403 377 return NULL; 404 378 405 - if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { 406 - kfree(trial); 407 - return NULL; 408 - } 409 - cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); 379 + if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) 380 + goto free_cs; 381 + if (!alloc_cpumask_var(&trial->effective_cpus, GFP_KERNEL)) 382 + goto free_cpus; 410 383 384 + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); 385 + cpumask_copy(trial->effective_cpus, cs->effective_cpus); 411 386 return trial; 387 + 388 + free_cpus: 389 + free_cpumask_var(trial->cpus_allowed); 390 + free_cs: 391 + kfree(trial); 392 + return NULL; 412 393 } 413 394 414 395 /** ··· 424 391 */ 425 392 static void free_trial_cpuset(struct cpuset *trial) 426 393 { 394 + free_cpumask_var(trial->effective_cpus); 427 395 free_cpumask_var(trial->cpus_allowed); 428 396 kfree(trial); 429 397 } ··· 470 436 471 437 par = parent_cs(cur); 472 438 473 - /* We must be a subset of our parent cpuset */ 439 + /* On legacy hiearchy, we must be a subset of our parent cpuset. */ 474 440 ret = -EACCES; 475 - if (!is_cpuset_subset(trial, par)) 441 + if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par)) 476 442 goto out; 477 443 478 444 /* ··· 514 480 #ifdef CONFIG_SMP 515 481 /* 516 482 * Helper routine for generate_sched_domains(). 517 - * Do cpusets a, b have overlapping cpus_allowed masks? 483 + * Do cpusets a, b have overlapping effective cpus_allowed masks? 518 484 */ 519 485 static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 520 486 { 521 - return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); 487 + return cpumask_intersects(a->effective_cpus, b->effective_cpus); 522 488 } 523 489 524 490 static void ··· 635 601 *dattr = SD_ATTR_INIT; 636 602 update_domain_attr_tree(dattr, &top_cpuset); 637 603 } 638 - cpumask_copy(doms[0], top_cpuset.cpus_allowed); 604 + cpumask_copy(doms[0], top_cpuset.effective_cpus); 639 605 640 606 goto done; 641 607 } ··· 739 705 struct cpuset *b = csa[j]; 740 706 741 707 if (apn == b->pn) { 742 - cpumask_or(dp, dp, b->cpus_allowed); 708 + cpumask_or(dp, dp, b->effective_cpus); 743 709 if (dattr) 744 710 update_domain_attr_tree(dattr + nslot, b); 745 711 ··· 791 757 * passing doms with offlined cpu to partition_sched_domains(). 792 758 * Anyways, hotplug work item will rebuild sched domains. 793 759 */ 794 - if (!cpumask_equal(top_cpuset.cpus_allowed, cpu_active_mask)) 760 + if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) 795 761 goto out; 796 762 797 763 /* Generate domain masks and attrs */ ··· 815 781 mutex_unlock(&cpuset_mutex); 816 782 } 817 783 818 - /* 819 - * effective_cpumask_cpuset - return nearest ancestor with non-empty cpus 820 - * @cs: the cpuset in interest 821 - * 822 - * A cpuset's effective cpumask is the cpumask of the nearest ancestor 823 - * with non-empty cpus. We use effective cpumask whenever: 824 - * - we update tasks' cpus_allowed. (they take on the ancestor's cpumask 825 - * if the cpuset they reside in has no cpus) 826 - * - we want to retrieve task_cs(tsk)'s cpus_allowed. 827 - * 828 - * Called with cpuset_mutex held. cpuset_cpus_allowed_fallback() is an 829 - * exception. See comments there. 830 - */ 831 - static struct cpuset *effective_cpumask_cpuset(struct cpuset *cs) 832 - { 833 - while (cpumask_empty(cs->cpus_allowed)) 834 - cs = parent_cs(cs); 835 - return cs; 836 - } 837 - 838 - /* 839 - * effective_nodemask_cpuset - return nearest ancestor with non-empty mems 840 - * @cs: the cpuset in interest 841 - * 842 - * A cpuset's effective nodemask is the nodemask of the nearest ancestor 843 - * with non-empty memss. We use effective nodemask whenever: 844 - * - we update tasks' mems_allowed. (they take on the ancestor's nodemask 845 - * if the cpuset they reside in has no mems) 846 - * - we want to retrieve task_cs(tsk)'s mems_allowed. 847 - * 848 - * Called with cpuset_mutex held. 849 - */ 850 - static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) 851 - { 852 - while (nodes_empty(cs->mems_allowed)) 853 - cs = parent_cs(cs); 854 - return cs; 855 - } 856 - 857 784 /** 858 785 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. 859 786 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed ··· 825 830 */ 826 831 static void update_tasks_cpumask(struct cpuset *cs) 827 832 { 828 - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 829 833 struct css_task_iter it; 830 834 struct task_struct *task; 831 835 832 836 css_task_iter_start(&cs->css, &it); 833 837 while ((task = css_task_iter_next(&it))) 834 - set_cpus_allowed_ptr(task, cpus_cs->cpus_allowed); 838 + set_cpus_allowed_ptr(task, cs->effective_cpus); 835 839 css_task_iter_end(&it); 836 840 } 837 841 838 842 /* 839 - * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. 840 - * @root_cs: the root cpuset of the hierarchy 841 - * @update_root: update root cpuset or not? 843 + * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree 844 + * @cs: the cpuset to consider 845 + * @new_cpus: temp variable for calculating new effective_cpus 842 846 * 843 - * This will update cpumasks of tasks in @root_cs and all other empty cpusets 844 - * which take on cpumask of @root_cs. 847 + * When congifured cpumask is changed, the effective cpumasks of this cpuset 848 + * and all its descendants need to be updated. 849 + * 850 + * On legacy hierachy, effective_cpus will be the same with cpu_allowed. 845 851 * 846 852 * Called with cpuset_mutex held 847 853 */ 848 - static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root) 854 + static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) 849 855 { 850 856 struct cpuset *cp; 851 857 struct cgroup_subsys_state *pos_css; 858 + bool need_rebuild_sched_domains = false; 852 859 853 860 rcu_read_lock(); 854 - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 855 - if (cp == root_cs) { 856 - if (!update_root) 857 - continue; 858 - } else { 859 - /* skip the whole subtree if @cp have some CPU */ 860 - if (!cpumask_empty(cp->cpus_allowed)) { 861 - pos_css = css_rightmost_descendant(pos_css); 862 - continue; 863 - } 861 + cpuset_for_each_descendant_pre(cp, pos_css, cs) { 862 + struct cpuset *parent = parent_cs(cp); 863 + 864 + cpumask_and(new_cpus, cp->cpus_allowed, parent->effective_cpus); 865 + 866 + /* 867 + * If it becomes empty, inherit the effective mask of the 868 + * parent, which is guaranteed to have some CPUs. 869 + */ 870 + if (cpumask_empty(new_cpus)) 871 + cpumask_copy(new_cpus, parent->effective_cpus); 872 + 873 + /* Skip the whole subtree if the cpumask remains the same. */ 874 + if (cpumask_equal(new_cpus, cp->effective_cpus)) { 875 + pos_css = css_rightmost_descendant(pos_css); 876 + continue; 864 877 } 878 + 865 879 if (!css_tryget_online(&cp->css)) 866 880 continue; 867 881 rcu_read_unlock(); 868 882 883 + mutex_lock(&callback_mutex); 884 + cpumask_copy(cp->effective_cpus, new_cpus); 885 + mutex_unlock(&callback_mutex); 886 + 887 + WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 888 + !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 889 + 869 890 update_tasks_cpumask(cp); 891 + 892 + /* 893 + * If the effective cpumask of any non-empty cpuset is changed, 894 + * we need to rebuild sched domains. 895 + */ 896 + if (!cpumask_empty(cp->cpus_allowed) && 897 + is_sched_load_balance(cp)) 898 + need_rebuild_sched_domains = true; 870 899 871 900 rcu_read_lock(); 872 901 css_put(&cp->css); 873 902 } 874 903 rcu_read_unlock(); 904 + 905 + if (need_rebuild_sched_domains) 906 + rebuild_sched_domains_locked(); 875 907 } 876 908 877 909 /** ··· 911 889 const char *buf) 912 890 { 913 891 int retval; 914 - int is_load_balanced; 915 892 916 893 /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ 917 894 if (cs == &top_cpuset) ··· 929 908 if (retval < 0) 930 909 return retval; 931 910 932 - if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) 911 + if (!cpumask_subset(trialcs->cpus_allowed, 912 + top_cpuset.cpus_allowed)) 933 913 return -EINVAL; 934 914 } 935 915 ··· 942 920 if (retval < 0) 943 921 return retval; 944 922 945 - is_load_balanced = is_sched_load_balance(trialcs); 946 - 947 923 mutex_lock(&callback_mutex); 948 924 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 949 925 mutex_unlock(&callback_mutex); 950 926 951 - update_tasks_cpumask_hier(cs, true); 952 - 953 - if (is_load_balanced) 954 - rebuild_sched_domains_locked(); 927 + /* use trialcs->cpus_allowed as a temp variable */ 928 + update_cpumasks_hier(cs, trialcs->cpus_allowed); 955 929 return 0; 956 930 } 957 931 ··· 969 951 const nodemask_t *to) 970 952 { 971 953 struct task_struct *tsk = current; 972 - struct cpuset *mems_cs; 973 954 974 955 tsk->mems_allowed = *to; 975 956 976 957 do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); 977 958 978 959 rcu_read_lock(); 979 - mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 980 - guarantee_online_mems(mems_cs, &tsk->mems_allowed); 960 + guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); 981 961 rcu_read_unlock(); 982 962 } 983 963 ··· 1044 1028 static void update_tasks_nodemask(struct cpuset *cs) 1045 1029 { 1046 1030 static nodemask_t newmems; /* protected by cpuset_mutex */ 1047 - struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1048 1031 struct css_task_iter it; 1049 1032 struct task_struct *task; 1050 1033 1051 1034 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1052 1035 1053 - guarantee_online_mems(mems_cs, &newmems); 1036 + guarantee_online_mems(cs, &newmems); 1054 1037 1055 1038 /* 1056 1039 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't ··· 1092 1077 } 1093 1078 1094 1079 /* 1095 - * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. 1096 - * @cs: the root cpuset of the hierarchy 1097 - * @update_root: update the root cpuset or not? 1080 + * update_nodemasks_hier - Update effective nodemasks and tasks in the subtree 1081 + * @cs: the cpuset to consider 1082 + * @new_mems: a temp variable for calculating new effective_mems 1098 1083 * 1099 - * This will update nodemasks of tasks in @root_cs and all other empty cpusets 1100 - * which take on nodemask of @root_cs. 1084 + * When configured nodemask is changed, the effective nodemasks of this cpuset 1085 + * and all its descendants need to be updated. 1086 + * 1087 + * On legacy hiearchy, effective_mems will be the same with mems_allowed. 1101 1088 * 1102 1089 * Called with cpuset_mutex held 1103 1090 */ 1104 - static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root) 1091 + static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) 1105 1092 { 1106 1093 struct cpuset *cp; 1107 1094 struct cgroup_subsys_state *pos_css; 1108 1095 1109 1096 rcu_read_lock(); 1110 - cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { 1111 - if (cp == root_cs) { 1112 - if (!update_root) 1113 - continue; 1114 - } else { 1115 - /* skip the whole subtree if @cp have some CPU */ 1116 - if (!nodes_empty(cp->mems_allowed)) { 1117 - pos_css = css_rightmost_descendant(pos_css); 1118 - continue; 1119 - } 1097 + cpuset_for_each_descendant_pre(cp, pos_css, cs) { 1098 + struct cpuset *parent = parent_cs(cp); 1099 + 1100 + nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems); 1101 + 1102 + /* 1103 + * If it becomes empty, inherit the effective mask of the 1104 + * parent, which is guaranteed to have some MEMs. 1105 + */ 1106 + if (nodes_empty(*new_mems)) 1107 + *new_mems = parent->effective_mems; 1108 + 1109 + /* Skip the whole subtree if the nodemask remains the same. */ 1110 + if (nodes_equal(*new_mems, cp->effective_mems)) { 1111 + pos_css = css_rightmost_descendant(pos_css); 1112 + continue; 1120 1113 } 1114 + 1121 1115 if (!css_tryget_online(&cp->css)) 1122 1116 continue; 1123 1117 rcu_read_unlock(); 1118 + 1119 + mutex_lock(&callback_mutex); 1120 + cp->effective_mems = *new_mems; 1121 + mutex_unlock(&callback_mutex); 1122 + 1123 + WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 1124 + !nodes_equal(cp->mems_allowed, cp->effective_mems)); 1124 1125 1125 1126 update_tasks_nodemask(cp); 1126 1127 ··· 1187 1156 goto done; 1188 1157 1189 1158 if (!nodes_subset(trialcs->mems_allowed, 1190 - node_states[N_MEMORY])) { 1191 - retval = -EINVAL; 1159 + top_cpuset.mems_allowed)) { 1160 + retval = -EINVAL; 1192 1161 goto done; 1193 1162 } 1194 1163 } ··· 1205 1174 cs->mems_allowed = trialcs->mems_allowed; 1206 1175 mutex_unlock(&callback_mutex); 1207 1176 1208 - update_tasks_nodemask_hier(cs, true); 1177 + /* use trialcs->mems_allowed as a temp variable */ 1178 + update_nodemasks_hier(cs, &cs->mems_allowed); 1209 1179 done: 1210 1180 return retval; 1211 1181 } ··· 1421 1389 1422 1390 mutex_lock(&cpuset_mutex); 1423 1391 1424 - /* 1425 - * We allow to move tasks into an empty cpuset if sane_behavior 1426 - * flag is set. 1427 - */ 1392 + /* allow moving tasks into an empty cpuset if on default hierarchy */ 1428 1393 ret = -ENOSPC; 1429 - if (!cgroup_sane_behavior(css->cgroup) && 1394 + if (!cgroup_on_dfl(css->cgroup) && 1430 1395 (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) 1431 1396 goto out_unlock; 1432 1397 ··· 1481 1452 struct task_struct *leader = cgroup_taskset_first(tset); 1482 1453 struct cpuset *cs = css_cs(css); 1483 1454 struct cpuset *oldcs = cpuset_attach_old_cs; 1484 - struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); 1485 - struct cpuset *mems_cs = effective_nodemask_cpuset(cs); 1486 1455 1487 1456 mutex_lock(&cpuset_mutex); 1488 1457 ··· 1488 1461 if (cs == &top_cpuset) 1489 1462 cpumask_copy(cpus_attach, cpu_possible_mask); 1490 1463 else 1491 - guarantee_online_cpus(cpus_cs, cpus_attach); 1464 + guarantee_online_cpus(cs, cpus_attach); 1492 1465 1493 - guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); 1466 + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1494 1467 1495 1468 cgroup_taskset_for_each(task, tset) { 1496 1469 /* ··· 1507 1480 * Change mm, possibly for multiple threads in a threadgroup. This is 1508 1481 * expensive and may sleep. 1509 1482 */ 1510 - cpuset_attach_nodemask_to = cs->mems_allowed; 1483 + cpuset_attach_nodemask_to = cs->effective_mems; 1511 1484 mm = get_task_mm(leader); 1512 1485 if (mm) { 1513 - struct cpuset *mems_oldcs = effective_nodemask_cpuset(oldcs); 1514 - 1515 1486 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1516 1487 1517 1488 /* ··· 1520 1495 * mm from. 1521 1496 */ 1522 1497 if (is_memory_migrate(cs)) { 1523 - cpuset_migrate_mm(mm, &mems_oldcs->old_mems_allowed, 1498 + cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, 1524 1499 &cpuset_attach_nodemask_to); 1525 1500 } 1526 1501 mmput(mm); ··· 1541 1516 FILE_MEMORY_MIGRATE, 1542 1517 FILE_CPULIST, 1543 1518 FILE_MEMLIST, 1519 + FILE_EFFECTIVE_CPULIST, 1520 + FILE_EFFECTIVE_MEMLIST, 1544 1521 FILE_CPU_EXCLUSIVE, 1545 1522 FILE_MEM_EXCLUSIVE, 1546 1523 FILE_MEM_HARDWALL, ··· 1721 1694 case FILE_MEMLIST: 1722 1695 s += nodelist_scnprintf(s, count, cs->mems_allowed); 1723 1696 break; 1697 + case FILE_EFFECTIVE_CPULIST: 1698 + s += cpulist_scnprintf(s, count, cs->effective_cpus); 1699 + break; 1700 + case FILE_EFFECTIVE_MEMLIST: 1701 + s += nodelist_scnprintf(s, count, cs->effective_mems); 1702 + break; 1724 1703 default: 1725 1704 ret = -EINVAL; 1726 1705 goto out_unlock; ··· 1809 1776 .write = cpuset_write_resmask, 1810 1777 .max_write_len = (100U + 6 * MAX_NUMNODES), 1811 1778 .private = FILE_MEMLIST, 1779 + }, 1780 + 1781 + { 1782 + .name = "effective_cpus", 1783 + .seq_show = cpuset_common_seq_show, 1784 + .private = FILE_EFFECTIVE_CPULIST, 1785 + }, 1786 + 1787 + { 1788 + .name = "effective_mems", 1789 + .seq_show = cpuset_common_seq_show, 1790 + .private = FILE_EFFECTIVE_MEMLIST, 1812 1791 }, 1813 1792 1814 1793 { ··· 1914 1869 cs = kzalloc(sizeof(*cs), GFP_KERNEL); 1915 1870 if (!cs) 1916 1871 return ERR_PTR(-ENOMEM); 1917 - if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { 1918 - kfree(cs); 1919 - return ERR_PTR(-ENOMEM); 1920 - } 1872 + if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) 1873 + goto free_cs; 1874 + if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL)) 1875 + goto free_cpus; 1921 1876 1922 1877 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 1923 1878 cpumask_clear(cs->cpus_allowed); 1924 1879 nodes_clear(cs->mems_allowed); 1880 + cpumask_clear(cs->effective_cpus); 1881 + nodes_clear(cs->effective_mems); 1925 1882 fmeter_init(&cs->fmeter); 1926 1883 cs->relax_domain_level = -1; 1927 1884 1928 1885 return &cs->css; 1886 + 1887 + free_cpus: 1888 + free_cpumask_var(cs->cpus_allowed); 1889 + free_cs: 1890 + kfree(cs); 1891 + return ERR_PTR(-ENOMEM); 1929 1892 } 1930 1893 1931 1894 static int cpuset_css_online(struct cgroup_subsys_state *css) ··· 1955 1902 set_bit(CS_SPREAD_SLAB, &cs->flags); 1956 1903 1957 1904 cpuset_inc(); 1905 + 1906 + mutex_lock(&callback_mutex); 1907 + if (cgroup_on_dfl(cs->css.cgroup)) { 1908 + cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1909 + cs->effective_mems = parent->effective_mems; 1910 + } 1911 + mutex_unlock(&callback_mutex); 1958 1912 1959 1913 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1960 1914 goto out_unlock; ··· 2022 1962 { 2023 1963 struct cpuset *cs = css_cs(css); 2024 1964 1965 + free_cpumask_var(cs->effective_cpus); 2025 1966 free_cpumask_var(cs->cpus_allowed); 2026 1967 kfree(cs); 2027 1968 } 2028 1969 1970 + static void cpuset_bind(struct cgroup_subsys_state *root_css) 1971 + { 1972 + mutex_lock(&cpuset_mutex); 1973 + mutex_lock(&callback_mutex); 1974 + 1975 + if (cgroup_on_dfl(root_css->cgroup)) { 1976 + cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 1977 + top_cpuset.mems_allowed = node_possible_map; 1978 + } else { 1979 + cpumask_copy(top_cpuset.cpus_allowed, 1980 + top_cpuset.effective_cpus); 1981 + top_cpuset.mems_allowed = top_cpuset.effective_mems; 1982 + } 1983 + 1984 + mutex_unlock(&callback_mutex); 1985 + mutex_unlock(&cpuset_mutex); 1986 + } 1987 + 2029 1988 struct cgroup_subsys cpuset_cgrp_subsys = { 2030 - .css_alloc = cpuset_css_alloc, 2031 - .css_online = cpuset_css_online, 2032 - .css_offline = cpuset_css_offline, 2033 - .css_free = cpuset_css_free, 2034 - .can_attach = cpuset_can_attach, 2035 - .cancel_attach = cpuset_cancel_attach, 2036 - .attach = cpuset_attach, 2037 - .base_cftypes = files, 2038 - .early_init = 1, 1989 + .css_alloc = cpuset_css_alloc, 1990 + .css_online = cpuset_css_online, 1991 + .css_offline = cpuset_css_offline, 1992 + .css_free = cpuset_css_free, 1993 + .can_attach = cpuset_can_attach, 1994 + .cancel_attach = cpuset_cancel_attach, 1995 + .attach = cpuset_attach, 1996 + .bind = cpuset_bind, 1997 + .legacy_cftypes = files, 1998 + .early_init = 1, 2039 1999 }; 2040 2000 2041 2001 /** ··· 2070 1990 2071 1991 if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) 2072 1992 BUG(); 1993 + if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) 1994 + BUG(); 2073 1995 2074 1996 cpumask_setall(top_cpuset.cpus_allowed); 2075 1997 nodes_setall(top_cpuset.mems_allowed); 1998 + cpumask_setall(top_cpuset.effective_cpus); 1999 + nodes_setall(top_cpuset.effective_mems); 2076 2000 2077 2001 fmeter_init(&top_cpuset.fmeter); 2078 2002 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); ··· 2119 2035 } 2120 2036 } 2121 2037 2038 + static void 2039 + hotplug_update_tasks_legacy(struct cpuset *cs, 2040 + struct cpumask *new_cpus, nodemask_t *new_mems, 2041 + bool cpus_updated, bool mems_updated) 2042 + { 2043 + bool is_empty; 2044 + 2045 + mutex_lock(&callback_mutex); 2046 + cpumask_copy(cs->cpus_allowed, new_cpus); 2047 + cpumask_copy(cs->effective_cpus, new_cpus); 2048 + cs->mems_allowed = *new_mems; 2049 + cs->effective_mems = *new_mems; 2050 + mutex_unlock(&callback_mutex); 2051 + 2052 + /* 2053 + * Don't call update_tasks_cpumask() if the cpuset becomes empty, 2054 + * as the tasks will be migratecd to an ancestor. 2055 + */ 2056 + if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) 2057 + update_tasks_cpumask(cs); 2058 + if (mems_updated && !nodes_empty(cs->mems_allowed)) 2059 + update_tasks_nodemask(cs); 2060 + 2061 + is_empty = cpumask_empty(cs->cpus_allowed) || 2062 + nodes_empty(cs->mems_allowed); 2063 + 2064 + mutex_unlock(&cpuset_mutex); 2065 + 2066 + /* 2067 + * Move tasks to the nearest ancestor with execution resources, 2068 + * This is full cgroup operation which will also call back into 2069 + * cpuset. Should be done outside any lock. 2070 + */ 2071 + if (is_empty) 2072 + remove_tasks_in_empty_cpuset(cs); 2073 + 2074 + mutex_lock(&cpuset_mutex); 2075 + } 2076 + 2077 + static void 2078 + hotplug_update_tasks(struct cpuset *cs, 2079 + struct cpumask *new_cpus, nodemask_t *new_mems, 2080 + bool cpus_updated, bool mems_updated) 2081 + { 2082 + if (cpumask_empty(new_cpus)) 2083 + cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); 2084 + if (nodes_empty(*new_mems)) 2085 + *new_mems = parent_cs(cs)->effective_mems; 2086 + 2087 + mutex_lock(&callback_mutex); 2088 + cpumask_copy(cs->effective_cpus, new_cpus); 2089 + cs->effective_mems = *new_mems; 2090 + mutex_unlock(&callback_mutex); 2091 + 2092 + if (cpus_updated) 2093 + update_tasks_cpumask(cs); 2094 + if (mems_updated) 2095 + update_tasks_nodemask(cs); 2096 + } 2097 + 2122 2098 /** 2123 2099 * cpuset_hotplug_update_tasks - update tasks in a cpuset for hotunplug 2124 2100 * @cs: cpuset in interest ··· 2189 2045 */ 2190 2046 static void cpuset_hotplug_update_tasks(struct cpuset *cs) 2191 2047 { 2192 - static cpumask_t off_cpus; 2193 - static nodemask_t off_mems; 2194 - bool is_empty; 2195 - bool sane = cgroup_sane_behavior(cs->css.cgroup); 2196 - 2048 + static cpumask_t new_cpus; 2049 + static nodemask_t new_mems; 2050 + bool cpus_updated; 2051 + bool mems_updated; 2197 2052 retry: 2198 2053 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 2199 2054 ··· 2207 2064 goto retry; 2208 2065 } 2209 2066 2210 - cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed); 2211 - nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed); 2067 + cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus); 2068 + nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems); 2212 2069 2213 - mutex_lock(&callback_mutex); 2214 - cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus); 2215 - mutex_unlock(&callback_mutex); 2070 + cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); 2071 + mems_updated = !nodes_equal(new_mems, cs->effective_mems); 2216 2072 2217 - /* 2218 - * If sane_behavior flag is set, we need to update tasks' cpumask 2219 - * for empty cpuset to take on ancestor's cpumask. Otherwise, don't 2220 - * call update_tasks_cpumask() if the cpuset becomes empty, as 2221 - * the tasks in it will be migrated to an ancestor. 2222 - */ 2223 - if ((sane && cpumask_empty(cs->cpus_allowed)) || 2224 - (!cpumask_empty(&off_cpus) && !cpumask_empty(cs->cpus_allowed))) 2225 - update_tasks_cpumask(cs); 2226 - 2227 - mutex_lock(&callback_mutex); 2228 - nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems); 2229 - mutex_unlock(&callback_mutex); 2230 - 2231 - /* 2232 - * If sane_behavior flag is set, we need to update tasks' nodemask 2233 - * for empty cpuset to take on ancestor's nodemask. Otherwise, don't 2234 - * call update_tasks_nodemask() if the cpuset becomes empty, as 2235 - * the tasks in it will be migratd to an ancestor. 2236 - */ 2237 - if ((sane && nodes_empty(cs->mems_allowed)) || 2238 - (!nodes_empty(off_mems) && !nodes_empty(cs->mems_allowed))) 2239 - update_tasks_nodemask(cs); 2240 - 2241 - is_empty = cpumask_empty(cs->cpus_allowed) || 2242 - nodes_empty(cs->mems_allowed); 2073 + if (cgroup_on_dfl(cs->css.cgroup)) 2074 + hotplug_update_tasks(cs, &new_cpus, &new_mems, 2075 + cpus_updated, mems_updated); 2076 + else 2077 + hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, 2078 + cpus_updated, mems_updated); 2243 2079 2244 2080 mutex_unlock(&cpuset_mutex); 2245 - 2246 - /* 2247 - * If sane_behavior flag is set, we'll keep tasks in empty cpusets. 2248 - * 2249 - * Otherwise move tasks to the nearest ancestor with execution 2250 - * resources. This is full cgroup operation which will 2251 - * also call back into cpuset. Should be done outside any lock. 2252 - */ 2253 - if (!sane && is_empty) 2254 - remove_tasks_in_empty_cpuset(cs); 2255 2081 } 2256 2082 2257 2083 /** ··· 2244 2132 static cpumask_t new_cpus; 2245 2133 static nodemask_t new_mems; 2246 2134 bool cpus_updated, mems_updated; 2135 + bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup); 2247 2136 2248 2137 mutex_lock(&cpuset_mutex); 2249 2138 ··· 2252 2139 cpumask_copy(&new_cpus, cpu_active_mask); 2253 2140 new_mems = node_states[N_MEMORY]; 2254 2141 2255 - cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus); 2256 - mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems); 2142 + cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus); 2143 + mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems); 2257 2144 2258 2145 /* synchronize cpus_allowed to cpu_active_mask */ 2259 2146 if (cpus_updated) { 2260 2147 mutex_lock(&callback_mutex); 2261 - cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 2148 + if (!on_dfl) 2149 + cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 2150 + cpumask_copy(top_cpuset.effective_cpus, &new_cpus); 2262 2151 mutex_unlock(&callback_mutex); 2263 2152 /* we don't mess with cpumasks of tasks in top_cpuset */ 2264 2153 } ··· 2268 2153 /* synchronize mems_allowed to N_MEMORY */ 2269 2154 if (mems_updated) { 2270 2155 mutex_lock(&callback_mutex); 2271 - top_cpuset.mems_allowed = new_mems; 2156 + if (!on_dfl) 2157 + top_cpuset.mems_allowed = new_mems; 2158 + top_cpuset.effective_mems = new_mems; 2272 2159 mutex_unlock(&callback_mutex); 2273 2160 update_tasks_nodemask(&top_cpuset); 2274 2161 } ··· 2345 2228 top_cpuset.mems_allowed = node_states[N_MEMORY]; 2346 2229 top_cpuset.old_mems_allowed = top_cpuset.mems_allowed; 2347 2230 2231 + cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask); 2232 + top_cpuset.effective_mems = node_states[N_MEMORY]; 2233 + 2348 2234 register_hotmemory_notifier(&cpuset_track_online_nodes_nb); 2349 2235 } 2350 2236 ··· 2364 2244 2365 2245 void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2366 2246 { 2367 - struct cpuset *cpus_cs; 2368 - 2369 2247 mutex_lock(&callback_mutex); 2370 2248 rcu_read_lock(); 2371 - cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2372 - guarantee_online_cpus(cpus_cs, pmask); 2249 + guarantee_online_cpus(task_cs(tsk), pmask); 2373 2250 rcu_read_unlock(); 2374 2251 mutex_unlock(&callback_mutex); 2375 2252 } 2376 2253 2377 2254 void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2378 2255 { 2379 - struct cpuset *cpus_cs; 2380 - 2381 2256 rcu_read_lock(); 2382 - cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); 2383 - do_set_cpus_allowed(tsk, cpus_cs->cpus_allowed); 2257 + do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); 2384 2258 rcu_read_unlock(); 2385 2259 2386 2260 /* ··· 2413 2299 2414 2300 nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2415 2301 { 2416 - struct cpuset *mems_cs; 2417 2302 nodemask_t mask; 2418 2303 2419 2304 mutex_lock(&callback_mutex); 2420 2305 rcu_read_lock(); 2421 - mems_cs = effective_nodemask_cpuset(task_cs(tsk)); 2422 - guarantee_online_mems(mems_cs, &mask); 2306 + guarantee_online_mems(task_cs(tsk), &mask); 2423 2307 rcu_read_unlock(); 2424 2308 mutex_unlock(&callback_mutex); 2425 2309

+1 -1

kernel/sched/core.c

··· 8083 8083 .can_attach = cpu_cgroup_can_attach, 8084 8084 .attach = cpu_cgroup_attach, 8085 8085 .exit = cpu_cgroup_exit, 8086 - .base_cftypes = cpu_files, 8086 + .legacy_cftypes = cpu_files, 8087 8087 .early_init = 1, 8088 8088 }; 8089 8089

+1 -1

kernel/sched/cpuacct.c

··· 278 278 struct cgroup_subsys cpuacct_cgrp_subsys = { 279 279 .css_alloc = cpuacct_css_alloc, 280 280 .css_free = cpuacct_css_free, 281 - .base_cftypes = files, 281 + .legacy_cftypes = files, 282 282 .early_init = 1, 283 283 };

+2 -3

mm/hugetlb_cgroup.c

··· 358 358 cft = &h->cgroup_files[4]; 359 359 memset(cft, 0, sizeof(*cft)); 360 360 361 - WARN_ON(cgroup_add_cftypes(&hugetlb_cgrp_subsys, h->cgroup_files)); 362 - 363 - return; 361 + WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, 362 + h->cgroup_files)); 364 363 } 365 364 366 365 void __init hugetlb_cgroup_file_init(void)

+31 -6

mm/memcontrol.c

··· 6007 6007 }, 6008 6008 { 6009 6009 .name = "use_hierarchy", 6010 - .flags = CFTYPE_INSANE, 6011 6010 .write_u64 = mem_cgroup_hierarchy_write, 6012 6011 .read_u64 = mem_cgroup_hierarchy_read, 6013 6012 }, ··· 6408 6409 6409 6410 memcg_destroy_kmem(memcg); 6410 6411 __mem_cgroup_free(memcg); 6412 + } 6413 + 6414 + /** 6415 + * mem_cgroup_css_reset - reset the states of a mem_cgroup 6416 + * @css: the target css 6417 + * 6418 + * Reset the states of the mem_cgroup associated with @css. This is 6419 + * invoked when the userland requests disabling on the default hierarchy 6420 + * but the memcg is pinned through dependency. The memcg should stop 6421 + * applying policies and should revert to the vanilla state as it may be 6422 + * made visible again. 6423 + * 6424 + * The current implementation only resets the essential configurations. 6425 + * This needs to be expanded to cover all the visible parts. 6426 + */ 6427 + static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) 6428 + { 6429 + struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6430 + 6431 + mem_cgroup_resize_limit(memcg, ULLONG_MAX); 6432 + mem_cgroup_resize_memsw_limit(memcg, ULLONG_MAX); 6433 + memcg_update_kmem_limit(memcg, ULLONG_MAX); 6434 + res_counter_set_soft_limit(&memcg->res, ULLONG_MAX); 6411 6435 } 6412 6436 6413 6437 #ifdef CONFIG_MMU ··· 7027 7005 7028 7006 /* 7029 7007 * Cgroup retains root cgroups across [un]mount cycles making it necessary 7030 - * to verify sane_behavior flag on each mount attempt. 7008 + * to verify whether we're attached to the default hierarchy on each mount 7009 + * attempt. 7031 7010 */ 7032 7011 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) 7033 7012 { 7034 7013 /* 7035 - * use_hierarchy is forced with sane_behavior. cgroup core 7014 + * use_hierarchy is forced on the default hierarchy. cgroup core 7036 7015 * guarantees that @root doesn't have any children, so turning it 7037 7016 * on for the root memcg is enough. 7038 7017 */ 7039 - if (cgroup_sane_behavior(root_css->cgroup)) 7018 + if (cgroup_on_dfl(root_css->cgroup)) 7040 7019 mem_cgroup_from_css(root_css)->use_hierarchy = true; 7041 7020 } 7042 7021 ··· 7046 7023 .css_online = mem_cgroup_css_online, 7047 7024 .css_offline = mem_cgroup_css_offline, 7048 7025 .css_free = mem_cgroup_css_free, 7026 + .css_reset = mem_cgroup_css_reset, 7049 7027 .can_attach = mem_cgroup_can_attach, 7050 7028 .cancel_attach = mem_cgroup_cancel_attach, 7051 7029 .attach = mem_cgroup_move_task, 7052 7030 .bind = mem_cgroup_bind, 7053 - .base_cftypes = mem_cgroup_files, 7031 + .legacy_cftypes = mem_cgroup_files, 7054 7032 .early_init = 0, 7055 7033 }; 7056 7034 ··· 7068 7044 7069 7045 static void __init memsw_file_init(void) 7070 7046 { 7071 - WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, memsw_cgroup_files)); 7047 + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, 7048 + memsw_cgroup_files)); 7072 7049 } 7073 7050 7074 7051 static void __init enable_swap_cgroup(void)

+1 -1

net/core/netclassid_cgroup.c

··· 107 107 .css_online = cgrp_css_online, 108 108 .css_free = cgrp_css_free, 109 109 .attach = cgrp_attach, 110 - .base_cftypes = ss_files, 110 + .legacy_cftypes = ss_files, 111 111 };

+1 -1

net/core/netprio_cgroup.c

··· 249 249 .css_online = cgrp_css_online, 250 250 .css_free = cgrp_css_free, 251 251 .attach = net_prio_attach, 252 - .base_cftypes = ss_files, 252 + .legacy_cftypes = ss_files, 253 253 }; 254 254 255 255 static int netprio_device_event(struct notifier_block *unused,

+1 -1

net/ipv4/tcp_memcontrol.c

··· 222 222 223 223 static int __init tcp_memcontrol_init(void) 224 224 { 225 - WARN_ON(cgroup_add_cftypes(&memory_cgrp_subsys, tcp_files)); 225 + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files)); 226 226 return 0; 227 227 } 228 228 __initcall(tcp_memcontrol_init);

+1 -1

security/device_cgroup.c

··· 796 796 .css_free = devcgroup_css_free, 797 797 .css_online = devcgroup_online, 798 798 .css_offline = devcgroup_offline, 799 - .base_cftypes = dev_cgroup_files, 799 + .legacy_cftypes = dev_cgroup_files, 800 800 }; 801 801 802 802 /**