Merge branch 'for-3.3' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

+21 -30

Documentation/cgroups/cgroups.txt

··· 594 594 called multiple times against a cgroup. 595 595 596 596 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 597 - struct task_struct *task) 597 + struct cgroup_taskset *tset) 598 598 (cgroup_mutex held by caller) 599 599 600 - Called prior to moving a task into a cgroup; if the subsystem 601 - returns an error, this will abort the attach operation. If a NULL 602 - task is passed, then a successful result indicates that *any* 603 - unspecified task can be moved into the cgroup. Note that this isn't 604 - called on a fork. If this method returns 0 (success) then this should 605 - remain valid while the caller holds cgroup_mutex and it is ensured that either 600 + Called prior to moving one or more tasks into a cgroup; if the 601 + subsystem returns an error, this will abort the attach operation. 602 + @tset contains the tasks to be attached and is guaranteed to have at 603 + least one task in it. 604 + 605 + If there are multiple tasks in the taskset, then: 606 + - it's guaranteed that all are from the same thread group 607 + - @tset contains all tasks from the thread group whether or not 608 + they're switching cgroups 609 + - the first task is the leader 610 + 611 + Each @tset entry also contains the task's old cgroup and tasks which 612 + aren't switching cgroup can be skipped easily using the 613 + cgroup_taskset_for_each() iterator. Note that this isn't called on a 614 + fork. If this method returns 0 (success) then this should remain valid 615 + while the caller holds cgroup_mutex and it is ensured that either 606 616 attach() or cancel_attach() will be called in future. 607 617 608 - int can_attach_task(struct cgroup *cgrp, struct task_struct *tsk); 609 - (cgroup_mutex held by caller) 610 - 611 - As can_attach, but for operations that must be run once per task to be 612 - attached (possibly many when using cgroup_attach_proc). Called after 613 - can_attach. 614 - 615 618 void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 616 - struct task_struct *task, bool threadgroup) 619 + struct cgroup_taskset *tset) 617 620 (cgroup_mutex held by caller) 618 621 619 622 Called when a task attach operation has failed after can_attach() has succeeded. 620 623 A subsystem whose can_attach() has some side-effects should provide this 621 624 function, so that the subsystem can implement a rollback. If not, not necessary. 622 625 This will be called only about subsystems whose can_attach() operation have 623 - succeeded. 624 - 625 - void pre_attach(struct cgroup *cgrp); 626 - (cgroup_mutex held by caller) 627 - 628 - For any non-per-thread attachment work that needs to happen before 629 - attach_task. Needed by cpuset. 626 + succeeded. The parameters are identical to can_attach(). 630 627 631 628 void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 632 - struct cgroup *old_cgrp, struct task_struct *task) 629 + struct cgroup_taskset *tset) 633 630 (cgroup_mutex held by caller) 634 631 635 632 Called after the task has been attached to the cgroup, to allow any 636 633 post-attachment activity that requires memory allocations or blocking. 637 - 638 - void attach_task(struct cgroup *cgrp, struct task_struct *tsk); 639 - (cgroup_mutex held by caller) 640 - 641 - As attach, but for operations that must be run once per task to be attached, 642 - like can_attach_task. Called before attach. Currently does not support any 643 - subsystem that might need the old_cgrp for every thread in the group. 634 + The parameters are identical to can_attach(). 644 635 645 636 void fork(struct cgroup_subsy *ss, struct task_struct *task) 646 637

+28 -17

block/blk-cgroup.c

··· 30 30 31 31 static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, 32 32 struct cgroup *); 33 - static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *); 34 - static void blkiocg_attach_task(struct cgroup *, struct task_struct *); 33 + static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, 34 + struct cgroup_taskset *); 35 + static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, 36 + struct cgroup_taskset *); 35 37 static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); 36 38 static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); 37 39 ··· 46 44 struct cgroup_subsys blkio_subsys = { 47 45 .name = "blkio", 48 46 .create = blkiocg_create, 49 - .can_attach_task = blkiocg_can_attach_task, 50 - .attach_task = blkiocg_attach_task, 47 + .can_attach = blkiocg_can_attach, 48 + .attach = blkiocg_attach, 51 49 .destroy = blkiocg_destroy, 52 50 .populate = blkiocg_populate, 53 51 #ifdef CONFIG_BLK_CGROUP ··· 1628 1626 * of the main cic data structures. For now we allow a task to change 1629 1627 * its cgroup only if it's the only owner of its ioc. 1630 1628 */ 1631 - static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1629 + static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1630 + struct cgroup_taskset *tset) 1632 1631 { 1632 + struct task_struct *task; 1633 1633 struct io_context *ioc; 1634 1634 int ret = 0; 1635 1635 1636 1636 /* task_lock() is needed to avoid races with exit_io_context() */ 1637 - task_lock(tsk); 1638 - ioc = tsk->io_context; 1639 - if (ioc && atomic_read(&ioc->nr_tasks) > 1) 1640 - ret = -EINVAL; 1641 - task_unlock(tsk); 1642 - 1637 + cgroup_taskset_for_each(task, cgrp, tset) { 1638 + task_lock(task); 1639 + ioc = task->io_context; 1640 + if (ioc && atomic_read(&ioc->nr_tasks) > 1) 1641 + ret = -EINVAL; 1642 + task_unlock(task); 1643 + if (ret) 1644 + break; 1645 + } 1643 1646 return ret; 1644 1647 } 1645 1648 1646 - static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1649 + static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1650 + struct cgroup_taskset *tset) 1647 1651 { 1652 + struct task_struct *task; 1648 1653 struct io_context *ioc; 1649 1654 1650 - task_lock(tsk); 1651 - ioc = tsk->io_context; 1652 - if (ioc) 1653 - ioc->cgroup_changed = 1; 1654 - task_unlock(tsk); 1655 + cgroup_taskset_for_each(task, cgrp, tset) { 1656 + task_lock(task); 1657 + ioc = task->io_context; 1658 + if (ioc) 1659 + ioc->cgroup_changed = 1; 1660 + task_unlock(task); 1661 + } 1655 1662 } 1656 1663 1657 1664 void blkio_policy_register(struct blkio_policy_type *blkiop)

+25 -6

include/linux/cgroup.h

··· 457 457 void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); 458 458 459 459 /* 460 + * Control Group taskset, used to pass around set of tasks to cgroup_subsys 461 + * methods. 462 + */ 463 + struct cgroup_taskset; 464 + struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); 465 + struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); 466 + struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset); 467 + int cgroup_taskset_size(struct cgroup_taskset *tset); 468 + 469 + /** 470 + * cgroup_taskset_for_each - iterate cgroup_taskset 471 + * @task: the loop cursor 472 + * @skip_cgrp: skip if task's cgroup matches this, %NULL to iterate through all 473 + * @tset: taskset to iterate 474 + */ 475 + #define cgroup_taskset_for_each(task, skip_cgrp, tset) \ 476 + for ((task) = cgroup_taskset_first((tset)); (task); \ 477 + (task) = cgroup_taskset_next((tset))) \ 478 + if (!(skip_cgrp) || \ 479 + cgroup_taskset_cur_cgroup((tset)) != (skip_cgrp)) 480 + 481 + /* 460 482 * Control Group subsystem type. 461 483 * See Documentation/cgroups/cgroups.txt for details 462 484 */ ··· 489 467 int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 490 468 void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); 491 469 int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 492 - struct task_struct *tsk); 493 - int (*can_attach_task)(struct cgroup *cgrp, struct task_struct *tsk); 470 + struct cgroup_taskset *tset); 494 471 void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 495 - struct task_struct *tsk); 496 - void (*pre_attach)(struct cgroup *cgrp); 497 - void (*attach_task)(struct cgroup *cgrp, struct task_struct *tsk); 472 + struct cgroup_taskset *tset); 498 473 void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, 499 - struct cgroup *old_cgrp, struct task_struct *tsk); 474 + struct cgroup_taskset *tset); 500 475 void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); 501 476 void (*exit)(struct cgroup_subsys *ss, struct cgroup *cgrp, 502 477 struct cgroup *old_cgrp, struct task_struct *task);

+4 -5

include/linux/init_task.h

··· 23 23 extern struct fs_struct init_fs; 24 24 25 25 #ifdef CONFIG_CGROUPS 26 - #define INIT_THREADGROUP_FORK_LOCK(sig) \ 27 - .threadgroup_fork_lock = \ 28 - __RWSEM_INITIALIZER(sig.threadgroup_fork_lock), 26 + #define INIT_GROUP_RWSEM(sig) \ 27 + .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem), 29 28 #else 30 - #define INIT_THREADGROUP_FORK_LOCK(sig) 29 + #define INIT_GROUP_RWSEM(sig) 31 30 #endif 32 31 33 32 #define INIT_SIGNALS(sig) { \ ··· 45 46 }, \ 46 47 .cred_guard_mutex = \ 47 48 __MUTEX_INITIALIZER(sig.cred_guard_mutex), \ 48 - INIT_THREADGROUP_FORK_LOCK(sig) \ 49 + INIT_GROUP_RWSEM(sig) \ 49 50 } 50 51 51 52 extern struct nsproxy init_nsproxy;

+54 -19

include/linux/sched.h

··· 637 637 #endif 638 638 #ifdef CONFIG_CGROUPS 639 639 /* 640 - * The threadgroup_fork_lock prevents threads from forking with 641 - * CLONE_THREAD while held for writing. Use this for fork-sensitive 642 - * threadgroup-wide operations. It's taken for reading in fork.c in 643 - * copy_process(). 644 - * Currently only needed write-side by cgroups. 640 + * group_rwsem prevents new tasks from entering the threadgroup and 641 + * member tasks from exiting,a more specifically, setting of 642 + * PF_EXITING. fork and exit paths are protected with this rwsem 643 + * using threadgroup_change_begin/end(). Users which require 644 + * threadgroup to remain stable should use threadgroup_[un]lock() 645 + * which also takes care of exec path. Currently, cgroup is the 646 + * only user. 645 647 */ 646 - struct rw_semaphore threadgroup_fork_lock; 648 + struct rw_semaphore group_rwsem; 647 649 #endif 648 650 649 651 int oom_adj; /* OOM kill score adjustment (bit shift) */ ··· 2396 2394 spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); 2397 2395 } 2398 2396 2399 - /* See the declaration of threadgroup_fork_lock in signal_struct. */ 2400 2397 #ifdef CONFIG_CGROUPS 2401 - static inline void threadgroup_fork_read_lock(struct task_struct *tsk) 2398 + static inline void threadgroup_change_begin(struct task_struct *tsk) 2402 2399 { 2403 - down_read(&tsk->signal->threadgroup_fork_lock); 2400 + down_read(&tsk->signal->group_rwsem); 2404 2401 } 2405 - static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) 2402 + static inline void threadgroup_change_end(struct task_struct *tsk) 2406 2403 { 2407 - up_read(&tsk->signal->threadgroup_fork_lock); 2404 + up_read(&tsk->signal->group_rwsem); 2408 2405 } 2409 - static inline void threadgroup_fork_write_lock(struct task_struct *tsk) 2406 + 2407 + /** 2408 + * threadgroup_lock - lock threadgroup 2409 + * @tsk: member task of the threadgroup to lock 2410 + * 2411 + * Lock the threadgroup @tsk belongs to. No new task is allowed to enter 2412 + * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or 2413 + * perform exec. This is useful for cases where the threadgroup needs to 2414 + * stay stable across blockable operations. 2415 + * 2416 + * fork and exit paths explicitly call threadgroup_change_{begin|end}() for 2417 + * synchronization. While held, no new task will be added to threadgroup 2418 + * and no existing live task will have its PF_EXITING set. 2419 + * 2420 + * During exec, a task goes and puts its thread group through unusual 2421 + * changes. After de-threading, exclusive access is assumed to resources 2422 + * which are usually shared by tasks in the same group - e.g. sighand may 2423 + * be replaced with a new one. Also, the exec'ing task takes over group 2424 + * leader role including its pid. Exclude these changes while locked by 2425 + * grabbing cred_guard_mutex which is used to synchronize exec path. 2426 + */ 2427 + static inline void threadgroup_lock(struct task_struct *tsk) 2410 2428 { 2411 - down_write(&tsk->signal->threadgroup_fork_lock); 2429 + /* 2430 + * exec uses exit for de-threading nesting group_rwsem inside 2431 + * cred_guard_mutex. Grab cred_guard_mutex first. 2432 + */ 2433 + mutex_lock(&tsk->signal->cred_guard_mutex); 2434 + down_write(&tsk->signal->group_rwsem); 2412 2435 } 2413 - static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) 2436 + 2437 + /** 2438 + * threadgroup_unlock - unlock threadgroup 2439 + * @tsk: member task of the threadgroup to unlock 2440 + * 2441 + * Reverse threadgroup_lock(). 2442 + */ 2443 + static inline void threadgroup_unlock(struct task_struct *tsk) 2414 2444 { 2415 - up_write(&tsk->signal->threadgroup_fork_lock); 2445 + up_write(&tsk->signal->group_rwsem); 2446 + mutex_unlock(&tsk->signal->cred_guard_mutex); 2416 2447 } 2417 2448 #else 2418 - static inline void threadgroup_fork_read_lock(struct task_struct *tsk) {} 2419 - static inline void threadgroup_fork_read_unlock(struct task_struct *tsk) {} 2420 - static inline void threadgroup_fork_write_lock(struct task_struct *tsk) {} 2421 - static inline void threadgroup_fork_write_unlock(struct task_struct *tsk) {} 2449 + static inline void threadgroup_change_begin(struct task_struct *tsk) {} 2450 + static inline void threadgroup_change_end(struct task_struct *tsk) {} 2451 + static inline void threadgroup_lock(struct task_struct *tsk) {} 2452 + static inline void threadgroup_unlock(struct task_struct *tsk) {} 2422 2453 #endif 2423 2454 2424 2455 #ifndef __HAVE_THREAD_FUNCTIONS

+231 -170

kernel/cgroup.c

··· 63 63 64 64 #include <linux/atomic.h> 65 65 66 + /* 67 + * cgroup_mutex is the master lock. Any modification to cgroup or its 68 + * hierarchy must be performed while holding it. 69 + * 70 + * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify 71 + * cgroupfs_root of any cgroup hierarchy - subsys list, flags, 72 + * release_agent_path and so on. Modifying requires both cgroup_mutex and 73 + * cgroup_root_mutex. Readers can acquire either of the two. This is to 74 + * break the following locking order cycle. 75 + * 76 + * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem 77 + * B. namespace_sem -> cgroup_mutex 78 + * 79 + * B happens only through cgroup_show_options() and using cgroup_root_mutex 80 + * breaks it. 81 + */ 66 82 static DEFINE_MUTEX(cgroup_mutex); 83 + static DEFINE_MUTEX(cgroup_root_mutex); 67 84 68 85 /* 69 86 * Generate an array of cgroup subsystem pointers. At boot time, this is ··· 938 921 * 939 922 * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; 940 923 */ 941 - DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 924 + static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); 942 925 943 926 static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) 944 927 { ··· 970 953 int i; 971 954 972 955 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 956 + BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 973 957 974 958 removed_bits = root->actual_subsys_bits & ~final_bits; 975 959 added_bits = final_bits & ~root->actual_subsys_bits; ··· 1061 1043 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1062 1044 struct cgroup_subsys *ss; 1063 1045 1064 - mutex_lock(&cgroup_mutex); 1046 + mutex_lock(&cgroup_root_mutex); 1065 1047 for_each_subsys(root, ss) 1066 1048 seq_printf(seq, ",%s", ss->name); 1067 1049 if (test_bit(ROOT_NOPREFIX, &root->flags)) ··· 1072 1054 seq_puts(seq, ",clone_children"); 1073 1055 if (strlen(root->name)) 1074 1056 seq_printf(seq, ",name=%s", root->name); 1075 - mutex_unlock(&cgroup_mutex); 1057 + mutex_unlock(&cgroup_root_mutex); 1076 1058 return 0; 1077 1059 } 1078 1060 ··· 1193 1175 1194 1176 /* 1195 1177 * If the 'all' option was specified select all the subsystems, 1196 - * otherwise 'all, 'none' and a subsystem name options were not 1197 - * specified, let's default to 'all' 1178 + * otherwise if 'none', 'name=' and a subsystem name options 1179 + * were not specified, let's default to 'all' 1198 1180 */ 1199 - if (all_ss || (!all_ss && !one_ss && !opts->none)) { 1181 + if (all_ss || (!one_ss && !opts->none && !opts->name)) { 1200 1182 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1201 1183 struct cgroup_subsys *ss = subsys[i]; 1202 1184 if (ss == NULL) ··· 1287 1269 1288 1270 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1289 1271 mutex_lock(&cgroup_mutex); 1272 + mutex_lock(&cgroup_root_mutex); 1290 1273 1291 1274 /* See what subsystems are wanted */ 1292 1275 ret = parse_cgroupfs_options(data, &opts); ··· 1316 1297 out_unlock: 1317 1298 kfree(opts.release_agent); 1318 1299 kfree(opts.name); 1300 + mutex_unlock(&cgroup_root_mutex); 1319 1301 mutex_unlock(&cgroup_mutex); 1320 1302 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1321 1303 return ret; ··· 1501 1481 int ret = 0; 1502 1482 struct super_block *sb; 1503 1483 struct cgroupfs_root *new_root; 1484 + struct inode *inode; 1504 1485 1505 1486 /* First find the desired set of subsystems */ 1506 1487 mutex_lock(&cgroup_mutex); ··· 1535 1514 /* We used the new root structure, so this is a new hierarchy */ 1536 1515 struct list_head tmp_cg_links; 1537 1516 struct cgroup *root_cgrp = &root->top_cgroup; 1538 - struct inode *inode; 1539 1517 struct cgroupfs_root *existing_root; 1540 1518 const struct cred *cred; 1541 1519 int i; ··· 1548 1528 1549 1529 mutex_lock(&inode->i_mutex); 1550 1530 mutex_lock(&cgroup_mutex); 1531 + mutex_lock(&cgroup_root_mutex); 1551 1532 1552 - if (strlen(root->name)) { 1553 - /* Check for name clashes with existing mounts */ 1554 - for_each_active_root(existing_root) { 1555 - if (!strcmp(existing_root->name, root->name)) { 1556 - ret = -EBUSY; 1557 - mutex_unlock(&cgroup_mutex); 1558 - mutex_unlock(&inode->i_mutex); 1559 - goto drop_new_super; 1560 - } 1561 - } 1562 - } 1533 + /* Check for name clashes with existing mounts */ 1534 + ret = -EBUSY; 1535 + if (strlen(root->name)) 1536 + for_each_active_root(existing_root) 1537 + if (!strcmp(existing_root->name, root->name)) 1538 + goto unlock_drop; 1563 1539 1564 1540 /* 1565 1541 * We're accessing css_set_count without locking ··· 1565 1549 * have some link structures left over 1566 1550 */ 1567 1551 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1568 - if (ret) { 1569 - mutex_unlock(&cgroup_mutex); 1570 - mutex_unlock(&inode->i_mutex); 1571 - goto drop_new_super; 1572 - } 1552 + if (ret) 1553 + goto unlock_drop; 1573 1554 1574 1555 ret = rebind_subsystems(root, root->subsys_bits); 1575 1556 if (ret == -EBUSY) { 1576 - mutex_unlock(&cgroup_mutex); 1577 - mutex_unlock(&inode->i_mutex); 1578 1557 free_cg_links(&tmp_cg_links); 1579 - goto drop_new_super; 1558 + goto unlock_drop; 1580 1559 } 1581 1560 /* 1582 1561 * There must be no failure case after here, since rebinding ··· 1610 1599 cred = override_creds(&init_cred); 1611 1600 cgroup_populate_dir(root_cgrp); 1612 1601 revert_creds(cred); 1602 + mutex_unlock(&cgroup_root_mutex); 1613 1603 mutex_unlock(&cgroup_mutex); 1614 1604 mutex_unlock(&inode->i_mutex); 1615 1605 } else { ··· 1627 1615 kfree(opts.name); 1628 1616 return dget(sb->s_root); 1629 1617 1618 + unlock_drop: 1619 + mutex_unlock(&cgroup_root_mutex); 1620 + mutex_unlock(&cgroup_mutex); 1621 + mutex_unlock(&inode->i_mutex); 1630 1622 drop_new_super: 1631 1623 deactivate_locked_super(sb); 1632 1624 drop_modules: ··· 1655 1639 BUG_ON(!list_empty(&cgrp->sibling)); 1656 1640 1657 1641 mutex_lock(&cgroup_mutex); 1642 + mutex_lock(&cgroup_root_mutex); 1658 1643 1659 1644 /* Rebind all subsystems back to the default hierarchy */ 1660 1645 ret = rebind_subsystems(root, 0); ··· 1681 1664 root_count--; 1682 1665 } 1683 1666 1667 + mutex_unlock(&cgroup_root_mutex); 1684 1668 mutex_unlock(&cgroup_mutex); 1685 1669 1686 1670 kill_litter_super(sb); ··· 1758 1740 EXPORT_SYMBOL_GPL(cgroup_path); 1759 1741 1760 1742 /* 1743 + * Control Group taskset 1744 + */ 1745 + struct task_and_cgroup { 1746 + struct task_struct *task; 1747 + struct cgroup *cgrp; 1748 + }; 1749 + 1750 + struct cgroup_taskset { 1751 + struct task_and_cgroup single; 1752 + struct flex_array *tc_array; 1753 + int tc_array_len; 1754 + int idx; 1755 + struct cgroup *cur_cgrp; 1756 + }; 1757 + 1758 + /** 1759 + * cgroup_taskset_first - reset taskset and return the first task 1760 + * @tset: taskset of interest 1761 + * 1762 + * @tset iteration is initialized and the first task is returned. 1763 + */ 1764 + struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) 1765 + { 1766 + if (tset->tc_array) { 1767 + tset->idx = 0; 1768 + return cgroup_taskset_next(tset); 1769 + } else { 1770 + tset->cur_cgrp = tset->single.cgrp; 1771 + return tset->single.task; 1772 + } 1773 + } 1774 + EXPORT_SYMBOL_GPL(cgroup_taskset_first); 1775 + 1776 + /** 1777 + * cgroup_taskset_next - iterate to the next task in taskset 1778 + * @tset: taskset of interest 1779 + * 1780 + * Return the next task in @tset. Iteration must have been initialized 1781 + * with cgroup_taskset_first(). 1782 + */ 1783 + struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) 1784 + { 1785 + struct task_and_cgroup *tc; 1786 + 1787 + if (!tset->tc_array || tset->idx >= tset->tc_array_len) 1788 + return NULL; 1789 + 1790 + tc = flex_array_get(tset->tc_array, tset->idx++); 1791 + tset->cur_cgrp = tc->cgrp; 1792 + return tc->task; 1793 + } 1794 + EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1795 + 1796 + /** 1797 + * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task 1798 + * @tset: taskset of interest 1799 + * 1800 + * Return the cgroup for the current (last returned) task of @tset. This 1801 + * function must be preceded by either cgroup_taskset_first() or 1802 + * cgroup_taskset_next(). 1803 + */ 1804 + struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) 1805 + { 1806 + return tset->cur_cgrp; 1807 + } 1808 + EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); 1809 + 1810 + /** 1811 + * cgroup_taskset_size - return the number of tasks in taskset 1812 + * @tset: taskset of interest 1813 + */ 1814 + int cgroup_taskset_size(struct cgroup_taskset *tset) 1815 + { 1816 + return tset->tc_array ? tset->tc_array_len : 1; 1817 + } 1818 + EXPORT_SYMBOL_GPL(cgroup_taskset_size); 1819 + 1820 + 1821 + /* 1761 1822 * cgroup_task_migrate - move a task from one cgroup to another. 1762 1823 * 1763 1824 * 'guarantee' is set if the caller promises that a new css_set for the task 1764 1825 * will already exist. If not set, this function might sleep, and can fail with 1765 - * -ENOMEM. Otherwise, it can only fail with -ESRCH. 1826 + * -ENOMEM. Must be called with cgroup_mutex and threadgroup locked. 1766 1827 */ 1767 1828 static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1768 1829 struct task_struct *tsk, bool guarantee) ··· 1850 1753 struct css_set *newcg; 1851 1754 1852 1755 /* 1853 - * get old css_set. we need to take task_lock and refcount it, because 1854 - * an exiting task can change its css_set to init_css_set and drop its 1855 - * old one without taking cgroup_mutex. 1756 + * We are synchronized through threadgroup_lock() against PF_EXITING 1757 + * setting such that we can't race against cgroup_exit() changing the 1758 + * css_set to init_css_set and dropping the old one. 1856 1759 */ 1857 - task_lock(tsk); 1760 + WARN_ON_ONCE(tsk->flags & PF_EXITING); 1858 1761 oldcg = tsk->cgroups; 1859 - get_css_set(oldcg); 1860 - task_unlock(tsk); 1861 1762 1862 1763 /* locate or allocate a new css_set for this task. */ 1863 1764 if (guarantee) { ··· 1870 1775 might_sleep(); 1871 1776 /* find_css_set will give us newcg already referenced. */ 1872 1777 newcg = find_css_set(oldcg, cgrp); 1873 - if (!newcg) { 1874 - put_css_set(oldcg); 1778 + if (!newcg) 1875 1779 return -ENOMEM; 1876 - } 1877 1780 } 1878 - put_css_set(oldcg); 1879 1781 1880 - /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ 1881 1782 task_lock(tsk); 1882 - if (tsk->flags & PF_EXITING) { 1883 - task_unlock(tsk); 1884 - put_css_set(newcg); 1885 - return -ESRCH; 1886 - } 1887 1783 rcu_assign_pointer(tsk->cgroups, newcg); 1888 1784 task_unlock(tsk); 1889 1785 ··· 1900 1814 * @cgrp: the cgroup the task is attaching to 1901 1815 * @tsk: the task to be attached 1902 1816 * 1903 - * Call holding cgroup_mutex. May take task_lock of 1904 - * the task 'tsk' during call. 1817 + * Call with cgroup_mutex and threadgroup locked. May take task_lock of 1818 + * @tsk during call. 1905 1819 */ 1906 1820 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1907 1821 { ··· 1909 1823 struct cgroup_subsys *ss, *failed_ss = NULL; 1910 1824 struct cgroup *oldcgrp; 1911 1825 struct cgroupfs_root *root = cgrp->root; 1826 + struct cgroup_taskset tset = { }; 1827 + 1828 + /* @tsk either already exited or can't exit until the end */ 1829 + if (tsk->flags & PF_EXITING) 1830 + return -ESRCH; 1912 1831 1913 1832 /* Nothing to do if the task is already in that cgroup */ 1914 1833 oldcgrp = task_cgroup_from_root(tsk, root); 1915 1834 if (cgrp == oldcgrp) 1916 1835 return 0; 1917 1836 1837 + tset.single.task = tsk; 1838 + tset.single.cgrp = oldcgrp; 1839 + 1918 1840 for_each_subsys(root, ss) { 1919 1841 if (ss->can_attach) { 1920 - retval = ss->can_attach(ss, cgrp, tsk); 1842 + retval = ss->can_attach(ss, cgrp, &tset); 1921 1843 if (retval) { 1922 1844 /* 1923 1845 * Remember on which subsystem the can_attach() ··· 1937 1843 goto out; 1938 1844 } 1939 1845 } 1940 - if (ss->can_attach_task) { 1941 - retval = ss->can_attach_task(cgrp, tsk); 1942 - if (retval) { 1943 - failed_ss = ss; 1944 - goto out; 1945 - } 1946 - } 1947 1846 } 1948 1847 1949 1848 retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); ··· 1944 1857 goto out; 1945 1858 1946 1859 for_each_subsys(root, ss) { 1947 - if (ss->pre_attach) 1948 - ss->pre_attach(cgrp); 1949 - if (ss->attach_task) 1950 - ss->attach_task(cgrp, tsk); 1951 1860 if (ss->attach) 1952 - ss->attach(ss, cgrp, oldcgrp, tsk); 1861 + ss->attach(ss, cgrp, &tset); 1953 1862 } 1954 1863 1955 1864 synchronize_rcu(); ··· 1967 1884 */ 1968 1885 break; 1969 1886 if (ss->cancel_attach) 1970 - ss->cancel_attach(ss, cgrp, tsk); 1887 + ss->cancel_attach(ss, cgrp, &tset); 1971 1888 } 1972 1889 } 1973 1890 return retval; ··· 2018 1935 2019 1936 read_lock(&css_set_lock); 2020 1937 newcg = find_existing_css_set(cg, cgrp, template); 2021 - if (newcg) 2022 - get_css_set(newcg); 2023 1938 read_unlock(&css_set_lock); 2024 1939 2025 1940 /* doesn't exist at all? */ 2026 1941 if (!newcg) 2027 1942 return false; 2028 1943 /* see if it's already in the list */ 2029 - list_for_each_entry(cg_entry, newcg_list, links) { 2030 - if (cg_entry->cg == newcg) { 2031 - put_css_set(newcg); 1944 + list_for_each_entry(cg_entry, newcg_list, links) 1945 + if (cg_entry->cg == newcg) 2032 1946 return true; 2033 - } 2034 - } 2035 1947 2036 1948 /* not found */ 2037 - put_css_set(newcg); 2038 1949 return false; 2039 1950 } 2040 1951 ··· 2062 1985 * @cgrp: the cgroup to attach to 2063 1986 * @leader: the threadgroup leader task_struct of the group to be attached 2064 1987 * 2065 - * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will 2066 - * take task_lock of each thread in leader's threadgroup individually in turn. 1988 + * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 1989 + * task_lock of each thread in leader's threadgroup individually in turn. 2067 1990 */ 2068 - int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 1991 + static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 2069 1992 { 2070 1993 int retval, i, group_size; 2071 1994 struct cgroup_subsys *ss, *failed_ss = NULL; 2072 - bool cancel_failed_ss = false; 2073 1995 /* guaranteed to be initialized later, but the compiler needs this */ 2074 - struct cgroup *oldcgrp = NULL; 2075 1996 struct css_set *oldcg; 2076 1997 struct cgroupfs_root *root = cgrp->root; 2077 1998 /* threadgroup list cursor and array */ 2078 1999 struct task_struct *tsk; 2000 + struct task_and_cgroup *tc; 2079 2001 struct flex_array *group; 2002 + struct cgroup_taskset tset = { }; 2080 2003 /* 2081 2004 * we need to make sure we have css_sets for all the tasks we're 2082 2005 * going to move -before- we actually start moving them, so that in ··· 2089 2012 * step 0: in order to do expensive, possibly blocking operations for 2090 2013 * every thread, we cannot iterate the thread group list, since it needs 2091 2014 * rcu or tasklist locked. instead, build an array of all threads in the 2092 - * group - threadgroup_fork_lock prevents new threads from appearing, 2093 - * and if threads exit, this will just be an over-estimate. 2015 + * group - group_rwsem prevents new threads from appearing, and if 2016 + * threads exit, this will just be an over-estimate. 2094 2017 */ 2095 2018 group_size = get_nr_threads(leader); 2096 2019 /* flex_array supports very large thread-groups better than kmalloc. */ 2097 - group = flex_array_alloc(sizeof(struct task_struct *), group_size, 2098 - GFP_KERNEL); 2020 + group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 2099 2021 if (!group) 2100 2022 return -ENOMEM; 2101 2023 /* pre-allocate to guarantee space while iterating in rcu read-side. */ ··· 2116 2040 retval = -EAGAIN; 2117 2041 goto out_free_group_list; 2118 2042 } 2119 - /* take a reference on each task in the group to go in the array. */ 2043 + 2120 2044 tsk = leader; 2121 2045 i = 0; 2122 2046 do { 2047 + struct task_and_cgroup ent; 2048 + 2049 + /* @tsk either already exited or can't exit until the end */ 2050 + if (tsk->flags & PF_EXITING) 2051 + continue; 2052 + 2123 2053 /* as per above, nr_threads may decrease, but not increase. */ 2124 2054 BUG_ON(i >= group_size); 2125 - get_task_struct(tsk); 2126 2055 /* 2127 2056 * saying GFP_ATOMIC has no effect here because we did prealloc 2128 2057 * earlier, but it's good form to communicate our expectations. 2129 2058 */ 2130 - retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); 2059 + ent.task = tsk; 2060 + ent.cgrp = task_cgroup_from_root(tsk, root); 2061 + /* nothing to do if this task is already in the cgroup */ 2062 + if (ent.cgrp == cgrp) 2063 + continue; 2064 + retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2131 2065 BUG_ON(retval != 0); 2132 2066 i++; 2133 2067 } while_each_thread(leader, tsk); 2134 2068 /* remember the number of threads in the array for later. */ 2135 2069 group_size = i; 2070 + tset.tc_array = group; 2071 + tset.tc_array_len = group_size; 2136 2072 read_unlock(&tasklist_lock); 2073 + 2074 + /* methods shouldn't be called if no task is actually migrating */ 2075 + retval = 0; 2076 + if (!group_size) 2077 + goto out_free_group_list; 2137 2078 2138 2079 /* 2139 2080 * step 1: check that we can legitimately attach to the cgroup. 2140 2081 */ 2141 2082 for_each_subsys(root, ss) { 2142 2083 if (ss->can_attach) { 2143 - retval = ss->can_attach(ss, cgrp, leader); 2084 + retval = ss->can_attach(ss, cgrp, &tset); 2144 2085 if (retval) { 2145 2086 failed_ss = ss; 2146 2087 goto out_cancel_attach; 2147 - } 2148 - } 2149 - /* a callback to be run on every thread in the threadgroup. */ 2150 - if (ss->can_attach_task) { 2151 - /* run on each task in the threadgroup. */ 2152 - for (i = 0; i < group_size; i++) { 2153 - tsk = flex_array_get_ptr(group, i); 2154 - retval = ss->can_attach_task(cgrp, tsk); 2155 - if (retval) { 2156 - failed_ss = ss; 2157 - cancel_failed_ss = true; 2158 - goto out_cancel_attach; 2159 - } 2160 2088 } 2161 2089 } 2162 2090 } ··· 2171 2091 */ 2172 2092 INIT_LIST_HEAD(&newcg_list); 2173 2093 for (i = 0; i < group_size; i++) { 2174 - tsk = flex_array_get_ptr(group, i); 2175 - /* nothing to do if this task is already in the cgroup */ 2176 - oldcgrp = task_cgroup_from_root(tsk, root); 2177 - if (cgrp == oldcgrp) 2178 - continue; 2179 - /* get old css_set pointer */ 2180 - task_lock(tsk); 2181 - oldcg = tsk->cgroups; 2182 - get_css_set(oldcg); 2183 - task_unlock(tsk); 2184 - /* see if the new one for us is already in the list? */ 2185 - if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { 2186 - /* was already there, nothing to do. */ 2187 - put_css_set(oldcg); 2188 - } else { 2189 - /* we don't already have it. get new one. */ 2094 + tc = flex_array_get(group, i); 2095 + oldcg = tc->task->cgroups; 2096 + 2097 + /* if we don't already have it in the list get a new one */ 2098 + if (!css_set_check_fetched(cgrp, tc->task, oldcg, 2099 + &newcg_list)) { 2190 2100 retval = css_set_prefetch(cgrp, oldcg, &newcg_list); 2191 - put_css_set(oldcg); 2192 2101 if (retval) 2193 2102 goto out_list_teardown; 2194 2103 } 2195 2104 } 2196 2105 2197 2106 /* 2198 - * step 3: now that we're guaranteed success wrt the css_sets, proceed 2199 - * to move all tasks to the new cgroup, calling ss->attach_task for each 2200 - * one along the way. there are no failure cases after here, so this is 2201 - * the commit point. 2107 + * step 3: now that we're guaranteed success wrt the css_sets, 2108 + * proceed to move all tasks to the new cgroup. There are no 2109 + * failure cases after here, so this is the commit point. 2202 2110 */ 2203 - for_each_subsys(root, ss) { 2204 - if (ss->pre_attach) 2205 - ss->pre_attach(cgrp); 2206 - } 2207 2111 for (i = 0; i < group_size; i++) { 2208 - tsk = flex_array_get_ptr(group, i); 2209 - /* leave current thread as it is if it's already there */ 2210 - oldcgrp = task_cgroup_from_root(tsk, root); 2211 - if (cgrp == oldcgrp) 2212 - continue; 2213 - /* if the thread is PF_EXITING, it can just get skipped. */ 2214 - retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); 2215 - if (retval == 0) { 2216 - /* attach each task to each subsystem */ 2217 - for_each_subsys(root, ss) { 2218 - if (ss->attach_task) 2219 - ss->attach_task(cgrp, tsk); 2220 - } 2221 - } else { 2222 - BUG_ON(retval != -ESRCH); 2223 - } 2112 + tc = flex_array_get(group, i); 2113 + retval = cgroup_task_migrate(cgrp, tc->cgrp, tc->task, true); 2114 + BUG_ON(retval); 2224 2115 } 2225 2116 /* nothing is sensitive to fork() after this point. */ 2226 2117 2227 2118 /* 2228 - * step 4: do expensive, non-thread-specific subsystem callbacks. 2229 - * TODO: if ever a subsystem needs to know the oldcgrp for each task 2230 - * being moved, this call will need to be reworked to communicate that. 2119 + * step 4: do subsystem attach callbacks. 2231 2120 */ 2232 2121 for_each_subsys(root, ss) { 2233 2122 if (ss->attach) 2234 - ss->attach(ss, cgrp, oldcgrp, leader); 2123 + ss->attach(ss, cgrp, &tset); 2235 2124 } 2236 2125 2237 2126 /* ··· 2220 2171 /* same deal as in cgroup_attach_task */ 2221 2172 if (retval) { 2222 2173 for_each_subsys(root, ss) { 2223 - if (ss == failed_ss) { 2224 - if (cancel_failed_ss && ss->cancel_attach) 2225 - ss->cancel_attach(ss, cgrp, leader); 2174 + if (ss == failed_ss) 2226 2175 break; 2227 - } 2228 2176 if (ss->cancel_attach) 2229 - ss->cancel_attach(ss, cgrp, leader); 2177 + ss->cancel_attach(ss, cgrp, &tset); 2230 2178 } 2231 - } 2232 - /* clean up the array of referenced threads in the group. */ 2233 - for (i = 0; i < group_size; i++) { 2234 - tsk = flex_array_get_ptr(group, i); 2235 - put_task_struct(tsk); 2236 2179 } 2237 2180 out_free_group_list: 2238 2181 flex_array_free(group); ··· 2233 2192 2234 2193 /* 2235 2194 * Find the task_struct of the task to attach by vpid and pass it along to the 2236 - * function to attach either it or all tasks in its threadgroup. Will take 2237 - * cgroup_mutex; may take task_lock of task. 2195 + * function to attach either it or all tasks in its threadgroup. Will lock 2196 + * cgroup_mutex and threadgroup; may take task_lock of task. 2238 2197 */ 2239 2198 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2240 2199 { ··· 2261 2220 * detect it later. 2262 2221 */ 2263 2222 tsk = tsk->group_leader; 2264 - } else if (tsk->flags & PF_EXITING) { 2265 - /* optimization for the single-task-only case */ 2266 - rcu_read_unlock(); 2267 - cgroup_unlock(); 2268 - return -ESRCH; 2269 2223 } 2270 - 2271 2224 /* 2272 2225 * even if we're attaching all tasks in the thread group, we 2273 2226 * only need to check permissions on one of them. ··· 2284 2249 get_task_struct(tsk); 2285 2250 } 2286 2251 2287 - if (threadgroup) { 2288 - threadgroup_fork_write_lock(tsk); 2252 + threadgroup_lock(tsk); 2253 + 2254 + if (threadgroup) 2289 2255 ret = cgroup_attach_proc(cgrp, tsk); 2290 - threadgroup_fork_write_unlock(tsk); 2291 - } else { 2256 + else 2292 2257 ret = cgroup_attach_task(cgrp, tsk); 2293 - } 2258 + 2259 + threadgroup_unlock(tsk); 2260 + 2294 2261 put_task_struct(tsk); 2295 2262 cgroup_unlock(); 2296 2263 return ret; ··· 2343 2306 return -EINVAL; 2344 2307 if (!cgroup_lock_live_group(cgrp)) 2345 2308 return -ENODEV; 2309 + mutex_lock(&cgroup_root_mutex); 2346 2310 strcpy(cgrp->root->release_agent_path, buffer); 2311 + mutex_unlock(&cgroup_root_mutex); 2347 2312 cgroup_unlock(); 2348 2313 return 0; 2349 2314 } ··· 2828 2789 } 2829 2790 2830 2791 void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 2792 + __acquires(css_set_lock) 2831 2793 { 2832 2794 /* 2833 2795 * The first time anyone tries to iterate across a cgroup, ··· 2868 2828 } 2869 2829 2870 2830 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 2831 + __releases(css_set_lock) 2871 2832 { 2872 2833 read_unlock(&css_set_lock); 2873 2834 } ··· 4532 4491 * 4533 4492 * A pointer to the shared css_set was automatically copied in 4534 4493 * fork.c by dup_task_struct(). However, we ignore that copy, since 4535 - * it was not made under the protection of RCU or cgroup_mutex, so 4536 - * might no longer be a valid cgroup pointer. cgroup_attach_task() might 4537 - * have already changed current->cgroups, allowing the previously 4538 - * referenced cgroup group to be removed and freed. 4494 + * it was not made under the protection of RCU, cgroup_mutex or 4495 + * threadgroup_change_begin(), so it might no longer be a valid 4496 + * cgroup pointer. cgroup_attach_task() might have already changed 4497 + * current->cgroups, allowing the previously referenced cgroup 4498 + * group to be removed and freed. 4499 + * 4500 + * Outside the pointer validity we also need to process the css_set 4501 + * inheritance between threadgoup_change_begin() and 4502 + * threadgoup_change_end(), this way there is no leak in any process 4503 + * wide migration performed by cgroup_attach_proc() that could otherwise 4504 + * miss a thread because it is too early or too late in the fork stage. 4539 4505 * 4540 4506 * At the point that cgroup_fork() is called, 'current' is the parent 4541 4507 * task, and the passed argument 'child' points to the child task. 4542 4508 */ 4543 4509 void cgroup_fork(struct task_struct *child) 4544 4510 { 4545 - task_lock(current); 4511 + /* 4512 + * We don't need to task_lock() current because current->cgroups 4513 + * can't be changed concurrently here. The parent obviously hasn't 4514 + * exited and called cgroup_exit(), and we are synchronized against 4515 + * cgroup migration through threadgroup_change_begin(). 4516 + */ 4546 4517 child->cgroups = current->cgroups; 4547 4518 get_css_set(child->cgroups); 4548 - task_unlock(current); 4549 4519 INIT_LIST_HEAD(&child->cg_list); 4550 4520 } 4551 4521 ··· 4598 4546 { 4599 4547 if (use_task_css_set_links) { 4600 4548 write_lock(&css_set_lock); 4601 - task_lock(child); 4602 - if (list_empty(&child->cg_list)) 4549 + if (list_empty(&child->cg_list)) { 4550 + /* 4551 + * It's safe to use child->cgroups without task_lock() 4552 + * here because we are protected through 4553 + * threadgroup_change_begin() against concurrent 4554 + * css_set change in cgroup_task_migrate(). Also 4555 + * the task can't exit at that point until 4556 + * wake_up_new_task() is called, so we are protected 4557 + * against cgroup_exit() setting child->cgroup to 4558 + * init_css_set. 4559 + */ 4603 4560 list_add(&child->cg_list, &child->cgroups->tasks); 4604 - task_unlock(child); 4561 + } 4605 4562 write_unlock(&css_set_lock); 4606 4563 } 4607 4564 }

+5 -11

kernel/cgroup_freezer.c

··· 166 166 */ 167 167 static int freezer_can_attach(struct cgroup_subsys *ss, 168 168 struct cgroup *new_cgroup, 169 - struct task_struct *task) 169 + struct cgroup_taskset *tset) 170 170 { 171 171 struct freezer *freezer; 172 + struct task_struct *task; 172 173 173 174 /* 174 175 * Anything frozen can't move or be moved to/from. 175 176 */ 177 + cgroup_taskset_for_each(task, new_cgroup, tset) 178 + if (cgroup_freezing(task)) 179 + return -EBUSY; 176 180 177 181 freezer = cgroup_freezer(new_cgroup); 178 182 if (freezer->state != CGROUP_THAWED) 179 183 return -EBUSY; 180 184 181 185 return 0; 182 - } 183 - 184 - static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 185 - { 186 - return cgroup_freezing(tsk) ? -EBUSY : 0; 187 186 } 188 187 189 188 static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) ··· 380 381 .populate = freezer_populate, 381 382 .subsys_id = freezer_subsys_id, 382 383 .can_attach = freezer_can_attach, 383 - .can_attach_task = freezer_can_attach_task, 384 - .pre_attach = NULL, 385 - .attach_task = NULL, 386 - .attach = NULL, 387 384 .fork = freezer_fork, 388 - .exit = NULL, 389 385 };

+49 -58

kernel/cpuset.c

··· 1389 1389 return val; 1390 1390 } 1391 1391 1392 - /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1393 - static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1394 - struct task_struct *tsk) 1395 - { 1396 - struct cpuset *cs = cgroup_cs(cont); 1397 - 1398 - if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1399 - return -ENOSPC; 1400 - 1401 - /* 1402 - * Kthreads bound to specific cpus cannot be moved to a new cpuset; we 1403 - * cannot change their cpu affinity and isolating such threads by their 1404 - * set of allowed nodes is unnecessary. Thus, cpusets are not 1405 - * applicable for such threads. This prevents checking for success of 1406 - * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may 1407 - * be changed. 1408 - */ 1409 - if (tsk->flags & PF_THREAD_BOUND) 1410 - return -EINVAL; 1411 - 1412 - return 0; 1413 - } 1414 - 1415 - static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) 1416 - { 1417 - return security_task_setscheduler(task); 1418 - } 1419 - 1420 1392 /* 1421 1393 * Protected by cgroup_lock. The nodemasks must be stored globally because 1422 - * dynamically allocating them is not allowed in pre_attach, and they must 1423 - * persist among pre_attach, attach_task, and attach. 1394 + * dynamically allocating them is not allowed in can_attach, and they must 1395 + * persist until attach. 1424 1396 */ 1425 1397 static cpumask_var_t cpus_attach; 1426 1398 static nodemask_t cpuset_attach_nodemask_from; 1427 1399 static nodemask_t cpuset_attach_nodemask_to; 1428 1400 1429 - /* Set-up work for before attaching each task. */ 1430 - static void cpuset_pre_attach(struct cgroup *cont) 1401 + /* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ 1402 + static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1403 + struct cgroup_taskset *tset) 1431 1404 { 1432 - struct cpuset *cs = cgroup_cs(cont); 1405 + struct cpuset *cs = cgroup_cs(cgrp); 1406 + struct task_struct *task; 1407 + int ret; 1433 1408 1409 + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1410 + return -ENOSPC; 1411 + 1412 + cgroup_taskset_for_each(task, cgrp, tset) { 1413 + /* 1414 + * Kthreads bound to specific cpus cannot be moved to a new 1415 + * cpuset; we cannot change their cpu affinity and 1416 + * isolating such threads by their set of allowed nodes is 1417 + * unnecessary. Thus, cpusets are not applicable for such 1418 + * threads. This prevents checking for success of 1419 + * set_cpus_allowed_ptr() on all attached tasks before 1420 + * cpus_allowed may be changed. 1421 + */ 1422 + if (task->flags & PF_THREAD_BOUND) 1423 + return -EINVAL; 1424 + if ((ret = security_task_setscheduler(task))) 1425 + return ret; 1426 + } 1427 + 1428 + /* prepare for attach */ 1434 1429 if (cs == &top_cpuset) 1435 1430 cpumask_copy(cpus_attach, cpu_possible_mask); 1436 1431 else 1437 1432 guarantee_online_cpus(cs, cpus_attach); 1438 1433 1439 1434 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 1435 + 1436 + return 0; 1440 1437 } 1441 1438 1442 - /* Per-thread attachment work. */ 1443 - static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) 1444 - { 1445 - int err; 1446 - struct cpuset *cs = cgroup_cs(cont); 1447 - 1448 - /* 1449 - * can_attach beforehand should guarantee that this doesn't fail. 1450 - * TODO: have a better way to handle failure here 1451 - */ 1452 - err = set_cpus_allowed_ptr(tsk, cpus_attach); 1453 - WARN_ON_ONCE(err); 1454 - 1455 - cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); 1456 - cpuset_update_task_spread_flag(cs, tsk); 1457 - } 1458 - 1459 - static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, 1460 - struct cgroup *oldcont, struct task_struct *tsk) 1439 + static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 1440 + struct cgroup_taskset *tset) 1461 1441 { 1462 1442 struct mm_struct *mm; 1463 - struct cpuset *cs = cgroup_cs(cont); 1464 - struct cpuset *oldcs = cgroup_cs(oldcont); 1443 + struct task_struct *task; 1444 + struct task_struct *leader = cgroup_taskset_first(tset); 1445 + struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); 1446 + struct cpuset *cs = cgroup_cs(cgrp); 1447 + struct cpuset *oldcs = cgroup_cs(oldcgrp); 1448 + 1449 + cgroup_taskset_for_each(task, cgrp, tset) { 1450 + /* 1451 + * can_attach beforehand should guarantee that this doesn't 1452 + * fail. TODO: have a better way to handle failure here 1453 + */ 1454 + WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach)); 1455 + 1456 + cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to); 1457 + cpuset_update_task_spread_flag(cs, task); 1458 + } 1465 1459 1466 1460 /* 1467 1461 * Change mm, possibly for multiple threads in a threadgroup. This is ··· 1463 1469 */ 1464 1470 cpuset_attach_nodemask_from = oldcs->mems_allowed; 1465 1471 cpuset_attach_nodemask_to = cs->mems_allowed; 1466 - mm = get_task_mm(tsk); 1472 + mm = get_task_mm(leader); 1467 1473 if (mm) { 1468 1474 mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); 1469 1475 if (is_memory_migrate(cs)) ··· 1919 1925 .create = cpuset_create, 1920 1926 .destroy = cpuset_destroy, 1921 1927 .can_attach = cpuset_can_attach, 1922 - .can_attach_task = cpuset_can_attach_task, 1923 - .pre_attach = cpuset_pre_attach, 1924 - .attach_task = cpuset_attach_task, 1925 1928 .attach = cpuset_attach, 1926 1929 .populate = cpuset_populate, 1927 1930 .post_clone = cpuset_post_clone,

+8 -5

kernel/events/core.c

··· 6941 6941 return 0; 6942 6942 } 6943 6943 6944 - static void 6945 - perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) 6944 + static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 6945 + struct cgroup_taskset *tset) 6946 6946 { 6947 - task_function_call(task, __perf_cgroup_move, task); 6947 + struct task_struct *task; 6948 + 6949 + cgroup_taskset_for_each(task, cgrp, tset) 6950 + task_function_call(task, __perf_cgroup_move, task); 6948 6951 } 6949 6952 6950 6953 static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, ··· 6961 6958 if (!(task->flags & PF_EXITING)) 6962 6959 return; 6963 6960 6964 - perf_cgroup_attach_task(cgrp, task); 6961 + task_function_call(task, __perf_cgroup_move, task); 6965 6962 } 6966 6963 6967 6964 struct cgroup_subsys perf_subsys = { ··· 6970 6967 .create = perf_cgroup_create, 6971 6968 .destroy = perf_cgroup_destroy, 6972 6969 .exit = perf_cgroup_exit, 6973 - .attach_task = perf_cgroup_attach_task, 6970 + .attach = perf_cgroup_attach, 6974 6971 }; 6975 6972 #endif /* CONFIG_CGROUP_PERF */

+4 -4

kernel/fork.c

··· 972 972 sched_autogroup_fork(sig); 973 973 974 974 #ifdef CONFIG_CGROUPS 975 - init_rwsem(&sig->threadgroup_fork_lock); 975 + init_rwsem(&sig->group_rwsem); 976 976 #endif 977 977 978 978 sig->oom_adj = current->signal->oom_adj; ··· 1153 1153 p->io_context = NULL; 1154 1154 p->audit_context = NULL; 1155 1155 if (clone_flags & CLONE_THREAD) 1156 - threadgroup_fork_read_lock(current); 1156 + threadgroup_change_begin(current); 1157 1157 cgroup_fork(p); 1158 1158 #ifdef CONFIG_NUMA 1159 1159 p->mempolicy = mpol_dup(p->mempolicy); ··· 1368 1368 proc_fork_connector(p); 1369 1369 cgroup_post_fork(p); 1370 1370 if (clone_flags & CLONE_THREAD) 1371 - threadgroup_fork_read_unlock(current); 1371 + threadgroup_change_end(current); 1372 1372 perf_event_fork(p); 1373 1373 return p; 1374 1374 ··· 1403 1403 bad_fork_cleanup_cgroup: 1404 1404 #endif 1405 1405 if (clone_flags & CLONE_THREAD) 1406 - threadgroup_fork_read_unlock(current); 1406 + threadgroup_change_end(current); 1407 1407 cgroup_exit(p, cgroup_callbacks_done); 1408 1408 delayacct_tsk_free(p); 1409 1409 module_put(task_thread_info(p)->exec_domain->module);

+1 -2

kernel/res_counter.c

··· 159 159 return 0; 160 160 } 161 161 162 - /* FIXME - make memparse() take const char* args */ 163 - *res = memparse((char *)buf, &end); 162 + *res = memparse(buf, &end); 164 163 if (*end != '\0') 165 164 return -EINVAL; 166 165

+19 -12

kernel/sched/core.c

··· 7563 7563 sched_destroy_group(tg); 7564 7564 } 7565 7565 7566 - static int 7567 - cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 7566 + static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7567 + struct cgroup_taskset *tset) 7568 7568 { 7569 + struct task_struct *task; 7570 + 7571 + cgroup_taskset_for_each(task, cgrp, tset) { 7569 7572 #ifdef CONFIG_RT_GROUP_SCHED 7570 - if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 7571 - return -EINVAL; 7573 + if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) 7574 + return -EINVAL; 7572 7575 #else 7573 - /* We don't support RT-tasks being in separate groups */ 7574 - if (tsk->sched_class != &fair_sched_class) 7575 - return -EINVAL; 7576 + /* We don't support RT-tasks being in separate groups */ 7577 + if (task->sched_class != &fair_sched_class) 7578 + return -EINVAL; 7576 7579 #endif 7580 + } 7577 7581 return 0; 7578 7582 } 7579 7583 7580 - static void 7581 - cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 7584 + static void cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 7585 + struct cgroup_taskset *tset) 7582 7586 { 7583 - sched_move_task(tsk); 7587 + struct task_struct *task; 7588 + 7589 + cgroup_taskset_for_each(task, cgrp, tset) 7590 + sched_move_task(task); 7584 7591 } 7585 7592 7586 7593 static void ··· 7922 7915 .name = "cpu", 7923 7916 .create = cpu_cgroup_create, 7924 7917 .destroy = cpu_cgroup_destroy, 7925 - .can_attach_task = cpu_cgroup_can_attach_task, 7926 - .attach_task = cpu_cgroup_attach_task, 7918 + .can_attach = cpu_cgroup_can_attach, 7919 + .attach = cpu_cgroup_attach, 7927 7920 .exit = cpu_cgroup_exit, 7928 7921 .populate = cpu_cgroup_populate, 7929 7922 .subsys_id = cpu_cgroup_subsys_id,

+10

kernel/signal.c

··· 2355 2355 int group_stop = 0; 2356 2356 sigset_t unblocked; 2357 2357 2358 + /* 2359 + * @tsk is about to have PF_EXITING set - lock out users which 2360 + * expect stable threadgroup. 2361 + */ 2362 + threadgroup_change_begin(tsk); 2363 + 2358 2364 if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { 2359 2365 tsk->flags |= PF_EXITING; 2366 + threadgroup_change_end(tsk); 2360 2367 return; 2361 2368 } 2362 2369 ··· 2373 2366 * see wants_signal(), do_signal_stop(). 2374 2367 */ 2375 2368 tsk->flags |= PF_EXITING; 2369 + 2370 + threadgroup_change_end(tsk); 2371 + 2376 2372 if (!signal_pending(tsk)) 2377 2373 goto out; 2378 2374

+8 -8

mm/memcontrol.c

··· 5391 5391 5392 5392 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5393 5393 struct cgroup *cgroup, 5394 - struct task_struct *p) 5394 + struct cgroup_taskset *tset) 5395 5395 { 5396 + struct task_struct *p = cgroup_taskset_first(tset); 5396 5397 int ret = 0; 5397 5398 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); 5398 5399 ··· 5431 5430 5432 5431 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5433 5432 struct cgroup *cgroup, 5434 - struct task_struct *p) 5433 + struct cgroup_taskset *tset) 5435 5434 { 5436 5435 mem_cgroup_clear_mc(); 5437 5436 } ··· 5548 5547 5549 5548 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5550 5549 struct cgroup *cont, 5551 - struct cgroup *old_cont, 5552 - struct task_struct *p) 5550 + struct cgroup_taskset *tset) 5553 5551 { 5552 + struct task_struct *p = cgroup_taskset_first(tset); 5554 5553 struct mm_struct *mm = get_task_mm(p); 5555 5554 5556 5555 if (mm) { ··· 5565 5564 #else /* !CONFIG_MMU */ 5566 5565 static int mem_cgroup_can_attach(struct cgroup_subsys *ss, 5567 5566 struct cgroup *cgroup, 5568 - struct task_struct *p) 5567 + struct cgroup_taskset *tset) 5569 5568 { 5570 5569 return 0; 5571 5570 } 5572 5571 static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, 5573 5572 struct cgroup *cgroup, 5574 - struct task_struct *p) 5573 + struct cgroup_taskset *tset) 5575 5574 { 5576 5575 } 5577 5576 static void mem_cgroup_move_task(struct cgroup_subsys *ss, 5578 5577 struct cgroup *cont, 5579 - struct cgroup *old_cont, 5580 - struct task_struct *p) 5578 + struct cgroup_taskset *tset) 5581 5579 { 5582 5580 } 5583 5581 #endif

+4 -3

security/device_cgroup.c

··· 62 62 struct cgroup_subsys devices_subsys; 63 63 64 64 static int devcgroup_can_attach(struct cgroup_subsys *ss, 65 - struct cgroup *new_cgroup, struct task_struct *task) 65 + struct cgroup *new_cgrp, struct cgroup_taskset *set) 66 66 { 67 - if (current != task && !capable(CAP_SYS_ADMIN)) 68 - return -EPERM; 67 + struct task_struct *task = cgroup_taskset_first(set); 69 68 69 + if (current != task && !capable(CAP_SYS_ADMIN)) 70 + return -EPERM; 70 71 return 0; 71 72 } 72 73