sched/cpuset: Bring back cpuset_mutex · tjh.dev/kernel@111cd11

+4 -4

include/linux/cpuset.h

··· 71 71 extern void cpuset_force_rebuild(void); 72 72 extern void cpuset_update_active_cpus(void); 73 73 extern void cpuset_wait_for_hotplug(void); 74 - extern void cpuset_read_lock(void); 75 - extern void cpuset_read_unlock(void); 74 + extern void cpuset_lock(void); 75 + extern void cpuset_unlock(void); 76 76 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); 77 77 extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); 78 78 extern nodemask_t cpuset_mems_allowed(struct task_struct *p); ··· 189 189 190 190 static inline void cpuset_wait_for_hotplug(void) { } 191 191 192 - static inline void cpuset_read_lock(void) { } 193 - static inline void cpuset_read_unlock(void) { } 192 + static inline void cpuset_lock(void) { } 193 + static inline void cpuset_unlock(void) { } 194 194 195 195 static inline void cpuset_cpus_allowed(struct task_struct *p, 196 196 struct cpumask *mask)

+80 -79

kernel/cgroup/cpuset.c

··· 366 366 if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) 367 367 368 368 /* 369 - * There are two global locks guarding cpuset structures - cpuset_rwsem and 369 + * There are two global locks guarding cpuset structures - cpuset_mutex and 370 370 * callback_lock. We also require taking task_lock() when dereferencing a 371 371 * task's cpuset pointer. See "The task_lock() exception", at the end of this 372 - * comment. The cpuset code uses only cpuset_rwsem write lock. Other 373 - * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to 374 - * prevent change to cpuset structures. 372 + * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems 373 + * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset 374 + * structures. Note that cpuset_mutex needs to be a mutex as it is used in 375 + * paths that rely on priority inheritance (e.g. scheduler - on RT) for 376 + * correctness. 375 377 * 376 378 * A task must hold both locks to modify cpusets. If a task holds 377 - * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it 378 - * is the only task able to also acquire callback_lock and be able to 379 - * modify cpusets. It can perform various checks on the cpuset structure 380 - * first, knowing nothing will change. It can also allocate memory while 381 - * just holding cpuset_rwsem. While it is performing these checks, various 382 - * callback routines can briefly acquire callback_lock to query cpusets. 383 - * Once it is ready to make the changes, it takes callback_lock, blocking 384 - * everyone else. 379 + * cpuset_mutex, it blocks others, ensuring that it is the only task able to 380 + * also acquire callback_lock and be able to modify cpusets. It can perform 381 + * various checks on the cpuset structure first, knowing nothing will change. 382 + * It can also allocate memory while just holding cpuset_mutex. While it is 383 + * performing these checks, various callback routines can briefly acquire 384 + * callback_lock to query cpusets. Once it is ready to make the changes, it 385 + * takes callback_lock, blocking everyone else. 385 386 * 386 387 * Calls to the kernel memory allocator can not be made while holding 387 388 * callback_lock, as that would risk double tripping on callback_lock ··· 404 403 * guidelines for accessing subsystem state in kernel/cgroup.c 405 404 */ 406 405 407 - DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); 406 + static DEFINE_MUTEX(cpuset_mutex); 408 407 409 - void cpuset_read_lock(void) 408 + void cpuset_lock(void) 410 409 { 411 - percpu_down_read(&cpuset_rwsem); 410 + mutex_lock(&cpuset_mutex); 412 411 } 413 412 414 - void cpuset_read_unlock(void) 413 + void cpuset_unlock(void) 415 414 { 416 - percpu_up_read(&cpuset_rwsem); 415 + mutex_unlock(&cpuset_mutex); 417 416 } 418 417 419 418 static DEFINE_SPINLOCK(callback_lock); ··· 497 496 * One way or another, we guarantee to return some non-empty subset 498 497 * of cpu_online_mask. 499 498 * 500 - * Call with callback_lock or cpuset_rwsem held. 499 + * Call with callback_lock or cpuset_mutex held. 501 500 */ 502 501 static void guarantee_online_cpus(struct task_struct *tsk, 503 502 struct cpumask *pmask) ··· 539 538 * One way or another, we guarantee to return some non-empty subset 540 539 * of node_states[N_MEMORY]. 541 540 * 542 - * Call with callback_lock or cpuset_rwsem held. 541 + * Call with callback_lock or cpuset_mutex held. 543 542 */ 544 543 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 545 544 { ··· 551 550 /* 552 551 * update task's spread flag if cpuset's page/slab spread flag is set 553 552 * 554 - * Call with callback_lock or cpuset_rwsem held. The check can be skipped 553 + * Call with callback_lock or cpuset_mutex held. The check can be skipped 555 554 * if on default hierarchy. 556 555 */ 557 556 static void cpuset_update_task_spread_flags(struct cpuset *cs, ··· 576 575 * 577 576 * One cpuset is a subset of another if all its allowed CPUs and 578 577 * Memory Nodes are a subset of the other, and its exclusive flags 579 - * are only set if the other's are set. Call holding cpuset_rwsem. 578 + * are only set if the other's are set. Call holding cpuset_mutex. 580 579 */ 581 580 582 581 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) ··· 714 713 * If we replaced the flag and mask values of the current cpuset 715 714 * (cur) with those values in the trial cpuset (trial), would 716 715 * our various subset and exclusive rules still be valid? Presumes 717 - * cpuset_rwsem held. 716 + * cpuset_mutex held. 718 717 * 719 718 * 'cur' is the address of an actual, in-use cpuset. Operations 720 719 * such as list traversal that depend on the actual address of the ··· 830 829 rcu_read_unlock(); 831 830 } 832 831 833 - /* Must be called with cpuset_rwsem held. */ 832 + /* Must be called with cpuset_mutex held. */ 834 833 static inline int nr_cpusets(void) 835 834 { 836 835 /* jump label reference count + the top-level cpuset */ ··· 856 855 * domains when operating in the severe memory shortage situations 857 856 * that could cause allocation failures below. 858 857 * 859 - * Must be called with cpuset_rwsem held. 858 + * Must be called with cpuset_mutex held. 860 859 * 861 860 * The three key local variables below are: 862 861 * cp - cpuset pointer, used (together with pos_css) to perform a ··· 1085 1084 struct cpuset *cs = NULL; 1086 1085 struct cgroup_subsys_state *pos_css; 1087 1086 1088 - percpu_rwsem_assert_held(&cpuset_rwsem); 1087 + lockdep_assert_held(&cpuset_mutex); 1089 1088 lockdep_assert_cpus_held(); 1090 1089 lockdep_assert_held(&sched_domains_mutex); 1091 1090 ··· 1135 1134 * 'cpus' is removed, then call this routine to rebuild the 1136 1135 * scheduler's dynamic sched domains. 1137 1136 * 1138 - * Call with cpuset_rwsem held. Takes cpus_read_lock(). 1137 + * Call with cpuset_mutex held. Takes cpus_read_lock(). 1139 1138 */ 1140 1139 static void rebuild_sched_domains_locked(void) 1141 1140 { ··· 1146 1145 int ndoms; 1147 1146 1148 1147 lockdep_assert_cpus_held(); 1149 - percpu_rwsem_assert_held(&cpuset_rwsem); 1148 + lockdep_assert_held(&cpuset_mutex); 1150 1149 1151 1150 /* 1152 1151 * If we have raced with CPU hotplug, return early to avoid ··· 1197 1196 void rebuild_sched_domains(void) 1198 1197 { 1199 1198 cpus_read_lock(); 1200 - percpu_down_write(&cpuset_rwsem); 1199 + mutex_lock(&cpuset_mutex); 1201 1200 rebuild_sched_domains_locked(); 1202 - percpu_up_write(&cpuset_rwsem); 1201 + mutex_unlock(&cpuset_mutex); 1203 1202 cpus_read_unlock(); 1204 1203 } 1205 1204 ··· 1209 1208 * @new_cpus: the temp variable for the new effective_cpus mask 1210 1209 * 1211 1210 * Iterate through each task of @cs updating its cpus_allowed to the 1212 - * effective cpuset's. As this function is called with cpuset_rwsem held, 1211 + * effective cpuset's. As this function is called with cpuset_mutex held, 1213 1212 * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() 1214 1213 * is used instead of effective_cpus to make sure all offline CPUs are also 1215 1214 * included as hotplug code won't update cpumasks for tasks in top_cpuset. ··· 1323 1322 int old_prs, new_prs; 1324 1323 int part_error = PERR_NONE; /* Partition error? */ 1325 1324 1326 - percpu_rwsem_assert_held(&cpuset_rwsem); 1325 + lockdep_assert_held(&cpuset_mutex); 1327 1326 1328 1327 /* 1329 1328 * The parent must be a partition root. ··· 1546 1545 * 1547 1546 * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. 1548 1547 * 1549 - * Called with cpuset_rwsem held 1548 + * Called with cpuset_mutex held 1550 1549 */ 1551 1550 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, 1552 1551 bool force) ··· 1706 1705 struct cpuset *sibling; 1707 1706 struct cgroup_subsys_state *pos_css; 1708 1707 1709 - percpu_rwsem_assert_held(&cpuset_rwsem); 1708 + lockdep_assert_held(&cpuset_mutex); 1710 1709 1711 1710 /* 1712 1711 * Check all its siblings and call update_cpumasks_hier() ··· 1956 1955 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1957 1956 * 1958 1957 * Iterate through each task of @cs updating its mems_allowed to the 1959 - * effective cpuset's. As this function is called with cpuset_rwsem held, 1958 + * effective cpuset's. As this function is called with cpuset_mutex held, 1960 1959 * cpuset membership stays stable. 1961 1960 */ 1962 1961 static void update_tasks_nodemask(struct cpuset *cs) 1963 1962 { 1964 - static nodemask_t newmems; /* protected by cpuset_rwsem */ 1963 + static nodemask_t newmems; /* protected by cpuset_mutex */ 1965 1964 struct css_task_iter it; 1966 1965 struct task_struct *task; 1967 1966 ··· 1974 1973 * take while holding tasklist_lock. Forks can happen - the 1975 1974 * mpol_dup() cpuset_being_rebound check will catch such forks, 1976 1975 * and rebind their vma mempolicies too. Because we still hold 1977 - * the global cpuset_rwsem, we know that no other rebind effort 1976 + * the global cpuset_mutex, we know that no other rebind effort 1978 1977 * will be contending for the global variable cpuset_being_rebound. 1979 1978 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1980 1979 * is idempotent. Also migrate pages in each mm to new nodes. ··· 2020 2019 * 2021 2020 * On legacy hierarchy, effective_mems will be the same with mems_allowed. 2022 2021 * 2023 - * Called with cpuset_rwsem held 2022 + * Called with cpuset_mutex held 2024 2023 */ 2025 2024 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) 2026 2025 { ··· 2073 2072 * mempolicies and if the cpuset is marked 'memory_migrate', 2074 2073 * migrate the tasks pages to the new memory. 2075 2074 * 2076 - * Call with cpuset_rwsem held. May take callback_lock during call. 2075 + * Call with cpuset_mutex held. May take callback_lock during call. 2077 2076 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 2078 2077 * lock each such tasks mm->mmap_lock, scan its vma's and rebind 2079 2078 * their mempolicies to the cpusets new mems_allowed. ··· 2165 2164 * @cs: the cpuset in which each task's spread flags needs to be changed 2166 2165 * 2167 2166 * Iterate through each task of @cs updating its spread flags. As this 2168 - * function is called with cpuset_rwsem held, cpuset membership stays 2167 + * function is called with cpuset_mutex held, cpuset membership stays 2169 2168 * stable. 2170 2169 */ 2171 2170 static void update_tasks_flags(struct cpuset *cs) ··· 2185 2184 * cs: the cpuset to update 2186 2185 * turning_on: whether the flag is being set or cleared 2187 2186 * 2188 - * Call with cpuset_rwsem held. 2187 + * Call with cpuset_mutex held. 2189 2188 */ 2190 2189 2191 2190 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, ··· 2235 2234 * @new_prs: new partition root state 2236 2235 * Return: 0 if successful, != 0 if error 2237 2236 * 2238 - * Call with cpuset_rwsem held. 2237 + * Call with cpuset_mutex held. 2239 2238 */ 2240 2239 static int update_prstate(struct cpuset *cs, int new_prs) 2241 2240 { ··· 2473 2472 return 0; 2474 2473 } 2475 2474 2476 - /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ 2475 + /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 2477 2476 static int cpuset_can_attach(struct cgroup_taskset *tset) 2478 2477 { 2479 2478 struct cgroup_subsys_state *css; ··· 2485 2484 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); 2486 2485 cs = css_cs(css); 2487 2486 2488 - percpu_down_write(&cpuset_rwsem); 2487 + mutex_lock(&cpuset_mutex); 2489 2488 2490 2489 /* Check to see if task is allowed in the cpuset */ 2491 2490 ret = cpuset_can_attach_check(cs); ··· 2507 2506 */ 2508 2507 cs->attach_in_progress++; 2509 2508 out_unlock: 2510 - percpu_up_write(&cpuset_rwsem); 2509 + mutex_unlock(&cpuset_mutex); 2511 2510 return ret; 2512 2511 } 2513 2512 ··· 2519 2518 cgroup_taskset_first(tset, &css); 2520 2519 cs = css_cs(css); 2521 2520 2522 - percpu_down_write(&cpuset_rwsem); 2521 + mutex_lock(&cpuset_mutex); 2523 2522 cs->attach_in_progress--; 2524 2523 if (!cs->attach_in_progress) 2525 2524 wake_up(&cpuset_attach_wq); 2526 - percpu_up_write(&cpuset_rwsem); 2525 + mutex_unlock(&cpuset_mutex); 2527 2526 } 2528 2527 2529 2528 /* 2530 - * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task() 2529 + * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() 2531 2530 * but we can't allocate it dynamically there. Define it global and 2532 2531 * allocate from cpuset_init(). 2533 2532 */ ··· 2536 2535 2537 2536 static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) 2538 2537 { 2539 - percpu_rwsem_assert_held(&cpuset_rwsem); 2538 + lockdep_assert_held(&cpuset_mutex); 2540 2539 2541 2540 if (cs != &top_cpuset) 2542 2541 guarantee_online_cpus(task, cpus_attach); ··· 2566 2565 cs = css_cs(css); 2567 2566 2568 2567 lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ 2569 - percpu_down_write(&cpuset_rwsem); 2568 + mutex_lock(&cpuset_mutex); 2570 2569 cpus_updated = !cpumask_equal(cs->effective_cpus, 2571 2570 oldcs->effective_cpus); 2572 2571 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); ··· 2627 2626 if (!cs->attach_in_progress) 2628 2627 wake_up(&cpuset_attach_wq); 2629 2628 2630 - percpu_up_write(&cpuset_rwsem); 2629 + mutex_unlock(&cpuset_mutex); 2631 2630 } 2632 2631 2633 2632 /* The various types of files and directories in a cpuset file system */ ··· 2659 2658 int retval = 0; 2660 2659 2661 2660 cpus_read_lock(); 2662 - percpu_down_write(&cpuset_rwsem); 2661 + mutex_lock(&cpuset_mutex); 2663 2662 if (!is_cpuset_online(cs)) { 2664 2663 retval = -ENODEV; 2665 2664 goto out_unlock; ··· 2695 2694 break; 2696 2695 } 2697 2696 out_unlock: 2698 - percpu_up_write(&cpuset_rwsem); 2697 + mutex_unlock(&cpuset_mutex); 2699 2698 cpus_read_unlock(); 2700 2699 return retval; 2701 2700 } ··· 2708 2707 int retval = -ENODEV; 2709 2708 2710 2709 cpus_read_lock(); 2711 - percpu_down_write(&cpuset_rwsem); 2710 + mutex_lock(&cpuset_mutex); 2712 2711 if (!is_cpuset_online(cs)) 2713 2712 goto out_unlock; 2714 2713 ··· 2721 2720 break; 2722 2721 } 2723 2722 out_unlock: 2724 - percpu_up_write(&cpuset_rwsem); 2723 + mutex_unlock(&cpuset_mutex); 2725 2724 cpus_read_unlock(); 2726 2725 return retval; 2727 2726 } ··· 2754 2753 * operation like this one can lead to a deadlock through kernfs 2755 2754 * active_ref protection. Let's break the protection. Losing the 2756 2755 * protection is okay as we check whether @cs is online after 2757 - * grabbing cpuset_rwsem anyway. This only happens on the legacy 2756 + * grabbing cpuset_mutex anyway. This only happens on the legacy 2758 2757 * hierarchies. 2759 2758 */ 2760 2759 css_get(&cs->css); ··· 2762 2761 flush_work(&cpuset_hotplug_work); 2763 2762 2764 2763 cpus_read_lock(); 2765 - percpu_down_write(&cpuset_rwsem); 2764 + mutex_lock(&cpuset_mutex); 2766 2765 if (!is_cpuset_online(cs)) 2767 2766 goto out_unlock; 2768 2767 ··· 2786 2785 2787 2786 free_cpuset(trialcs); 2788 2787 out_unlock: 2789 - percpu_up_write(&cpuset_rwsem); 2788 + mutex_unlock(&cpuset_mutex); 2790 2789 cpus_read_unlock(); 2791 2790 kernfs_unbreak_active_protection(of->kn); 2792 2791 css_put(&cs->css); ··· 2934 2933 2935 2934 css_get(&cs->css); 2936 2935 cpus_read_lock(); 2937 - percpu_down_write(&cpuset_rwsem); 2936 + mutex_lock(&cpuset_mutex); 2938 2937 if (!is_cpuset_online(cs)) 2939 2938 goto out_unlock; 2940 2939 2941 2940 retval = update_prstate(cs, val); 2942 2941 out_unlock: 2943 - percpu_up_write(&cpuset_rwsem); 2942 + mutex_unlock(&cpuset_mutex); 2944 2943 cpus_read_unlock(); 2945 2944 css_put(&cs->css); 2946 2945 return retval ?: nbytes; ··· 3157 3156 return 0; 3158 3157 3159 3158 cpus_read_lock(); 3160 - percpu_down_write(&cpuset_rwsem); 3159 + mutex_lock(&cpuset_mutex); 3161 3160 3162 3161 set_bit(CS_ONLINE, &cs->flags); 3163 3162 if (is_spread_page(parent)) ··· 3208 3207 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 3209 3208 spin_unlock_irq(&callback_lock); 3210 3209 out_unlock: 3211 - percpu_up_write(&cpuset_rwsem); 3210 + mutex_unlock(&cpuset_mutex); 3212 3211 cpus_read_unlock(); 3213 3212 return 0; 3214 3213 } ··· 3229 3228 struct cpuset *cs = css_cs(css); 3230 3229 3231 3230 cpus_read_lock(); 3232 - percpu_down_write(&cpuset_rwsem); 3231 + mutex_lock(&cpuset_mutex); 3233 3232 3234 3233 if (is_partition_valid(cs)) 3235 3234 update_prstate(cs, 0); ··· 3248 3247 cpuset_dec(); 3249 3248 clear_bit(CS_ONLINE, &cs->flags); 3250 3249 3251 - percpu_up_write(&cpuset_rwsem); 3250 + mutex_unlock(&cpuset_mutex); 3252 3251 cpus_read_unlock(); 3253 3252 } 3254 3253 ··· 3261 3260 3262 3261 static void cpuset_bind(struct cgroup_subsys_state *root_css) 3263 3262 { 3264 - percpu_down_write(&cpuset_rwsem); 3263 + mutex_lock(&cpuset_mutex); 3265 3264 spin_lock_irq(&callback_lock); 3266 3265 3267 3266 if (is_in_v2_mode()) { ··· 3274 3273 } 3275 3274 3276 3275 spin_unlock_irq(&callback_lock); 3277 - percpu_up_write(&cpuset_rwsem); 3276 + mutex_unlock(&cpuset_mutex); 3278 3277 } 3279 3278 3280 3279 /* ··· 3295 3294 return 0; 3296 3295 3297 3296 lockdep_assert_held(&cgroup_mutex); 3298 - percpu_down_write(&cpuset_rwsem); 3297 + mutex_lock(&cpuset_mutex); 3299 3298 3300 3299 /* Check to see if task is allowed in the cpuset */ 3301 3300 ret = cpuset_can_attach_check(cs); ··· 3316 3315 */ 3317 3316 cs->attach_in_progress++; 3318 3317 out_unlock: 3319 - percpu_up_write(&cpuset_rwsem); 3318 + mutex_unlock(&cpuset_mutex); 3320 3319 return ret; 3321 3320 } 3322 3321 ··· 3332 3331 if (same_cs) 3333 3332 return; 3334 3333 3335 - percpu_down_write(&cpuset_rwsem); 3334 + mutex_lock(&cpuset_mutex); 3336 3335 cs->attach_in_progress--; 3337 3336 if (!cs->attach_in_progress) 3338 3337 wake_up(&cpuset_attach_wq); 3339 - percpu_up_write(&cpuset_rwsem); 3338 + mutex_unlock(&cpuset_mutex); 3340 3339 } 3341 3340 3342 3341 /* ··· 3364 3363 } 3365 3364 3366 3365 /* CLONE_INTO_CGROUP */ 3367 - percpu_down_write(&cpuset_rwsem); 3366 + mutex_lock(&cpuset_mutex); 3368 3367 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 3369 3368 cpuset_attach_task(cs, task); 3370 3369 ··· 3372 3371 if (!cs->attach_in_progress) 3373 3372 wake_up(&cpuset_attach_wq); 3374 3373 3375 - percpu_up_write(&cpuset_rwsem); 3374 + mutex_unlock(&cpuset_mutex); 3376 3375 } 3377 3376 3378 3377 struct cgroup_subsys cpuset_cgrp_subsys = { ··· 3473 3472 is_empty = cpumask_empty(cs->cpus_allowed) || 3474 3473 nodes_empty(cs->mems_allowed); 3475 3474 3476 - percpu_up_write(&cpuset_rwsem); 3475 + mutex_unlock(&cpuset_mutex); 3477 3476 3478 3477 /* 3479 3478 * Move tasks to the nearest ancestor with execution resources, ··· 3483 3482 if (is_empty) 3484 3483 remove_tasks_in_empty_cpuset(cs); 3485 3484 3486 - percpu_down_write(&cpuset_rwsem); 3485 + mutex_lock(&cpuset_mutex); 3487 3486 } 3488 3487 3489 3488 static void ··· 3534 3533 retry: 3535 3534 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 3536 3535 3537 - percpu_down_write(&cpuset_rwsem); 3536 + mutex_lock(&cpuset_mutex); 3538 3537 3539 3538 /* 3540 3539 * We have raced with task attaching. We wait until attaching 3541 3540 * is finished, so we won't attach a task to an empty cpuset. 3542 3541 */ 3543 3542 if (cs->attach_in_progress) { 3544 - percpu_up_write(&cpuset_rwsem); 3543 + mutex_unlock(&cpuset_mutex); 3545 3544 goto retry; 3546 3545 } 3547 3546 ··· 3638 3637 cpus_updated, mems_updated); 3639 3638 3640 3639 unlock: 3641 - percpu_up_write(&cpuset_rwsem); 3640 + mutex_unlock(&cpuset_mutex); 3642 3641 } 3643 3642 3644 3643 /** ··· 3668 3667 if (on_dfl && !alloc_cpumasks(NULL, &tmp)) 3669 3668 ptmp = &tmp; 3670 3669 3671 - percpu_down_write(&cpuset_rwsem); 3670 + mutex_lock(&cpuset_mutex); 3672 3671 3673 3672 /* fetch the available cpus/mems and find out which changed how */ 3674 3673 cpumask_copy(&new_cpus, cpu_active_mask); ··· 3725 3724 update_tasks_nodemask(&top_cpuset); 3726 3725 } 3727 3726 3728 - percpu_up_write(&cpuset_rwsem); 3727 + mutex_unlock(&cpuset_mutex); 3729 3728 3730 3729 /* if cpus or mems changed, we need to propagate to descendants */ 3731 3730 if (cpus_updated || mems_updated) { ··· 4156 4155 * - Used for /proc/<pid>/cpuset. 4157 4156 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 4158 4157 * doesn't really matter if tsk->cpuset changes after we read it, 4159 - * and we take cpuset_rwsem, keeping cpuset_attach() from changing it 4158 + * and we take cpuset_mutex, keeping cpuset_attach() from changing it 4160 4159 * anyway. 4161 4160 */ 4162 4161 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,

+15 -7

kernel/sched/core.c

··· 7590 7590 int reset_on_fork; 7591 7591 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 7592 7592 struct rq *rq; 7593 + bool cpuset_locked = false; 7593 7594 7594 7595 /* The pi code expects interrupts enabled */ 7595 7596 BUG_ON(pi && in_interrupt()); ··· 7640 7639 return retval; 7641 7640 } 7642 7641 7643 - if (pi) 7644 - cpuset_read_lock(); 7642 + /* 7643 + * SCHED_DEADLINE bandwidth accounting relies on stable cpusets 7644 + * information. 7645 + */ 7646 + if (dl_policy(policy) || dl_policy(p->policy)) { 7647 + cpuset_locked = true; 7648 + cpuset_lock(); 7649 + } 7645 7650 7646 7651 /* 7647 7652 * Make sure no PI-waiters arrive (or leave) while we are ··· 7723 7716 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 7724 7717 policy = oldpolicy = -1; 7725 7718 task_rq_unlock(rq, p, &rf); 7726 - if (pi) 7727 - cpuset_read_unlock(); 7719 + if (cpuset_locked) 7720 + cpuset_unlock(); 7728 7721 goto recheck; 7729 7722 } 7730 7723 ··· 7791 7784 task_rq_unlock(rq, p, &rf); 7792 7785 7793 7786 if (pi) { 7794 - cpuset_read_unlock(); 7787 + if (cpuset_locked) 7788 + cpuset_unlock(); 7795 7789 rt_mutex_adjust_pi(p); 7796 7790 } 7797 7791 ··· 7804 7796 7805 7797 unlock: 7806 7798 task_rq_unlock(rq, p, &rf); 7807 - if (pi) 7808 - cpuset_read_unlock(); 7799 + if (cpuset_locked) 7800 + cpuset_unlock(); 7809 7801 return retval; 7810 7802 } 7811 7803