Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

sched/cpuset: Bring back cpuset_mutex

Turns out percpu_cpuset_rwsem - commit 1243dc518c9d ("cgroup/cpuset:
Convert cpuset_mutex to percpu_rwsem") - wasn't such a brilliant idea,
as it has been reported to cause slowdowns in workloads that need to
change cpuset configuration frequently and it is also not implementing
priority inheritance (which causes troubles with realtime workloads).

Convert percpu_cpuset_rwsem back to regular cpuset_mutex. Also grab it
only for SCHED_DEADLINE tasks (other policies don't care about stable
cpusets anyway).

Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Reviewed-by: Waiman Long <longman@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>

authored by

Juri Lelli and committed by
Tejun Heo
111cd11b ad3a557d

+99 -90
+4 -4
include/linux/cpuset.h
··· 71 71 extern void cpuset_force_rebuild(void); 72 72 extern void cpuset_update_active_cpus(void); 73 73 extern void cpuset_wait_for_hotplug(void); 74 - extern void cpuset_read_lock(void); 75 - extern void cpuset_read_unlock(void); 74 + extern void cpuset_lock(void); 75 + extern void cpuset_unlock(void); 76 76 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); 77 77 extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); 78 78 extern nodemask_t cpuset_mems_allowed(struct task_struct *p); ··· 189 189 190 190 static inline void cpuset_wait_for_hotplug(void) { } 191 191 192 - static inline void cpuset_read_lock(void) { } 193 - static inline void cpuset_read_unlock(void) { } 192 + static inline void cpuset_lock(void) { } 193 + static inline void cpuset_unlock(void) { } 194 194 195 195 static inline void cpuset_cpus_allowed(struct task_struct *p, 196 196 struct cpumask *mask)
+80 -79
kernel/cgroup/cpuset.c
··· 366 366 if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) 367 367 368 368 /* 369 - * There are two global locks guarding cpuset structures - cpuset_rwsem and 369 + * There are two global locks guarding cpuset structures - cpuset_mutex and 370 370 * callback_lock. We also require taking task_lock() when dereferencing a 371 371 * task's cpuset pointer. See "The task_lock() exception", at the end of this 372 - * comment. The cpuset code uses only cpuset_rwsem write lock. Other 373 - * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to 374 - * prevent change to cpuset structures. 372 + * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems 373 + * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset 374 + * structures. Note that cpuset_mutex needs to be a mutex as it is used in 375 + * paths that rely on priority inheritance (e.g. scheduler - on RT) for 376 + * correctness. 375 377 * 376 378 * A task must hold both locks to modify cpusets. If a task holds 377 - * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it 378 - * is the only task able to also acquire callback_lock and be able to 379 - * modify cpusets. It can perform various checks on the cpuset structure 380 - * first, knowing nothing will change. It can also allocate memory while 381 - * just holding cpuset_rwsem. While it is performing these checks, various 382 - * callback routines can briefly acquire callback_lock to query cpusets. 383 - * Once it is ready to make the changes, it takes callback_lock, blocking 384 - * everyone else. 379 + * cpuset_mutex, it blocks others, ensuring that it is the only task able to 380 + * also acquire callback_lock and be able to modify cpusets. It can perform 381 + * various checks on the cpuset structure first, knowing nothing will change. 382 + * It can also allocate memory while just holding cpuset_mutex. While it is 383 + * performing these checks, various callback routines can briefly acquire 384 + * callback_lock to query cpusets. Once it is ready to make the changes, it 385 + * takes callback_lock, blocking everyone else. 385 386 * 386 387 * Calls to the kernel memory allocator can not be made while holding 387 388 * callback_lock, as that would risk double tripping on callback_lock ··· 404 403 * guidelines for accessing subsystem state in kernel/cgroup.c 405 404 */ 406 405 407 - DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem); 406 + static DEFINE_MUTEX(cpuset_mutex); 408 407 409 - void cpuset_read_lock(void) 408 + void cpuset_lock(void) 410 409 { 411 - percpu_down_read(&cpuset_rwsem); 410 + mutex_lock(&cpuset_mutex); 412 411 } 413 412 414 - void cpuset_read_unlock(void) 413 + void cpuset_unlock(void) 415 414 { 416 - percpu_up_read(&cpuset_rwsem); 415 + mutex_unlock(&cpuset_mutex); 417 416 } 418 417 419 418 static DEFINE_SPINLOCK(callback_lock); ··· 497 496 * One way or another, we guarantee to return some non-empty subset 498 497 * of cpu_online_mask. 499 498 * 500 - * Call with callback_lock or cpuset_rwsem held. 499 + * Call with callback_lock or cpuset_mutex held. 501 500 */ 502 501 static void guarantee_online_cpus(struct task_struct *tsk, 503 502 struct cpumask *pmask) ··· 539 538 * One way or another, we guarantee to return some non-empty subset 540 539 * of node_states[N_MEMORY]. 541 540 * 542 - * Call with callback_lock or cpuset_rwsem held. 541 + * Call with callback_lock or cpuset_mutex held. 543 542 */ 544 543 static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 545 544 { ··· 551 550 /* 552 551 * update task's spread flag if cpuset's page/slab spread flag is set 553 552 * 554 - * Call with callback_lock or cpuset_rwsem held. The check can be skipped 553 + * Call with callback_lock or cpuset_mutex held. The check can be skipped 555 554 * if on default hierarchy. 556 555 */ 557 556 static void cpuset_update_task_spread_flags(struct cpuset *cs, ··· 576 575 * 577 576 * One cpuset is a subset of another if all its allowed CPUs and 578 577 * Memory Nodes are a subset of the other, and its exclusive flags 579 - * are only set if the other's are set. Call holding cpuset_rwsem. 578 + * are only set if the other's are set. Call holding cpuset_mutex. 580 579 */ 581 580 582 581 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) ··· 714 713 * If we replaced the flag and mask values of the current cpuset 715 714 * (cur) with those values in the trial cpuset (trial), would 716 715 * our various subset and exclusive rules still be valid? Presumes 717 - * cpuset_rwsem held. 716 + * cpuset_mutex held. 718 717 * 719 718 * 'cur' is the address of an actual, in-use cpuset. Operations 720 719 * such as list traversal that depend on the actual address of the ··· 830 829 rcu_read_unlock(); 831 830 } 832 831 833 - /* Must be called with cpuset_rwsem held. */ 832 + /* Must be called with cpuset_mutex held. */ 834 833 static inline int nr_cpusets(void) 835 834 { 836 835 /* jump label reference count + the top-level cpuset */ ··· 856 855 * domains when operating in the severe memory shortage situations 857 856 * that could cause allocation failures below. 858 857 * 859 - * Must be called with cpuset_rwsem held. 858 + * Must be called with cpuset_mutex held. 860 859 * 861 860 * The three key local variables below are: 862 861 * cp - cpuset pointer, used (together with pos_css) to perform a ··· 1085 1084 struct cpuset *cs = NULL; 1086 1085 struct cgroup_subsys_state *pos_css; 1087 1086 1088 - percpu_rwsem_assert_held(&cpuset_rwsem); 1087 + lockdep_assert_held(&cpuset_mutex); 1089 1088 lockdep_assert_cpus_held(); 1090 1089 lockdep_assert_held(&sched_domains_mutex); 1091 1090 ··· 1135 1134 * 'cpus' is removed, then call this routine to rebuild the 1136 1135 * scheduler's dynamic sched domains. 1137 1136 * 1138 - * Call with cpuset_rwsem held. Takes cpus_read_lock(). 1137 + * Call with cpuset_mutex held. Takes cpus_read_lock(). 1139 1138 */ 1140 1139 static void rebuild_sched_domains_locked(void) 1141 1140 { ··· 1146 1145 int ndoms; 1147 1146 1148 1147 lockdep_assert_cpus_held(); 1149 - percpu_rwsem_assert_held(&cpuset_rwsem); 1148 + lockdep_assert_held(&cpuset_mutex); 1150 1149 1151 1150 /* 1152 1151 * If we have raced with CPU hotplug, return early to avoid ··· 1197 1196 void rebuild_sched_domains(void) 1198 1197 { 1199 1198 cpus_read_lock(); 1200 - percpu_down_write(&cpuset_rwsem); 1199 + mutex_lock(&cpuset_mutex); 1201 1200 rebuild_sched_domains_locked(); 1202 - percpu_up_write(&cpuset_rwsem); 1201 + mutex_unlock(&cpuset_mutex); 1203 1202 cpus_read_unlock(); 1204 1203 } 1205 1204 ··· 1209 1208 * @new_cpus: the temp variable for the new effective_cpus mask 1210 1209 * 1211 1210 * Iterate through each task of @cs updating its cpus_allowed to the 1212 - * effective cpuset's. As this function is called with cpuset_rwsem held, 1211 + * effective cpuset's. As this function is called with cpuset_mutex held, 1213 1212 * cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask() 1214 1213 * is used instead of effective_cpus to make sure all offline CPUs are also 1215 1214 * included as hotplug code won't update cpumasks for tasks in top_cpuset. ··· 1323 1322 int old_prs, new_prs; 1324 1323 int part_error = PERR_NONE; /* Partition error? */ 1325 1324 1326 - percpu_rwsem_assert_held(&cpuset_rwsem); 1325 + lockdep_assert_held(&cpuset_mutex); 1327 1326 1328 1327 /* 1329 1328 * The parent must be a partition root. ··· 1546 1545 * 1547 1546 * On legacy hierarchy, effective_cpus will be the same with cpu_allowed. 1548 1547 * 1549 - * Called with cpuset_rwsem held 1548 + * Called with cpuset_mutex held 1550 1549 */ 1551 1550 static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, 1552 1551 bool force) ··· 1706 1705 struct cpuset *sibling; 1707 1706 struct cgroup_subsys_state *pos_css; 1708 1707 1709 - percpu_rwsem_assert_held(&cpuset_rwsem); 1708 + lockdep_assert_held(&cpuset_mutex); 1710 1709 1711 1710 /* 1712 1711 * Check all its siblings and call update_cpumasks_hier() ··· 1956 1955 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed 1957 1956 * 1958 1957 * Iterate through each task of @cs updating its mems_allowed to the 1959 - * effective cpuset's. As this function is called with cpuset_rwsem held, 1958 + * effective cpuset's. As this function is called with cpuset_mutex held, 1960 1959 * cpuset membership stays stable. 1961 1960 */ 1962 1961 static void update_tasks_nodemask(struct cpuset *cs) 1963 1962 { 1964 - static nodemask_t newmems; /* protected by cpuset_rwsem */ 1963 + static nodemask_t newmems; /* protected by cpuset_mutex */ 1965 1964 struct css_task_iter it; 1966 1965 struct task_struct *task; 1967 1966 ··· 1974 1973 * take while holding tasklist_lock. Forks can happen - the 1975 1974 * mpol_dup() cpuset_being_rebound check will catch such forks, 1976 1975 * and rebind their vma mempolicies too. Because we still hold 1977 - * the global cpuset_rwsem, we know that no other rebind effort 1976 + * the global cpuset_mutex, we know that no other rebind effort 1978 1977 * will be contending for the global variable cpuset_being_rebound. 1979 1978 * It's ok if we rebind the same mm twice; mpol_rebind_mm() 1980 1979 * is idempotent. Also migrate pages in each mm to new nodes. ··· 2020 2019 * 2021 2020 * On legacy hierarchy, effective_mems will be the same with mems_allowed. 2022 2021 * 2023 - * Called with cpuset_rwsem held 2022 + * Called with cpuset_mutex held 2024 2023 */ 2025 2024 static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) 2026 2025 { ··· 2073 2072 * mempolicies and if the cpuset is marked 'memory_migrate', 2074 2073 * migrate the tasks pages to the new memory. 2075 2074 * 2076 - * Call with cpuset_rwsem held. May take callback_lock during call. 2075 + * Call with cpuset_mutex held. May take callback_lock during call. 2077 2076 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 2078 2077 * lock each such tasks mm->mmap_lock, scan its vma's and rebind 2079 2078 * their mempolicies to the cpusets new mems_allowed. ··· 2165 2164 * @cs: the cpuset in which each task's spread flags needs to be changed 2166 2165 * 2167 2166 * Iterate through each task of @cs updating its spread flags. As this 2168 - * function is called with cpuset_rwsem held, cpuset membership stays 2167 + * function is called with cpuset_mutex held, cpuset membership stays 2169 2168 * stable. 2170 2169 */ 2171 2170 static void update_tasks_flags(struct cpuset *cs) ··· 2185 2184 * cs: the cpuset to update 2186 2185 * turning_on: whether the flag is being set or cleared 2187 2186 * 2188 - * Call with cpuset_rwsem held. 2187 + * Call with cpuset_mutex held. 2189 2188 */ 2190 2189 2191 2190 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, ··· 2235 2234 * @new_prs: new partition root state 2236 2235 * Return: 0 if successful, != 0 if error 2237 2236 * 2238 - * Call with cpuset_rwsem held. 2237 + * Call with cpuset_mutex held. 2239 2238 */ 2240 2239 static int update_prstate(struct cpuset *cs, int new_prs) 2241 2240 { ··· 2473 2472 return 0; 2474 2473 } 2475 2474 2476 - /* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */ 2475 + /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ 2477 2476 static int cpuset_can_attach(struct cgroup_taskset *tset) 2478 2477 { 2479 2478 struct cgroup_subsys_state *css; ··· 2485 2484 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); 2486 2485 cs = css_cs(css); 2487 2486 2488 - percpu_down_write(&cpuset_rwsem); 2487 + mutex_lock(&cpuset_mutex); 2489 2488 2490 2489 /* Check to see if task is allowed in the cpuset */ 2491 2490 ret = cpuset_can_attach_check(cs); ··· 2507 2506 */ 2508 2507 cs->attach_in_progress++; 2509 2508 out_unlock: 2510 - percpu_up_write(&cpuset_rwsem); 2509 + mutex_unlock(&cpuset_mutex); 2511 2510 return ret; 2512 2511 } 2513 2512 ··· 2519 2518 cgroup_taskset_first(tset, &css); 2520 2519 cs = css_cs(css); 2521 2520 2522 - percpu_down_write(&cpuset_rwsem); 2521 + mutex_lock(&cpuset_mutex); 2523 2522 cs->attach_in_progress--; 2524 2523 if (!cs->attach_in_progress) 2525 2524 wake_up(&cpuset_attach_wq); 2526 - percpu_up_write(&cpuset_rwsem); 2525 + mutex_unlock(&cpuset_mutex); 2527 2526 } 2528 2527 2529 2528 /* 2530 - * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task() 2529 + * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task() 2531 2530 * but we can't allocate it dynamically there. Define it global and 2532 2531 * allocate from cpuset_init(). 2533 2532 */ ··· 2536 2535 2537 2536 static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task) 2538 2537 { 2539 - percpu_rwsem_assert_held(&cpuset_rwsem); 2538 + lockdep_assert_held(&cpuset_mutex); 2540 2539 2541 2540 if (cs != &top_cpuset) 2542 2541 guarantee_online_cpus(task, cpus_attach); ··· 2566 2565 cs = css_cs(css); 2567 2566 2568 2567 lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */ 2569 - percpu_down_write(&cpuset_rwsem); 2568 + mutex_lock(&cpuset_mutex); 2570 2569 cpus_updated = !cpumask_equal(cs->effective_cpus, 2571 2570 oldcs->effective_cpus); 2572 2571 mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems); ··· 2627 2626 if (!cs->attach_in_progress) 2628 2627 wake_up(&cpuset_attach_wq); 2629 2628 2630 - percpu_up_write(&cpuset_rwsem); 2629 + mutex_unlock(&cpuset_mutex); 2631 2630 } 2632 2631 2633 2632 /* The various types of files and directories in a cpuset file system */ ··· 2659 2658 int retval = 0; 2660 2659 2661 2660 cpus_read_lock(); 2662 - percpu_down_write(&cpuset_rwsem); 2661 + mutex_lock(&cpuset_mutex); 2663 2662 if (!is_cpuset_online(cs)) { 2664 2663 retval = -ENODEV; 2665 2664 goto out_unlock; ··· 2695 2694 break; 2696 2695 } 2697 2696 out_unlock: 2698 - percpu_up_write(&cpuset_rwsem); 2697 + mutex_unlock(&cpuset_mutex); 2699 2698 cpus_read_unlock(); 2700 2699 return retval; 2701 2700 } ··· 2708 2707 int retval = -ENODEV; 2709 2708 2710 2709 cpus_read_lock(); 2711 - percpu_down_write(&cpuset_rwsem); 2710 + mutex_lock(&cpuset_mutex); 2712 2711 if (!is_cpuset_online(cs)) 2713 2712 goto out_unlock; 2714 2713 ··· 2721 2720 break; 2722 2721 } 2723 2722 out_unlock: 2724 - percpu_up_write(&cpuset_rwsem); 2723 + mutex_unlock(&cpuset_mutex); 2725 2724 cpus_read_unlock(); 2726 2725 return retval; 2727 2726 } ··· 2754 2753 * operation like this one can lead to a deadlock through kernfs 2755 2754 * active_ref protection. Let's break the protection. Losing the 2756 2755 * protection is okay as we check whether @cs is online after 2757 - * grabbing cpuset_rwsem anyway. This only happens on the legacy 2756 + * grabbing cpuset_mutex anyway. This only happens on the legacy 2758 2757 * hierarchies. 2759 2758 */ 2760 2759 css_get(&cs->css); ··· 2762 2761 flush_work(&cpuset_hotplug_work); 2763 2762 2764 2763 cpus_read_lock(); 2765 - percpu_down_write(&cpuset_rwsem); 2764 + mutex_lock(&cpuset_mutex); 2766 2765 if (!is_cpuset_online(cs)) 2767 2766 goto out_unlock; 2768 2767 ··· 2786 2785 2787 2786 free_cpuset(trialcs); 2788 2787 out_unlock: 2789 - percpu_up_write(&cpuset_rwsem); 2788 + mutex_unlock(&cpuset_mutex); 2790 2789 cpus_read_unlock(); 2791 2790 kernfs_unbreak_active_protection(of->kn); 2792 2791 css_put(&cs->css); ··· 2934 2933 2935 2934 css_get(&cs->css); 2936 2935 cpus_read_lock(); 2937 - percpu_down_write(&cpuset_rwsem); 2936 + mutex_lock(&cpuset_mutex); 2938 2937 if (!is_cpuset_online(cs)) 2939 2938 goto out_unlock; 2940 2939 2941 2940 retval = update_prstate(cs, val); 2942 2941 out_unlock: 2943 - percpu_up_write(&cpuset_rwsem); 2942 + mutex_unlock(&cpuset_mutex); 2944 2943 cpus_read_unlock(); 2945 2944 css_put(&cs->css); 2946 2945 return retval ?: nbytes; ··· 3157 3156 return 0; 3158 3157 3159 3158 cpus_read_lock(); 3160 - percpu_down_write(&cpuset_rwsem); 3159 + mutex_lock(&cpuset_mutex); 3161 3160 3162 3161 set_bit(CS_ONLINE, &cs->flags); 3163 3162 if (is_spread_page(parent)) ··· 3208 3207 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 3209 3208 spin_unlock_irq(&callback_lock); 3210 3209 out_unlock: 3211 - percpu_up_write(&cpuset_rwsem); 3210 + mutex_unlock(&cpuset_mutex); 3212 3211 cpus_read_unlock(); 3213 3212 return 0; 3214 3213 } ··· 3229 3228 struct cpuset *cs = css_cs(css); 3230 3229 3231 3230 cpus_read_lock(); 3232 - percpu_down_write(&cpuset_rwsem); 3231 + mutex_lock(&cpuset_mutex); 3233 3232 3234 3233 if (is_partition_valid(cs)) 3235 3234 update_prstate(cs, 0); ··· 3248 3247 cpuset_dec(); 3249 3248 clear_bit(CS_ONLINE, &cs->flags); 3250 3249 3251 - percpu_up_write(&cpuset_rwsem); 3250 + mutex_unlock(&cpuset_mutex); 3252 3251 cpus_read_unlock(); 3253 3252 } 3254 3253 ··· 3261 3260 3262 3261 static void cpuset_bind(struct cgroup_subsys_state *root_css) 3263 3262 { 3264 - percpu_down_write(&cpuset_rwsem); 3263 + mutex_lock(&cpuset_mutex); 3265 3264 spin_lock_irq(&callback_lock); 3266 3265 3267 3266 if (is_in_v2_mode()) { ··· 3274 3273 } 3275 3274 3276 3275 spin_unlock_irq(&callback_lock); 3277 - percpu_up_write(&cpuset_rwsem); 3276 + mutex_unlock(&cpuset_mutex); 3278 3277 } 3279 3278 3280 3279 /* ··· 3295 3294 return 0; 3296 3295 3297 3296 lockdep_assert_held(&cgroup_mutex); 3298 - percpu_down_write(&cpuset_rwsem); 3297 + mutex_lock(&cpuset_mutex); 3299 3298 3300 3299 /* Check to see if task is allowed in the cpuset */ 3301 3300 ret = cpuset_can_attach_check(cs); ··· 3316 3315 */ 3317 3316 cs->attach_in_progress++; 3318 3317 out_unlock: 3319 - percpu_up_write(&cpuset_rwsem); 3318 + mutex_unlock(&cpuset_mutex); 3320 3319 return ret; 3321 3320 } 3322 3321 ··· 3332 3331 if (same_cs) 3333 3332 return; 3334 3333 3335 - percpu_down_write(&cpuset_rwsem); 3334 + mutex_lock(&cpuset_mutex); 3336 3335 cs->attach_in_progress--; 3337 3336 if (!cs->attach_in_progress) 3338 3337 wake_up(&cpuset_attach_wq); 3339 - percpu_up_write(&cpuset_rwsem); 3338 + mutex_unlock(&cpuset_mutex); 3340 3339 } 3341 3340 3342 3341 /* ··· 3364 3363 } 3365 3364 3366 3365 /* CLONE_INTO_CGROUP */ 3367 - percpu_down_write(&cpuset_rwsem); 3366 + mutex_lock(&cpuset_mutex); 3368 3367 guarantee_online_mems(cs, &cpuset_attach_nodemask_to); 3369 3368 cpuset_attach_task(cs, task); 3370 3369 ··· 3372 3371 if (!cs->attach_in_progress) 3373 3372 wake_up(&cpuset_attach_wq); 3374 3373 3375 - percpu_up_write(&cpuset_rwsem); 3374 + mutex_unlock(&cpuset_mutex); 3376 3375 } 3377 3376 3378 3377 struct cgroup_subsys cpuset_cgrp_subsys = { ··· 3473 3472 is_empty = cpumask_empty(cs->cpus_allowed) || 3474 3473 nodes_empty(cs->mems_allowed); 3475 3474 3476 - percpu_up_write(&cpuset_rwsem); 3475 + mutex_unlock(&cpuset_mutex); 3477 3476 3478 3477 /* 3479 3478 * Move tasks to the nearest ancestor with execution resources, ··· 3483 3482 if (is_empty) 3484 3483 remove_tasks_in_empty_cpuset(cs); 3485 3484 3486 - percpu_down_write(&cpuset_rwsem); 3485 + mutex_lock(&cpuset_mutex); 3487 3486 } 3488 3487 3489 3488 static void ··· 3534 3533 retry: 3535 3534 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 3536 3535 3537 - percpu_down_write(&cpuset_rwsem); 3536 + mutex_lock(&cpuset_mutex); 3538 3537 3539 3538 /* 3540 3539 * We have raced with task attaching. We wait until attaching 3541 3540 * is finished, so we won't attach a task to an empty cpuset. 3542 3541 */ 3543 3542 if (cs->attach_in_progress) { 3544 - percpu_up_write(&cpuset_rwsem); 3543 + mutex_unlock(&cpuset_mutex); 3545 3544 goto retry; 3546 3545 } 3547 3546 ··· 3638 3637 cpus_updated, mems_updated); 3639 3638 3640 3639 unlock: 3641 - percpu_up_write(&cpuset_rwsem); 3640 + mutex_unlock(&cpuset_mutex); 3642 3641 } 3643 3642 3644 3643 /** ··· 3668 3667 if (on_dfl && !alloc_cpumasks(NULL, &tmp)) 3669 3668 ptmp = &tmp; 3670 3669 3671 - percpu_down_write(&cpuset_rwsem); 3670 + mutex_lock(&cpuset_mutex); 3672 3671 3673 3672 /* fetch the available cpus/mems and find out which changed how */ 3674 3673 cpumask_copy(&new_cpus, cpu_active_mask); ··· 3725 3724 update_tasks_nodemask(&top_cpuset); 3726 3725 } 3727 3726 3728 - percpu_up_write(&cpuset_rwsem); 3727 + mutex_unlock(&cpuset_mutex); 3729 3728 3730 3729 /* if cpus or mems changed, we need to propagate to descendants */ 3731 3730 if (cpus_updated || mems_updated) { ··· 4156 4155 * - Used for /proc/<pid>/cpuset. 4157 4156 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 4158 4157 * doesn't really matter if tsk->cpuset changes after we read it, 4159 - * and we take cpuset_rwsem, keeping cpuset_attach() from changing it 4158 + * and we take cpuset_mutex, keeping cpuset_attach() from changing it 4160 4159 * anyway. 4161 4160 */ 4162 4161 int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+15 -7
kernel/sched/core.c
··· 7590 7590 int reset_on_fork; 7591 7591 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; 7592 7592 struct rq *rq; 7593 + bool cpuset_locked = false; 7593 7594 7594 7595 /* The pi code expects interrupts enabled */ 7595 7596 BUG_ON(pi && in_interrupt()); ··· 7640 7639 return retval; 7641 7640 } 7642 7641 7643 - if (pi) 7644 - cpuset_read_lock(); 7642 + /* 7643 + * SCHED_DEADLINE bandwidth accounting relies on stable cpusets 7644 + * information. 7645 + */ 7646 + if (dl_policy(policy) || dl_policy(p->policy)) { 7647 + cpuset_locked = true; 7648 + cpuset_lock(); 7649 + } 7645 7650 7646 7651 /* 7647 7652 * Make sure no PI-waiters arrive (or leave) while we are ··· 7723 7716 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 7724 7717 policy = oldpolicy = -1; 7725 7718 task_rq_unlock(rq, p, &rf); 7726 - if (pi) 7727 - cpuset_read_unlock(); 7719 + if (cpuset_locked) 7720 + cpuset_unlock(); 7728 7721 goto recheck; 7729 7722 } 7730 7723 ··· 7791 7784 task_rq_unlock(rq, p, &rf); 7792 7785 7793 7786 if (pi) { 7794 - cpuset_read_unlock(); 7787 + if (cpuset_locked) 7788 + cpuset_unlock(); 7795 7789 rt_mutex_adjust_pi(p); 7796 7790 } 7797 7791 ··· 7804 7796 7805 7797 unlock: 7806 7798 task_rq_unlock(rq, p, &rf); 7807 - if (pi) 7808 - cpuset_read_unlock(); 7799 + if (cpuset_locked) 7800 + cpuset_unlock(); 7809 7801 return retval; 7810 7802 } 7811 7803