Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rtmutex: Turn the plist into an rb-tree

Turn the pi-chains from plist to rb-tree, in the rt_mutex code,
and provide a proper comparison function for -deadline and
-priority tasks.

This is done mainly because:
- classical prio field of the plist is just an int, which might
not be enough for representing a deadline;
- manipulating such a list would become O(nr_deadline_tasks),
which might be to much, as the number of -deadline task increases.

Therefore, an rb-tree is used, and tasks are queued in it according
to the following logic:
- among two -priority (i.e., SCHED_BATCH/OTHER/RR/FIFO) tasks, the
one with the higher (lower, actually!) prio wins;
- among a -priority and a -deadline task, the latter always wins;
- among two -deadline tasks, the one with the earliest deadline
wins.

Queueing and dequeueing functions are changed accordingly, for both
the list of a task's pi-waiters and the list of tasks blocked on
a pi-lock.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-again-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-10-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

authored by

Peter Zijlstra and committed by
Ingo Molnar
fb00aca4 af6ace76

+157 -65
+10
include/linux/init_task.h
··· 11 11 #include <linux/user_namespace.h> 12 12 #include <linux/securebits.h> 13 13 #include <linux/seqlock.h> 14 + #include <linux/rbtree.h> 14 15 #include <net/net_namespace.h> 15 16 #include <linux/sched/rt.h> 16 17 ··· 155 154 156 155 #define INIT_TASK_COMM "swapper" 157 156 157 + #ifdef CONFIG_RT_MUTEXES 158 + # define INIT_RT_MUTEXES(tsk) \ 159 + .pi_waiters = RB_ROOT, \ 160 + .pi_waiters_leftmost = NULL, 161 + #else 162 + # define INIT_RT_MUTEXES(tsk) 163 + #endif 164 + 158 165 /* 159 166 * INIT_TASK is used to set up the first task table, touch at 160 167 * your own risk!. Base=0, limit=0x1fffff (=2MB) ··· 230 221 INIT_TRACE_RECURSION \ 231 222 INIT_TASK_RCU_PREEMPT(tsk) \ 232 223 INIT_CPUSET_SEQ(tsk) \ 224 + INIT_RT_MUTEXES(tsk) \ 233 225 INIT_VTIME(tsk) \ 234 226 } 235 227
+6 -12
include/linux/rtmutex.h
··· 13 13 #define __LINUX_RT_MUTEX_H 14 14 15 15 #include <linux/linkage.h> 16 - #include <linux/plist.h> 16 + #include <linux/rbtree.h> 17 17 #include <linux/spinlock_types.h> 18 18 19 19 extern int max_lock_depth; /* for sysctl */ ··· 22 22 * The rt_mutex structure 23 23 * 24 24 * @wait_lock: spinlock to protect the structure 25 - * @wait_list: pilist head to enqueue waiters in priority order 25 + * @waiters: rbtree root to enqueue waiters in priority order 26 + * @waiters_leftmost: top waiter 26 27 * @owner: the mutex owner 27 28 */ 28 29 struct rt_mutex { 29 30 raw_spinlock_t wait_lock; 30 - struct plist_head wait_list; 31 + struct rb_root waiters; 32 + struct rb_node *waiters_leftmost; 31 33 struct task_struct *owner; 32 34 #ifdef CONFIG_DEBUG_RT_MUTEXES 33 35 int save_state; ··· 68 66 69 67 #define __RT_MUTEX_INITIALIZER(mutexname) \ 70 68 { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ 71 - , .wait_list = PLIST_HEAD_INIT(mutexname.wait_list) \ 69 + , .waiters = RB_ROOT \ 72 70 , .owner = NULL \ 73 71 __DEBUG_RT_MUTEX_INITIALIZER(mutexname)} 74 72 ··· 99 97 extern int rt_mutex_trylock(struct rt_mutex *lock); 100 98 101 99 extern void rt_mutex_unlock(struct rt_mutex *lock); 102 - 103 - #ifdef CONFIG_RT_MUTEXES 104 - # define INIT_RT_MUTEXES(tsk) \ 105 - .pi_waiters = PLIST_HEAD_INIT(tsk.pi_waiters), \ 106 - INIT_RT_MUTEX_DEBUG(tsk) 107 - #else 108 - # define INIT_RT_MUTEXES(tsk) 109 - #endif 110 100 111 101 #endif
+3 -1
include/linux/sched.h
··· 16 16 #include <linux/types.h> 17 17 #include <linux/timex.h> 18 18 #include <linux/jiffies.h> 19 + #include <linux/plist.h> 19 20 #include <linux/rbtree.h> 20 21 #include <linux/thread_info.h> 21 22 #include <linux/cpumask.h> ··· 1355 1354 1356 1355 #ifdef CONFIG_RT_MUTEXES 1357 1356 /* PI waiters blocked on a rt_mutex held by this task */ 1358 - struct plist_head pi_waiters; 1357 + struct rb_root pi_waiters; 1358 + struct rb_node *pi_waiters_leftmost; 1359 1359 /* Deadlock detection and priority inheritance handling */ 1360 1360 struct rt_mutex_waiter *pi_blocked_on; 1361 1361 #endif
+2 -1
kernel/fork.c
··· 1087 1087 { 1088 1088 raw_spin_lock_init(&p->pi_lock); 1089 1089 #ifdef CONFIG_RT_MUTEXES 1090 - plist_head_init(&p->pi_waiters); 1090 + p->pi_waiters = RB_ROOT; 1091 + p->pi_waiters_leftmost = NULL; 1091 1092 p->pi_blocked_on = NULL; 1092 1093 #endif 1093 1094 }
+2
kernel/futex.c
··· 2316 2316 * code while we sleep on uaddr. 2317 2317 */ 2318 2318 debug_rt_mutex_init_waiter(&rt_waiter); 2319 + RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); 2320 + RB_CLEAR_NODE(&rt_waiter.tree_entry); 2319 2321 rt_waiter.task = NULL; 2320 2322 2321 2323 ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
+2 -6
kernel/locking/rtmutex-debug.c
··· 24 24 #include <linux/kallsyms.h> 25 25 #include <linux/syscalls.h> 26 26 #include <linux/interrupt.h> 27 - #include <linux/plist.h> 27 + #include <linux/rbtree.h> 28 28 #include <linux/fs.h> 29 29 #include <linux/debug_locks.h> 30 30 ··· 57 57 58 58 void rt_mutex_debug_task_free(struct task_struct *task) 59 59 { 60 - DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters)); 60 + DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters)); 61 61 DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); 62 62 } 63 63 ··· 154 154 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) 155 155 { 156 156 memset(waiter, 0x11, sizeof(*waiter)); 157 - plist_node_init(&waiter->list_entry, MAX_PRIO); 158 - plist_node_init(&waiter->pi_list_entry, MAX_PRIO); 159 157 waiter->deadlock_task_pid = NULL; 160 158 } 161 159 162 160 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) 163 161 { 164 162 put_pid(waiter->deadlock_task_pid); 165 - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry)); 166 - DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 167 163 memset(waiter, 0x22, sizeof(*waiter)); 168 164 } 169 165
+121 -30
kernel/locking/rtmutex.c
··· 14 14 #include <linux/export.h> 15 15 #include <linux/sched.h> 16 16 #include <linux/sched/rt.h> 17 + #include <linux/sched/deadline.h> 17 18 #include <linux/timer.h> 18 19 19 20 #include "rtmutex_common.h" ··· 92 91 } 93 92 #endif 94 93 94 + static inline int 95 + rt_mutex_waiter_less(struct rt_mutex_waiter *left, 96 + struct rt_mutex_waiter *right) 97 + { 98 + if (left->task->prio < right->task->prio) 99 + return 1; 100 + 101 + /* 102 + * If both tasks are dl_task(), we check their deadlines. 103 + */ 104 + if (dl_prio(left->task->prio) && dl_prio(right->task->prio)) 105 + return (left->task->dl.deadline < right->task->dl.deadline); 106 + 107 + return 0; 108 + } 109 + 110 + static void 111 + rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) 112 + { 113 + struct rb_node **link = &lock->waiters.rb_node; 114 + struct rb_node *parent = NULL; 115 + struct rt_mutex_waiter *entry; 116 + int leftmost = 1; 117 + 118 + while (*link) { 119 + parent = *link; 120 + entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); 121 + if (rt_mutex_waiter_less(waiter, entry)) { 122 + link = &parent->rb_left; 123 + } else { 124 + link = &parent->rb_right; 125 + leftmost = 0; 126 + } 127 + } 128 + 129 + if (leftmost) 130 + lock->waiters_leftmost = &waiter->tree_entry; 131 + 132 + rb_link_node(&waiter->tree_entry, parent, link); 133 + rb_insert_color(&waiter->tree_entry, &lock->waiters); 134 + } 135 + 136 + static void 137 + rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) 138 + { 139 + if (RB_EMPTY_NODE(&waiter->tree_entry)) 140 + return; 141 + 142 + if (lock->waiters_leftmost == &waiter->tree_entry) 143 + lock->waiters_leftmost = rb_next(&waiter->tree_entry); 144 + 145 + rb_erase(&waiter->tree_entry, &lock->waiters); 146 + RB_CLEAR_NODE(&waiter->tree_entry); 147 + } 148 + 149 + static void 150 + rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) 151 + { 152 + struct rb_node **link = &task->pi_waiters.rb_node; 153 + struct rb_node *parent = NULL; 154 + struct rt_mutex_waiter *entry; 155 + int leftmost = 1; 156 + 157 + while (*link) { 158 + parent = *link; 159 + entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); 160 + if (rt_mutex_waiter_less(waiter, entry)) { 161 + link = &parent->rb_left; 162 + } else { 163 + link = &parent->rb_right; 164 + leftmost = 0; 165 + } 166 + } 167 + 168 + if (leftmost) 169 + task->pi_waiters_leftmost = &waiter->pi_tree_entry; 170 + 171 + rb_link_node(&waiter->pi_tree_entry, parent, link); 172 + rb_insert_color(&waiter->pi_tree_entry, &task->pi_waiters); 173 + } 174 + 175 + static void 176 + rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) 177 + { 178 + if (RB_EMPTY_NODE(&waiter->pi_tree_entry)) 179 + return; 180 + 181 + if (task->pi_waiters_leftmost == &waiter->pi_tree_entry) 182 + task->pi_waiters_leftmost = rb_next(&waiter->pi_tree_entry); 183 + 184 + rb_erase(&waiter->pi_tree_entry, &task->pi_waiters); 185 + RB_CLEAR_NODE(&waiter->pi_tree_entry); 186 + } 187 + 95 188 /* 96 - * Calculate task priority from the waiter list priority 189 + * Calculate task priority from the waiter tree priority 97 190 * 98 - * Return task->normal_prio when the waiter list is empty or when 191 + * Return task->normal_prio when the waiter tree is empty or when 99 192 * the waiter is not allowed to do priority boosting 100 193 */ 101 194 int rt_mutex_getprio(struct task_struct *task) ··· 197 102 if (likely(!task_has_pi_waiters(task))) 198 103 return task->normal_prio; 199 104 200 - return min(task_top_pi_waiter(task)->pi_list_entry.prio, 105 + return min(task_top_pi_waiter(task)->task->prio, 201 106 task->normal_prio); 202 107 } 203 108 ··· 328 233 * When deadlock detection is off then we check, if further 329 234 * priority adjustment is necessary. 330 235 */ 331 - if (!detect_deadlock && waiter->list_entry.prio == task->prio) 236 + if (!detect_deadlock && waiter->task->prio == task->prio) 332 237 goto out_unlock_pi; 333 238 334 239 lock = waiter->lock; ··· 349 254 top_waiter = rt_mutex_top_waiter(lock); 350 255 351 256 /* Requeue the waiter */ 352 - plist_del(&waiter->list_entry, &lock->wait_list); 353 - waiter->list_entry.prio = task->prio; 354 - plist_add(&waiter->list_entry, &lock->wait_list); 257 + rt_mutex_dequeue(lock, waiter); 258 + waiter->task->prio = task->prio; 259 + rt_mutex_enqueue(lock, waiter); 355 260 356 261 /* Release the task */ 357 262 raw_spin_unlock_irqrestore(&task->pi_lock, flags); ··· 375 280 376 281 if (waiter == rt_mutex_top_waiter(lock)) { 377 282 /* Boost the owner */ 378 - plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); 379 - waiter->pi_list_entry.prio = waiter->list_entry.prio; 380 - plist_add(&waiter->pi_list_entry, &task->pi_waiters); 283 + rt_mutex_dequeue_pi(task, top_waiter); 284 + rt_mutex_enqueue_pi(task, waiter); 381 285 __rt_mutex_adjust_prio(task); 382 286 383 287 } else if (top_waiter == waiter) { 384 288 /* Deboost the owner */ 385 - plist_del(&waiter->pi_list_entry, &task->pi_waiters); 289 + rt_mutex_dequeue_pi(task, waiter); 386 290 waiter = rt_mutex_top_waiter(lock); 387 - waiter->pi_list_entry.prio = waiter->list_entry.prio; 388 - plist_add(&waiter->pi_list_entry, &task->pi_waiters); 291 + rt_mutex_enqueue_pi(task, waiter); 389 292 __rt_mutex_adjust_prio(task); 390 293 } 391 294 ··· 448 355 * 3) it is top waiter 449 356 */ 450 357 if (rt_mutex_has_waiters(lock)) { 451 - if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { 358 + if (task->prio >= rt_mutex_top_waiter(lock)->task->prio) { 452 359 if (!waiter || waiter != rt_mutex_top_waiter(lock)) 453 360 return 0; 454 361 } ··· 462 369 463 370 /* remove the queued waiter. */ 464 371 if (waiter) { 465 - plist_del(&waiter->list_entry, &lock->wait_list); 372 + rt_mutex_dequeue(lock, waiter); 466 373 task->pi_blocked_on = NULL; 467 374 } 468 375 ··· 472 379 */ 473 380 if (rt_mutex_has_waiters(lock)) { 474 381 top = rt_mutex_top_waiter(lock); 475 - top->pi_list_entry.prio = top->list_entry.prio; 476 - plist_add(&top->pi_list_entry, &task->pi_waiters); 382 + rt_mutex_enqueue_pi(task, top); 477 383 } 478 384 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 479 385 } ··· 508 416 __rt_mutex_adjust_prio(task); 509 417 waiter->task = task; 510 418 waiter->lock = lock; 511 - plist_node_init(&waiter->list_entry, task->prio); 512 - plist_node_init(&waiter->pi_list_entry, task->prio); 513 419 514 420 /* Get the top priority waiter on the lock */ 515 421 if (rt_mutex_has_waiters(lock)) 516 422 top_waiter = rt_mutex_top_waiter(lock); 517 - plist_add(&waiter->list_entry, &lock->wait_list); 423 + rt_mutex_enqueue(lock, waiter); 518 424 519 425 task->pi_blocked_on = waiter; 520 426 ··· 523 433 524 434 if (waiter == rt_mutex_top_waiter(lock)) { 525 435 raw_spin_lock_irqsave(&owner->pi_lock, flags); 526 - plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 527 - plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 436 + rt_mutex_dequeue_pi(owner, top_waiter); 437 + rt_mutex_enqueue_pi(owner, waiter); 528 438 529 439 __rt_mutex_adjust_prio(owner); 530 440 if (owner->pi_blocked_on) ··· 576 486 * boosted mode and go back to normal after releasing 577 487 * lock->wait_lock. 578 488 */ 579 - plist_del(&waiter->pi_list_entry, &current->pi_waiters); 489 + rt_mutex_dequeue_pi(current, waiter); 580 490 581 491 rt_mutex_set_owner(lock, NULL); 582 492 ··· 600 510 int chain_walk = 0; 601 511 602 512 raw_spin_lock_irqsave(&current->pi_lock, flags); 603 - plist_del(&waiter->list_entry, &lock->wait_list); 513 + rt_mutex_dequeue(lock, waiter); 604 514 current->pi_blocked_on = NULL; 605 515 raw_spin_unlock_irqrestore(&current->pi_lock, flags); 606 516 ··· 611 521 612 522 raw_spin_lock_irqsave(&owner->pi_lock, flags); 613 523 614 - plist_del(&waiter->pi_list_entry, &owner->pi_waiters); 524 + rt_mutex_dequeue_pi(owner, waiter); 615 525 616 526 if (rt_mutex_has_waiters(lock)) { 617 527 struct rt_mutex_waiter *next; 618 528 619 529 next = rt_mutex_top_waiter(lock); 620 - plist_add(&next->pi_list_entry, &owner->pi_waiters); 530 + rt_mutex_enqueue_pi(owner, next); 621 531 } 622 532 __rt_mutex_adjust_prio(owner); 623 533 ··· 626 536 627 537 raw_spin_unlock_irqrestore(&owner->pi_lock, flags); 628 538 } 629 - 630 - WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 631 539 632 540 if (!chain_walk) 633 541 return; ··· 653 565 raw_spin_lock_irqsave(&task->pi_lock, flags); 654 566 655 567 waiter = task->pi_blocked_on; 656 - if (!waiter || waiter->list_entry.prio == task->prio) { 568 + if (!waiter || waiter->task->prio == task->prio) { 657 569 raw_spin_unlock_irqrestore(&task->pi_lock, flags); 658 570 return; 659 571 } ··· 726 638 int ret = 0; 727 639 728 640 debug_rt_mutex_init_waiter(&waiter); 641 + RB_CLEAR_NODE(&waiter.pi_tree_entry); 642 + RB_CLEAR_NODE(&waiter.tree_entry); 729 643 730 644 raw_spin_lock(&lock->wait_lock); 731 645 ··· 994 904 { 995 905 lock->owner = NULL; 996 906 raw_spin_lock_init(&lock->wait_lock); 997 - plist_head_init(&lock->wait_list); 907 + lock->waiters = RB_ROOT; 908 + lock->waiters_leftmost = NULL; 998 909 999 910 debug_rt_mutex_init(lock, name); 1000 911 }
+11 -11
kernel/locking/rtmutex_common.h
··· 40 40 * This is the control structure for tasks blocked on a rt_mutex, 41 41 * which is allocated on the kernel stack on of the blocked task. 42 42 * 43 - * @list_entry: pi node to enqueue into the mutex waiters list 44 - * @pi_list_entry: pi node to enqueue into the mutex owner waiters list 43 + * @tree_entry: pi node to enqueue into the mutex waiters tree 44 + * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree 45 45 * @task: task reference to the blocked task 46 46 */ 47 47 struct rt_mutex_waiter { 48 - struct plist_node list_entry; 49 - struct plist_node pi_list_entry; 48 + struct rb_node tree_entry; 49 + struct rb_node pi_tree_entry; 50 50 struct task_struct *task; 51 51 struct rt_mutex *lock; 52 52 #ifdef CONFIG_DEBUG_RT_MUTEXES ··· 57 57 }; 58 58 59 59 /* 60 - * Various helpers to access the waiters-plist: 60 + * Various helpers to access the waiters-tree: 61 61 */ 62 62 static inline int rt_mutex_has_waiters(struct rt_mutex *lock) 63 63 { 64 - return !plist_head_empty(&lock->wait_list); 64 + return !RB_EMPTY_ROOT(&lock->waiters); 65 65 } 66 66 67 67 static inline struct rt_mutex_waiter * ··· 69 69 { 70 70 struct rt_mutex_waiter *w; 71 71 72 - w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, 73 - list_entry); 72 + w = rb_entry(lock->waiters_leftmost, struct rt_mutex_waiter, 73 + tree_entry); 74 74 BUG_ON(w->lock != lock); 75 75 76 76 return w; ··· 78 78 79 79 static inline int task_has_pi_waiters(struct task_struct *p) 80 80 { 81 - return !plist_head_empty(&p->pi_waiters); 81 + return !RB_EMPTY_ROOT(&p->pi_waiters); 82 82 } 83 83 84 84 static inline struct rt_mutex_waiter * 85 85 task_top_pi_waiter(struct task_struct *p) 86 86 { 87 - return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, 88 - pi_list_entry); 87 + return rb_entry(p->pi_waiters_leftmost, struct rt_mutex_waiter, 88 + pi_tree_entry); 89 89 } 90 90 91 91 /*
-4
kernel/sched/core.c
··· 6635 6635 INIT_HLIST_HEAD(&init_task.preempt_notifiers); 6636 6636 #endif 6637 6637 6638 - #ifdef CONFIG_RT_MUTEXES 6639 - plist_head_init(&init_task.pi_waiters); 6640 - #endif 6641 - 6642 6638 /* 6643 6639 * The boot idle thread does lazy MMU switching as well: 6644 6640 */