sched/deadline: Fix priority inheritance with multiple scheduling classes

Glenn reported that "an application [he developed produces] a BUG in
deadline.c when a SCHED_DEADLINE task contends with CFS tasks on nested
PTHREAD_PRIO_INHERIT mutexes. I believe the bug is triggered when a CFS
task that was boosted by a SCHED_DEADLINE task boosts another CFS task
(nested priority inheritance).

------------[ cut here ]------------
kernel BUG at kernel/sched/deadline.c:1462!
invalid opcode: 0000 [#1] PREEMPT SMP
CPU: 12 PID: 19171 Comm: dl_boost_bug Tainted: ...
Hardware name: ...
RIP: 0010:enqueue_task_dl+0x335/0x910
Code: ...
RSP: 0018:ffffc9000c2bbc68 EFLAGS: 00010002
RAX: 0000000000000009 RBX: ffff888c0af94c00 RCX: ffffffff81e12500
RDX: 000000000000002e RSI: ffff888c0af94c00 RDI: ffff888c10b22600
RBP: ffffc9000c2bbd08 R08: 0000000000000009 R09: 0000000000000078
R10: ffffffff81e12440 R11: ffffffff81e1236c R12: ffff888bc8932600
R13: ffff888c0af94eb8 R14: ffff888c10b22600 R15: ffff888bc8932600
FS: 00007fa58ac55700(0000) GS:ffff888c10b00000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007fa58b523230 CR3: 0000000bf44ab003 CR4: 00000000007606e0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
? intel_pstate_update_util_hwp+0x13/0x170
rt_mutex_setprio+0x1cc/0x4b0
task_blocks_on_rt_mutex+0x225/0x260
rt_spin_lock_slowlock_locked+0xab/0x2d0
rt_spin_lock_slowlock+0x50/0x80
hrtimer_grab_expiry_lock+0x20/0x30
hrtimer_cancel+0x13/0x30
do_nanosleep+0xa0/0x150
hrtimer_nanosleep+0xe1/0x230
? __hrtimer_init_sleeper+0x60/0x60
__x64_sys_nanosleep+0x8d/0xa0
do_syscall_64+0x4a/0x100
entry_SYSCALL_64_after_hwframe+0x49/0xbe
RIP: 0033:0x7fa58b52330d
...
---[ end trace 0000000000000002 ]—

He also provided a simple reproducer creating the situation below:

So the execution order of locking steps are the following
(N1 and N2 are non-deadline tasks. D1 is a deadline task. M1 and M2
are mutexes that are enabled * with priority inheritance.)

Time moves forward as this timeline goes down:

N1 N2 D1
| | |
| | |
Lock(M1) | |
| | |
| Lock(M2) |
| | |
| | Lock(M2)
| | |
| Lock(M1) |
| (!!bug triggered!) |

Daniel reported a similar situation as well, by just letting ksoftirqd
run with DEADLINE (and eventually block on a mutex).

Problem is that boosted entities (Priority Inheritance) use static
DEADLINE parameters of the top priority waiter. However, there might be
cases where top waiter could be a non-DEADLINE entity that is currently
boosted by a DEADLINE entity from a different lock chain (i.e., nested
priority chains involving entities of non-DEADLINE classes). In this
case, top waiter static DEADLINE parameters could be null (initialized
to 0 at fork()) and replenish_dl_entity() would hit a BUG().

Fix this by keeping track of the original donor and using its parameters
when a task is boosted.

Reported-by: Glenn Elliott <glenn@aurora.tech>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Juri Lelli <juri.lelli@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Link: https://lkml.kernel.org/r/20201117061432.517340-1-juri.lelli@redhat.com

authored by Juri Lelli and committed by Peter Zijlstra 2279f540 ec618b84

Changed files
+68 -50
include
linux
kernel
+9 -1
include/linux/sched.h
··· 551 551 * overruns. 552 552 */ 553 553 unsigned int dl_throttled : 1; 554 - unsigned int dl_boosted : 1; 555 554 unsigned int dl_yielded : 1; 556 555 unsigned int dl_non_contending : 1; 557 556 unsigned int dl_overrun : 1; ··· 569 570 * time. 570 571 */ 571 572 struct hrtimer inactive_timer; 573 + 574 + #ifdef CONFIG_RT_MUTEXES 575 + /* 576 + * Priority Inheritance. When a DEADLINE scheduling entity is boosted 577 + * pi_se points to the donor, otherwise points to the dl_se it belongs 578 + * to (the original one/itself). 579 + */ 580 + struct sched_dl_entity *pi_se; 581 + #endif 572 582 }; 573 583 574 584 #ifdef CONFIG_UCLAMP_TASK
+6 -5
kernel/sched/core.c
··· 4912 4912 if (!dl_prio(p->normal_prio) || 4913 4913 (pi_task && dl_prio(pi_task->prio) && 4914 4914 dl_entity_preempt(&pi_task->dl, &p->dl))) { 4915 - p->dl.dl_boosted = 1; 4915 + p->dl.pi_se = pi_task->dl.pi_se; 4916 4916 queue_flag |= ENQUEUE_REPLENISH; 4917 - } else 4918 - p->dl.dl_boosted = 0; 4917 + } else { 4918 + p->dl.pi_se = &p->dl; 4919 + } 4919 4920 p->sched_class = &dl_sched_class; 4920 4921 } else if (rt_prio(prio)) { 4921 4922 if (dl_prio(oldprio)) 4922 - p->dl.dl_boosted = 0; 4923 + p->dl.pi_se = &p->dl; 4923 4924 if (oldprio < prio) 4924 4925 queue_flag |= ENQUEUE_HEAD; 4925 4926 p->sched_class = &rt_sched_class; 4926 4927 } else { 4927 4928 if (dl_prio(oldprio)) 4928 - p->dl.dl_boosted = 0; 4929 + p->dl.pi_se = &p->dl; 4929 4930 if (rt_prio(oldprio)) 4930 4931 p->rt.timeout = 0; 4931 4932 p->sched_class = &fair_sched_class;
+53 -44
kernel/sched/deadline.c
··· 43 43 return !RB_EMPTY_NODE(&dl_se->rb_node); 44 44 } 45 45 46 + #ifdef CONFIG_RT_MUTEXES 47 + static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) 48 + { 49 + return dl_se->pi_se; 50 + } 51 + 52 + static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) 53 + { 54 + return pi_of(dl_se) != dl_se; 55 + } 56 + #else 57 + static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) 58 + { 59 + return dl_se; 60 + } 61 + 62 + static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) 63 + { 64 + return false; 65 + } 66 + #endif 67 + 46 68 #ifdef CONFIG_SMP 47 69 static inline struct dl_bw *dl_bw_of(int i) 48 70 { ··· 720 698 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 721 699 struct rq *rq = rq_of_dl_rq(dl_rq); 722 700 723 - WARN_ON(dl_se->dl_boosted); 701 + WARN_ON(is_dl_boosted(dl_se)); 724 702 WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); 725 703 726 704 /* ··· 758 736 * could happen are, typically, a entity voluntarily trying to overcome its 759 737 * runtime, or it just underestimated it during sched_setattr(). 760 738 */ 761 - static void replenish_dl_entity(struct sched_dl_entity *dl_se, 762 - struct sched_dl_entity *pi_se) 739 + static void replenish_dl_entity(struct sched_dl_entity *dl_se) 763 740 { 764 741 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 765 742 struct rq *rq = rq_of_dl_rq(dl_rq); 766 743 767 - BUG_ON(pi_se->dl_runtime <= 0); 744 + BUG_ON(pi_of(dl_se)->dl_runtime <= 0); 768 745 769 746 /* 770 747 * This could be the case for a !-dl task that is boosted. 771 748 * Just go with full inherited parameters. 772 749 */ 773 750 if (dl_se->dl_deadline == 0) { 774 - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 775 - dl_se->runtime = pi_se->dl_runtime; 751 + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 752 + dl_se->runtime = pi_of(dl_se)->dl_runtime; 776 753 } 777 754 778 755 if (dl_se->dl_yielded && dl_se->runtime > 0) ··· 784 763 * arbitrary large. 785 764 */ 786 765 while (dl_se->runtime <= 0) { 787 - dl_se->deadline += pi_se->dl_period; 788 - dl_se->runtime += pi_se->dl_runtime; 766 + dl_se->deadline += pi_of(dl_se)->dl_period; 767 + dl_se->runtime += pi_of(dl_se)->dl_runtime; 789 768 } 790 769 791 770 /* ··· 799 778 */ 800 779 if (dl_time_before(dl_se->deadline, rq_clock(rq))) { 801 780 printk_deferred_once("sched: DL replenish lagged too much\n"); 802 - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 803 - dl_se->runtime = pi_se->dl_runtime; 781 + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 782 + dl_se->runtime = pi_of(dl_se)->dl_runtime; 804 783 } 805 784 806 785 if (dl_se->dl_yielded) ··· 833 812 * task with deadline equal to period this is the same of using 834 813 * dl_period instead of dl_deadline in the equation above. 835 814 */ 836 - static bool dl_entity_overflow(struct sched_dl_entity *dl_se, 837 - struct sched_dl_entity *pi_se, u64 t) 815 + static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) 838 816 { 839 817 u64 left, right; 840 818 ··· 855 835 * of anything below microseconds resolution is actually fiction 856 836 * (but still we want to give the user that illusion >;). 857 837 */ 858 - left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); 838 + left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); 859 839 right = ((dl_se->deadline - t) >> DL_SCALE) * 860 - (pi_se->dl_runtime >> DL_SCALE); 840 + (pi_of(dl_se)->dl_runtime >> DL_SCALE); 861 841 862 842 return dl_time_before(right, left); 863 843 } ··· 942 922 * Please refer to the comments update_dl_revised_wakeup() function to find 943 923 * more about the Revised CBS rule. 944 924 */ 945 - static void update_dl_entity(struct sched_dl_entity *dl_se, 946 - struct sched_dl_entity *pi_se) 925 + static void update_dl_entity(struct sched_dl_entity *dl_se) 947 926 { 948 927 struct dl_rq *dl_rq = dl_rq_of_se(dl_se); 949 928 struct rq *rq = rq_of_dl_rq(dl_rq); 950 929 951 930 if (dl_time_before(dl_se->deadline, rq_clock(rq)) || 952 - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { 931 + dl_entity_overflow(dl_se, rq_clock(rq))) { 953 932 954 933 if (unlikely(!dl_is_implicit(dl_se) && 955 934 !dl_time_before(dl_se->deadline, rq_clock(rq)) && 956 - !dl_se->dl_boosted)){ 935 + !is_dl_boosted(dl_se))) { 957 936 update_dl_revised_wakeup(dl_se, rq); 958 937 return; 959 938 } 960 939 961 - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 962 - dl_se->runtime = pi_se->dl_runtime; 940 + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; 941 + dl_se->runtime = pi_of(dl_se)->dl_runtime; 963 942 } 964 943 } 965 944 ··· 1057 1038 * The task might have been boosted by someone else and might be in the 1058 1039 * boosting/deboosting path, its not throttled. 1059 1040 */ 1060 - if (dl_se->dl_boosted) 1041 + if (is_dl_boosted(dl_se)) 1061 1042 goto unlock; 1062 1043 1063 1044 /* ··· 1085 1066 * but do not enqueue -- wait for our wakeup to do that. 1086 1067 */ 1087 1068 if (!task_on_rq_queued(p)) { 1088 - replenish_dl_entity(dl_se, dl_se); 1069 + replenish_dl_entity(dl_se); 1089 1070 goto unlock; 1090 1071 } 1091 1072 ··· 1175 1156 1176 1157 if (dl_time_before(dl_se->deadline, rq_clock(rq)) && 1177 1158 dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { 1178 - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) 1159 + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) 1179 1160 return; 1180 1161 dl_se->dl_throttled = 1; 1181 1162 if (dl_se->runtime > 0) ··· 1306 1287 dl_se->dl_overrun = 1; 1307 1288 1308 1289 __dequeue_task_dl(rq, curr, 0); 1309 - if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) 1290 + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) 1310 1291 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 1311 1292 1312 1293 if (!is_leftmost(curr, &rq->dl)) ··· 1500 1481 } 1501 1482 1502 1483 static void 1503 - enqueue_dl_entity(struct sched_dl_entity *dl_se, 1504 - struct sched_dl_entity *pi_se, int flags) 1484 + enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) 1505 1485 { 1506 1486 BUG_ON(on_dl_rq(dl_se)); 1507 1487 ··· 1511 1493 */ 1512 1494 if (flags & ENQUEUE_WAKEUP) { 1513 1495 task_contending(dl_se, flags); 1514 - update_dl_entity(dl_se, pi_se); 1496 + update_dl_entity(dl_se); 1515 1497 } else if (flags & ENQUEUE_REPLENISH) { 1516 - replenish_dl_entity(dl_se, pi_se); 1498 + replenish_dl_entity(dl_se); 1517 1499 } else if ((flags & ENQUEUE_RESTORE) && 1518 1500 dl_time_before(dl_se->deadline, 1519 1501 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { ··· 1530 1512 1531 1513 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) 1532 1514 { 1533 - struct task_struct *pi_task = rt_mutex_get_top_task(p); 1534 - struct sched_dl_entity *pi_se = &p->dl; 1535 - 1536 - /* 1537 - * Use the scheduling parameters of the top pi-waiter task if: 1538 - * - we have a top pi-waiter which is a SCHED_DEADLINE task AND 1539 - * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is 1540 - * smaller than our deadline OR we are a !SCHED_DEADLINE task getting 1541 - * boosted due to a SCHED_DEADLINE pi-waiter). 1542 - * Otherwise we keep our runtime and deadline. 1543 - */ 1544 - if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { 1545 - pi_se = &pi_task->dl; 1515 + if (is_dl_boosted(&p->dl)) { 1546 1516 /* 1547 1517 * Because of delays in the detection of the overrun of a 1548 1518 * thread's runtime, it might be the case that a thread ··· 1563 1557 * the throttle. 1564 1558 */ 1565 1559 p->dl.dl_throttled = 0; 1566 - BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); 1560 + BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH); 1567 1561 return; 1568 1562 } 1569 1563 ··· 1600 1594 return; 1601 1595 } 1602 1596 1603 - enqueue_dl_entity(&p->dl, pi_se, flags); 1597 + enqueue_dl_entity(&p->dl, flags); 1604 1598 1605 1599 if (!task_current(rq, p) && p->nr_cpus_allowed > 1) 1606 1600 enqueue_pushable_dl_task(rq, p); ··· 2793 2787 dl_se->dl_bw = 0; 2794 2788 dl_se->dl_density = 0; 2795 2789 2796 - dl_se->dl_boosted = 0; 2797 2790 dl_se->dl_throttled = 0; 2798 2791 dl_se->dl_yielded = 0; 2799 2792 dl_se->dl_non_contending = 0; 2800 2793 dl_se->dl_overrun = 0; 2794 + 2795 + #ifdef CONFIG_RT_MUTEXES 2796 + dl_se->pi_se = dl_se; 2797 + #endif 2801 2798 } 2802 2799 2803 2800 bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)