Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

iocost: protect iocg->abs_vdebt with iocg->waitq.lock

abs_vdebt is an atomic_64 which tracks how much over budget a given cgroup
is and controls the activation of use_delay mechanism. Once a cgroup goes
over budget from forced IOs, it has to pay it back with its future budget.
The progress guarantee on debt paying comes from the iocg being active -
active iocgs are processed by the periodic timer, which ensures that as time
passes the debts dissipate and the iocg returns to normal operation.

However, both iocg activation and vdebt handling are asynchronous and a
sequence like the following may happen.

1. The iocg is in the process of being deactivated by the periodic timer.

2. A bio enters ioc_rqos_throttle(), calls iocg_activate() which returns
without anything because it still sees that the iocg is already active.

3. The iocg is deactivated.

4. The bio from #2 is over budget but needs to be forced. It increases
abs_vdebt and goes over the threshold and enables use_delay.

5. IO control is enabled for the iocg's subtree and now IOs are attributed
to the descendant cgroups and the iocg itself no longer issues IOs.

This leaves the iocg with stuck abs_vdebt - it has debt but inactive and no
further IOs which can activate it. This can end up unduly punishing all the
descendants cgroups.

The usual throttling path has the same issue - the iocg must be active while
throttled to ensure that future event will wake it up - and solves the
problem by synchronizing the throttling path with a spinlock. abs_vdebt
handling is another form of overage handling and shares a lot of
characteristics including the fact that it isn't in the hottest path.

This patch fixes the above and other possible races by strictly
synchronizing abs_vdebt and use_delay handling with iocg->waitq.lock.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Vlad Dmitriev <vvd@fb.com>
Cc: stable@vger.kernel.org # v5.4+
Fixes: e1518f63f246 ("blk-iocost: Don't let merges push vtime into the future")
Signed-off-by: Jens Axboe <axboe@kernel.dk>

authored by

Tejun Heo and committed by
Jens Axboe
0b80f986 10c70d95

+77 -47
+71 -46
block/blk-iocost.c
··· 466 466 */ 467 467 atomic64_t vtime; 468 468 atomic64_t done_vtime; 469 - atomic64_t abs_vdebt; 469 + u64 abs_vdebt; 470 470 u64 last_vtime; 471 471 472 472 /* ··· 1142 1142 struct iocg_wake_ctx ctx = { .iocg = iocg }; 1143 1143 u64 margin_ns = (u64)(ioc->period_us * 1144 1144 WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; 1145 - u64 abs_vdebt, vdebt, vshortage, expires, oexpires; 1145 + u64 vdebt, vshortage, expires, oexpires; 1146 1146 s64 vbudget; 1147 1147 u32 hw_inuse; 1148 1148 ··· 1152 1152 vbudget = now->vnow - atomic64_read(&iocg->vtime); 1153 1153 1154 1154 /* pay off debt */ 1155 - abs_vdebt = atomic64_read(&iocg->abs_vdebt); 1156 - vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse); 1155 + vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); 1157 1156 if (vdebt && vbudget > 0) { 1158 1157 u64 delta = min_t(u64, vbudget, vdebt); 1159 1158 u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), 1160 - abs_vdebt); 1159 + iocg->abs_vdebt); 1161 1160 1162 1161 atomic64_add(delta, &iocg->vtime); 1163 1162 atomic64_add(delta, &iocg->done_vtime); 1164 - atomic64_sub(abs_delta, &iocg->abs_vdebt); 1165 - if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0)) 1166 - atomic64_set(&iocg->abs_vdebt, 0); 1163 + iocg->abs_vdebt -= abs_delta; 1167 1164 } 1168 1165 1169 1166 /* ··· 1216 1219 u64 expires, oexpires; 1217 1220 u32 hw_inuse; 1218 1221 1222 + lockdep_assert_held(&iocg->waitq.lock); 1223 + 1219 1224 /* debt-adjust vtime */ 1220 1225 current_hweight(iocg, NULL, &hw_inuse); 1221 - vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse); 1226 + vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); 1222 1227 1223 - /* clear or maintain depending on the overage */ 1224 - if (time_before_eq64(vtime, now->vnow)) { 1228 + /* 1229 + * Clear or maintain depending on the overage. Non-zero vdebt is what 1230 + * guarantees that @iocg is online and future iocg_kick_delay() will 1231 + * clear use_delay. Don't leave it on when there's no vdebt. 1232 + */ 1233 + if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { 1225 1234 blkcg_clear_delay(blkg); 1226 1235 return false; 1227 1236 } ··· 1261 1258 { 1262 1259 struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); 1263 1260 struct ioc_now now; 1261 + unsigned long flags; 1264 1262 1263 + spin_lock_irqsave(&iocg->waitq.lock, flags); 1265 1264 ioc_now(iocg->ioc, &now); 1266 1265 iocg_kick_delay(iocg, &now, 0); 1266 + spin_unlock_irqrestore(&iocg->waitq.lock, flags); 1267 1267 1268 1268 return HRTIMER_NORESTART; 1269 1269 } ··· 1374 1368 * should have woken up in the last period and expire idle iocgs. 1375 1369 */ 1376 1370 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { 1377 - if (!waitqueue_active(&iocg->waitq) && 1378 - !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg)) 1371 + if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt && 1372 + !iocg_is_idle(iocg)) 1379 1373 continue; 1380 1374 1381 1375 spin_lock(&iocg->waitq.lock); 1382 1376 1383 - if (waitqueue_active(&iocg->waitq) || 1384 - atomic64_read(&iocg->abs_vdebt)) { 1377 + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { 1385 1378 /* might be oversleeping vtime / hweight changes, kick */ 1386 1379 iocg_kick_waitq(iocg, &now); 1387 1380 iocg_kick_delay(iocg, &now, 0); ··· 1723 1718 * tests are racy but the races aren't systemic - we only miss once 1724 1719 * in a while which is fine. 1725 1720 */ 1726 - if (!waitqueue_active(&iocg->waitq) && 1727 - !atomic64_read(&iocg->abs_vdebt) && 1721 + if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && 1728 1722 time_before_eq64(vtime + cost, now.vnow)) { 1729 1723 iocg_commit_bio(iocg, bio, cost); 1730 1724 return; 1731 1725 } 1732 1726 1733 1727 /* 1734 - * We're over budget. If @bio has to be issued regardless, 1735 - * remember the abs_cost instead of advancing vtime. 1736 - * iocg_kick_waitq() will pay off the debt before waking more IOs. 1728 + * We activated above but w/o any synchronization. Deactivation is 1729 + * synchronized with waitq.lock and we won't get deactivated as long 1730 + * as we're waiting or has debt, so we're good if we're activated 1731 + * here. In the unlikely case that we aren't, just issue the IO. 1732 + */ 1733 + spin_lock_irq(&iocg->waitq.lock); 1734 + 1735 + if (unlikely(list_empty(&iocg->active_list))) { 1736 + spin_unlock_irq(&iocg->waitq.lock); 1737 + iocg_commit_bio(iocg, bio, cost); 1738 + return; 1739 + } 1740 + 1741 + /* 1742 + * We're over budget. If @bio has to be issued regardless, remember 1743 + * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay 1744 + * off the debt before waking more IOs. 1745 + * 1737 1746 * This way, the debt is continuously paid off each period with the 1738 - * actual budget available to the cgroup. If we just wound vtime, 1739 - * we would incorrectly use the current hw_inuse for the entire 1740 - * amount which, for example, can lead to the cgroup staying 1741 - * blocked for a long time even with substantially raised hw_inuse. 1747 + * actual budget available to the cgroup. If we just wound vtime, we 1748 + * would incorrectly use the current hw_inuse for the entire amount 1749 + * which, for example, can lead to the cgroup staying blocked for a 1750 + * long time even with substantially raised hw_inuse. 1751 + * 1752 + * An iocg with vdebt should stay online so that the timer can keep 1753 + * deducting its vdebt and [de]activate use_delay mechanism 1754 + * accordingly. We don't want to race against the timer trying to 1755 + * clear them and leave @iocg inactive w/ dangling use_delay heavily 1756 + * penalizing the cgroup and its descendants. 1742 1757 */ 1743 1758 if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { 1744 - atomic64_add(abs_cost, &iocg->abs_vdebt); 1759 + iocg->abs_vdebt += abs_cost; 1745 1760 if (iocg_kick_delay(iocg, &now, cost)) 1746 1761 blkcg_schedule_throttle(rqos->q, 1747 1762 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); 1763 + spin_unlock_irq(&iocg->waitq.lock); 1748 1764 return; 1749 1765 } 1750 1766 ··· 1782 1756 * All waiters are on iocg->waitq and the wait states are 1783 1757 * synchronized using waitq.lock. 1784 1758 */ 1785 - spin_lock_irq(&iocg->waitq.lock); 1786 - 1787 - /* 1788 - * We activated above but w/o any synchronization. Deactivation is 1789 - * synchronized with waitq.lock and we won't get deactivated as 1790 - * long as we're waiting, so we're good if we're activated here. 1791 - * In the unlikely case that we are deactivated, just issue the IO. 1792 - */ 1793 - if (unlikely(list_empty(&iocg->active_list))) { 1794 - spin_unlock_irq(&iocg->waitq.lock); 1795 - iocg_commit_bio(iocg, bio, cost); 1796 - return; 1797 - } 1798 - 1799 1759 init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); 1800 1760 wait.wait.private = current; 1801 1761 wait.bio = bio; ··· 1813 1801 struct ioc_now now; 1814 1802 u32 hw_inuse; 1815 1803 u64 abs_cost, cost; 1804 + unsigned long flags; 1816 1805 1817 1806 /* bypass if disabled or for root cgroup */ 1818 1807 if (!ioc->enabled || !iocg->level) ··· 1833 1820 iocg->cursor = bio_end; 1834 1821 1835 1822 /* 1836 - * Charge if there's enough vtime budget and the existing request 1837 - * has cost assigned. Otherwise, account it as debt. See debt 1838 - * handling in ioc_rqos_throttle() for details. 1823 + * Charge if there's enough vtime budget and the existing request has 1824 + * cost assigned. 1839 1825 */ 1840 1826 if (rq->bio && rq->bio->bi_iocost_cost && 1841 - time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) 1827 + time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { 1842 1828 iocg_commit_bio(iocg, bio, cost); 1843 - else 1844 - atomic64_add(abs_cost, &iocg->abs_vdebt); 1829 + return; 1830 + } 1831 + 1832 + /* 1833 + * Otherwise, account it as debt if @iocg is online, which it should 1834 + * be for the vast majority of cases. See debt handling in 1835 + * ioc_rqos_throttle() for details. 1836 + */ 1837 + spin_lock_irqsave(&iocg->waitq.lock, flags); 1838 + if (likely(!list_empty(&iocg->active_list))) { 1839 + iocg->abs_vdebt += abs_cost; 1840 + iocg_kick_delay(iocg, &now, cost); 1841 + } else { 1842 + iocg_commit_bio(iocg, bio, cost); 1843 + } 1844 + spin_unlock_irqrestore(&iocg->waitq.lock, flags); 1845 1845 } 1846 1846 1847 1847 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) ··· 2024 1998 iocg->ioc = ioc; 2025 1999 atomic64_set(&iocg->vtime, now.vnow); 2026 2000 atomic64_set(&iocg->done_vtime, now.vnow); 2027 - atomic64_set(&iocg->abs_vdebt, 0); 2028 2001 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); 2029 2002 INIT_LIST_HEAD(&iocg->active_list); 2030 2003 iocg->hweight_active = HWEIGHT_WHOLE;
+6 -1
tools/cgroup/iocost_monitor.py
··· 159 159 else: 160 160 self.inflight_pct = 0 161 161 162 - self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000 162 + # vdebt used to be an atomic64_t and is now u64, support both 163 + try: 164 + self.debt_ms = iocg.abs_vdebt.counter.value_() / VTIME_PER_USEC / 1000 165 + except: 166 + self.debt_ms = iocg.abs_vdebt.value_() / VTIME_PER_USEC / 1000 167 + 163 168 self.use_delay = blkg.use_delay.counter.value_() 164 169 self.delay_ms = blkg.delay_nsec.counter.value_() / 1_000_000 165 170