sched/wait: Break up long wake list walk

We encountered workloads that have very long wake up list on large
systems. A waker takes a long time to traverse the entire wake list and
execute all the wake functions.

We saw page wait list that are up to 3700+ entries long in tests of
large 4 and 8 socket systems. It took 0.8 sec to traverse such list
during wake up. Any other CPU that contends for the list spin lock will
spin for a long time. It is a result of the numa balancing migration of
hot pages that are shared by many threads.

Multiple CPUs waking are queued up behind the lock, and the last one
queued has to wait until all CPUs did all the wakeups.

The page wait list is traversed with interrupt disabled, which caused
various problems. This was the original cause that triggered the NMI
watch dog timer in: https://patchwork.kernel.org/patch/9800303/ . Only
extending the NMI watch dog timer there helped.

This patch bookmarks the waker's scan position in wake list and break
the wake up walk, to allow access to the list before the waker resume
its walk down the rest of the wait list. It lowers the interrupt and
rescheduling latency.

This patch also provides a performance boost when combined with the next
patch to break up page wakeup list walk. We saw 22% improvement in the
will-it-scale file pread2 test on a Xeon Phi system running 256 threads.

[ v2: Merged in Linus' changes to remove the bookmark_wake_function, and
simply access to flags. ]

Reported-by: Kan Liang <kan.liang@intel.com>
Tested-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

Tim Chen and committed by
Linus Torvalds
2554db91 46c1e79f

+64 -15
+1
include/linux/wait.h
··· 18 /* wait_queue_entry::flags */ 19 #define WQ_FLAG_EXCLUSIVE 0x01 20 #define WQ_FLAG_WOKEN 0x02 21 22 /* 23 * A single wait-queue entry structure:
··· 18 /* wait_queue_entry::flags */ 19 #define WQ_FLAG_EXCLUSIVE 0x01 20 #define WQ_FLAG_WOKEN 0x02 21 + #define WQ_FLAG_BOOKMARK 0x04 22 23 /* 24 * A single wait-queue entry structure:
+63 -15
kernel/sched/wait.c
··· 53 } 54 EXPORT_SYMBOL(remove_wait_queue); 55 56 57 /* 58 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just ··· 69 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 70 * zero in this (rare) case, and we handle it by continuing to scan the queue. 71 */ 72 - static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, 73 - int nr_exclusive, int wake_flags, void *key) 74 { 75 wait_queue_entry_t *curr, *next; 76 77 - list_for_each_entry_safe(curr, next, &wq_head->head, entry) { 78 unsigned flags = curr->flags; 79 - int ret = curr->func(curr, mode, wake_flags, key); 80 if (ret < 0) 81 break; 82 if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 83 break; 84 } 85 } 86 ··· 146 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, 147 int nr_exclusive, void *key) 148 { 149 - unsigned long flags; 150 - 151 - spin_lock_irqsave(&wq_head->lock, flags); 152 - __wake_up_common(wq_head, mode, nr_exclusive, 0, key); 153 - spin_unlock_irqrestore(&wq_head->lock, flags); 154 } 155 EXPORT_SYMBOL(__wake_up); 156 ··· 155 */ 156 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) 157 { 158 - __wake_up_common(wq_head, mode, nr, 0, NULL); 159 } 160 EXPORT_SYMBOL_GPL(__wake_up_locked); 161 162 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) 163 { 164 - __wake_up_common(wq_head, mode, 1, 0, key); 165 } 166 EXPORT_SYMBOL_GPL(__wake_up_locked_key); 167 ··· 185 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, 186 int nr_exclusive, void *key) 187 { 188 - unsigned long flags; 189 int wake_flags = 1; /* XXX WF_SYNC */ 190 191 if (unlikely(!wq_head)) ··· 193 if (unlikely(nr_exclusive != 1)) 194 wake_flags = 0; 195 196 - spin_lock_irqsave(&wq_head->lock, flags); 197 - __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key); 198 - spin_unlock_irqrestore(&wq_head->lock, flags); 199 } 200 EXPORT_SYMBOL_GPL(__wake_up_sync_key); 201
··· 53 } 54 EXPORT_SYMBOL(remove_wait_queue); 55 56 + /* 57 + * Scan threshold to break wait queue walk. 58 + * This allows a waker to take a break from holding the 59 + * wait queue lock during the wait queue walk. 60 + */ 61 + #define WAITQUEUE_WALK_BREAK_CNT 64 62 63 /* 64 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just ··· 63 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 64 * zero in this (rare) case, and we handle it by continuing to scan the queue. 65 */ 66 + static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, 67 + int nr_exclusive, int wake_flags, void *key, 68 + wait_queue_entry_t *bookmark) 69 { 70 wait_queue_entry_t *curr, *next; 71 + int cnt = 0; 72 73 + if (bookmark && (bookmark->flags & WQ_FLAG_BOOKMARK)) { 74 + curr = list_next_entry(bookmark, entry); 75 + 76 + list_del(&bookmark->entry); 77 + bookmark->flags = 0; 78 + } else 79 + curr = list_first_entry(&wq_head->head, wait_queue_entry_t, entry); 80 + 81 + if (&curr->entry == &wq_head->head) 82 + return nr_exclusive; 83 + 84 + list_for_each_entry_safe_from(curr, next, &wq_head->head, entry) { 85 unsigned flags = curr->flags; 86 + int ret; 87 + 88 + if (flags & WQ_FLAG_BOOKMARK) 89 + continue; 90 + 91 + ret = curr->func(curr, mode, wake_flags, key); 92 if (ret < 0) 93 break; 94 if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 95 break; 96 + 97 + if (bookmark && (++cnt > WAITQUEUE_WALK_BREAK_CNT) && 98 + (&next->entry != &wq_head->head)) { 99 + bookmark->flags = WQ_FLAG_BOOKMARK; 100 + list_add_tail(&bookmark->entry, &next->entry); 101 + break; 102 + } 103 + } 104 + return nr_exclusive; 105 + } 106 + 107 + static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode, 108 + int nr_exclusive, int wake_flags, void *key) 109 + { 110 + unsigned long flags; 111 + wait_queue_entry_t bookmark; 112 + 113 + bookmark.flags = 0; 114 + bookmark.private = NULL; 115 + bookmark.func = NULL; 116 + INIT_LIST_HEAD(&bookmark.entry); 117 + 118 + spin_lock_irqsave(&wq_head->lock, flags); 119 + nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); 120 + spin_unlock_irqrestore(&wq_head->lock, flags); 121 + 122 + while (bookmark.flags & WQ_FLAG_BOOKMARK) { 123 + spin_lock_irqsave(&wq_head->lock, flags); 124 + nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, 125 + wake_flags, key, &bookmark); 126 + spin_unlock_irqrestore(&wq_head->lock, flags); 127 } 128 } 129 ··· 91 void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, 92 int nr_exclusive, void *key) 93 { 94 + __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key); 95 } 96 EXPORT_SYMBOL(__wake_up); 97 ··· 104 */ 105 void __wake_up_locked(struct wait_queue_head *wq_head, unsigned int mode, int nr) 106 { 107 + __wake_up_common(wq_head, mode, nr, 0, NULL, NULL); 108 } 109 EXPORT_SYMBOL_GPL(__wake_up_locked); 110 111 void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key) 112 { 113 + __wake_up_common(wq_head, mode, 1, 0, key, NULL); 114 } 115 EXPORT_SYMBOL_GPL(__wake_up_locked_key); 116 ··· 134 void __wake_up_sync_key(struct wait_queue_head *wq_head, unsigned int mode, 135 int nr_exclusive, void *key) 136 { 137 int wake_flags = 1; /* XXX WF_SYNC */ 138 139 if (unlikely(!wq_head)) ··· 143 if (unlikely(nr_exclusive != 1)) 144 wake_flags = 0; 145 146 + __wake_up_common_lock(wq_head, mode, nr_exclusive, wake_flags, key); 147 } 148 EXPORT_SYMBOL_GPL(__wake_up_sync_key); 149