Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

virtio_balloon: fix deadlock on OOM

fill_balloon doing memory allocations under balloon_lock
can cause a deadlock when leak_balloon is called from
virtballoon_oom_notify and tries to take same lock.

To fix, split page allocation and enqueue and do allocations outside the lock.

Here's a detailed analysis of the deadlock by Tetsuo Handa:

In leak_balloon(), mutex_lock(&vb->balloon_lock) is called in order to
serialize against fill_balloon(). But in fill_balloon(),
alloc_page(GFP_HIGHUSER[_MOVABLE] | __GFP_NOMEMALLOC | __GFP_NORETRY) is
called with vb->balloon_lock mutex held. Since GFP_HIGHUSER[_MOVABLE]
implies __GFP_DIRECT_RECLAIM | __GFP_IO | __GFP_FS, despite __GFP_NORETRY
is specified, this allocation attempt might indirectly depend on somebody
else's __GFP_DIRECT_RECLAIM memory allocation. And such indirect
__GFP_DIRECT_RECLAIM memory allocation might call leak_balloon() via
virtballoon_oom_notify() via blocking_notifier_call_chain() callback via
out_of_memory() when it reached __alloc_pages_may_oom() and held oom_lock
mutex. Since vb->balloon_lock mutex is already held by fill_balloon(), it
will cause OOM lockup.

Thread1 Thread2
fill_balloon()
takes a balloon_lock
balloon_page_enqueue()
alloc_page(GFP_HIGHUSER_MOVABLE)
direct reclaim (__GFP_FS context) takes a fs lock
waits for that fs lock alloc_page(GFP_NOFS)
__alloc_pages_may_oom()
takes the oom_lock
out_of_memory()
blocking_notifier_call_chain()
leak_balloon()
tries to take that balloon_lock and deadlocks

Reported-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Wei Wang <wei.w.wang@intel.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

+74 -13
+19 -5
drivers/virtio/virtio_balloon.c
··· 143 143 144 144 static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) 145 145 { 146 - struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; 147 146 unsigned num_allocated_pages; 147 + unsigned num_pfns; 148 + struct page *page; 149 + LIST_HEAD(pages); 148 150 149 151 /* We can only do one array worth at a time. */ 150 152 num = min(num, ARRAY_SIZE(vb->pfns)); 151 153 152 - mutex_lock(&vb->balloon_lock); 153 - for (vb->num_pfns = 0; vb->num_pfns < num; 154 - vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { 155 - struct page *page = balloon_page_enqueue(vb_dev_info); 154 + for (num_pfns = 0; num_pfns < num; 155 + num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE) { 156 + struct page *page = balloon_page_alloc(); 156 157 157 158 if (!page) { 158 159 dev_info_ratelimited(&vb->vdev->dev, ··· 163 162 msleep(200); 164 163 break; 165 164 } 165 + 166 + balloon_page_push(&pages, page); 167 + } 168 + 169 + mutex_lock(&vb->balloon_lock); 170 + 171 + vb->num_pfns = 0; 172 + 173 + while ((page = balloon_page_pop(&pages))) { 174 + balloon_page_enqueue(&vb->vb_dev_info, page); 175 + 176 + vb->num_pfns += VIRTIO_BALLOON_PAGES_PER_PAGE; 177 + 166 178 set_page_pfns(vb, vb->pfns + vb->num_pfns, page); 167 179 vb->num_pages += VIRTIO_BALLOON_PAGES_PER_PAGE; 168 180 if (!virtio_has_feature(vb->vdev,
+34 -1
include/linux/balloon_compaction.h
··· 50 50 #include <linux/gfp.h> 51 51 #include <linux/err.h> 52 52 #include <linux/fs.h> 53 + #include <linux/list.h> 53 54 54 55 /* 55 56 * Balloon device information descriptor. ··· 68 67 struct inode *inode; 69 68 }; 70 69 71 - extern struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info); 70 + extern struct page *balloon_page_alloc(void); 71 + extern void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, 72 + struct page *page); 72 73 extern struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info); 73 74 74 75 static inline void balloon_devinfo_init(struct balloon_dev_info *balloon) ··· 196 193 } 197 194 198 195 #endif /* CONFIG_BALLOON_COMPACTION */ 196 + 197 + /* 198 + * balloon_page_push - insert a page into a page list. 199 + * @head : pointer to list 200 + * @page : page to be added 201 + * 202 + * Caller must ensure the page is private and protect the list. 203 + */ 204 + static inline void balloon_page_push(struct list_head *pages, struct page *page) 205 + { 206 + list_add(&page->lru, pages); 207 + } 208 + 209 + /* 210 + * balloon_page_pop - remove a page from a page list. 211 + * @head : pointer to list 212 + * @page : page to be added 213 + * 214 + * Caller must ensure the page is private and protect the list. 215 + */ 216 + static inline struct page *balloon_page_pop(struct list_head *pages) 217 + { 218 + struct page *page = list_first_entry_or_null(pages, struct page, lru); 219 + 220 + if (!page) 221 + return NULL; 222 + 223 + list_del(&page->lru); 224 + return page; 225 + } 199 226 #endif /* _LINUX_BALLOON_COMPACTION_H */
+21 -7
mm/balloon_compaction.c
··· 11 11 #include <linux/balloon_compaction.h> 12 12 13 13 /* 14 + * balloon_page_alloc - allocates a new page for insertion into the balloon 15 + * page list. 16 + * 17 + * Driver must call it to properly allocate a new enlisted balloon page. 18 + * Driver must call balloon_page_enqueue before definitively removing it from 19 + * the guest system. This function returns the page address for the recently 20 + * allocated page or NULL in the case we fail to allocate a new page this turn. 21 + */ 22 + struct page *balloon_page_alloc(void) 23 + { 24 + struct page *page = alloc_page(balloon_mapping_gfp_mask() | 25 + __GFP_NOMEMALLOC | __GFP_NORETRY); 26 + return page; 27 + } 28 + EXPORT_SYMBOL_GPL(balloon_page_alloc); 29 + 30 + /* 14 31 * balloon_page_enqueue - allocates a new page and inserts it into the balloon 15 32 * page list. 16 33 * @b_dev_info: balloon device descriptor where we will insert a new page to 34 + * @page: new page to enqueue - allocated using balloon_page_alloc. 17 35 * 18 - * Driver must call it to properly allocate a new enlisted balloon page 36 + * Driver must call it to properly enqueue a new allocated balloon page 19 37 * before definitively removing it from the guest system. 20 38 * This function returns the page address for the recently enqueued page or 21 39 * NULL in the case we fail to allocate a new page this turn. 22 40 */ 23 - struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) 41 + void balloon_page_enqueue(struct balloon_dev_info *b_dev_info, 42 + struct page *page) 24 43 { 25 44 unsigned long flags; 26 - struct page *page = alloc_page(balloon_mapping_gfp_mask() | 27 - __GFP_NOMEMALLOC | __GFP_NORETRY); 28 - if (!page) 29 - return NULL; 30 45 31 46 /* 32 47 * Block others from accessing the 'page' when we get around to ··· 54 39 __count_vm_event(BALLOON_INFLATE); 55 40 spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); 56 41 unlock_page(page); 57 - return page; 58 42 } 59 43 EXPORT_SYMBOL_GPL(balloon_page_enqueue); 60 44