Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm, hotplug: fix concurrent memory hot-add deadlock

There's a deadlock when concurrently hot-adding memory through the probe
interface and switching a memory block from offline to online.

When hot-adding memory via the probe interface, add_memory() first takes
mem_hotplug_begin() and then device_lock() is later taken when registering
the newly initialized memory block. This creates a lock dependency of (1)
mem_hotplug.lock (2) dev->mutex.

When switching a memory block from offline to online, dev->mutex is first
grabbed in device_online() when the write(2) transitions an existing
memory block from offline to online, and then online_pages() will take
mem_hotplug_begin().

This creates a lock inversion between mem_hotplug.lock and dev->mutex.
Vitaly reports that this deadlock can happen when kworker handling a probe
event races with systemd-udevd switching a memory block's state.

This patch requires the state transition to take mem_hotplug_begin()
before dev->mutex. Hot-adding memory via the probe interface creates a
memory block while holding mem_hotplug_begin(), there is no way to take
dev->mutex first in this case.

online_pages() and offline_pages() are only called when transitioning
memory block state. We now require that mem_hotplug_begin() is taken
before calling them -- this requires exporting the mem_hotplug_begin() and
mem_hotplug_done() to generic code. In all hot-add and hot-remove cases,
mem_hotplug_begin() is done prior to device_online(). This is all that is
needed to avoid the deadlock.

Signed-off-by: David Rientjes <rientjes@google.com>
Reported-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Tested-by: Vitaly Kuznetsov <vkuznets@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>
Cc: "K. Y. Srinivasan" <kys@microsoft.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Zhang Zhen <zhenzhang.zhang@huawei.com>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: Wang Nan <wangnan0@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

authored by

David Rientjes and committed by
Linus Torvalds
30467e0b 17e0db82

+30 -28
+12 -7
drivers/base/memory.c
··· 219 219 /* 220 220 * MEMORY_HOTPLUG depends on SPARSEMEM in mm/Kconfig, so it is 221 221 * OK to have direct references to sparsemem variables in here. 222 + * Must already be protected by mem_hotplug_begin(). 222 223 */ 223 224 static int 224 225 memory_block_action(unsigned long phys_index, unsigned long action, int online_type) ··· 287 286 if (mem->online_type < 0) 288 287 mem->online_type = MMOP_ONLINE_KEEP; 289 288 289 + /* Already under protection of mem_hotplug_begin() */ 290 290 ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); 291 291 292 292 /* clear online_type */ ··· 330 328 goto err; 331 329 } 332 330 331 + /* 332 + * Memory hotplug needs to hold mem_hotplug_begin() for probe to find 333 + * the correct memory block to online before doing device_online(dev), 334 + * which will take dev->mutex. Take the lock early to prevent an 335 + * inversion, memory_subsys_online() callbacks will be implemented by 336 + * assuming it's already protected. 337 + */ 338 + mem_hotplug_begin(); 339 + 333 340 switch (online_type) { 334 341 case MMOP_ONLINE_KERNEL: 335 342 case MMOP_ONLINE_MOVABLE: 336 343 case MMOP_ONLINE_KEEP: 337 - /* 338 - * mem->online_type is not protected so there can be a 339 - * race here. However, when racing online, the first 340 - * will succeed and the second will just return as the 341 - * block will already be online. The online type 342 - * could be either one, but that is expected. 343 - */ 344 344 mem->online_type = online_type; 345 345 ret = device_online(&mem->dev); 346 346 break; ··· 353 349 ret = -EINVAL; /* should never happen */ 354 350 } 355 351 352 + mem_hotplug_done(); 356 353 err: 357 354 unlock_device_hotplug(); 358 355
+6
include/linux/memory_hotplug.h
··· 192 192 void get_online_mems(void); 193 193 void put_online_mems(void); 194 194 195 + void mem_hotplug_begin(void); 196 + void mem_hotplug_done(void); 197 + 195 198 #else /* ! CONFIG_MEMORY_HOTPLUG */ 196 199 /* 197 200 * Stub functions for when hotplug is off ··· 233 230 234 231 static inline void get_online_mems(void) {} 235 232 static inline void put_online_mems(void) {} 233 + 234 + static inline void mem_hotplug_begin(void) {} 235 + static inline void mem_hotplug_done(void) {} 236 236 237 237 #endif /* ! CONFIG_MEMORY_HOTPLUG */ 238 238
+12 -21
mm/memory_hotplug.c
··· 104 104 105 105 } 106 106 107 - static void mem_hotplug_begin(void) 107 + void mem_hotplug_begin(void) 108 108 { 109 109 mem_hotplug.active_writer = current; 110 110 ··· 119 119 } 120 120 } 121 121 122 - static void mem_hotplug_done(void) 122 + void mem_hotplug_done(void) 123 123 { 124 124 mem_hotplug.active_writer = NULL; 125 125 mutex_unlock(&mem_hotplug.lock); ··· 959 959 } 960 960 961 961 962 + /* Must be protected by mem_hotplug_begin() */ 962 963 int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) 963 964 { 964 965 unsigned long flags; ··· 970 969 int ret; 971 970 struct memory_notify arg; 972 971 973 - mem_hotplug_begin(); 974 972 /* 975 973 * This doesn't need a lock to do pfn_to_page(). 976 974 * The section can't be removed here because of the ··· 977 977 */ 978 978 zone = page_zone(pfn_to_page(pfn)); 979 979 980 - ret = -EINVAL; 981 980 if ((zone_idx(zone) > ZONE_NORMAL || 982 981 online_type == MMOP_ONLINE_MOVABLE) && 983 982 !can_online_high_movable(zone)) 984 - goto out; 983 + return -EINVAL; 985 984 986 985 if (online_type == MMOP_ONLINE_KERNEL && 987 986 zone_idx(zone) == ZONE_MOVABLE) { 988 987 if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) 989 - goto out; 988 + return -EINVAL; 990 989 } 991 990 if (online_type == MMOP_ONLINE_MOVABLE && 992 991 zone_idx(zone) == ZONE_MOVABLE - 1) { 993 992 if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) 994 - goto out; 993 + return -EINVAL; 995 994 } 996 995 997 996 /* Previous code may changed the zone of the pfn range */ ··· 1006 1007 ret = notifier_to_errno(ret); 1007 1008 if (ret) { 1008 1009 memory_notify(MEM_CANCEL_ONLINE, &arg); 1009 - goto out; 1010 + return ret; 1010 1011 } 1011 1012 /* 1012 1013 * If this zone is not populated, then it is not in zonelist. ··· 1030 1031 (((unsigned long long) pfn + nr_pages) 1031 1032 << PAGE_SHIFT) - 1); 1032 1033 memory_notify(MEM_CANCEL_ONLINE, &arg); 1033 - goto out; 1034 + return ret; 1034 1035 } 1035 1036 1036 1037 zone->present_pages += onlined_pages; ··· 1060 1061 1061 1062 if (onlined_pages) 1062 1063 memory_notify(MEM_ONLINE, &arg); 1063 - out: 1064 - mem_hotplug_done(); 1065 - return ret; 1064 + return 0; 1066 1065 } 1067 1066 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ 1068 1067 ··· 1685 1688 if (!test_pages_in_a_zone(start_pfn, end_pfn)) 1686 1689 return -EINVAL; 1687 1690 1688 - mem_hotplug_begin(); 1689 - 1690 1691 zone = page_zone(pfn_to_page(start_pfn)); 1691 1692 node = zone_to_nid(zone); 1692 1693 nr_pages = end_pfn - start_pfn; 1693 1694 1694 - ret = -EINVAL; 1695 1695 if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) 1696 - goto out; 1696 + return -EINVAL; 1697 1697 1698 1698 /* set above range as isolated */ 1699 1699 ret = start_isolate_page_range(start_pfn, end_pfn, 1700 1700 MIGRATE_MOVABLE, true); 1701 1701 if (ret) 1702 - goto out; 1702 + return ret; 1703 1703 1704 1704 arg.start_pfn = start_pfn; 1705 1705 arg.nr_pages = nr_pages; ··· 1789 1795 writeback_set_ratelimit(); 1790 1796 1791 1797 memory_notify(MEM_OFFLINE, &arg); 1792 - mem_hotplug_done(); 1793 1798 return 0; 1794 1799 1795 1800 failed_removal: ··· 1798 1805 memory_notify(MEM_CANCEL_OFFLINE, &arg); 1799 1806 /* pushback to free area */ 1800 1807 undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); 1801 - 1802 - out: 1803 - mem_hotplug_done(); 1804 1808 return ret; 1805 1809 } 1806 1810 1811 + /* Must be protected by mem_hotplug_begin() */ 1807 1812 int offline_pages(unsigned long start_pfn, unsigned long nr_pages) 1808 1813 { 1809 1814 return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);