Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm/memory_hotplug: support memmap_on_memory when memmap is not aligned to pageblocks

Currently, memmap_on_memory feature is only supported with memory block
sizes that result in vmemmap pages covering full page blocks. This is
because memory onlining/offlining code requires applicable ranges to be
pageblock-aligned, for example, to set the migratetypes properly.

This patch helps to lift that restriction by reserving more pages than
required for vmemmap space. This helps the start address to be page block
aligned with different memory block sizes. Using this facility implies
the kernel will be reserving some pages for every memoryblock. This
allows the memmap on memory feature to be widely useful with different
memory block size values.

For ex: with 64K page size and 256MiB memory block size, we require 4
pages to map vmemmap pages, To align things correctly we end up adding a
reserve of 28 pages. ie, for every 4096 pages 28 pages get reserved.

Link: https://lkml.kernel.org/r/20230808091501.287660-5-aneesh.kumar@linux.ibm.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Aneesh Kumar K.V and committed by
Andrew Morton
2d1f649c 85a2b4b0

+113 -19
+12
Documentation/admin-guide/mm/memory-hotplug.rst
··· 433 433 memory in a way that huge pages in bigger 434 434 granularity cannot be formed on hotplugged 435 435 memory. 436 + 437 + With value "force" it could result in memory 438 + wastage due to memmap size limitations. For 439 + example, if the memmap for a memory block 440 + requires 1 MiB, but the pageblock size is 2 441 + MiB, 1 MiB of hotplugged memory will be wasted. 442 + Note that there are still cases where the 443 + feature cannot be enforced: for example, if the 444 + memmap is smaller than a single page, or if the 445 + architecture does not support the forced mode 446 + in all configurations. 447 + 436 448 ``online_policy`` read-write: Set the basic policy used for 437 449 automatic zone selection when onlining memory 438 450 blocks without specifying a target zone.
+101 -19
mm/memory_hotplug.c
··· 41 41 #include "internal.h" 42 42 #include "shuffle.h" 43 43 44 + enum { 45 + MEMMAP_ON_MEMORY_DISABLE = 0, 46 + MEMMAP_ON_MEMORY_ENABLE, 47 + MEMMAP_ON_MEMORY_FORCE, 48 + }; 49 + 50 + static int memmap_mode __read_mostly = MEMMAP_ON_MEMORY_DISABLE; 51 + 52 + static inline unsigned long memory_block_memmap_size(void) 53 + { 54 + return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); 55 + } 56 + 57 + static inline unsigned long memory_block_memmap_on_memory_pages(void) 58 + { 59 + unsigned long nr_pages = PFN_UP(memory_block_memmap_size()); 60 + 61 + /* 62 + * In "forced" memmap_on_memory mode, we add extra pages to align the 63 + * vmemmap size to cover full pageblocks. That way, we can add memory 64 + * even if the vmemmap size is not properly aligned, however, we might waste 65 + * memory. 66 + */ 67 + if (memmap_mode == MEMMAP_ON_MEMORY_FORCE) 68 + return pageblock_align(nr_pages); 69 + return nr_pages; 70 + } 71 + 44 72 #ifdef CONFIG_MHP_MEMMAP_ON_MEMORY 45 73 /* 46 74 * memory_hotplug.memmap_on_memory parameter 47 75 */ 48 - static bool memmap_on_memory __ro_after_init; 49 - module_param(memmap_on_memory, bool, 0444); 50 - MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug"); 76 + static int set_memmap_mode(const char *val, const struct kernel_param *kp) 77 + { 78 + int ret, mode; 79 + bool enabled; 80 + 81 + if (sysfs_streq(val, "force") || sysfs_streq(val, "FORCE")) { 82 + mode = MEMMAP_ON_MEMORY_FORCE; 83 + } else { 84 + ret = kstrtobool(val, &enabled); 85 + if (ret < 0) 86 + return ret; 87 + if (enabled) 88 + mode = MEMMAP_ON_MEMORY_ENABLE; 89 + else 90 + mode = MEMMAP_ON_MEMORY_DISABLE; 91 + } 92 + *((int *)kp->arg) = mode; 93 + if (mode == MEMMAP_ON_MEMORY_FORCE) { 94 + unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); 95 + 96 + pr_info_once("Memory hotplug will waste %ld pages in each memory block\n", 97 + memmap_pages - PFN_UP(memory_block_memmap_size())); 98 + } 99 + return 0; 100 + } 101 + 102 + static int get_memmap_mode(char *buffer, const struct kernel_param *kp) 103 + { 104 + if (*((int *)kp->arg) == MEMMAP_ON_MEMORY_FORCE) 105 + return sprintf(buffer, "force\n"); 106 + return param_get_bool(buffer, kp); 107 + } 108 + 109 + static const struct kernel_param_ops memmap_mode_ops = { 110 + .set = set_memmap_mode, 111 + .get = get_memmap_mode, 112 + }; 113 + module_param_cb(memmap_on_memory, &memmap_mode_ops, &memmap_mode, 0444); 114 + MODULE_PARM_DESC(memmap_on_memory, "Enable memmap on memory for memory hotplug\n" 115 + "With value \"force\" it could result in memory wastage due " 116 + "to memmap size limitations (Y/N/force)"); 51 117 52 118 static inline bool mhp_memmap_on_memory(void) 53 119 { 54 - return memmap_on_memory; 120 + return memmap_mode != MEMMAP_ON_MEMORY_DISABLE; 55 121 } 56 122 #else 57 123 static inline bool mhp_memmap_on_memory(void) ··· 1313 1247 return device_online(&mem->dev); 1314 1248 } 1315 1249 1316 - static inline unsigned long memory_block_memmap_size(void) 1317 - { 1318 - return PHYS_PFN(memory_block_size_bytes()) * sizeof(struct page); 1319 - } 1320 - 1321 1250 #ifndef arch_supports_memmap_on_memory 1322 1251 static inline bool arch_supports_memmap_on_memory(unsigned long vmemmap_size) 1323 1252 { ··· 1328 1267 static bool mhp_supports_memmap_on_memory(unsigned long size) 1329 1268 { 1330 1269 unsigned long vmemmap_size = memory_block_memmap_size(); 1331 - unsigned long remaining_size = size - vmemmap_size; 1270 + unsigned long memmap_pages = memory_block_memmap_on_memory_pages(); 1332 1271 1333 1272 /* 1334 1273 * Besides having arch support and the feature enabled at runtime, we ··· 1356 1295 * altmap as an alternative source of memory, and we do not exactly 1357 1296 * populate a single PMD. 1358 1297 */ 1359 - return mhp_memmap_on_memory() && 1360 - size == memory_block_size_bytes() && 1361 - IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT)) && 1362 - arch_supports_memmap_on_memory(vmemmap_size); 1298 + if (!mhp_memmap_on_memory() || size != memory_block_size_bytes()) 1299 + return false; 1300 + 1301 + /* 1302 + * Make sure the vmemmap allocation is fully contained 1303 + * so that we always allocate vmemmap memory from altmap area. 1304 + */ 1305 + if (!IS_ALIGNED(vmemmap_size, PAGE_SIZE)) 1306 + return false; 1307 + 1308 + /* 1309 + * start pfn should be pageblock_nr_pages aligned for correctly 1310 + * setting migrate types 1311 + */ 1312 + if (!pageblock_aligned(memmap_pages)) 1313 + return false; 1314 + 1315 + if (memmap_pages == PHYS_PFN(memory_block_size_bytes())) 1316 + /* No effective hotplugged memory doesn't make sense. */ 1317 + return false; 1318 + 1319 + return arch_supports_memmap_on_memory(vmemmap_size); 1363 1320 } 1364 1321 1365 1322 /* ··· 1390 1311 { 1391 1312 struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) }; 1392 1313 enum memblock_flags memblock_flags = MEMBLOCK_NONE; 1393 - struct vmem_altmap mhp_altmap = {}; 1314 + struct vmem_altmap mhp_altmap = { 1315 + .base_pfn = PHYS_PFN(res->start), 1316 + .end_pfn = PHYS_PFN(res->end), 1317 + }; 1394 1318 struct memory_group *group = NULL; 1395 1319 u64 start, size; 1396 1320 bool new_node = false; ··· 1438 1356 */ 1439 1357 if (mhp_flags & MHP_MEMMAP_ON_MEMORY) { 1440 1358 if (mhp_supports_memmap_on_memory(size)) { 1441 - mhp_altmap.free = PHYS_PFN(size); 1442 - mhp_altmap.base_pfn = PHYS_PFN(start); 1359 + mhp_altmap.free = memory_block_memmap_on_memory_pages(); 1443 1360 params.altmap = &mhp_altmap; 1444 1361 } 1445 1362 /* fallback to not using altmap */ ··· 1450 1369 goto error; 1451 1370 1452 1371 /* create memory block devices after memory was added */ 1453 - ret = create_memory_block_devices(start, size, mhp_altmap.alloc, 1454 - group); 1372 + ret = create_memory_block_devices(start, size, mhp_altmap.free, group); 1455 1373 if (ret) { 1456 1374 arch_remove_memory(start, size, NULL); 1457 1375 goto error; ··· 2176 2096 * right thing if we used vmem_altmap when hot-adding 2177 2097 * the range. 2178 2098 */ 2099 + mhp_altmap.base_pfn = PHYS_PFN(start); 2100 + mhp_altmap.free = nr_vmemmap_pages; 2179 2101 mhp_altmap.alloc = nr_vmemmap_pages; 2180 2102 altmap = &mhp_altmap; 2181 2103 }