Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

s390/mm: allocate vmemmap pages from self-contained memory range

Allocate memory map (struct pages array) from the hotplugged memory
range, rather than using system memory. The change addresses the issue
where standby memory, when configured to be much larger than online
memory, could potentially lead to ipl failure due to memory map
allocation from online memory. For example, 16MB of memory map
allocation is needed for a memory block size of 1GB and when standby
memory is configured much larger than online memory, this could lead to
ipl failure.

To address this issue, the solution involves introducing "memmap on
memory" using the vmem_altmap structure on s390. Architectures that
want to implement it should pass the altmap to the vmemmap_populate()
function and its associated callchain. This enhancement is discussed in
commit 4b94ffdc4163 ("x86, mm: introduce vmem_altmap to augment
vmemmap_populate()")

Provide "memmap on memory" support for s390 by passing the altmap in
vmemmap_populate() and its callchain. The allocation path is described
as follows:
* When altmap is NULL in vmemmap_populate(), memory map allocation
occurs using the existing vmemmap_alloc_block_buf().
* When altmap is not NULL in vmemmap_populate(), memory map allocation
still uses vmemmap_alloc_block_buf(), but this function internally
calls altmap_alloc_block_buf().

For deallocation, the process is outlined as follows:
* When altmap is NULL in vmemmap_free(), memory map deallocation happens
through free_pages().
* When altmap is not NULL in vmemmap_free(), memory map deallocation
occurs via vmem_altmap_free().

While memory map allocation is primarily handled through the
self-contained memory map range, there might still be a small amount of
system memory allocation required for vmemmap pagetables. To mitigate
this impact, this feature will be limited to machines with EDAT1
support.

Link: https://lkml.kernel.org/r/20240108132747.3238763-3-sumanthk@linux.ibm.com
Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Signed-off-by: Sumanth Korikkar <sumanthk@linux.ibm.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Cc: Anshuman Khandual <anshuman.khandual@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Sumanth Korikkar and committed by
Andrew Morton
1a65b73a c5f1e2d1

+35 -30
-3
arch/s390/mm/init.c
··· 281 281 unsigned long size_pages = PFN_DOWN(size); 282 282 int rc; 283 283 284 - if (WARN_ON_ONCE(params->altmap)) 285 - return -EINVAL; 286 - 287 284 if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)) 288 285 return -EINVAL; 289 286
+35 -27
arch/s390/mm/vmem.c
··· 33 33 return memblock_alloc(size, size); 34 34 } 35 35 36 - static void vmem_free_pages(unsigned long addr, int order) 36 + static void vmem_free_pages(unsigned long addr, int order, struct vmem_altmap *altmap) 37 37 { 38 + if (altmap) { 39 + vmem_altmap_free(altmap, 1 << order); 40 + return; 41 + } 38 42 /* We don't expect boot memory to be removed ever. */ 39 43 if (!slab_is_available() || 40 44 WARN_ON_ONCE(PageReserved(virt_to_page((void *)addr)))) ··· 160 156 161 157 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 162 158 static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, 163 - unsigned long end, bool add, bool direct) 159 + unsigned long end, bool add, bool direct, 160 + struct vmem_altmap *altmap) 164 161 { 165 162 unsigned long prot, pages = 0; 166 163 int ret = -ENOMEM; ··· 177 172 if (pte_none(*pte)) 178 173 continue; 179 174 if (!direct) 180 - vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); 175 + vmem_free_pages((unsigned long)pfn_to_virt(pte_pfn(*pte)), get_order(PAGE_SIZE), altmap); 181 176 pte_clear(&init_mm, addr, pte); 182 177 } else if (pte_none(*pte)) { 183 178 if (!direct) { 184 - void *new_page = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); 179 + void *new_page = vmemmap_alloc_block_buf(PAGE_SIZE, NUMA_NO_NODE, altmap); 185 180 186 181 if (!new_page) 187 182 goto out; ··· 218 213 219 214 /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ 220 215 static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, 221 - unsigned long end, bool add, bool direct) 216 + unsigned long end, bool add, bool direct, 217 + struct vmem_altmap *altmap) 222 218 { 223 219 unsigned long next, prot, pages = 0; 224 220 int ret = -ENOMEM; ··· 240 234 if (IS_ALIGNED(addr, PMD_SIZE) && 241 235 IS_ALIGNED(next, PMD_SIZE)) { 242 236 if (!direct) 243 - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); 237 + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); 244 238 pmd_clear(pmd); 245 239 pages++; 246 240 } else if (!direct && vmemmap_unuse_sub_pmd(addr, next)) { 247 - vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE)); 241 + vmem_free_pages(pmd_deref(*pmd), get_order(PMD_SIZE), altmap); 248 242 pmd_clear(pmd); 249 243 } 250 244 continue; ··· 267 261 * page tables since vmemmap_populate gets 268 262 * called for each section separately. 269 263 */ 270 - new_page = vmemmap_alloc_block(PMD_SIZE, NUMA_NO_NODE); 264 + new_page = vmemmap_alloc_block_buf(PMD_SIZE, NUMA_NO_NODE, altmap); 271 265 if (new_page) { 272 266 set_pmd(pmd, __pmd(__pa(new_page) | prot)); 273 267 if (!IS_ALIGNED(addr, PMD_SIZE) || ··· 286 280 vmemmap_use_sub_pmd(addr, next); 287 281 continue; 288 282 } 289 - ret = modify_pte_table(pmd, addr, next, add, direct); 283 + ret = modify_pte_table(pmd, addr, next, add, direct, altmap); 290 284 if (ret) 291 285 goto out; 292 286 if (!add) ··· 308 302 for (i = 0; i < PTRS_PER_PMD; i++, pmd++) 309 303 if (!pmd_none(*pmd)) 310 304 return; 311 - vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER); 305 + vmem_free_pages(pud_deref(*pud), CRST_ALLOC_ORDER, NULL); 312 306 pud_clear(pud); 313 307 } 314 308 315 309 static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, 316 - bool add, bool direct) 310 + bool add, bool direct, struct vmem_altmap *altmap) 317 311 { 318 312 unsigned long next, prot, pages = 0; 319 313 int ret = -ENOMEM; ··· 353 347 } else if (pud_large(*pud)) { 354 348 continue; 355 349 } 356 - ret = modify_pmd_table(pud, addr, next, add, direct); 350 + ret = modify_pmd_table(pud, addr, next, add, direct, altmap); 357 351 if (ret) 358 352 goto out; 359 353 if (!add) ··· 376 370 if (!pud_none(*pud)) 377 371 return; 378 372 } 379 - vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER); 373 + vmem_free_pages(p4d_deref(*p4d), CRST_ALLOC_ORDER, NULL); 380 374 p4d_clear(p4d); 381 375 } 382 376 383 377 static int modify_p4d_table(pgd_t *pgd, unsigned long addr, unsigned long end, 384 - bool add, bool direct) 378 + bool add, bool direct, struct vmem_altmap *altmap) 385 379 { 386 380 unsigned long next; 387 381 int ret = -ENOMEM; ··· 400 394 goto out; 401 395 p4d_populate(&init_mm, p4d, pud); 402 396 } 403 - ret = modify_pud_table(p4d, addr, next, add, direct); 397 + ret = modify_pud_table(p4d, addr, next, add, direct, altmap); 404 398 if (ret) 405 399 goto out; 406 400 if (!add) ··· 421 415 if (!p4d_none(*p4d)) 422 416 return; 423 417 } 424 - vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER); 418 + vmem_free_pages(pgd_deref(*pgd), CRST_ALLOC_ORDER, NULL); 425 419 pgd_clear(pgd); 426 420 } 427 421 428 422 static int modify_pagetable(unsigned long start, unsigned long end, bool add, 429 - bool direct) 423 + bool direct, struct vmem_altmap *altmap) 430 424 { 431 425 unsigned long addr, next; 432 426 int ret = -ENOMEM; ··· 451 445 goto out; 452 446 pgd_populate(&init_mm, pgd, p4d); 453 447 } 454 - ret = modify_p4d_table(pgd, addr, next, add, direct); 448 + ret = modify_p4d_table(pgd, addr, next, add, direct, altmap); 455 449 if (ret) 456 450 goto out; 457 451 if (!add) ··· 464 458 return ret; 465 459 } 466 460 467 - static int add_pagetable(unsigned long start, unsigned long end, bool direct) 461 + static int add_pagetable(unsigned long start, unsigned long end, bool direct, 462 + struct vmem_altmap *altmap) 468 463 { 469 - return modify_pagetable(start, end, true, direct); 464 + return modify_pagetable(start, end, true, direct, altmap); 470 465 } 471 466 472 - static int remove_pagetable(unsigned long start, unsigned long end, bool direct) 467 + static int remove_pagetable(unsigned long start, unsigned long end, bool direct, 468 + struct vmem_altmap *altmap) 473 469 { 474 - return modify_pagetable(start, end, false, direct); 470 + return modify_pagetable(start, end, false, direct, altmap); 475 471 } 476 472 477 473 /* ··· 482 474 static int vmem_add_range(unsigned long start, unsigned long size) 483 475 { 484 476 start = (unsigned long)__va(start); 485 - return add_pagetable(start, start + size, true); 477 + return add_pagetable(start, start + size, true, NULL); 486 478 } 487 479 488 480 /* ··· 491 483 static void vmem_remove_range(unsigned long start, unsigned long size) 492 484 { 493 485 start = (unsigned long)__va(start); 494 - remove_pagetable(start, start + size, true); 486 + remove_pagetable(start, start + size, true, NULL); 495 487 } 496 488 497 489 /* ··· 504 496 505 497 mutex_lock(&vmem_mutex); 506 498 /* We don't care about the node, just use NUMA_NO_NODE on allocations */ 507 - ret = add_pagetable(start, end, false); 499 + ret = add_pagetable(start, end, false, altmap); 508 500 if (ret) 509 - remove_pagetable(start, end, false); 501 + remove_pagetable(start, end, false, altmap); 510 502 mutex_unlock(&vmem_mutex); 511 503 return ret; 512 504 } ··· 517 509 struct vmem_altmap *altmap) 518 510 { 519 511 mutex_lock(&vmem_mutex); 520 - remove_pagetable(start, end, false); 512 + remove_pagetable(start, end, false, altmap); 521 513 mutex_unlock(&vmem_mutex); 522 514 } 523 515