Merge branch 'x86-kdump-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull kdump fixes from Peter Anvin:
"The kexec/kdump people have found several problems with the support
for loading over 4 GiB that was introduced in this merge cycle. This
is partly due to a number of design problems inherent in the way the
various pieces of kdump fit together (it is pretty horrifically manual
in many places.)

After a *lot* of iterations this is the patchset that was agreed upon,
but of course it is now very late in the cycle. However, because it
changes both the syntax and semantics of the crashkernel option, it
would be desirable to avoid a stable release with the broken
interfaces."

I'm not happy with the timing, since originally the plan was to release
the final 3.9 tomorrow. But apparently I'm doing an -rc8 instead...

* 'x86-kdump-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
kexec: use Crash kernel for Crash kernel low
x86, kdump: Change crashkernel_high/low= to crashkernel=,high/low
x86, kdump: Retore crashkernel= to allocate under 896M
x86, kdump: Set crashkernel_low automatically

Changed files
+180 -28
Documentation
arch
x86
kernel
include
kernel
lib
+20 -3
Documentation/kernel-parameters.txt
··· 596 596 is selected automatically. Check 597 597 Documentation/kdump/kdump.txt for further details. 598 598 599 - crashkernel_low=size[KMG] 600 - [KNL, x86] parts under 4G. 601 - 602 599 crashkernel=range1:size1[,range2:size2,...][@offset] 603 600 [KNL] Same as above, but depends on the memory 604 601 in the running system. The syntax of range is 605 602 start-[end] where start and end are both 606 603 a memory unit (amount[KMG]). See also 607 604 Documentation/kdump/kdump.txt for an example. 605 + 606 + crashkernel=size[KMG],high 607 + [KNL, x86_64] range could be above 4G. Allow kernel 608 + to allocate physical memory region from top, so could 609 + be above 4G if system have more than 4G ram installed. 610 + Otherwise memory region will be allocated below 4G, if 611 + available. 612 + It will be ignored if crashkernel=X is specified. 613 + crashkernel=size[KMG],low 614 + [KNL, x86_64] range under 4G. When crashkernel=X,high 615 + is passed, kernel could allocate physical memory region 616 + above 4G, that cause second kernel crash on system 617 + that require some amount of low memory, e.g. swiotlb 618 + requires at least 64M+32K low memory. Kernel would 619 + try to allocate 72M below 4G automatically. 620 + This one let user to specify own low range under 4G 621 + for second kernel instead. 622 + 0: to disable low allocation. 623 + It will be ignored when crashkernel=X,high is not used 624 + or memory reserved is below 4G. 608 625 609 626 cs89x0_dma= [HW,NET] 610 627 Format: <dma>
+37 -8
arch/x86/kernel/setup.c
··· 507 507 /* 508 508 * Keep the crash kernel below this limit. On 32 bits earlier kernels 509 509 * would limit the kernel to the low 512 MiB due to mapping restrictions. 510 + * On 64bit, old kexec-tools need to under 896MiB. 510 511 */ 511 512 #ifdef CONFIG_X86_32 512 - # define CRASH_KERNEL_ADDR_MAX (512 << 20) 513 + # define CRASH_KERNEL_ADDR_LOW_MAX (512 << 20) 514 + # define CRASH_KERNEL_ADDR_HIGH_MAX (512 << 20) 513 515 #else 514 - # define CRASH_KERNEL_ADDR_MAX MAXMEM 516 + # define CRASH_KERNEL_ADDR_LOW_MAX (896UL<<20) 517 + # define CRASH_KERNEL_ADDR_HIGH_MAX MAXMEM 515 518 #endif 516 519 517 520 static void __init reserve_crashkernel_low(void) ··· 524 521 unsigned long long low_base = 0, low_size = 0; 525 522 unsigned long total_low_mem; 526 523 unsigned long long base; 524 + bool auto_set = false; 527 525 int ret; 528 526 529 527 total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT)); 528 + /* crashkernel=Y,low */ 530 529 ret = parse_crashkernel_low(boot_command_line, total_low_mem, 531 530 &low_size, &base); 532 - if (ret != 0 || low_size <= 0) 533 - return; 531 + if (ret != 0) { 532 + /* 533 + * two parts from lib/swiotlb.c: 534 + * swiotlb size: user specified with swiotlb= or default. 535 + * swiotlb overflow buffer: now is hardcoded to 32k. 536 + * We round it to 8M for other buffers that 537 + * may need to stay low too. 538 + */ 539 + low_size = swiotlb_size_or_default() + (8UL<<20); 540 + auto_set = true; 541 + } else { 542 + /* passed with crashkernel=0,low ? */ 543 + if (!low_size) 544 + return; 545 + } 534 546 535 547 low_base = memblock_find_in_range(low_size, (1ULL<<32), 536 548 low_size, alignment); 537 549 538 550 if (!low_base) { 539 - pr_info("crashkernel low reservation failed - No suitable area found.\n"); 551 + if (!auto_set) 552 + pr_info("crashkernel low reservation failed - No suitable area found.\n"); 540 553 541 554 return; 542 555 } ··· 573 554 const unsigned long long alignment = 16<<20; /* 16M */ 574 555 unsigned long long total_mem; 575 556 unsigned long long crash_size, crash_base; 557 + bool high = false; 576 558 int ret; 577 559 578 560 total_mem = memblock_phys_mem_size(); 579 561 562 + /* crashkernel=XM */ 580 563 ret = parse_crashkernel(boot_command_line, total_mem, 581 564 &crash_size, &crash_base); 582 - if (ret != 0 || crash_size <= 0) 583 - return; 565 + if (ret != 0 || crash_size <= 0) { 566 + /* crashkernel=X,high */ 567 + ret = parse_crashkernel_high(boot_command_line, total_mem, 568 + &crash_size, &crash_base); 569 + if (ret != 0 || crash_size <= 0) 570 + return; 571 + high = true; 572 + } 584 573 585 574 /* 0 means: find the address automatically */ 586 575 if (crash_base <= 0) { ··· 596 569 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX 597 570 */ 598 571 crash_base = memblock_find_in_range(alignment, 599 - CRASH_KERNEL_ADDR_MAX, crash_size, alignment); 572 + high ? CRASH_KERNEL_ADDR_HIGH_MAX : 573 + CRASH_KERNEL_ADDR_LOW_MAX, 574 + crash_size, alignment); 600 575 601 576 if (!crash_base) { 602 577 pr_info("crashkernel reservation failed - No suitable area found.\n");
+2
include/linux/kexec.h
··· 200 200 201 201 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, 202 202 unsigned long long *crash_size, unsigned long long *crash_base); 203 + int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, 204 + unsigned long long *crash_size, unsigned long long *crash_base); 203 205 int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, 204 206 unsigned long long *crash_size, unsigned long long *crash_base); 205 207 int crash_shrink_memory(unsigned long new_size);
+1
include/linux/swiotlb.h
··· 25 25 extern void swiotlb_init(int verbose); 26 26 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); 27 27 extern unsigned long swiotlb_nr_tbl(void); 28 + unsigned long swiotlb_size_or_default(void); 28 29 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs); 29 30 30 31 /*
+105 -13
kernel/kexec.c
··· 55 55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 56 56 }; 57 57 struct resource crashk_low_res = { 58 - .name = "Crash kernel low", 58 + .name = "Crash kernel", 59 59 .start = 0, 60 60 .end = 0, 61 61 .flags = IORESOURCE_BUSY | IORESOURCE_MEM ··· 1368 1368 return 0; 1369 1369 } 1370 1370 1371 + #define SUFFIX_HIGH 0 1372 + #define SUFFIX_LOW 1 1373 + #define SUFFIX_NULL 2 1374 + static __initdata char *suffix_tbl[] = { 1375 + [SUFFIX_HIGH] = ",high", 1376 + [SUFFIX_LOW] = ",low", 1377 + [SUFFIX_NULL] = NULL, 1378 + }; 1379 + 1371 1380 /* 1372 - * That function is the entry point for command line parsing and should be 1373 - * called from the arch-specific code. 1381 + * That function parses "suffix" crashkernel command lines like 1382 + * 1383 + * crashkernel=size,[high|low] 1384 + * 1385 + * It returns 0 on success and -EINVAL on failure. 1374 1386 */ 1387 + static int __init parse_crashkernel_suffix(char *cmdline, 1388 + unsigned long long *crash_size, 1389 + unsigned long long *crash_base, 1390 + const char *suffix) 1391 + { 1392 + char *cur = cmdline; 1393 + 1394 + *crash_size = memparse(cmdline, &cur); 1395 + if (cmdline == cur) { 1396 + pr_warn("crashkernel: memory value expected\n"); 1397 + return -EINVAL; 1398 + } 1399 + 1400 + /* check with suffix */ 1401 + if (strncmp(cur, suffix, strlen(suffix))) { 1402 + pr_warn("crashkernel: unrecognized char\n"); 1403 + return -EINVAL; 1404 + } 1405 + cur += strlen(suffix); 1406 + if (*cur != ' ' && *cur != '\0') { 1407 + pr_warn("crashkernel: unrecognized char\n"); 1408 + return -EINVAL; 1409 + } 1410 + 1411 + return 0; 1412 + } 1413 + 1414 + static __init char *get_last_crashkernel(char *cmdline, 1415 + const char *name, 1416 + const char *suffix) 1417 + { 1418 + char *p = cmdline, *ck_cmdline = NULL; 1419 + 1420 + /* find crashkernel and use the last one if there are more */ 1421 + p = strstr(p, name); 1422 + while (p) { 1423 + char *end_p = strchr(p, ' '); 1424 + char *q; 1425 + 1426 + if (!end_p) 1427 + end_p = p + strlen(p); 1428 + 1429 + if (!suffix) { 1430 + int i; 1431 + 1432 + /* skip the one with any known suffix */ 1433 + for (i = 0; suffix_tbl[i]; i++) { 1434 + q = end_p - strlen(suffix_tbl[i]); 1435 + if (!strncmp(q, suffix_tbl[i], 1436 + strlen(suffix_tbl[i]))) 1437 + goto next; 1438 + } 1439 + ck_cmdline = p; 1440 + } else { 1441 + q = end_p - strlen(suffix); 1442 + if (!strncmp(q, suffix, strlen(suffix))) 1443 + ck_cmdline = p; 1444 + } 1445 + next: 1446 + p = strstr(p+1, name); 1447 + } 1448 + 1449 + if (!ck_cmdline) 1450 + return NULL; 1451 + 1452 + return ck_cmdline; 1453 + } 1454 + 1375 1455 static int __init __parse_crashkernel(char *cmdline, 1376 1456 unsigned long long system_ram, 1377 1457 unsigned long long *crash_size, 1378 1458 unsigned long long *crash_base, 1379 - const char *name) 1459 + const char *name, 1460 + const char *suffix) 1380 1461 { 1381 - char *p = cmdline, *ck_cmdline = NULL; 1382 1462 char *first_colon, *first_space; 1463 + char *ck_cmdline; 1383 1464 1384 1465 BUG_ON(!crash_size || !crash_base); 1385 1466 *crash_size = 0; 1386 1467 *crash_base = 0; 1387 1468 1388 - /* find crashkernel and use the last one if there are more */ 1389 - p = strstr(p, name); 1390 - while (p) { 1391 - ck_cmdline = p; 1392 - p = strstr(p+1, name); 1393 - } 1469 + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); 1394 1470 1395 1471 if (!ck_cmdline) 1396 1472 return -EINVAL; 1397 1473 1398 1474 ck_cmdline += strlen(name); 1399 1475 1476 + if (suffix) 1477 + return parse_crashkernel_suffix(ck_cmdline, crash_size, 1478 + crash_base, suffix); 1400 1479 /* 1401 1480 * if the commandline contains a ':', then that's the extended 1402 1481 * syntax -- if not, it must be the classic syntax ··· 1492 1413 return 0; 1493 1414 } 1494 1415 1416 + /* 1417 + * That function is the entry point for command line parsing and should be 1418 + * called from the arch-specific code. 1419 + */ 1495 1420 int __init parse_crashkernel(char *cmdline, 1496 1421 unsigned long long system_ram, 1497 1422 unsigned long long *crash_size, 1498 1423 unsigned long long *crash_base) 1499 1424 { 1500 1425 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, 1501 - "crashkernel="); 1426 + "crashkernel=", NULL); 1427 + } 1428 + 1429 + int __init parse_crashkernel_high(char *cmdline, 1430 + unsigned long long system_ram, 1431 + unsigned long long *crash_size, 1432 + unsigned long long *crash_base) 1433 + { 1434 + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, 1435 + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); 1502 1436 } 1503 1437 1504 1438 int __init parse_crashkernel_low(char *cmdline, ··· 1520 1428 unsigned long long *crash_base) 1521 1429 { 1522 1430 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, 1523 - "crashkernel_low="); 1431 + "crashkernel=", suffix_tbl[SUFFIX_LOW]); 1524 1432 } 1525 1433 1526 1434 static void update_vmcoreinfo_note(void)
+15 -4
lib/swiotlb.c
··· 105 105 if (!strcmp(str, "force")) 106 106 swiotlb_force = 1; 107 107 108 - return 1; 108 + return 0; 109 109 } 110 - __setup("swiotlb=", setup_io_tlb_npages); 110 + early_param("swiotlb", setup_io_tlb_npages); 111 111 /* make io_tlb_overflow tunable too? */ 112 112 113 113 unsigned long swiotlb_nr_tbl(void) ··· 115 115 return io_tlb_nslabs; 116 116 } 117 117 EXPORT_SYMBOL_GPL(swiotlb_nr_tbl); 118 + 119 + /* default to 64MB */ 120 + #define IO_TLB_DEFAULT_SIZE (64UL<<20) 121 + unsigned long swiotlb_size_or_default(void) 122 + { 123 + unsigned long size; 124 + 125 + size = io_tlb_nslabs << IO_TLB_SHIFT; 126 + 127 + return size ? size : (IO_TLB_DEFAULT_SIZE); 128 + } 129 + 118 130 /* Note that this doesn't work with highmem page */ 119 131 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, 120 132 volatile void *address) ··· 200 188 void __init 201 189 swiotlb_init(int verbose) 202 190 { 203 - /* default to 64MB */ 204 - size_t default_size = 64UL<<20; 191 + size_t default_size = IO_TLB_DEFAULT_SIZE; 205 192 unsigned char *vstart; 206 193 unsigned long bytes; 207 194