x86, UV: Disable BAU on network congestion

The numalink network can become so congested that TLB shootdown
using the Broadcast Assist Unit becomes slower than using IPI's.

In that case, disable the use of the BAU for a period of time.
The period is tunable. When the period expires the use of the
BAU is re-enabled. A count of these actions is added to the
statistics file.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004a4-0a@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Cliff Wickman and committed by Ingo Molnar 50fb55ac e8e5e8a8

+77 -3
+4
arch/x86/include/asm/uv/uv_bau.h
··· 34 34 */ 35 35 36 36 #define UV_ITEMS_PER_DESCRIPTOR 8 37 + /* the 'throttle' to prevent the hardware stay-busy bug */ 37 38 #define MAX_BAU_CONCURRENT 3 38 39 #define UV_CPUS_PER_ACT_STATUS 32 39 40 #define UV_ACT_STATUS_MASK 0x3 ··· 339 338 int timeout_tries; 340 339 int ipi_attempts; 341 340 int conseccompletes; 341 + int baudisabled; 342 342 int set_bau_off; 343 343 short cpu; 344 344 short uvhub_cpu; ··· 391 389 unsigned long s_busy; /* status stayed busy past s/w timer */ 392 390 unsigned long s_throttles; /* waits in throttle */ 393 391 unsigned long s_retry_messages; /* retry broadcasts */ 392 + unsigned long s_bau_reenabled; /* for bau enable/disable */ 393 + unsigned long s_bau_disabled; /* for bau enable/disable */ 394 394 /* destination statistics */ 395 395 unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ 396 396 unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
+73 -3
arch/x86/kernel/tlb_uv.c
··· 44 44 }; 45 45 static int timeout_us; 46 46 static int nobau; 47 + static int baudisabled; 48 + static spinlock_t disable_lock; 49 + static cycles_t congested_cycles; 47 50 48 51 /* tunables: */ 49 52 static int max_bau_concurrent = MAX_BAU_CONCURRENT; ··· 522 519 return 1; 523 520 } 524 521 522 + /* 523 + * Completions are taking a very long time due to a congested numalink 524 + * network. 525 + */ 526 + static void 527 + disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) 528 + { 529 + int tcpu; 530 + struct bau_control *tbcp; 531 + 532 + /* let only one cpu do this disabling */ 533 + spin_lock(&disable_lock); 534 + if (!baudisabled && bcp->period_requests && 535 + ((bcp->period_time / bcp->period_requests) > congested_cycles)) { 536 + /* it becomes this cpu's job to turn on the use of the 537 + BAU again */ 538 + baudisabled = 1; 539 + bcp->set_bau_off = 1; 540 + bcp->set_bau_on_time = get_cycles() + 541 + sec_2_cycles(bcp->congested_period); 542 + stat->s_bau_disabled++; 543 + for_each_present_cpu(tcpu) { 544 + tbcp = &per_cpu(bau_control, tcpu); 545 + tbcp->baudisabled = 1; 546 + } 547 + } 548 + spin_unlock(&disable_lock); 549 + } 550 + 525 551 /** 526 552 * uv_flush_send_and_wait 527 553 * ··· 713 681 if (time2 > time1) { 714 682 elapsed = time2 - time1; 715 683 stat->s_time += elapsed; 684 + if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { 685 + bcp->period_requests++; 686 + bcp->period_time += elapsed; 687 + if ((elapsed > congested_cycles) && 688 + (bcp->period_requests > bcp->congested_reps)) { 689 + disable_for_congestion(bcp, stat); 690 + } 691 + } 716 692 } else 717 693 stat->s_requestor--; /* don't count this one */ 718 694 if (completion_status == FLUSH_COMPLETE && try > 1) ··· 787 747 struct cpumask *flush_mask; 788 748 struct ptc_stats *stat; 789 749 struct bau_control *bcp; 750 + struct bau_control *tbcp; 790 751 791 752 /* kernel was booted 'nobau' */ 792 753 if (nobau) 793 754 return cpumask; 794 755 795 756 bcp = &per_cpu(bau_control, cpu); 757 + stat = &per_cpu(ptcstats, cpu); 758 + 759 + /* bau was disabled due to slow response */ 760 + if (bcp->baudisabled) { 761 + /* the cpu that disabled it must re-enable it */ 762 + if (bcp->set_bau_off) { 763 + if (get_cycles() >= bcp->set_bau_on_time) { 764 + stat->s_bau_reenabled++; 765 + baudisabled = 0; 766 + for_each_present_cpu(tcpu) { 767 + tbcp = &per_cpu(bau_control, tcpu); 768 + tbcp->baudisabled = 0; 769 + tbcp->period_requests = 0; 770 + tbcp->period_time = 0; 771 + } 772 + } 773 + } 774 + return cpumask; 775 + } 796 776 797 777 /* 798 778 * Each sending cpu has a per-cpu mask which it fills from the caller's ··· 853 793 else 854 794 return NULL; 855 795 } 856 - stat = &per_cpu(ptcstats, cpu); 857 796 stat->s_requestor++; 858 797 stat->s_ntargcpu += remotes; 859 798 remotes = bau_uvhub_weight(&bau_desc->distribution); ··· 1032 973 seq_printf(file, 1033 974 "sw_ack recv rtime all "); 1034 975 seq_printf(file, 1035 - "one mult none retry canc nocan reset rcan\n"); 976 + "one mult none retry canc nocan reset rcan "); 977 + seq_printf(file, 978 + "disable enable\n"); 1036 979 } 1037 980 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 1038 981 stat = &per_cpu(ptcstats, cpu); ··· 1054 993 1055 994 /* destination side statistics */ 1056 995 seq_printf(file, 1057 - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", 996 + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 1058 997 uv_read_global_mmr64(uv_cpu_to_pnode(cpu), 1059 998 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), 1060 999 stat->d_requestee, cycles_2_us(stat->d_time), ··· 1062 1001 stat->d_nomsg, stat->d_retries, stat->d_canceled, 1063 1002 stat->d_nocanceled, stat->d_resets, 1064 1003 stat->d_rcanceled); 1004 + seq_printf(file, "%ld %ld\n", 1005 + stat->s_bau_disabled, stat->s_bau_reenabled); 1065 1006 } 1066 1007 1067 1008 return 0; ··· 1175 1112 "reset: number of ipi-style reset requests processed\n"); 1176 1113 printk(KERN_DEBUG 1177 1114 "rcan: number messages canceled by reset requests\n"); 1115 + printk(KERN_DEBUG 1116 + "disable: number times use of the BAU was disabled\n"); 1117 + printk(KERN_DEBUG 1118 + "enable: number times use of the BAU was re-enabled\n"); 1178 1119 } else if (input_arg == -1) { 1179 1120 for_each_present_cpu(cpu) { 1180 1121 stat = &per_cpu(ptcstats, cpu); ··· 1635 1568 kfree(uvhub_descs); 1636 1569 for_each_present_cpu(cpu) { 1637 1570 bcp = &per_cpu(bau_control, cpu); 1571 + bcp->baudisabled = 0; 1638 1572 /* time interval to catch a hardware stay-busy bug */ 1639 1573 bcp->timeout_interval = microsec_2_cycles(2*timeout_us); 1640 1574 bcp->max_bau_concurrent = max_bau_concurrent; ··· 1677 1609 uv_nshift = uv_hub_info->m_val; 1678 1610 uv_mmask = (1UL << uv_hub_info->m_val) - 1; 1679 1611 nuvhubs = uv_num_possible_blades(); 1612 + spin_lock_init(&disable_lock); 1613 + congested_cycles = microsec_2_cycles(congested_response_us); 1680 1614 1681 1615 uv_init_per_cpu(nuvhubs); 1682 1616