x86, UV: Disable BAU on network congestion

The numalink network can become so congested that TLB shootdown
using the Broadcast Assist Unit becomes slower than using IPI's.

In that case, disable the use of the BAU for a period of time.
The period is tunable. When the period expires the use of the
BAU is re-enabled. A count of these actions is added to the
statistics file.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004a4-0a@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Cliff Wickman and committed by Ingo Molnar 50fb55ac e8e5e8a8

+77 -3
+4
arch/x86/include/asm/uv/uv_bau.h
··· 34 */ 35 36 #define UV_ITEMS_PER_DESCRIPTOR 8 37 #define MAX_BAU_CONCURRENT 3 38 #define UV_CPUS_PER_ACT_STATUS 32 39 #define UV_ACT_STATUS_MASK 0x3 ··· 339 int timeout_tries; 340 int ipi_attempts; 341 int conseccompletes; 342 int set_bau_off; 343 short cpu; 344 short uvhub_cpu; ··· 391 unsigned long s_busy; /* status stayed busy past s/w timer */ 392 unsigned long s_throttles; /* waits in throttle */ 393 unsigned long s_retry_messages; /* retry broadcasts */ 394 /* destination statistics */ 395 unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ 396 unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
··· 34 */ 35 36 #define UV_ITEMS_PER_DESCRIPTOR 8 37 + /* the 'throttle' to prevent the hardware stay-busy bug */ 38 #define MAX_BAU_CONCURRENT 3 39 #define UV_CPUS_PER_ACT_STATUS 32 40 #define UV_ACT_STATUS_MASK 0x3 ··· 338 int timeout_tries; 339 int ipi_attempts; 340 int conseccompletes; 341 + int baudisabled; 342 int set_bau_off; 343 short cpu; 344 short uvhub_cpu; ··· 389 unsigned long s_busy; /* status stayed busy past s/w timer */ 390 unsigned long s_throttles; /* waits in throttle */ 391 unsigned long s_retry_messages; /* retry broadcasts */ 392 + unsigned long s_bau_reenabled; /* for bau enable/disable */ 393 + unsigned long s_bau_disabled; /* for bau enable/disable */ 394 /* destination statistics */ 395 unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ 396 unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */
+73 -3
arch/x86/kernel/tlb_uv.c
··· 44 }; 45 static int timeout_us; 46 static int nobau; 47 48 /* tunables: */ 49 static int max_bau_concurrent = MAX_BAU_CONCURRENT; ··· 522 return 1; 523 } 524 525 /** 526 * uv_flush_send_and_wait 527 * ··· 713 if (time2 > time1) { 714 elapsed = time2 - time1; 715 stat->s_time += elapsed; 716 } else 717 stat->s_requestor--; /* don't count this one */ 718 if (completion_status == FLUSH_COMPLETE && try > 1) ··· 787 struct cpumask *flush_mask; 788 struct ptc_stats *stat; 789 struct bau_control *bcp; 790 791 /* kernel was booted 'nobau' */ 792 if (nobau) 793 return cpumask; 794 795 bcp = &per_cpu(bau_control, cpu); 796 797 /* 798 * Each sending cpu has a per-cpu mask which it fills from the caller's ··· 853 else 854 return NULL; 855 } 856 - stat = &per_cpu(ptcstats, cpu); 857 stat->s_requestor++; 858 stat->s_ntargcpu += remotes; 859 remotes = bau_uvhub_weight(&bau_desc->distribution); ··· 1032 seq_printf(file, 1033 "sw_ack recv rtime all "); 1034 seq_printf(file, 1035 - "one mult none retry canc nocan reset rcan\n"); 1036 } 1037 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 1038 stat = &per_cpu(ptcstats, cpu); ··· 1054 1055 /* destination side statistics */ 1056 seq_printf(file, 1057 - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", 1058 uv_read_global_mmr64(uv_cpu_to_pnode(cpu), 1059 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), 1060 stat->d_requestee, cycles_2_us(stat->d_time), ··· 1062 stat->d_nomsg, stat->d_retries, stat->d_canceled, 1063 stat->d_nocanceled, stat->d_resets, 1064 stat->d_rcanceled); 1065 } 1066 1067 return 0; ··· 1175 "reset: number of ipi-style reset requests processed\n"); 1176 printk(KERN_DEBUG 1177 "rcan: number messages canceled by reset requests\n"); 1178 } else if (input_arg == -1) { 1179 for_each_present_cpu(cpu) { 1180 stat = &per_cpu(ptcstats, cpu); ··· 1635 kfree(uvhub_descs); 1636 for_each_present_cpu(cpu) { 1637 bcp = &per_cpu(bau_control, cpu); 1638 /* time interval to catch a hardware stay-busy bug */ 1639 bcp->timeout_interval = microsec_2_cycles(2*timeout_us); 1640 bcp->max_bau_concurrent = max_bau_concurrent; ··· 1677 uv_nshift = uv_hub_info->m_val; 1678 uv_mmask = (1UL << uv_hub_info->m_val) - 1; 1679 nuvhubs = uv_num_possible_blades(); 1680 1681 uv_init_per_cpu(nuvhubs); 1682
··· 44 }; 45 static int timeout_us; 46 static int nobau; 47 + static int baudisabled; 48 + static spinlock_t disable_lock; 49 + static cycles_t congested_cycles; 50 51 /* tunables: */ 52 static int max_bau_concurrent = MAX_BAU_CONCURRENT; ··· 519 return 1; 520 } 521 522 + /* 523 + * Completions are taking a very long time due to a congested numalink 524 + * network. 525 + */ 526 + static void 527 + disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) 528 + { 529 + int tcpu; 530 + struct bau_control *tbcp; 531 + 532 + /* let only one cpu do this disabling */ 533 + spin_lock(&disable_lock); 534 + if (!baudisabled && bcp->period_requests && 535 + ((bcp->period_time / bcp->period_requests) > congested_cycles)) { 536 + /* it becomes this cpu's job to turn on the use of the 537 + BAU again */ 538 + baudisabled = 1; 539 + bcp->set_bau_off = 1; 540 + bcp->set_bau_on_time = get_cycles() + 541 + sec_2_cycles(bcp->congested_period); 542 + stat->s_bau_disabled++; 543 + for_each_present_cpu(tcpu) { 544 + tbcp = &per_cpu(bau_control, tcpu); 545 + tbcp->baudisabled = 1; 546 + } 547 + } 548 + spin_unlock(&disable_lock); 549 + } 550 + 551 /** 552 * uv_flush_send_and_wait 553 * ··· 681 if (time2 > time1) { 682 elapsed = time2 - time1; 683 stat->s_time += elapsed; 684 + if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { 685 + bcp->period_requests++; 686 + bcp->period_time += elapsed; 687 + if ((elapsed > congested_cycles) && 688 + (bcp->period_requests > bcp->congested_reps)) { 689 + disable_for_congestion(bcp, stat); 690 + } 691 + } 692 } else 693 stat->s_requestor--; /* don't count this one */ 694 if (completion_status == FLUSH_COMPLETE && try > 1) ··· 747 struct cpumask *flush_mask; 748 struct ptc_stats *stat; 749 struct bau_control *bcp; 750 + struct bau_control *tbcp; 751 752 /* kernel was booted 'nobau' */ 753 if (nobau) 754 return cpumask; 755 756 bcp = &per_cpu(bau_control, cpu); 757 + stat = &per_cpu(ptcstats, cpu); 758 + 759 + /* bau was disabled due to slow response */ 760 + if (bcp->baudisabled) { 761 + /* the cpu that disabled it must re-enable it */ 762 + if (bcp->set_bau_off) { 763 + if (get_cycles() >= bcp->set_bau_on_time) { 764 + stat->s_bau_reenabled++; 765 + baudisabled = 0; 766 + for_each_present_cpu(tcpu) { 767 + tbcp = &per_cpu(bau_control, tcpu); 768 + tbcp->baudisabled = 0; 769 + tbcp->period_requests = 0; 770 + tbcp->period_time = 0; 771 + } 772 + } 773 + } 774 + return cpumask; 775 + } 776 777 /* 778 * Each sending cpu has a per-cpu mask which it fills from the caller's ··· 793 else 794 return NULL; 795 } 796 stat->s_requestor++; 797 stat->s_ntargcpu += remotes; 798 remotes = bau_uvhub_weight(&bau_desc->distribution); ··· 973 seq_printf(file, 974 "sw_ack recv rtime all "); 975 seq_printf(file, 976 + "one mult none retry canc nocan reset rcan "); 977 + seq_printf(file, 978 + "disable enable\n"); 979 } 980 if (cpu < num_possible_cpus() && cpu_online(cpu)) { 981 stat = &per_cpu(ptcstats, cpu); ··· 993 994 /* destination side statistics */ 995 seq_printf(file, 996 + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 997 uv_read_global_mmr64(uv_cpu_to_pnode(cpu), 998 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), 999 stat->d_requestee, cycles_2_us(stat->d_time), ··· 1001 stat->d_nomsg, stat->d_retries, stat->d_canceled, 1002 stat->d_nocanceled, stat->d_resets, 1003 stat->d_rcanceled); 1004 + seq_printf(file, "%ld %ld\n", 1005 + stat->s_bau_disabled, stat->s_bau_reenabled); 1006 } 1007 1008 return 0; ··· 1112 "reset: number of ipi-style reset requests processed\n"); 1113 printk(KERN_DEBUG 1114 "rcan: number messages canceled by reset requests\n"); 1115 + printk(KERN_DEBUG 1116 + "disable: number times use of the BAU was disabled\n"); 1117 + printk(KERN_DEBUG 1118 + "enable: number times use of the BAU was re-enabled\n"); 1119 } else if (input_arg == -1) { 1120 for_each_present_cpu(cpu) { 1121 stat = &per_cpu(ptcstats, cpu); ··· 1568 kfree(uvhub_descs); 1569 for_each_present_cpu(cpu) { 1570 bcp = &per_cpu(bau_control, cpu); 1571 + bcp->baudisabled = 0; 1572 /* time interval to catch a hardware stay-busy bug */ 1573 bcp->timeout_interval = microsec_2_cycles(2*timeout_us); 1574 bcp->max_bau_concurrent = max_bau_concurrent; ··· 1609 uv_nshift = uv_hub_info->m_val; 1610 uv_mmask = (1UL << uv_hub_info->m_val) - 1; 1611 nuvhubs = uv_num_possible_blades(); 1612 + spin_lock_init(&disable_lock); 1613 + congested_cycles = microsec_2_cycles(congested_response_us); 1614 1615 uv_init_per_cpu(nuvhubs); 1616