x86, UV: BAU broadcast to the local hub

Make the Broadcast Assist Unit driver use the BAU for TLB
shootdowns of cpu's on the local uvhub.

It was previously thought that IPI might be faster to the cpu's
on the local hub. But the IPI operation would have to follow
the completion of the BAU broadcast anyway. So we broadcast to
the local uvhub in all cases except when the current cpu was the
only local cpu in the mask.

This simplifies uv_flush_send_and_wait() in that it returns
either all shootdowns complete, or none.

Adjust the statistics to account for shootdowns on the local
uvhub.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aq-G7@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Cliff Wickman and committed by Ingo Molnar 450a007e 7fba1bcd

+58 -85
+5
arch/x86/include/asm/uv/uv_bau.h
··· 346 unsigned long s_time; /* time spent in sending side */ 347 unsigned long s_retriesok; /* successful retries */ 348 unsigned long s_ntargcpu; /* total number of cpu's targeted */ 349 unsigned long s_ntarguvhub; /* total number of uvhubs targeted */ 350 unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/ 351 unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
··· 346 unsigned long s_time; /* time spent in sending side */ 347 unsigned long s_retriesok; /* successful retries */ 348 unsigned long s_ntargcpu; /* total number of cpu's targeted */ 349 + unsigned long s_ntargself; /* times the sending cpu was targeted */ 350 + unsigned long s_ntarglocals; /* targets of cpus on the local blade */ 351 + unsigned long s_ntargremotes; /* targets of cpus on remote blades */ 352 + unsigned long s_ntarglocaluvhub; /* targets of the local hub */ 353 + unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */ 354 unsigned long s_ntarguvhub; /* total number of uvhubs targeted */ 355 unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/ 356 unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
+53 -85
arch/x86/kernel/tlb_uv.c
··· 400 unsigned long mmr_offset, int right_shift, int this_cpu, 401 struct bau_control *bcp, struct bau_control *smaster, long try) 402 { 403 - int relaxes = 0; 404 unsigned long descriptor_status; 405 - unsigned long mmr; 406 - unsigned long mask; 407 cycles_t ttime; 408 struct ptc_stats *stat = bcp->statp; 409 struct bau_control *hmaster; ··· 521 * The flush_mask contains the cpus the broadcast is to be sent to, plus 522 * cpus that are on the local uvhub. 523 * 524 - * Returns NULL if all flushing represented in the mask was done. The mask 525 - * is zeroed. 526 - * Returns @flush_mask if some remote flushing remains to be done. The 527 - * mask will have some bits still set, representing any cpus on the local 528 - * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed. 529 */ 530 - const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, 531 - struct cpumask *flush_mask, 532 - struct bau_control *bcp) 533 { 534 int right_shift; 535 - int uvhub; 536 - int bit; 537 int completion_status = 0; 538 int seq_number = 0; 539 long try = 0; 540 int cpu = bcp->uvhub_cpu; 541 int this_cpu = bcp->cpu; 542 - int this_uvhub = bcp->uvhub; 543 unsigned long mmr_offset; 544 unsigned long index; 545 cycles_t time1; ··· 543 struct bau_control *smaster = bcp->socket_master; 544 struct bau_control *hmaster = bcp->uvhub_master; 545 546 - /* 547 - * Spin here while there are hmaster->max_bau_concurrent or more active 548 - * descriptors. This is the per-uvhub 'throttle'. 549 - */ 550 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 551 &hmaster->active_descriptor_count, 552 hmaster->max_bau_concurrent)) { ··· 578 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | 579 bcp->uvhub_cpu; 580 bcp->send_message = get_cycles(); 581 - 582 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 583 - 584 try++; 585 completion_status = uv_wait_completion(bau_desc, mmr_offset, 586 right_shift, this_cpu, bcp, smaster, try); ··· 637 (hmaster->max_bau_concurrent < 638 hmaster->max_bau_concurrent_constant)) 639 hmaster->max_bau_concurrent++; 640 - 641 - /* 642 - * hold any cpu not timing out here; no other cpu currently held by 643 - * the 'throttle' should enter the activation code 644 - */ 645 while (hmaster->uvhub_quiesce) 646 cpu_relax(); 647 atomic_dec(&hmaster->active_descriptor_count); 648 - 649 - /* guard against cycles wrap */ 650 if (time2 > time1) { 651 elapsed = time2 - time1; 652 stat->s_time += elapsed; ··· 652 } 653 } 654 } else 655 - stat->s_requestor--; /* don't count this one */ 656 if (completion_status == FLUSH_COMPLETE && try > 1) 657 stat->s_retriesok++; 658 else if (completion_status == FLUSH_GIVEUP) { 659 - /* 660 - * Cause the caller to do an IPI-style TLB shootdown on 661 - * the target cpu's, all of which are still in the mask. 662 - */ 663 stat->s_giveup++; 664 - return flush_mask; 665 } 666 - 667 - /* 668 - * Success, so clear the remote cpu's from the mask so we don't 669 - * use the IPI method of shootdown on them. 670 - */ 671 - for_each_cpu(bit, flush_mask) { 672 - uvhub = uv_cpu_to_blade_id(bit); 673 - if (uvhub == this_uvhub) 674 - continue; 675 - cpumask_clear_cpu(bit, flush_mask); 676 - } 677 - if (!cpumask_empty(flush_mask)) 678 - return flush_mask; 679 - 680 - return NULL; 681 } 682 683 /** ··· 691 struct mm_struct *mm, 692 unsigned long va, unsigned int cpu) 693 { 694 - int remotes; 695 int tcpu; 696 int uvhub; 697 int locals = 0; 698 struct bau_desc *bau_desc; 699 struct cpumask *flush_mask; 700 struct ptc_stats *stat; ··· 729 730 /* 731 * Each sending cpu has a per-cpu mask which it fills from the caller's 732 - * cpu mask. Only remote cpus are converted to uvhubs and copied. 733 */ 734 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); 735 - /* 736 - * copy cpumask to flush_mask, removing current cpu 737 - * (current cpu should already have been flushed by the caller and 738 - * should never be returned if we return flush_mask) 739 - */ 740 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 741 if (cpu_isset(cpu, *cpumask)) 742 - locals++; /* current cpu was targeted */ 743 744 bau_desc = bcp->descriptor_base; 745 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; 746 747 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 748 - remotes = 0; 749 for_each_cpu(tcpu, flush_mask) { 750 uvhub = uv_cpu_to_blade_id(tcpu); 751 - if (uvhub == bcp->uvhub) { 752 - locals++; 753 - continue; 754 - } 755 bau_uvhub_set(uvhub, &bau_desc->distribution); 756 - remotes++; 757 - } 758 - if (remotes == 0) { 759 - /* 760 - * No off_hub flushing; return status for local hub. 761 - * Return the caller's mask if all were local (the current 762 - * cpu may be in that mask). 763 - */ 764 - if (locals) 765 - return cpumask; 766 else 767 - return NULL; 768 } 769 stat->s_requestor++; 770 - stat->s_ntargcpu += remotes; 771 remotes = bau_uvhub_weight(&bau_desc->distribution); 772 - stat->s_ntarguvhub += remotes; 773 - if (remotes >= 16) 774 stat->s_ntarguvhub16++; 775 - else if (remotes >= 8) 776 stat->s_ntarguvhub8++; 777 - else if (remotes >= 4) 778 stat->s_ntarguvhub4++; 779 - else if (remotes >= 2) 780 stat->s_ntarguvhub2++; 781 else 782 stat->s_ntarguvhub1++; ··· 783 bau_desc->payload.sending_cpu = cpu; 784 785 /* 786 - * uv_flush_send_and_wait returns null if all cpu's were messaged, or 787 - * the adjusted flush_mask if any cpu's were not messaged. 788 */ 789 - return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); 790 } 791 792 /* ··· 938 939 if (!cpu) { 940 seq_printf(file, 941 - "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); 942 seq_printf(file, 943 - "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); 944 seq_printf(file, 945 "retries rok resetp resett giveup sto bz throt "); 946 seq_printf(file, ··· 958 seq_printf(file, 959 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 960 cpu, stat->s_requestor, cycles_2_us(stat->s_time), 961 - stat->s_ntarguvhub, stat->s_ntarguvhub16, 962 stat->s_ntarguvhub8, stat->s_ntarguvhub4, 963 stat->s_ntarguvhub2, stat->s_ntarguvhub1, 964 - stat->s_ntargcpu, stat->s_dtimeout); 965 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", 966 stat->s_retry_messages, stat->s_retriesok, 967 stat->s_resets_plug, stat->s_resets_timeout,
··· 400 unsigned long mmr_offset, int right_shift, int this_cpu, 401 struct bau_control *bcp, struct bau_control *smaster, long try) 402 { 403 unsigned long descriptor_status; 404 cycles_t ttime; 405 struct ptc_stats *stat = bcp->statp; 406 struct bau_control *hmaster; ··· 524 * The flush_mask contains the cpus the broadcast is to be sent to, plus 525 * cpus that are on the local uvhub. 526 * 527 + * Returns 0 if all flushing represented in the mask was done. 528 + * Returns 1 if it gives up entirely and the original cpu mask is to be 529 + * returned to the kernel. 530 */ 531 + int uv_flush_send_and_wait(struct bau_desc *bau_desc, 532 + struct cpumask *flush_mask, struct bau_control *bcp) 533 { 534 int right_shift; 535 int completion_status = 0; 536 int seq_number = 0; 537 long try = 0; 538 int cpu = bcp->uvhub_cpu; 539 int this_cpu = bcp->cpu; 540 unsigned long mmr_offset; 541 unsigned long index; 542 cycles_t time1; ··· 552 struct bau_control *smaster = bcp->socket_master; 553 struct bau_control *hmaster = bcp->uvhub_master; 554 555 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 556 &hmaster->active_descriptor_count, 557 hmaster->max_bau_concurrent)) { ··· 591 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | 592 bcp->uvhub_cpu; 593 bcp->send_message = get_cycles(); 594 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 595 try++; 596 completion_status = uv_wait_completion(bau_desc, mmr_offset, 597 right_shift, this_cpu, bcp, smaster, try); ··· 652 (hmaster->max_bau_concurrent < 653 hmaster->max_bau_concurrent_constant)) 654 hmaster->max_bau_concurrent++; 655 while (hmaster->uvhub_quiesce) 656 cpu_relax(); 657 atomic_dec(&hmaster->active_descriptor_count); 658 if (time2 > time1) { 659 elapsed = time2 - time1; 660 stat->s_time += elapsed; ··· 674 } 675 } 676 } else 677 + stat->s_requestor--; 678 if (completion_status == FLUSH_COMPLETE && try > 1) 679 stat->s_retriesok++; 680 else if (completion_status == FLUSH_GIVEUP) { 681 stat->s_giveup++; 682 + return 1; 683 } 684 + return 0; 685 } 686 687 /** ··· 731 struct mm_struct *mm, 732 unsigned long va, unsigned int cpu) 733 { 734 int tcpu; 735 int uvhub; 736 int locals = 0; 737 + int remotes = 0; 738 + int hubs = 0; 739 struct bau_desc *bau_desc; 740 struct cpumask *flush_mask; 741 struct ptc_stats *stat; ··· 768 769 /* 770 * Each sending cpu has a per-cpu mask which it fills from the caller's 771 + * cpu mask. All cpus are converted to uvhubs and copied to the 772 + * activation descriptor. 773 */ 774 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); 775 + /* don't actually do a shootdown of the local cpu */ 776 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 777 if (cpu_isset(cpu, *cpumask)) 778 + stat->s_ntargself++; 779 780 bau_desc = bcp->descriptor_base; 781 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; 782 783 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 784 + 785 + /* cpu statistics */ 786 for_each_cpu(tcpu, flush_mask) { 787 uvhub = uv_cpu_to_blade_id(tcpu); 788 bau_uvhub_set(uvhub, &bau_desc->distribution); 789 + if (uvhub == bcp->uvhub) 790 + locals++; 791 else 792 + remotes++; 793 } 794 + if ((locals + remotes) == 0) 795 + return NULL; 796 stat->s_requestor++; 797 + stat->s_ntargcpu += remotes + locals; 798 + stat->s_ntargremotes += remotes; 799 + stat->s_ntarglocals += locals; 800 remotes = bau_uvhub_weight(&bau_desc->distribution); 801 + 802 + /* uvhub statistics */ 803 + hubs = bau_uvhub_weight(&bau_desc->distribution); 804 + if (locals) { 805 + stat->s_ntarglocaluvhub++; 806 + stat->s_ntargremoteuvhub += (hubs - 1); 807 + } else 808 + stat->s_ntargremoteuvhub += hubs; 809 + stat->s_ntarguvhub += hubs; 810 + if (hubs >= 16) 811 stat->s_ntarguvhub16++; 812 + else if (hubs >= 8) 813 stat->s_ntarguvhub8++; 814 + else if (hubs >= 4) 815 stat->s_ntarguvhub4++; 816 + else if (hubs >= 2) 817 stat->s_ntarguvhub2++; 818 else 819 stat->s_ntarguvhub1++; ··· 824 bau_desc->payload.sending_cpu = cpu; 825 826 /* 827 + * uv_flush_send_and_wait returns 0 if all cpu's were messaged, 828 + * or 1 if it gave up and the original cpumask should be returned. 829 */ 830 + if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) 831 + return NULL; 832 + else 833 + return cpumask; 834 } 835 836 /* ··· 976 977 if (!cpu) { 978 seq_printf(file, 979 + "# cpu sent stime self locals remotes ncpus localhub "); 980 seq_printf(file, 981 + "remotehub numuvhubs numuvhubs16 numuvhubs8 "); 982 + seq_printf(file, 983 + "numuvhubs4 numuvhubs2 numuvhubs1 dto "); 984 seq_printf(file, 985 "retries rok resetp resett giveup sto bz throt "); 986 seq_printf(file, ··· 994 seq_printf(file, 995 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 996 cpu, stat->s_requestor, cycles_2_us(stat->s_time), 997 + stat->s_ntargself, stat->s_ntarglocals, 998 + stat->s_ntargremotes, stat->s_ntargcpu, 999 + stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, 1000 + stat->s_ntarguvhub, stat->s_ntarguvhub16); 1001 + seq_printf(file, "%ld %ld %ld %ld %ld ", 1002 stat->s_ntarguvhub8, stat->s_ntarguvhub4, 1003 stat->s_ntarguvhub2, stat->s_ntarguvhub1, 1004 + stat->s_dtimeout); 1005 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", 1006 stat->s_retry_messages, stat->s_retriesok, 1007 stat->s_resets_plug, stat->s_resets_timeout,