x86, UV: BAU broadcast to the local hub

Make the Broadcast Assist Unit driver use the BAU for TLB
shootdowns of cpu's on the local uvhub.

It was previously thought that IPI might be faster to the cpu's
on the local hub. But the IPI operation would have to follow
the completion of the BAU broadcast anyway. So we broadcast to
the local uvhub in all cases except when the current cpu was the
only local cpu in the mask.

This simplifies uv_flush_send_and_wait() in that it returns
either all shootdowns complete, or none.

Adjust the statistics to account for shootdowns on the local
uvhub.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: gregkh@suse.de
LKML-Reference: <E1OJvNy-0004aq-G7@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

authored by Cliff Wickman and committed by Ingo Molnar 450a007e 7fba1bcd

+58 -85
+5
arch/x86/include/asm/uv/uv_bau.h
··· 346 346 unsigned long s_time; /* time spent in sending side */ 347 347 unsigned long s_retriesok; /* successful retries */ 348 348 unsigned long s_ntargcpu; /* total number of cpu's targeted */ 349 + unsigned long s_ntargself; /* times the sending cpu was targeted */ 350 + unsigned long s_ntarglocals; /* targets of cpus on the local blade */ 351 + unsigned long s_ntargremotes; /* targets of cpus on remote blades */ 352 + unsigned long s_ntarglocaluvhub; /* targets of the local hub */ 353 + unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */ 349 354 unsigned long s_ntarguvhub; /* total number of uvhubs targeted */ 350 355 unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/ 351 356 unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */
+53 -85
arch/x86/kernel/tlb_uv.c
··· 400 400 unsigned long mmr_offset, int right_shift, int this_cpu, 401 401 struct bau_control *bcp, struct bau_control *smaster, long try) 402 402 { 403 - int relaxes = 0; 404 403 unsigned long descriptor_status; 405 - unsigned long mmr; 406 - unsigned long mask; 407 404 cycles_t ttime; 408 405 struct ptc_stats *stat = bcp->statp; 409 406 struct bau_control *hmaster; ··· 521 524 * The flush_mask contains the cpus the broadcast is to be sent to, plus 522 525 * cpus that are on the local uvhub. 523 526 * 524 - * Returns NULL if all flushing represented in the mask was done. The mask 525 - * is zeroed. 526 - * Returns @flush_mask if some remote flushing remains to be done. The 527 - * mask will have some bits still set, representing any cpus on the local 528 - * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed. 527 + * Returns 0 if all flushing represented in the mask was done. 528 + * Returns 1 if it gives up entirely and the original cpu mask is to be 529 + * returned to the kernel. 529 530 */ 530 - const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, 531 - struct cpumask *flush_mask, 532 - struct bau_control *bcp) 531 + int uv_flush_send_and_wait(struct bau_desc *bau_desc, 532 + struct cpumask *flush_mask, struct bau_control *bcp) 533 533 { 534 534 int right_shift; 535 - int uvhub; 536 - int bit; 537 535 int completion_status = 0; 538 536 int seq_number = 0; 539 537 long try = 0; 540 538 int cpu = bcp->uvhub_cpu; 541 539 int this_cpu = bcp->cpu; 542 - int this_uvhub = bcp->uvhub; 543 540 unsigned long mmr_offset; 544 541 unsigned long index; 545 542 cycles_t time1; ··· 543 552 struct bau_control *smaster = bcp->socket_master; 544 553 struct bau_control *hmaster = bcp->uvhub_master; 545 554 546 - /* 547 - * Spin here while there are hmaster->max_bau_concurrent or more active 548 - * descriptors. This is the per-uvhub 'throttle'. 549 - */ 550 555 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 551 556 &hmaster->active_descriptor_count, 552 557 hmaster->max_bau_concurrent)) { ··· 578 591 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | 579 592 bcp->uvhub_cpu; 580 593 bcp->send_message = get_cycles(); 581 - 582 594 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 583 - 584 595 try++; 585 596 completion_status = uv_wait_completion(bau_desc, mmr_offset, 586 597 right_shift, this_cpu, bcp, smaster, try); ··· 637 652 (hmaster->max_bau_concurrent < 638 653 hmaster->max_bau_concurrent_constant)) 639 654 hmaster->max_bau_concurrent++; 640 - 641 - /* 642 - * hold any cpu not timing out here; no other cpu currently held by 643 - * the 'throttle' should enter the activation code 644 - */ 645 655 while (hmaster->uvhub_quiesce) 646 656 cpu_relax(); 647 657 atomic_dec(&hmaster->active_descriptor_count); 648 - 649 - /* guard against cycles wrap */ 650 658 if (time2 > time1) { 651 659 elapsed = time2 - time1; 652 660 stat->s_time += elapsed; ··· 652 674 } 653 675 } 654 676 } else 655 - stat->s_requestor--; /* don't count this one */ 677 + stat->s_requestor--; 656 678 if (completion_status == FLUSH_COMPLETE && try > 1) 657 679 stat->s_retriesok++; 658 680 else if (completion_status == FLUSH_GIVEUP) { 659 - /* 660 - * Cause the caller to do an IPI-style TLB shootdown on 661 - * the target cpu's, all of which are still in the mask. 662 - */ 663 681 stat->s_giveup++; 664 - return flush_mask; 682 + return 1; 665 683 } 666 - 667 - /* 668 - * Success, so clear the remote cpu's from the mask so we don't 669 - * use the IPI method of shootdown on them. 670 - */ 671 - for_each_cpu(bit, flush_mask) { 672 - uvhub = uv_cpu_to_blade_id(bit); 673 - if (uvhub == this_uvhub) 674 - continue; 675 - cpumask_clear_cpu(bit, flush_mask); 676 - } 677 - if (!cpumask_empty(flush_mask)) 678 - return flush_mask; 679 - 680 - return NULL; 684 + return 0; 681 685 } 682 686 683 687 /** ··· 691 731 struct mm_struct *mm, 692 732 unsigned long va, unsigned int cpu) 693 733 { 694 - int remotes; 695 734 int tcpu; 696 735 int uvhub; 697 736 int locals = 0; 737 + int remotes = 0; 738 + int hubs = 0; 698 739 struct bau_desc *bau_desc; 699 740 struct cpumask *flush_mask; 700 741 struct ptc_stats *stat; ··· 729 768 730 769 /* 731 770 * Each sending cpu has a per-cpu mask which it fills from the caller's 732 - * cpu mask. Only remote cpus are converted to uvhubs and copied. 771 + * cpu mask. All cpus are converted to uvhubs and copied to the 772 + * activation descriptor. 733 773 */ 734 774 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); 735 - /* 736 - * copy cpumask to flush_mask, removing current cpu 737 - * (current cpu should already have been flushed by the caller and 738 - * should never be returned if we return flush_mask) 739 - */ 775 + /* don't actually do a shootdown of the local cpu */ 740 776 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 741 777 if (cpu_isset(cpu, *cpumask)) 742 - locals++; /* current cpu was targeted */ 778 + stat->s_ntargself++; 743 779 744 780 bau_desc = bcp->descriptor_base; 745 781 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; 746 782 747 783 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 748 - remotes = 0; 784 + 785 + /* cpu statistics */ 749 786 for_each_cpu(tcpu, flush_mask) { 750 787 uvhub = uv_cpu_to_blade_id(tcpu); 751 - if (uvhub == bcp->uvhub) { 752 - locals++; 753 - continue; 754 - } 755 788 bau_uvhub_set(uvhub, &bau_desc->distribution); 756 - remotes++; 757 - } 758 - if (remotes == 0) { 759 - /* 760 - * No off_hub flushing; return status for local hub. 761 - * Return the caller's mask if all were local (the current 762 - * cpu may be in that mask). 763 - */ 764 - if (locals) 765 - return cpumask; 789 + if (uvhub == bcp->uvhub) 790 + locals++; 766 791 else 767 - return NULL; 792 + remotes++; 768 793 } 794 + if ((locals + remotes) == 0) 795 + return NULL; 769 796 stat->s_requestor++; 770 - stat->s_ntargcpu += remotes; 797 + stat->s_ntargcpu += remotes + locals; 798 + stat->s_ntargremotes += remotes; 799 + stat->s_ntarglocals += locals; 771 800 remotes = bau_uvhub_weight(&bau_desc->distribution); 772 - stat->s_ntarguvhub += remotes; 773 - if (remotes >= 16) 801 + 802 + /* uvhub statistics */ 803 + hubs = bau_uvhub_weight(&bau_desc->distribution); 804 + if (locals) { 805 + stat->s_ntarglocaluvhub++; 806 + stat->s_ntargremoteuvhub += (hubs - 1); 807 + } else 808 + stat->s_ntargremoteuvhub += hubs; 809 + stat->s_ntarguvhub += hubs; 810 + if (hubs >= 16) 774 811 stat->s_ntarguvhub16++; 775 - else if (remotes >= 8) 812 + else if (hubs >= 8) 776 813 stat->s_ntarguvhub8++; 777 - else if (remotes >= 4) 814 + else if (hubs >= 4) 778 815 stat->s_ntarguvhub4++; 779 - else if (remotes >= 2) 816 + else if (hubs >= 2) 780 817 stat->s_ntarguvhub2++; 781 818 else 782 819 stat->s_ntarguvhub1++; ··· 783 824 bau_desc->payload.sending_cpu = cpu; 784 825 785 826 /* 786 - * uv_flush_send_and_wait returns null if all cpu's were messaged, or 787 - * the adjusted flush_mask if any cpu's were not messaged. 827 + * uv_flush_send_and_wait returns 0 if all cpu's were messaged, 828 + * or 1 if it gave up and the original cpumask should be returned. 788 829 */ 789 - return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); 830 + if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) 831 + return NULL; 832 + else 833 + return cpumask; 790 834 } 791 835 792 836 /* ··· 938 976 939 977 if (!cpu) { 940 978 seq_printf(file, 941 - "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); 979 + "# cpu sent stime self locals remotes ncpus localhub "); 942 980 seq_printf(file, 943 - "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); 981 + "remotehub numuvhubs numuvhubs16 numuvhubs8 "); 982 + seq_printf(file, 983 + "numuvhubs4 numuvhubs2 numuvhubs1 dto "); 944 984 seq_printf(file, 945 985 "retries rok resetp resett giveup sto bz throt "); 946 986 seq_printf(file, ··· 958 994 seq_printf(file, 959 995 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 960 996 cpu, stat->s_requestor, cycles_2_us(stat->s_time), 961 - stat->s_ntarguvhub, stat->s_ntarguvhub16, 997 + stat->s_ntargself, stat->s_ntarglocals, 998 + stat->s_ntargremotes, stat->s_ntargcpu, 999 + stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, 1000 + stat->s_ntarguvhub, stat->s_ntarguvhub16); 1001 + seq_printf(file, "%ld %ld %ld %ld %ld ", 962 1002 stat->s_ntarguvhub8, stat->s_ntarguvhub4, 963 1003 stat->s_ntarguvhub2, stat->s_ntarguvhub1, 964 - stat->s_ntargcpu, stat->s_dtimeout); 1004 + stat->s_dtimeout); 965 1005 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", 966 1006 stat->s_retry_messages, stat->s_retriesok, 967 1007 stat->s_resets_plug, stat->s_resets_timeout,