Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powerpc: Detect the presence of big-cores via "ibm, thread-groups"

On IBM POWER9, the device tree exposes a property array identifed by
"ibm,thread-groups" which will indicate which groups of threads share
a particular set of resources.

As of today we only have one form of grouping identifying the group of
threads in the core that share the L1 cache, translation cache and
instruction data flow.

This patch adds helper functions to parse the contents of
"ibm,thread-groups" and populate a per-cpu variable to cache
information about siblings of each CPU that share the L1, traslation
cache and instruction data-flow.

It also defines a new global variable named "has_big_cores" which
indicates if the cores on this configuration have multiple groups of
threads that share L1 cache.

For each online CPU, it maintains a cpu_smallcore_mask, which
indicates the online siblings which share the L1-cache with it.

Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by

Gautham R. Shenoy and committed by
Michael Ellerman
425752c6 bf6cbd0c

+235
+2
arch/powerpc/include/asm/cputhreads.h
··· 23 23 extern int threads_per_core; 24 24 extern int threads_per_subcore; 25 25 extern int threads_shift; 26 + extern bool has_big_cores; 26 27 extern cpumask_t threads_core_mask; 27 28 #else 28 29 #define threads_per_core 1 29 30 #define threads_per_subcore 1 30 31 #define threads_shift 0 32 + #define has_big_cores 0 31 33 #define threads_core_mask (*get_cpu_mask(0)) 32 34 #endif 33 35
+11
arch/powerpc/include/asm/smp.h
··· 100 100 DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); 101 101 DECLARE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); 102 102 DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); 103 + DECLARE_PER_CPU(cpumask_var_t, cpu_smallcore_map); 103 104 104 105 static inline struct cpumask *cpu_sibling_mask(int cpu) 105 106 { ··· 115 114 static inline struct cpumask *cpu_l2_cache_mask(int cpu) 116 115 { 117 116 return per_cpu(cpu_l2_cache_map, cpu); 117 + } 118 + 119 + static inline struct cpumask *cpu_smallcore_mask(int cpu) 120 + { 121 + return per_cpu(cpu_smallcore_map, cpu); 118 122 } 119 123 120 124 extern int cpu_to_core_id(int cpu); ··· 168 162 static inline void inhibit_secondary_onlining(void) {} 169 163 static inline void uninhibit_secondary_onlining(void) {} 170 164 static inline const struct cpumask *cpu_sibling_mask(int cpu) 165 + { 166 + return cpumask_of(cpu); 167 + } 168 + 169 + static inline const struct cpumask *cpu_smallcore_mask(int cpu) 171 170 { 172 171 return cpumask_of(cpu); 173 172 }
+222
arch/powerpc/kernel/smp.c
··· 75 75 #endif 76 76 77 77 struct thread_info *secondary_ti; 78 + bool has_big_cores; 78 79 79 80 DEFINE_PER_CPU(cpumask_var_t, cpu_sibling_map); 81 + DEFINE_PER_CPU(cpumask_var_t, cpu_smallcore_map); 80 82 DEFINE_PER_CPU(cpumask_var_t, cpu_l2_cache_map); 81 83 DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); 82 84 83 85 EXPORT_PER_CPU_SYMBOL(cpu_sibling_map); 84 86 EXPORT_PER_CPU_SYMBOL(cpu_l2_cache_map); 85 87 EXPORT_PER_CPU_SYMBOL(cpu_core_map); 88 + EXPORT_SYMBOL_GPL(has_big_cores); 89 + 90 + #define MAX_THREAD_LIST_SIZE 8 91 + #define THREAD_GROUP_SHARE_L1 1 92 + struct thread_groups { 93 + unsigned int property; 94 + unsigned int nr_groups; 95 + unsigned int threads_per_group; 96 + unsigned int thread_list[MAX_THREAD_LIST_SIZE]; 97 + }; 98 + 99 + /* 100 + * On big-cores system, cpu_l1_cache_map for each CPU corresponds to 101 + * the set its siblings that share the L1-cache. 102 + */ 103 + DEFINE_PER_CPU(cpumask_var_t, cpu_l1_cache_map); 86 104 87 105 /* SMP operations for this machine */ 88 106 struct smp_ops_t *smp_ops; ··· 693 675 } 694 676 #endif 695 677 678 + /* 679 + * parse_thread_groups: Parses the "ibm,thread-groups" device tree 680 + * property for the CPU device node @dn and stores 681 + * the parsed output in the thread_groups 682 + * structure @tg if the ibm,thread-groups[0] 683 + * matches @property. 684 + * 685 + * @dn: The device node of the CPU device. 686 + * @tg: Pointer to a thread group structure into which the parsed 687 + * output of "ibm,thread-groups" is stored. 688 + * @property: The property of the thread-group that the caller is 689 + * interested in. 690 + * 691 + * ibm,thread-groups[0..N-1] array defines which group of threads in 692 + * the CPU-device node can be grouped together based on the property. 693 + * 694 + * ibm,thread-groups[0] tells us the property based on which the 695 + * threads are being grouped together. If this value is 1, it implies 696 + * that the threads in the same group share L1, translation cache. 697 + * 698 + * ibm,thread-groups[1] tells us how many such thread groups exist. 699 + * 700 + * ibm,thread-groups[2] tells us the number of threads in each such 701 + * group. 702 + * 703 + * ibm,thread-groups[3..N-1] is the list of threads identified by 704 + * "ibm,ppc-interrupt-server#s" arranged as per their membership in 705 + * the grouping. 706 + * 707 + * Example: If ibm,thread-groups = [1,2,4,5,6,7,8,9,10,11,12] it 708 + * implies that there are 2 groups of 4 threads each, where each group 709 + * of threads share L1, translation cache. 710 + * 711 + * The "ibm,ppc-interrupt-server#s" of the first group is {5,6,7,8} 712 + * and the "ibm,ppc-interrupt-server#s" of the second group is {9, 10, 713 + * 11, 12} structure 714 + * 715 + * Returns 0 on success, -EINVAL if the property does not exist, 716 + * -ENODATA if property does not have a value, and -EOVERFLOW if the 717 + * property data isn't large enough. 718 + */ 719 + static int parse_thread_groups(struct device_node *dn, 720 + struct thread_groups *tg, 721 + unsigned int property) 722 + { 723 + int i; 724 + u32 thread_group_array[3 + MAX_THREAD_LIST_SIZE]; 725 + u32 *thread_list; 726 + size_t total_threads; 727 + int ret; 728 + 729 + ret = of_property_read_u32_array(dn, "ibm,thread-groups", 730 + thread_group_array, 3); 731 + if (ret) 732 + return ret; 733 + 734 + tg->property = thread_group_array[0]; 735 + tg->nr_groups = thread_group_array[1]; 736 + tg->threads_per_group = thread_group_array[2]; 737 + if (tg->property != property || 738 + tg->nr_groups < 1 || 739 + tg->threads_per_group < 1) 740 + return -ENODATA; 741 + 742 + total_threads = tg->nr_groups * tg->threads_per_group; 743 + 744 + ret = of_property_read_u32_array(dn, "ibm,thread-groups", 745 + thread_group_array, 746 + 3 + total_threads); 747 + if (ret) 748 + return ret; 749 + 750 + thread_list = &thread_group_array[3]; 751 + 752 + for (i = 0 ; i < total_threads; i++) 753 + tg->thread_list[i] = thread_list[i]; 754 + 755 + return 0; 756 + } 757 + 758 + /* 759 + * get_cpu_thread_group_start : Searches the thread group in tg->thread_list 760 + * that @cpu belongs to. 761 + * 762 + * @cpu : The logical CPU whose thread group is being searched. 763 + * @tg : The thread-group structure of the CPU node which @cpu belongs 764 + * to. 765 + * 766 + * Returns the index to tg->thread_list that points to the the start 767 + * of the thread_group that @cpu belongs to. 768 + * 769 + * Returns -1 if cpu doesn't belong to any of the groups pointed to by 770 + * tg->thread_list. 771 + */ 772 + static int get_cpu_thread_group_start(int cpu, struct thread_groups *tg) 773 + { 774 + int hw_cpu_id = get_hard_smp_processor_id(cpu); 775 + int i, j; 776 + 777 + for (i = 0; i < tg->nr_groups; i++) { 778 + int group_start = i * tg->threads_per_group; 779 + 780 + for (j = 0; j < tg->threads_per_group; j++) { 781 + int idx = group_start + j; 782 + 783 + if (tg->thread_list[idx] == hw_cpu_id) 784 + return group_start; 785 + } 786 + } 787 + 788 + return -1; 789 + } 790 + 791 + static int init_cpu_l1_cache_map(int cpu) 792 + 793 + { 794 + struct device_node *dn = of_get_cpu_node(cpu, NULL); 795 + struct thread_groups tg = {.property = 0, 796 + .nr_groups = 0, 797 + .threads_per_group = 0}; 798 + int first_thread = cpu_first_thread_sibling(cpu); 799 + int i, cpu_group_start = -1, err = 0; 800 + 801 + if (!dn) 802 + return -ENODATA; 803 + 804 + err = parse_thread_groups(dn, &tg, THREAD_GROUP_SHARE_L1); 805 + if (err) 806 + goto out; 807 + 808 + zalloc_cpumask_var_node(&per_cpu(cpu_l1_cache_map, cpu), 809 + GFP_KERNEL, 810 + cpu_to_node(cpu)); 811 + 812 + cpu_group_start = get_cpu_thread_group_start(cpu, &tg); 813 + 814 + if (unlikely(cpu_group_start == -1)) { 815 + WARN_ON_ONCE(1); 816 + err = -ENODATA; 817 + goto out; 818 + } 819 + 820 + for (i = first_thread; i < first_thread + threads_per_core; i++) { 821 + int i_group_start = get_cpu_thread_group_start(i, &tg); 822 + 823 + if (unlikely(i_group_start == -1)) { 824 + WARN_ON_ONCE(1); 825 + err = -ENODATA; 826 + goto out; 827 + } 828 + 829 + if (i_group_start == cpu_group_start) 830 + cpumask_set_cpu(i, per_cpu(cpu_l1_cache_map, cpu)); 831 + } 832 + 833 + out: 834 + of_node_put(dn); 835 + return err; 836 + } 837 + 838 + static int init_big_cores(void) 839 + { 840 + int cpu; 841 + 842 + for_each_possible_cpu(cpu) { 843 + int err = init_cpu_l1_cache_map(cpu); 844 + 845 + if (err) 846 + return err; 847 + 848 + zalloc_cpumask_var_node(&per_cpu(cpu_smallcore_map, cpu), 849 + GFP_KERNEL, 850 + cpu_to_node(cpu)); 851 + } 852 + 853 + has_big_cores = true; 854 + return 0; 855 + } 856 + 696 857 void __init smp_prepare_cpus(unsigned int max_cpus) 697 858 { 698 859 unsigned int cpu; ··· 909 712 cpumask_set_cpu(boot_cpuid, cpu_sibling_mask(boot_cpuid)); 910 713 cpumask_set_cpu(boot_cpuid, cpu_l2_cache_mask(boot_cpuid)); 911 714 cpumask_set_cpu(boot_cpuid, cpu_core_mask(boot_cpuid)); 715 + 716 + init_big_cores(); 717 + if (has_big_cores) { 718 + cpumask_set_cpu(boot_cpuid, 719 + cpu_smallcore_mask(boot_cpuid)); 720 + } 912 721 913 722 if (smp_ops && smp_ops->probe) 914 723 smp_ops->probe(); ··· 1206 1003 set_cpus_unrelated(cpu, i, cpu_core_mask); 1207 1004 set_cpus_unrelated(cpu, i, cpu_l2_cache_mask); 1208 1005 set_cpus_unrelated(cpu, i, cpu_sibling_mask); 1006 + if (has_big_cores) 1007 + set_cpus_unrelated(cpu, i, cpu_smallcore_mask); 1209 1008 } 1210 1009 } 1211 1010 #endif 1011 + 1012 + static inline void add_cpu_to_smallcore_masks(int cpu) 1013 + { 1014 + struct cpumask *this_l1_cache_map = per_cpu(cpu_l1_cache_map, cpu); 1015 + int i, first_thread = cpu_first_thread_sibling(cpu); 1016 + 1017 + if (!has_big_cores) 1018 + return; 1019 + 1020 + cpumask_set_cpu(cpu, cpu_smallcore_mask(cpu)); 1021 + 1022 + for (i = first_thread; i < first_thread + threads_per_core; i++) { 1023 + if (cpu_online(i) && cpumask_test_cpu(i, this_l1_cache_map)) 1024 + set_cpus_related(i, cpu, cpu_smallcore_mask); 1025 + } 1026 + } 1212 1027 1213 1028 static void add_cpu_to_masks(int cpu) 1214 1029 { ··· 1244 1023 if (cpu_online(i)) 1245 1024 set_cpus_related(i, cpu, cpu_sibling_mask); 1246 1025 1026 + add_cpu_to_smallcore_masks(cpu); 1247 1027 /* 1248 1028 * Copy the thread sibling mask into the cache sibling mask 1249 1029 * and mark any CPUs that share an L2 with this CPU.