Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

mm and cache_info: remove unnecessary CPU cache info update

For each CPU hotplug event, we will update per-CPU data slice size and
corresponding PCP configuration for every online CPU to make the
implementation simple. But, Kyle reported that this takes tens seconds
during boot on a machine with 34 zones and 3840 CPUs.

So, in this patch, for each CPU hotplug event, we only update per-CPU data
slice size and corresponding PCP configuration for the CPUs that share
caches with the hotplugged CPU. With the patch, the system boot time
reduces 67 seconds on the machine.

Link: https://lkml.kernel.org/r/20240126081944.414520-1-ying.huang@intel.com
Fixes: 362d37a106dd ("mm, pcp: reduce lock contention for draining high-order pages")
Signed-off-by: "Huang, Ying" <ying.huang@intel.com>
Originally-by: Kyle Meyer <kyle.meyer@hpe.com>
Reported-and-tested-by: Kyle Meyer <kyle.meyer@hpe.com>
Cc: Sudeep Holla <sudeep.holla@arm.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>

authored by

Huang Ying and committed by
Andrew Morton
5cec4eb7 96200c91

+63 -28
+44 -6
drivers/base/cacheinfo.c
··· 898 898 return rc; 899 899 } 900 900 901 + static unsigned int cpu_map_shared_cache(bool online, unsigned int cpu, 902 + cpumask_t **map) 903 + { 904 + struct cacheinfo *llc, *sib_llc; 905 + unsigned int sibling; 906 + 907 + if (!last_level_cache_is_valid(cpu)) 908 + return 0; 909 + 910 + llc = per_cpu_cacheinfo_idx(cpu, cache_leaves(cpu) - 1); 911 + 912 + if (llc->type != CACHE_TYPE_DATA && llc->type != CACHE_TYPE_UNIFIED) 913 + return 0; 914 + 915 + if (online) { 916 + *map = &llc->shared_cpu_map; 917 + return cpumask_weight(*map); 918 + } 919 + 920 + /* shared_cpu_map of offlined CPU will be cleared, so use sibling map */ 921 + for_each_cpu(sibling, &llc->shared_cpu_map) { 922 + if (sibling == cpu || !last_level_cache_is_valid(sibling)) 923 + continue; 924 + sib_llc = per_cpu_cacheinfo_idx(sibling, cache_leaves(sibling) - 1); 925 + *map = &sib_llc->shared_cpu_map; 926 + return cpumask_weight(*map); 927 + } 928 + 929 + return 0; 930 + } 931 + 901 932 /* 902 933 * Calculate the size of the per-CPU data cache slice. This can be 903 934 * used to estimate the size of the data cache slice that can be used ··· 960 929 ci->per_cpu_data_slice_size = llc->size / nr_shared; 961 930 } 962 931 963 - static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu) 932 + static void update_per_cpu_data_slice_size(bool cpu_online, unsigned int cpu, 933 + cpumask_t *cpu_map) 964 934 { 965 935 unsigned int icpu; 966 936 967 - for_each_online_cpu(icpu) { 937 + for_each_cpu(icpu, cpu_map) { 968 938 if (!cpu_online && icpu == cpu) 969 939 continue; 970 940 update_per_cpu_data_slice_size_cpu(icpu); 941 + setup_pcp_cacheinfo(icpu); 971 942 } 972 943 } 973 944 974 945 static int cacheinfo_cpu_online(unsigned int cpu) 975 946 { 976 947 int rc = detect_cache_attributes(cpu); 948 + cpumask_t *cpu_map; 977 949 978 950 if (rc) 979 951 return rc; 980 952 rc = cache_add_dev(cpu); 981 953 if (rc) 982 954 goto err; 983 - update_per_cpu_data_slice_size(true, cpu); 984 - setup_pcp_cacheinfo(); 955 + if (cpu_map_shared_cache(true, cpu, &cpu_map)) 956 + update_per_cpu_data_slice_size(true, cpu, cpu_map); 985 957 return 0; 986 958 err: 987 959 free_cache_attributes(cpu); ··· 993 959 994 960 static int cacheinfo_cpu_pre_down(unsigned int cpu) 995 961 { 962 + cpumask_t *cpu_map; 963 + unsigned int nr_shared; 964 + 965 + nr_shared = cpu_map_shared_cache(false, cpu, &cpu_map); 996 966 if (cpumask_test_and_clear_cpu(cpu, &cache_dev_map)) 997 967 cpu_cache_sysfs_exit(cpu); 998 968 999 969 free_cache_attributes(cpu); 1000 - update_per_cpu_data_slice_size(false, cpu); 1001 - setup_pcp_cacheinfo(); 970 + if (nr_shared > 1) 971 + update_per_cpu_data_slice_size(false, cpu, cpu_map); 1002 972 return 0; 1003 973 } 1004 974
+1 -1
include/linux/gfp.h
··· 334 334 void drain_local_pages(struct zone *zone); 335 335 336 336 void page_alloc_init_late(void); 337 - void setup_pcp_cacheinfo(void); 337 + void setup_pcp_cacheinfo(unsigned int cpu); 338 338 339 339 /* 340 340 * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
+18 -21
mm/page_alloc.c
··· 5572 5572 mutex_unlock(&pcp_batch_high_lock); 5573 5573 } 5574 5574 5575 - static void zone_pcp_update_cacheinfo(struct zone *zone) 5575 + static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) 5576 5576 { 5577 - int cpu; 5578 5577 struct per_cpu_pages *pcp; 5579 5578 struct cpu_cacheinfo *cci; 5580 5579 5581 - for_each_online_cpu(cpu) { 5582 - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 5583 - cci = get_cpu_cacheinfo(cpu); 5584 - /* 5585 - * If data cache slice of CPU is large enough, "pcp->batch" 5586 - * pages can be preserved in PCP before draining PCP for 5587 - * consecutive high-order pages freeing without allocation. 5588 - * This can reduce zone lock contention without hurting 5589 - * cache-hot pages sharing. 5590 - */ 5591 - spin_lock(&pcp->lock); 5592 - if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) 5593 - pcp->flags |= PCPF_FREE_HIGH_BATCH; 5594 - else 5595 - pcp->flags &= ~PCPF_FREE_HIGH_BATCH; 5596 - spin_unlock(&pcp->lock); 5597 - } 5580 + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); 5581 + cci = get_cpu_cacheinfo(cpu); 5582 + /* 5583 + * If data cache slice of CPU is large enough, "pcp->batch" 5584 + * pages can be preserved in PCP before draining PCP for 5585 + * consecutive high-order pages freeing without allocation. 5586 + * This can reduce zone lock contention without hurting 5587 + * cache-hot pages sharing. 5588 + */ 5589 + spin_lock(&pcp->lock); 5590 + if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) 5591 + pcp->flags |= PCPF_FREE_HIGH_BATCH; 5592 + else 5593 + pcp->flags &= ~PCPF_FREE_HIGH_BATCH; 5594 + spin_unlock(&pcp->lock); 5598 5595 } 5599 5596 5600 - void setup_pcp_cacheinfo(void) 5597 + void setup_pcp_cacheinfo(unsigned int cpu) 5601 5598 { 5602 5599 struct zone *zone; 5603 5600 5604 5601 for_each_populated_zone(zone) 5605 - zone_pcp_update_cacheinfo(zone); 5602 + zone_pcp_update_cacheinfo(zone, cpu); 5606 5603 } 5607 5604 5608 5605 /*