Merge branch 'x86-cache-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

+16

Documentation/ABI/testing/sysfs-devices-system-cpu

··· 272 272 the modified cache line is written to main 273 273 memory only when it is replaced 274 274 275 + 276 + What: /sys/devices/system/cpu/cpu*/cache/index*/id 277 + Date: September 2016 278 + Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org> 279 + Description: Cache id 280 + 281 + The id provides a unique number for a specific instance of 282 + a cache of a particular type. E.g. there may be a level 283 + 3 unified cache on each socket in a server and we may 284 + assign them ids 0, 1, 2, ... 285 + 286 + Note that id value can be non-contiguous. E.g. level 1 287 + caches typically exist per core, but there may not be a 288 + power of two cores on a socket, so these caches may be 289 + numbered 0, 1, 2, 3, 4, 5, 8, 9, 10, ... 290 + 275 291 What: /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats 276 292 /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat 277 293 /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat

+214

Documentation/x86/intel_rdt_ui.txt

··· 1 + User Interface for Resource Allocation in Intel Resource Director Technology 2 + 3 + Copyright (C) 2016 Intel Corporation 4 + 5 + Fenghua Yu <fenghua.yu@intel.com> 6 + Tony Luck <tony.luck@intel.com> 7 + 8 + This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the 9 + X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3". 10 + 11 + To use the feature mount the file system: 12 + 13 + # mount -t resctrl resctrl [-o cdp] /sys/fs/resctrl 14 + 15 + mount options are: 16 + 17 + "cdp": Enable code/data prioritization in L3 cache allocations. 18 + 19 + 20 + Info directory 21 + -------------- 22 + 23 + The 'info' directory contains information about the enabled 24 + resources. Each resource has its own subdirectory. The subdirectory 25 + names reflect the resource names. Each subdirectory contains the 26 + following files: 27 + 28 + "num_closids": The number of CLOSIDs which are valid for this 29 + resource. The kernel uses the smallest number of 30 + CLOSIDs of all enabled resources as limit. 31 + 32 + "cbm_mask": The bitmask which is valid for this resource. This 33 + mask is equivalent to 100%. 34 + 35 + "min_cbm_bits": The minimum number of consecutive bits which must be 36 + set when writing a mask. 37 + 38 + 39 + Resource groups 40 + --------------- 41 + Resource groups are represented as directories in the resctrl file 42 + system. The default group is the root directory. Other groups may be 43 + created as desired by the system administrator using the "mkdir(1)" 44 + command, and removed using "rmdir(1)". 45 + 46 + There are three files associated with each group: 47 + 48 + "tasks": A list of tasks that belongs to this group. Tasks can be 49 + added to a group by writing the task ID to the "tasks" file 50 + (which will automatically remove them from the previous 51 + group to which they belonged). New tasks created by fork(2) 52 + and clone(2) are added to the same group as their parent. 53 + If a pid is not in any sub partition, it is in root partition 54 + (i.e. default partition). 55 + 56 + "cpus": A bitmask of logical CPUs assigned to this group. Writing 57 + a new mask can add/remove CPUs from this group. Added CPUs 58 + are removed from their previous group. Removed ones are 59 + given to the default (root) group. You cannot remove CPUs 60 + from the default group. 61 + 62 + "schemata": A list of all the resources available to this group. 63 + Each resource has its own line and format - see below for 64 + details. 65 + 66 + When a task is running the following rules define which resources 67 + are available to it: 68 + 69 + 1) If the task is a member of a non-default group, then the schemata 70 + for that group is used. 71 + 72 + 2) Else if the task belongs to the default group, but is running on a 73 + CPU that is assigned to some specific group, then the schemata for 74 + the CPU's group is used. 75 + 76 + 3) Otherwise the schemata for the default group is used. 77 + 78 + 79 + Schemata files - general concepts 80 + --------------------------------- 81 + Each line in the file describes one resource. The line starts with 82 + the name of the resource, followed by specific values to be applied 83 + in each of the instances of that resource on the system. 84 + 85 + Cache IDs 86 + --------- 87 + On current generation systems there is one L3 cache per socket and L2 88 + caches are generally just shared by the hyperthreads on a core, but this 89 + isn't an architectural requirement. We could have multiple separate L3 90 + caches on a socket, multiple cores could share an L2 cache. So instead 91 + of using "socket" or "core" to define the set of logical cpus sharing 92 + a resource we use a "Cache ID". At a given cache level this will be a 93 + unique number across the whole system (but it isn't guaranteed to be a 94 + contiguous sequence, there may be gaps). To find the ID for each logical 95 + CPU look in /sys/devices/system/cpu/cpu*/cache/index*/id 96 + 97 + Cache Bit Masks (CBM) 98 + --------------------- 99 + For cache resources we describe the portion of the cache that is available 100 + for allocation using a bitmask. The maximum value of the mask is defined 101 + by each cpu model (and may be different for different cache levels). It 102 + is found using CPUID, but is also provided in the "info" directory of 103 + the resctrl file system in "info/{resource}/cbm_mask". X86 hardware 104 + requires that these masks have all the '1' bits in a contiguous block. So 105 + 0x3, 0x6 and 0xC are legal 4-bit masks with two bits set, but 0x5, 0x9 106 + and 0xA are not. On a system with a 20-bit mask each bit represents 5% 107 + of the capacity of the cache. You could partition the cache into four 108 + equal parts with masks: 0x1f, 0x3e0, 0x7c00, 0xf8000. 109 + 110 + 111 + L3 details (code and data prioritization disabled) 112 + -------------------------------------------------- 113 + With CDP disabled the L3 schemata format is: 114 + 115 + L3:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... 116 + 117 + L3 details (CDP enabled via mount option to resctrl) 118 + ---------------------------------------------------- 119 + When CDP is enabled L3 control is split into two separate resources 120 + so you can specify independent masks for code and data like this: 121 + 122 + L3data:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... 123 + L3code:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... 124 + 125 + L2 details 126 + ---------- 127 + L2 cache does not support code and data prioritization, so the 128 + schemata format is always: 129 + 130 + L2:<cache_id0>=<cbm>;<cache_id1>=<cbm>;... 131 + 132 + Example 1 133 + --------- 134 + On a two socket machine (one L3 cache per socket) with just four bits 135 + for cache bit masks 136 + 137 + # mount -t resctrl resctrl /sys/fs/resctrl 138 + # cd /sys/fs/resctrl 139 + # mkdir p0 p1 140 + # echo "L3:0=3;1=c" > /sys/fs/resctrl/p0/schemata 141 + # echo "L3:0=3;1=3" > /sys/fs/resctrl/p1/schemata 142 + 143 + The default resource group is unmodified, so we have access to all parts 144 + of all caches (its schemata file reads "L3:0=f;1=f"). 145 + 146 + Tasks that are under the control of group "p0" may only allocate from the 147 + "lower" 50% on cache ID 0, and the "upper" 50% of cache ID 1. 148 + Tasks in group "p1" use the "lower" 50% of cache on both sockets. 149 + 150 + Example 2 151 + --------- 152 + Again two sockets, but this time with a more realistic 20-bit mask. 153 + 154 + Two real time tasks pid=1234 running on processor 0 and pid=5678 running on 155 + processor 1 on socket 0 on a 2-socket and dual core machine. To avoid noisy 156 + neighbors, each of the two real-time tasks exclusively occupies one quarter 157 + of L3 cache on socket 0. 158 + 159 + # mount -t resctrl resctrl /sys/fs/resctrl 160 + # cd /sys/fs/resctrl 161 + 162 + First we reset the schemata for the default group so that the "upper" 163 + 50% of the L3 cache on socket 0 cannot be used by ordinary tasks: 164 + 165 + # echo "L3:0=3ff;1=fffff" > schemata 166 + 167 + Next we make a resource group for our first real time task and give 168 + it access to the "top" 25% of the cache on socket 0. 169 + 170 + # mkdir p0 171 + # echo "L3:0=f8000;1=fffff" > p0/schemata 172 + 173 + Finally we move our first real time task into this resource group. We 174 + also use taskset(1) to ensure the task always runs on a dedicated CPU 175 + on socket 0. Most uses of resource groups will also constrain which 176 + processors tasks run on. 177 + 178 + # echo 1234 > p0/tasks 179 + # taskset -cp 1 1234 180 + 181 + Ditto for the second real time task (with the remaining 25% of cache): 182 + 183 + # mkdir p1 184 + # echo "L3:0=7c00;1=fffff" > p1/schemata 185 + # echo 5678 > p1/tasks 186 + # taskset -cp 2 5678 187 + 188 + Example 3 189 + --------- 190 + 191 + A single socket system which has real-time tasks running on core 4-7 and 192 + non real-time workload assigned to core 0-3. The real-time tasks share text 193 + and data, so a per task association is not required and due to interaction 194 + with the kernel it's desired that the kernel on these cores shares L3 with 195 + the tasks. 196 + 197 + # mount -t resctrl resctrl /sys/fs/resctrl 198 + # cd /sys/fs/resctrl 199 + 200 + First we reset the schemata for the default group so that the "upper" 201 + 50% of the L3 cache on socket 0 cannot be used by ordinary tasks: 202 + 203 + # echo "L3:0=3ff" > schemata 204 + 205 + Next we make a resource group for our real time cores and give 206 + it access to the "top" 50% of the cache on socket 0. 207 + 208 + # mkdir p0 209 + # echo "L3:0=ffc00;" > p0/schemata 210 + 211 + Finally we move core 4-7 over to the new group and make sure that the 212 + kernel and the tasks running there get 50% of the cache. 213 + 214 + # echo C0 > p0/cpus

+8

MAINTAINERS

··· 10327 10327 S: Supported 10328 10328 F: drivers/infiniband/sw/rdmavt 10329 10329 10330 + RDT - RESOURCE ALLOCATION 10331 + M: Fenghua Yu <fenghua.yu@intel.com> 10332 + L: linux-kernel@vger.kernel.org 10333 + S: Supported 10334 + F: arch/x86/kernel/cpu/intel_rdt* 10335 + F: arch/x86/include/asm/intel_rdt* 10336 + F: Documentation/x86/intel_rdt* 10337 + 10330 10338 READ-COPY UPDATE (RCU) 10331 10339 M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> 10332 10340 M: Josh Triplett <josh@joshtriplett.org>

+13

arch/x86/Kconfig

··· 412 412 def_bool y 413 413 depends on X86_GOLDFISH 414 414 415 + config INTEL_RDT_A 416 + bool "Intel Resource Director Technology Allocation support" 417 + default n 418 + depends on X86 && CPU_SUP_INTEL 419 + select KERNFS 420 + help 421 + Select to enable resource allocation which is a sub-feature of 422 + Intel Resource Director Technology(RDT). More information about 423 + RDT can be found in the Intel x86 Architecture Software 424 + Developer Manual. 425 + 426 + Say N if unsure. 427 + 415 428 if X86_32 416 429 config X86_EXTENDED_PLATFORM 417 430 bool "Support for extended (non-PC) x86 platforms"

+2 -21

arch/x86/events/intel/cqm.c

··· 7 7 #include <linux/perf_event.h> 8 8 #include <linux/slab.h> 9 9 #include <asm/cpu_device_id.h> 10 + #include <asm/intel_rdt_common.h> 10 11 #include "../perf_event.h" 11 12 12 - #define MSR_IA32_PQR_ASSOC 0x0c8f 13 13 #define MSR_IA32_QM_CTR 0x0c8e 14 14 #define MSR_IA32_QM_EVTSEL 0x0c8d 15 15 ··· 24 24 static bool cqm_enabled, mbm_enabled; 25 25 unsigned int mbm_socket_max; 26 26 27 - /** 28 - * struct intel_pqr_state - State cache for the PQR MSR 29 - * @rmid: The cached Resource Monitoring ID 30 - * @closid: The cached Class Of Service ID 31 - * @rmid_usecnt: The usage counter for rmid 32 - * 33 - * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the 34 - * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always 35 - * contains both parts, so we need to cache them. 36 - * 37 - * The cache also helps to avoid pointless updates if the value does 38 - * not change. 39 - */ 40 - struct intel_pqr_state { 41 - u32 rmid; 42 - u32 closid; 43 - int rmid_usecnt; 44 - }; 45 - 46 27 /* 47 28 * The cached intel_pqr_state is strictly per CPU and can never be 48 29 * updated from a remote CPU. Both functions which modify the state 49 30 * (intel_cqm_event_start and intel_cqm_event_stop) are called with 50 31 * interrupts disabled, which is sufficient for the protection. 51 32 */ 52 - static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); 33 + DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); 53 34 static struct hrtimer *mbm_timers; 54 35 /** 55 36 * struct sample - mbm event's (local or total) data

+4

arch/x86/include/asm/cpufeatures.h

··· 189 189 190 190 #define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ 191 191 #define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ 192 + #define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ 193 + #define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ 194 + #define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ 192 195 193 196 #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ 194 197 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ ··· 225 222 #define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ 226 223 #define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ 227 224 #define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ 225 + #define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ 228 226 #define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ 229 227 #define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ 230 228 #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */

+224

arch/x86/include/asm/intel_rdt.h

··· 1 + #ifndef _ASM_X86_INTEL_RDT_H 2 + #define _ASM_X86_INTEL_RDT_H 3 + 4 + #ifdef CONFIG_INTEL_RDT_A 5 + 6 + #include <linux/kernfs.h> 7 + #include <linux/jump_label.h> 8 + 9 + #include <asm/intel_rdt_common.h> 10 + 11 + #define IA32_L3_QOS_CFG 0xc81 12 + #define IA32_L3_CBM_BASE 0xc90 13 + #define IA32_L2_CBM_BASE 0xd10 14 + 15 + #define L3_QOS_CDP_ENABLE 0x01ULL 16 + 17 + /** 18 + * struct rdtgroup - store rdtgroup's data in resctrl file system. 19 + * @kn: kernfs node 20 + * @rdtgroup_list: linked list for all rdtgroups 21 + * @closid: closid for this rdtgroup 22 + * @cpu_mask: CPUs assigned to this rdtgroup 23 + * @flags: status bits 24 + * @waitcount: how many cpus expect to find this 25 + * group when they acquire rdtgroup_mutex 26 + */ 27 + struct rdtgroup { 28 + struct kernfs_node *kn; 29 + struct list_head rdtgroup_list; 30 + int closid; 31 + struct cpumask cpu_mask; 32 + int flags; 33 + atomic_t waitcount; 34 + }; 35 + 36 + /* rdtgroup.flags */ 37 + #define RDT_DELETED 1 38 + 39 + /* List of all resource groups */ 40 + extern struct list_head rdt_all_groups; 41 + 42 + int __init rdtgroup_init(void); 43 + 44 + /** 45 + * struct rftype - describe each file in the resctrl file system 46 + * @name: file name 47 + * @mode: access mode 48 + * @kf_ops: operations 49 + * @seq_show: show content of the file 50 + * @write: write to the file 51 + */ 52 + struct rftype { 53 + char *name; 54 + umode_t mode; 55 + struct kernfs_ops *kf_ops; 56 + 57 + int (*seq_show)(struct kernfs_open_file *of, 58 + struct seq_file *sf, void *v); 59 + /* 60 + * write() is the generic write callback which maps directly to 61 + * kernfs write operation and overrides all other operations. 62 + * Maximum write size is determined by ->max_write_len. 63 + */ 64 + ssize_t (*write)(struct kernfs_open_file *of, 65 + char *buf, size_t nbytes, loff_t off); 66 + }; 67 + 68 + /** 69 + * struct rdt_resource - attributes of an RDT resource 70 + * @enabled: Is this feature enabled on this machine 71 + * @capable: Is this feature available on this machine 72 + * @name: Name to use in "schemata" file 73 + * @num_closid: Number of CLOSIDs available 74 + * @max_cbm: Largest Cache Bit Mask allowed 75 + * @min_cbm_bits: Minimum number of consecutive bits to be set 76 + * in a cache bit mask 77 + * @domains: All domains for this resource 78 + * @num_domains: Number of domains active 79 + * @msr_base: Base MSR address for CBMs 80 + * @tmp_cbms: Scratch space when updating schemata 81 + * @num_tmp_cbms: Number of CBMs in tmp_cbms 82 + * @cache_level: Which cache level defines scope of this domain 83 + * @cbm_idx_multi: Multiplier of CBM index 84 + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: 85 + * closid * cbm_idx_multi + cbm_idx_offset 86 + */ 87 + struct rdt_resource { 88 + bool enabled; 89 + bool capable; 90 + char *name; 91 + int num_closid; 92 + int cbm_len; 93 + int min_cbm_bits; 94 + u32 max_cbm; 95 + struct list_head domains; 96 + int num_domains; 97 + int msr_base; 98 + u32 *tmp_cbms; 99 + int num_tmp_cbms; 100 + int cache_level; 101 + int cbm_idx_multi; 102 + int cbm_idx_offset; 103 + }; 104 + 105 + /** 106 + * struct rdt_domain - group of cpus sharing an RDT resource 107 + * @list: all instances of this resource 108 + * @id: unique id for this instance 109 + * @cpu_mask: which cpus share this resource 110 + * @cbm: array of cache bit masks (indexed by CLOSID) 111 + */ 112 + struct rdt_domain { 113 + struct list_head list; 114 + int id; 115 + struct cpumask cpu_mask; 116 + u32 *cbm; 117 + }; 118 + 119 + /** 120 + * struct msr_param - set a range of MSRs from a domain 121 + * @res: The resource to use 122 + * @low: Beginning index from base MSR 123 + * @high: End index 124 + */ 125 + struct msr_param { 126 + struct rdt_resource *res; 127 + int low; 128 + int high; 129 + }; 130 + 131 + extern struct mutex rdtgroup_mutex; 132 + 133 + extern struct rdt_resource rdt_resources_all[]; 134 + extern struct rdtgroup rdtgroup_default; 135 + DECLARE_STATIC_KEY_FALSE(rdt_enable_key); 136 + 137 + int __init rdtgroup_init(void); 138 + 139 + enum { 140 + RDT_RESOURCE_L3, 141 + RDT_RESOURCE_L3DATA, 142 + RDT_RESOURCE_L3CODE, 143 + RDT_RESOURCE_L2, 144 + 145 + /* Must be the last */ 146 + RDT_NUM_RESOURCES, 147 + }; 148 + 149 + #define for_each_capable_rdt_resource(r) \ 150 + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ 151 + r++) \ 152 + if (r->capable) 153 + 154 + #define for_each_enabled_rdt_resource(r) \ 155 + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ 156 + r++) \ 157 + if (r->enabled) 158 + 159 + /* CPUID.(EAX=10H, ECX=ResID=1).EAX */ 160 + union cpuid_0x10_1_eax { 161 + struct { 162 + unsigned int cbm_len:5; 163 + } split; 164 + unsigned int full; 165 + }; 166 + 167 + /* CPUID.(EAX=10H, ECX=ResID=1).EDX */ 168 + union cpuid_0x10_1_edx { 169 + struct { 170 + unsigned int cos_max:16; 171 + } split; 172 + unsigned int full; 173 + }; 174 + 175 + DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid); 176 + 177 + void rdt_cbm_update(void *arg); 178 + struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); 179 + void rdtgroup_kn_unlock(struct kernfs_node *kn); 180 + ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, 181 + char *buf, size_t nbytes, loff_t off); 182 + int rdtgroup_schemata_show(struct kernfs_open_file *of, 183 + struct seq_file *s, void *v); 184 + 185 + /* 186 + * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR 187 + * 188 + * Following considerations are made so that this has minimal impact 189 + * on scheduler hot path: 190 + * - This will stay as no-op unless we are running on an Intel SKU 191 + * which supports resource control and we enable by mounting the 192 + * resctrl file system. 193 + * - Caches the per cpu CLOSid values and does the MSR write only 194 + * when a task with a different CLOSid is scheduled in. 195 + * 196 + * Must be called with preemption disabled. 197 + */ 198 + static inline void intel_rdt_sched_in(void) 199 + { 200 + if (static_branch_likely(&rdt_enable_key)) { 201 + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); 202 + int closid; 203 + 204 + /* 205 + * If this task has a closid assigned, use it. 206 + * Else use the closid assigned to this cpu. 207 + */ 208 + closid = current->closid; 209 + if (closid == 0) 210 + closid = this_cpu_read(cpu_closid); 211 + 212 + if (closid != state->closid) { 213 + state->closid = closid; 214 + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid); 215 + } 216 + } 217 + } 218 + 219 + #else 220 + 221 + static inline void intel_rdt_sched_in(void) {} 222 + 223 + #endif /* CONFIG_INTEL_RDT_A */ 224 + #endif /* _ASM_X86_INTEL_RDT_H */

+27

arch/x86/include/asm/intel_rdt_common.h

··· 1 + #ifndef _ASM_X86_INTEL_RDT_COMMON_H 2 + #define _ASM_X86_INTEL_RDT_COMMON_H 3 + 4 + #define MSR_IA32_PQR_ASSOC 0x0c8f 5 + 6 + /** 7 + * struct intel_pqr_state - State cache for the PQR MSR 8 + * @rmid: The cached Resource Monitoring ID 9 + * @closid: The cached Class Of Service ID 10 + * @rmid_usecnt: The usage counter for rmid 11 + * 12 + * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the 13 + * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always 14 + * contains both parts, so we need to cache them. 15 + * 16 + * The cache also helps to avoid pointless updates if the value does 17 + * not change. 18 + */ 19 + struct intel_pqr_state { 20 + u32 rmid; 21 + u32 closid; 22 + int rmid_usecnt; 23 + }; 24 + 25 + DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); 26 + 27 + #endif /* _ASM_X86_INTEL_RDT_COMMON_H */

+2

arch/x86/kernel/cpu/Makefile

··· 32 32 obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o 33 33 obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o 34 34 35 + obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o 36 + 35 37 obj-$(CONFIG_X86_MCE) += mcheck/ 36 38 obj-$(CONFIG_MTRR) += mtrr/ 37 39 obj-$(CONFIG_MICROCODE) += microcode/

+20

arch/x86/kernel/cpu/intel_cacheinfo.c

··· 153 153 union _cpuid4_leaf_eax eax; 154 154 union _cpuid4_leaf_ebx ebx; 155 155 union _cpuid4_leaf_ecx ecx; 156 + unsigned int id; 156 157 unsigned long size; 157 158 struct amd_northbridge *nb; 158 159 }; ··· 895 894 static void ci_leaf_init(struct cacheinfo *this_leaf, 896 895 struct _cpuid4_info_regs *base) 897 896 { 897 + this_leaf->id = base->id; 898 + this_leaf->attributes = CACHE_ID; 898 899 this_leaf->level = base->eax.split.level; 899 900 this_leaf->type = cache_type_map[base->eax.split.type]; 900 901 this_leaf->coherency_line_size = ··· 923 920 return 0; 924 921 } 925 922 923 + /* 924 + * The max shared threads number comes from CPUID.4:EAX[25-14] with input 925 + * ECX as cache index. Then right shift apicid by the number's order to get 926 + * cache id for this cache node. 927 + */ 928 + static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) 929 + { 930 + struct cpuinfo_x86 *c = &cpu_data(cpu); 931 + unsigned long num_threads_sharing; 932 + int index_msb; 933 + 934 + num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; 935 + index_msb = get_count_order(num_threads_sharing); 936 + id4_regs->id = c->apicid >> index_msb; 937 + } 938 + 926 939 static int __populate_cache_leaves(unsigned int cpu) 927 940 { 928 941 unsigned int idx, ret; ··· 950 931 ret = cpuid4_cache_lookup_regs(idx, &id4_regs); 951 932 if (ret) 952 933 return ret; 934 + get_cache_id(cpu, &id4_regs); 953 935 ci_leaf_init(this_leaf++, &id4_regs); 954 936 __cache_cpumap_setup(cpu, idx, &id4_regs); 955 937 }

+403

arch/x86/kernel/cpu/intel_rdt.c

··· 1 + /* 2 + * Resource Director Technology(RDT) 3 + * - Cache Allocation code. 4 + * 5 + * Copyright (C) 2016 Intel Corporation 6 + * 7 + * Authors: 8 + * Fenghua Yu <fenghua.yu@intel.com> 9 + * Tony Luck <tony.luck@intel.com> 10 + * Vikas Shivappa <vikas.shivappa@intel.com> 11 + * 12 + * This program is free software; you can redistribute it and/or modify it 13 + * under the terms and conditions of the GNU General Public License, 14 + * version 2, as published by the Free Software Foundation. 15 + * 16 + * This program is distributed in the hope it will be useful, but WITHOUT 17 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 18 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 19 + * more details. 20 + * 21 + * More information about RDT be found in the Intel (R) x86 Architecture 22 + * Software Developer Manual June 2016, volume 3, section 17.17. 23 + */ 24 + 25 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 26 + 27 + #include <linux/slab.h> 28 + #include <linux/err.h> 29 + #include <linux/cacheinfo.h> 30 + #include <linux/cpuhotplug.h> 31 + 32 + #include <asm/intel-family.h> 33 + #include <asm/intel_rdt.h> 34 + 35 + /* Mutex to protect rdtgroup access. */ 36 + DEFINE_MUTEX(rdtgroup_mutex); 37 + 38 + DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); 39 + 40 + #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) 41 + 42 + struct rdt_resource rdt_resources_all[] = { 43 + { 44 + .name = "L3", 45 + .domains = domain_init(RDT_RESOURCE_L3), 46 + .msr_base = IA32_L3_CBM_BASE, 47 + .min_cbm_bits = 1, 48 + .cache_level = 3, 49 + .cbm_idx_multi = 1, 50 + .cbm_idx_offset = 0 51 + }, 52 + { 53 + .name = "L3DATA", 54 + .domains = domain_init(RDT_RESOURCE_L3DATA), 55 + .msr_base = IA32_L3_CBM_BASE, 56 + .min_cbm_bits = 1, 57 + .cache_level = 3, 58 + .cbm_idx_multi = 2, 59 + .cbm_idx_offset = 0 60 + }, 61 + { 62 + .name = "L3CODE", 63 + .domains = domain_init(RDT_RESOURCE_L3CODE), 64 + .msr_base = IA32_L3_CBM_BASE, 65 + .min_cbm_bits = 1, 66 + .cache_level = 3, 67 + .cbm_idx_multi = 2, 68 + .cbm_idx_offset = 1 69 + }, 70 + { 71 + .name = "L2", 72 + .domains = domain_init(RDT_RESOURCE_L2), 73 + .msr_base = IA32_L2_CBM_BASE, 74 + .min_cbm_bits = 1, 75 + .cache_level = 2, 76 + .cbm_idx_multi = 1, 77 + .cbm_idx_offset = 0 78 + }, 79 + }; 80 + 81 + static int cbm_idx(struct rdt_resource *r, int closid) 82 + { 83 + return closid * r->cbm_idx_multi + r->cbm_idx_offset; 84 + } 85 + 86 + /* 87 + * cache_alloc_hsw_probe() - Have to probe for Intel haswell server CPUs 88 + * as they do not have CPUID enumeration support for Cache allocation. 89 + * The check for Vendor/Family/Model is not enough to guarantee that 90 + * the MSRs won't #GP fault because only the following SKUs support 91 + * CAT: 92 + * Intel(R) Xeon(R) CPU E5-2658 v3 @ 2.20GHz 93 + * Intel(R) Xeon(R) CPU E5-2648L v3 @ 1.80GHz 94 + * Intel(R) Xeon(R) CPU E5-2628L v3 @ 2.00GHz 95 + * Intel(R) Xeon(R) CPU E5-2618L v3 @ 2.30GHz 96 + * Intel(R) Xeon(R) CPU E5-2608L v3 @ 2.00GHz 97 + * Intel(R) Xeon(R) CPU E5-2658A v3 @ 2.20GHz 98 + * 99 + * Probe by trying to write the first of the L3 cach mask registers 100 + * and checking that the bits stick. Max CLOSids is always 4 and max cbm length 101 + * is always 20 on hsw server parts. The minimum cache bitmask length 102 + * allowed for HSW server is always 2 bits. Hardcode all of them. 103 + */ 104 + static inline bool cache_alloc_hsw_probe(void) 105 + { 106 + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 107 + boot_cpu_data.x86 == 6 && 108 + boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { 109 + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; 110 + u32 l, h, max_cbm = BIT_MASK(20) - 1; 111 + 112 + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) 113 + return false; 114 + rdmsr(IA32_L3_CBM_BASE, l, h); 115 + 116 + /* If all the bits were set in MSR, return success */ 117 + if (l != max_cbm) 118 + return false; 119 + 120 + r->num_closid = 4; 121 + r->cbm_len = 20; 122 + r->max_cbm = max_cbm; 123 + r->min_cbm_bits = 2; 124 + r->capable = true; 125 + r->enabled = true; 126 + 127 + return true; 128 + } 129 + 130 + return false; 131 + } 132 + 133 + static void rdt_get_config(int idx, struct rdt_resource *r) 134 + { 135 + union cpuid_0x10_1_eax eax; 136 + union cpuid_0x10_1_edx edx; 137 + u32 ebx, ecx; 138 + 139 + cpuid_count(0x00000010, idx, &eax.full, &ebx, &ecx, &edx.full); 140 + r->num_closid = edx.split.cos_max + 1; 141 + r->cbm_len = eax.split.cbm_len + 1; 142 + r->max_cbm = BIT_MASK(eax.split.cbm_len + 1) - 1; 143 + r->capable = true; 144 + r->enabled = true; 145 + } 146 + 147 + static void rdt_get_cdp_l3_config(int type) 148 + { 149 + struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; 150 + struct rdt_resource *r = &rdt_resources_all[type]; 151 + 152 + r->num_closid = r_l3->num_closid / 2; 153 + r->cbm_len = r_l3->cbm_len; 154 + r->max_cbm = r_l3->max_cbm; 155 + r->capable = true; 156 + /* 157 + * By default, CDP is disabled. CDP can be enabled by mount parameter 158 + * "cdp" during resctrl file system mount time. 159 + */ 160 + r->enabled = false; 161 + } 162 + 163 + static inline bool get_rdt_resources(void) 164 + { 165 + bool ret = false; 166 + 167 + if (cache_alloc_hsw_probe()) 168 + return true; 169 + 170 + if (!boot_cpu_has(X86_FEATURE_RDT_A)) 171 + return false; 172 + 173 + if (boot_cpu_has(X86_FEATURE_CAT_L3)) { 174 + rdt_get_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); 175 + if (boot_cpu_has(X86_FEATURE_CDP_L3)) { 176 + rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); 177 + rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); 178 + } 179 + ret = true; 180 + } 181 + if (boot_cpu_has(X86_FEATURE_CAT_L2)) { 182 + /* CPUID 0x10.2 fields are same format at 0x10.1 */ 183 + rdt_get_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); 184 + ret = true; 185 + } 186 + 187 + return ret; 188 + } 189 + 190 + static int get_cache_id(int cpu, int level) 191 + { 192 + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); 193 + int i; 194 + 195 + for (i = 0; i < ci->num_leaves; i++) { 196 + if (ci->info_list[i].level == level) 197 + return ci->info_list[i].id; 198 + } 199 + 200 + return -1; 201 + } 202 + 203 + void rdt_cbm_update(void *arg) 204 + { 205 + struct msr_param *m = (struct msr_param *)arg; 206 + struct rdt_resource *r = m->res; 207 + int i, cpu = smp_processor_id(); 208 + struct rdt_domain *d; 209 + 210 + list_for_each_entry(d, &r->domains, list) { 211 + /* Find the domain that contains this CPU */ 212 + if (cpumask_test_cpu(cpu, &d->cpu_mask)) 213 + goto found; 214 + } 215 + pr_info_once("cpu %d not found in any domain for resource %s\n", 216 + cpu, r->name); 217 + 218 + return; 219 + 220 + found: 221 + for (i = m->low; i < m->high; i++) { 222 + int idx = cbm_idx(r, i); 223 + 224 + wrmsrl(r->msr_base + idx, d->cbm[i]); 225 + } 226 + } 227 + 228 + /* 229 + * rdt_find_domain - Find a domain in a resource that matches input resource id 230 + * 231 + * Search resource r's domain list to find the resource id. If the resource 232 + * id is found in a domain, return the domain. Otherwise, if requested by 233 + * caller, return the first domain whose id is bigger than the input id. 234 + * The domain list is sorted by id in ascending order. 235 + */ 236 + static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, 237 + struct list_head **pos) 238 + { 239 + struct rdt_domain *d; 240 + struct list_head *l; 241 + 242 + if (id < 0) 243 + return ERR_PTR(id); 244 + 245 + list_for_each(l, &r->domains) { 246 + d = list_entry(l, struct rdt_domain, list); 247 + /* When id is found, return its domain. */ 248 + if (id == d->id) 249 + return d; 250 + /* Stop searching when finding id's position in sorted list. */ 251 + if (id < d->id) 252 + break; 253 + } 254 + 255 + if (pos) 256 + *pos = l; 257 + 258 + return NULL; 259 + } 260 + 261 + /* 262 + * domain_add_cpu - Add a cpu to a resource's domain list. 263 + * 264 + * If an existing domain in the resource r's domain list matches the cpu's 265 + * resource id, add the cpu in the domain. 266 + * 267 + * Otherwise, a new domain is allocated and inserted into the right position 268 + * in the domain list sorted by id in ascending order. 269 + * 270 + * The order in the domain list is visible to users when we print entries 271 + * in the schemata file and schemata input is validated to have the same order 272 + * as this list. 273 + */ 274 + static void domain_add_cpu(int cpu, struct rdt_resource *r) 275 + { 276 + int i, id = get_cache_id(cpu, r->cache_level); 277 + struct list_head *add_pos = NULL; 278 + struct rdt_domain *d; 279 + 280 + d = rdt_find_domain(r, id, &add_pos); 281 + if (IS_ERR(d)) { 282 + pr_warn("Could't find cache id for cpu %d\n", cpu); 283 + return; 284 + } 285 + 286 + if (d) { 287 + cpumask_set_cpu(cpu, &d->cpu_mask); 288 + return; 289 + } 290 + 291 + d = kzalloc_node(sizeof(*d), GFP_KERNEL, cpu_to_node(cpu)); 292 + if (!d) 293 + return; 294 + 295 + d->id = id; 296 + 297 + d->cbm = kmalloc_array(r->num_closid, sizeof(*d->cbm), GFP_KERNEL); 298 + if (!d->cbm) { 299 + kfree(d); 300 + return; 301 + } 302 + 303 + for (i = 0; i < r->num_closid; i++) { 304 + int idx = cbm_idx(r, i); 305 + 306 + d->cbm[i] = r->max_cbm; 307 + wrmsrl(r->msr_base + idx, d->cbm[i]); 308 + } 309 + 310 + cpumask_set_cpu(cpu, &d->cpu_mask); 311 + list_add_tail(&d->list, add_pos); 312 + r->num_domains++; 313 + } 314 + 315 + static void domain_remove_cpu(int cpu, struct rdt_resource *r) 316 + { 317 + int id = get_cache_id(cpu, r->cache_level); 318 + struct rdt_domain *d; 319 + 320 + d = rdt_find_domain(r, id, NULL); 321 + if (IS_ERR_OR_NULL(d)) { 322 + pr_warn("Could't find cache id for cpu %d\n", cpu); 323 + return; 324 + } 325 + 326 + cpumask_clear_cpu(cpu, &d->cpu_mask); 327 + if (cpumask_empty(&d->cpu_mask)) { 328 + r->num_domains--; 329 + kfree(d->cbm); 330 + list_del(&d->list); 331 + kfree(d); 332 + } 333 + } 334 + 335 + static void clear_closid(int cpu) 336 + { 337 + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); 338 + 339 + per_cpu(cpu_closid, cpu) = 0; 340 + state->closid = 0; 341 + wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); 342 + } 343 + 344 + static int intel_rdt_online_cpu(unsigned int cpu) 345 + { 346 + struct rdt_resource *r; 347 + 348 + mutex_lock(&rdtgroup_mutex); 349 + for_each_capable_rdt_resource(r) 350 + domain_add_cpu(cpu, r); 351 + /* The cpu is set in default rdtgroup after online. */ 352 + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); 353 + clear_closid(cpu); 354 + mutex_unlock(&rdtgroup_mutex); 355 + 356 + return 0; 357 + } 358 + 359 + static int intel_rdt_offline_cpu(unsigned int cpu) 360 + { 361 + struct rdtgroup *rdtgrp; 362 + struct rdt_resource *r; 363 + 364 + mutex_lock(&rdtgroup_mutex); 365 + for_each_capable_rdt_resource(r) 366 + domain_remove_cpu(cpu, r); 367 + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { 368 + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) 369 + break; 370 + } 371 + clear_closid(cpu); 372 + mutex_unlock(&rdtgroup_mutex); 373 + 374 + return 0; 375 + } 376 + 377 + static int __init intel_rdt_late_init(void) 378 + { 379 + struct rdt_resource *r; 380 + int state, ret; 381 + 382 + if (!get_rdt_resources()) 383 + return -ENODEV; 384 + 385 + state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, 386 + "x86/rdt/cat:online:", 387 + intel_rdt_online_cpu, intel_rdt_offline_cpu); 388 + if (state < 0) 389 + return state; 390 + 391 + ret = rdtgroup_init(); 392 + if (ret) { 393 + cpuhp_remove_state(state); 394 + return ret; 395 + } 396 + 397 + for_each_capable_rdt_resource(r) 398 + pr_info("Intel RDT %s allocation detected\n", r->name); 399 + 400 + return 0; 401 + } 402 + 403 + late_initcall(intel_rdt_late_init);

+1115

arch/x86/kernel/cpu/intel_rdt_rdtgroup.c

··· 1 + /* 2 + * User interface for Resource Alloction in Resource Director Technology(RDT) 3 + * 4 + * Copyright (C) 2016 Intel Corporation 5 + * 6 + * Author: Fenghua Yu <fenghua.yu@intel.com> 7 + * 8 + * This program is free software; you can redistribute it and/or modify it 9 + * under the terms and conditions of the GNU General Public License, 10 + * version 2, as published by the Free Software Foundation. 11 + * 12 + * This program is distributed in the hope it will be useful, but WITHOUT 13 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 15 + * more details. 16 + * 17 + * More information about RDT be found in the Intel (R) x86 Architecture 18 + * Software Developer Manual. 19 + */ 20 + 21 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 22 + 23 + #include <linux/cpu.h> 24 + #include <linux/fs.h> 25 + #include <linux/sysfs.h> 26 + #include <linux/kernfs.h> 27 + #include <linux/seq_file.h> 28 + #include <linux/sched.h> 29 + #include <linux/slab.h> 30 + #include <linux/cpu.h> 31 + #include <linux/task_work.h> 32 + 33 + #include <uapi/linux/magic.h> 34 + 35 + #include <asm/intel_rdt.h> 36 + #include <asm/intel_rdt_common.h> 37 + 38 + DEFINE_STATIC_KEY_FALSE(rdt_enable_key); 39 + struct kernfs_root *rdt_root; 40 + struct rdtgroup rdtgroup_default; 41 + LIST_HEAD(rdt_all_groups); 42 + 43 + /* Kernel fs node for "info" directory under root */ 44 + static struct kernfs_node *kn_info; 45 + 46 + /* 47 + * Trivial allocator for CLOSIDs. Since h/w only supports a small number, 48 + * we can keep a bitmap of free CLOSIDs in a single integer. 49 + * 50 + * Using a global CLOSID across all resources has some advantages and 51 + * some drawbacks: 52 + * + We can simply set "current->closid" to assign a task to a resource 53 + * group. 54 + * + Context switch code can avoid extra memory references deciding which 55 + * CLOSID to load into the PQR_ASSOC MSR 56 + * - We give up some options in configuring resource groups across multi-socket 57 + * systems. 58 + * - Our choices on how to configure each resource become progressively more 59 + * limited as the number of resources grows. 60 + */ 61 + static int closid_free_map; 62 + 63 + static void closid_init(void) 64 + { 65 + struct rdt_resource *r; 66 + int rdt_min_closid = 32; 67 + 68 + /* Compute rdt_min_closid across all resources */ 69 + for_each_enabled_rdt_resource(r) 70 + rdt_min_closid = min(rdt_min_closid, r->num_closid); 71 + 72 + closid_free_map = BIT_MASK(rdt_min_closid) - 1; 73 + 74 + /* CLOSID 0 is always reserved for the default group */ 75 + closid_free_map &= ~1; 76 + } 77 + 78 + int closid_alloc(void) 79 + { 80 + int closid = ffs(closid_free_map); 81 + 82 + if (closid == 0) 83 + return -ENOSPC; 84 + closid--; 85 + closid_free_map &= ~(1 << closid); 86 + 87 + return closid; 88 + } 89 + 90 + static void closid_free(int closid) 91 + { 92 + closid_free_map |= 1 << closid; 93 + } 94 + 95 + /* set uid and gid of rdtgroup dirs and files to that of the creator */ 96 + static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) 97 + { 98 + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, 99 + .ia_uid = current_fsuid(), 100 + .ia_gid = current_fsgid(), }; 101 + 102 + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && 103 + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) 104 + return 0; 105 + 106 + return kernfs_setattr(kn, &iattr); 107 + } 108 + 109 + static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) 110 + { 111 + struct kernfs_node *kn; 112 + int ret; 113 + 114 + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, 115 + 0, rft->kf_ops, rft, NULL, NULL); 116 + if (IS_ERR(kn)) 117 + return PTR_ERR(kn); 118 + 119 + ret = rdtgroup_kn_set_ugid(kn); 120 + if (ret) { 121 + kernfs_remove(kn); 122 + return ret; 123 + } 124 + 125 + return 0; 126 + } 127 + 128 + static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, 129 + int len) 130 + { 131 + struct rftype *rft; 132 + int ret; 133 + 134 + lockdep_assert_held(&rdtgroup_mutex); 135 + 136 + for (rft = rfts; rft < rfts + len; rft++) { 137 + ret = rdtgroup_add_file(kn, rft); 138 + if (ret) 139 + goto error; 140 + } 141 + 142 + return 0; 143 + error: 144 + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); 145 + while (--rft >= rfts) 146 + kernfs_remove_by_name(kn, rft->name); 147 + return ret; 148 + } 149 + 150 + static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) 151 + { 152 + struct kernfs_open_file *of = m->private; 153 + struct rftype *rft = of->kn->priv; 154 + 155 + if (rft->seq_show) 156 + return rft->seq_show(of, m, arg); 157 + return 0; 158 + } 159 + 160 + static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, 161 + size_t nbytes, loff_t off) 162 + { 163 + struct rftype *rft = of->kn->priv; 164 + 165 + if (rft->write) 166 + return rft->write(of, buf, nbytes, off); 167 + 168 + return -EINVAL; 169 + } 170 + 171 + static struct kernfs_ops rdtgroup_kf_single_ops = { 172 + .atomic_write_len = PAGE_SIZE, 173 + .write = rdtgroup_file_write, 174 + .seq_show = rdtgroup_seqfile_show, 175 + }; 176 + 177 + static int rdtgroup_cpus_show(struct kernfs_open_file *of, 178 + struct seq_file *s, void *v) 179 + { 180 + struct rdtgroup *rdtgrp; 181 + int ret = 0; 182 + 183 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 184 + 185 + if (rdtgrp) 186 + seq_printf(s, "%*pb\n", cpumask_pr_args(&rdtgrp->cpu_mask)); 187 + else 188 + ret = -ENOENT; 189 + rdtgroup_kn_unlock(of->kn); 190 + 191 + return ret; 192 + } 193 + 194 + /* 195 + * This is safe against intel_rdt_sched_in() called from __switch_to() 196 + * because __switch_to() is executed with interrupts disabled. A local call 197 + * from rdt_update_closid() is proteced against __switch_to() because 198 + * preemption is disabled. 199 + */ 200 + static void rdt_update_cpu_closid(void *closid) 201 + { 202 + if (closid) 203 + this_cpu_write(cpu_closid, *(int *)closid); 204 + /* 205 + * We cannot unconditionally write the MSR because the current 206 + * executing task might have its own closid selected. Just reuse 207 + * the context switch code. 208 + */ 209 + intel_rdt_sched_in(); 210 + } 211 + 212 + /* 213 + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, 214 + * 215 + * Per task closids must have been set up before calling this function. 216 + * 217 + * The per cpu closids are updated with the smp function call, when @closid 218 + * is not NULL. If @closid is NULL then all affected percpu closids must 219 + * have been set up before calling this function. 220 + */ 221 + static void 222 + rdt_update_closid(const struct cpumask *cpu_mask, int *closid) 223 + { 224 + int cpu = get_cpu(); 225 + 226 + if (cpumask_test_cpu(cpu, cpu_mask)) 227 + rdt_update_cpu_closid(closid); 228 + smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); 229 + put_cpu(); 230 + } 231 + 232 + static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, 233 + char *buf, size_t nbytes, loff_t off) 234 + { 235 + cpumask_var_t tmpmask, newmask; 236 + struct rdtgroup *rdtgrp, *r; 237 + int ret; 238 + 239 + if (!buf) 240 + return -EINVAL; 241 + 242 + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 243 + return -ENOMEM; 244 + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { 245 + free_cpumask_var(tmpmask); 246 + return -ENOMEM; 247 + } 248 + 249 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 250 + if (!rdtgrp) { 251 + ret = -ENOENT; 252 + goto unlock; 253 + } 254 + 255 + ret = cpumask_parse(buf, newmask); 256 + if (ret) 257 + goto unlock; 258 + 259 + /* check that user didn't specify any offline cpus */ 260 + cpumask_andnot(tmpmask, newmask, cpu_online_mask); 261 + if (cpumask_weight(tmpmask)) { 262 + ret = -EINVAL; 263 + goto unlock; 264 + } 265 + 266 + /* Check whether cpus are dropped from this group */ 267 + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); 268 + if (cpumask_weight(tmpmask)) { 269 + /* Can't drop from default group */ 270 + if (rdtgrp == &rdtgroup_default) { 271 + ret = -EINVAL; 272 + goto unlock; 273 + } 274 + /* Give any dropped cpus to rdtgroup_default */ 275 + cpumask_or(&rdtgroup_default.cpu_mask, 276 + &rdtgroup_default.cpu_mask, tmpmask); 277 + rdt_update_closid(tmpmask, &rdtgroup_default.closid); 278 + } 279 + 280 + /* 281 + * If we added cpus, remove them from previous group that owned them 282 + * and update per-cpu closid 283 + */ 284 + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); 285 + if (cpumask_weight(tmpmask)) { 286 + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { 287 + if (r == rdtgrp) 288 + continue; 289 + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); 290 + } 291 + rdt_update_closid(tmpmask, &rdtgrp->closid); 292 + } 293 + 294 + /* Done pushing/pulling - update this group with new mask */ 295 + cpumask_copy(&rdtgrp->cpu_mask, newmask); 296 + 297 + unlock: 298 + rdtgroup_kn_unlock(of->kn); 299 + free_cpumask_var(tmpmask); 300 + free_cpumask_var(newmask); 301 + 302 + return ret ?: nbytes; 303 + } 304 + 305 + struct task_move_callback { 306 + struct callback_head work; 307 + struct rdtgroup *rdtgrp; 308 + }; 309 + 310 + static void move_myself(struct callback_head *head) 311 + { 312 + struct task_move_callback *callback; 313 + struct rdtgroup *rdtgrp; 314 + 315 + callback = container_of(head, struct task_move_callback, work); 316 + rdtgrp = callback->rdtgrp; 317 + 318 + /* 319 + * If resource group was deleted before this task work callback 320 + * was invoked, then assign the task to root group and free the 321 + * resource group. 322 + */ 323 + if (atomic_dec_and_test(&rdtgrp->waitcount) && 324 + (rdtgrp->flags & RDT_DELETED)) { 325 + current->closid = 0; 326 + kfree(rdtgrp); 327 + } 328 + 329 + preempt_disable(); 330 + /* update PQR_ASSOC MSR to make resource group go into effect */ 331 + intel_rdt_sched_in(); 332 + preempt_enable(); 333 + 334 + kfree(callback); 335 + } 336 + 337 + static int __rdtgroup_move_task(struct task_struct *tsk, 338 + struct rdtgroup *rdtgrp) 339 + { 340 + struct task_move_callback *callback; 341 + int ret; 342 + 343 + callback = kzalloc(sizeof(*callback), GFP_KERNEL); 344 + if (!callback) 345 + return -ENOMEM; 346 + callback->work.func = move_myself; 347 + callback->rdtgrp = rdtgrp; 348 + 349 + /* 350 + * Take a refcount, so rdtgrp cannot be freed before the 351 + * callback has been invoked. 352 + */ 353 + atomic_inc(&rdtgrp->waitcount); 354 + ret = task_work_add(tsk, &callback->work, true); 355 + if (ret) { 356 + /* 357 + * Task is exiting. Drop the refcount and free the callback. 358 + * No need to check the refcount as the group cannot be 359 + * deleted before the write function unlocks rdtgroup_mutex. 360 + */ 361 + atomic_dec(&rdtgrp->waitcount); 362 + kfree(callback); 363 + } else { 364 + tsk->closid = rdtgrp->closid; 365 + } 366 + return ret; 367 + } 368 + 369 + static int rdtgroup_task_write_permission(struct task_struct *task, 370 + struct kernfs_open_file *of) 371 + { 372 + const struct cred *tcred = get_task_cred(task); 373 + const struct cred *cred = current_cred(); 374 + int ret = 0; 375 + 376 + /* 377 + * Even if we're attaching all tasks in the thread group, we only 378 + * need to check permissions on one of them. 379 + */ 380 + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 381 + !uid_eq(cred->euid, tcred->uid) && 382 + !uid_eq(cred->euid, tcred->suid)) 383 + ret = -EPERM; 384 + 385 + put_cred(tcred); 386 + return ret; 387 + } 388 + 389 + static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, 390 + struct kernfs_open_file *of) 391 + { 392 + struct task_struct *tsk; 393 + int ret; 394 + 395 + rcu_read_lock(); 396 + if (pid) { 397 + tsk = find_task_by_vpid(pid); 398 + if (!tsk) { 399 + rcu_read_unlock(); 400 + return -ESRCH; 401 + } 402 + } else { 403 + tsk = current; 404 + } 405 + 406 + get_task_struct(tsk); 407 + rcu_read_unlock(); 408 + 409 + ret = rdtgroup_task_write_permission(tsk, of); 410 + if (!ret) 411 + ret = __rdtgroup_move_task(tsk, rdtgrp); 412 + 413 + put_task_struct(tsk); 414 + return ret; 415 + } 416 + 417 + static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, 418 + char *buf, size_t nbytes, loff_t off) 419 + { 420 + struct rdtgroup *rdtgrp; 421 + int ret = 0; 422 + pid_t pid; 423 + 424 + if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) 425 + return -EINVAL; 426 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 427 + 428 + if (rdtgrp) 429 + ret = rdtgroup_move_task(pid, rdtgrp, of); 430 + else 431 + ret = -ENOENT; 432 + 433 + rdtgroup_kn_unlock(of->kn); 434 + 435 + return ret ?: nbytes; 436 + } 437 + 438 + static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) 439 + { 440 + struct task_struct *p, *t; 441 + 442 + rcu_read_lock(); 443 + for_each_process_thread(p, t) { 444 + if (t->closid == r->closid) 445 + seq_printf(s, "%d\n", t->pid); 446 + } 447 + rcu_read_unlock(); 448 + } 449 + 450 + static int rdtgroup_tasks_show(struct kernfs_open_file *of, 451 + struct seq_file *s, void *v) 452 + { 453 + struct rdtgroup *rdtgrp; 454 + int ret = 0; 455 + 456 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 457 + if (rdtgrp) 458 + show_rdt_tasks(rdtgrp, s); 459 + else 460 + ret = -ENOENT; 461 + rdtgroup_kn_unlock(of->kn); 462 + 463 + return ret; 464 + } 465 + 466 + /* Files in each rdtgroup */ 467 + static struct rftype rdtgroup_base_files[] = { 468 + { 469 + .name = "cpus", 470 + .mode = 0644, 471 + .kf_ops = &rdtgroup_kf_single_ops, 472 + .write = rdtgroup_cpus_write, 473 + .seq_show = rdtgroup_cpus_show, 474 + }, 475 + { 476 + .name = "tasks", 477 + .mode = 0644, 478 + .kf_ops = &rdtgroup_kf_single_ops, 479 + .write = rdtgroup_tasks_write, 480 + .seq_show = rdtgroup_tasks_show, 481 + }, 482 + { 483 + .name = "schemata", 484 + .mode = 0644, 485 + .kf_ops = &rdtgroup_kf_single_ops, 486 + .write = rdtgroup_schemata_write, 487 + .seq_show = rdtgroup_schemata_show, 488 + }, 489 + }; 490 + 491 + static int rdt_num_closids_show(struct kernfs_open_file *of, 492 + struct seq_file *seq, void *v) 493 + { 494 + struct rdt_resource *r = of->kn->parent->priv; 495 + 496 + seq_printf(seq, "%d\n", r->num_closid); 497 + 498 + return 0; 499 + } 500 + 501 + static int rdt_cbm_mask_show(struct kernfs_open_file *of, 502 + struct seq_file *seq, void *v) 503 + { 504 + struct rdt_resource *r = of->kn->parent->priv; 505 + 506 + seq_printf(seq, "%x\n", r->max_cbm); 507 + 508 + return 0; 509 + } 510 + 511 + static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, 512 + struct seq_file *seq, void *v) 513 + { 514 + struct rdt_resource *r = of->kn->parent->priv; 515 + 516 + seq_printf(seq, "%d\n", r->min_cbm_bits); 517 + 518 + return 0; 519 + } 520 + 521 + /* rdtgroup information files for one cache resource. */ 522 + static struct rftype res_info_files[] = { 523 + { 524 + .name = "num_closids", 525 + .mode = 0444, 526 + .kf_ops = &rdtgroup_kf_single_ops, 527 + .seq_show = rdt_num_closids_show, 528 + }, 529 + { 530 + .name = "cbm_mask", 531 + .mode = 0444, 532 + .kf_ops = &rdtgroup_kf_single_ops, 533 + .seq_show = rdt_cbm_mask_show, 534 + }, 535 + { 536 + .name = "min_cbm_bits", 537 + .mode = 0444, 538 + .kf_ops = &rdtgroup_kf_single_ops, 539 + .seq_show = rdt_min_cbm_bits_show, 540 + }, 541 + }; 542 + 543 + static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) 544 + { 545 + struct kernfs_node *kn_subdir; 546 + struct rdt_resource *r; 547 + int ret; 548 + 549 + /* create the directory */ 550 + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); 551 + if (IS_ERR(kn_info)) 552 + return PTR_ERR(kn_info); 553 + kernfs_get(kn_info); 554 + 555 + for_each_enabled_rdt_resource(r) { 556 + kn_subdir = kernfs_create_dir(kn_info, r->name, 557 + kn_info->mode, r); 558 + if (IS_ERR(kn_subdir)) { 559 + ret = PTR_ERR(kn_subdir); 560 + goto out_destroy; 561 + } 562 + kernfs_get(kn_subdir); 563 + ret = rdtgroup_kn_set_ugid(kn_subdir); 564 + if (ret) 565 + goto out_destroy; 566 + ret = rdtgroup_add_files(kn_subdir, res_info_files, 567 + ARRAY_SIZE(res_info_files)); 568 + if (ret) 569 + goto out_destroy; 570 + kernfs_activate(kn_subdir); 571 + } 572 + 573 + /* 574 + * This extra ref will be put in kernfs_remove() and guarantees 575 + * that @rdtgrp->kn is always accessible. 576 + */ 577 + kernfs_get(kn_info); 578 + 579 + ret = rdtgroup_kn_set_ugid(kn_info); 580 + if (ret) 581 + goto out_destroy; 582 + 583 + kernfs_activate(kn_info); 584 + 585 + return 0; 586 + 587 + out_destroy: 588 + kernfs_remove(kn_info); 589 + return ret; 590 + } 591 + 592 + static void l3_qos_cfg_update(void *arg) 593 + { 594 + bool *enable = arg; 595 + 596 + wrmsrl(IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); 597 + } 598 + 599 + static int set_l3_qos_cfg(struct rdt_resource *r, bool enable) 600 + { 601 + cpumask_var_t cpu_mask; 602 + struct rdt_domain *d; 603 + int cpu; 604 + 605 + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) 606 + return -ENOMEM; 607 + 608 + list_for_each_entry(d, &r->domains, list) { 609 + /* Pick one CPU from each domain instance to update MSR */ 610 + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); 611 + } 612 + cpu = get_cpu(); 613 + /* Update QOS_CFG MSR on this cpu if it's in cpu_mask. */ 614 + if (cpumask_test_cpu(cpu, cpu_mask)) 615 + l3_qos_cfg_update(&enable); 616 + /* Update QOS_CFG MSR on all other cpus in cpu_mask. */ 617 + smp_call_function_many(cpu_mask, l3_qos_cfg_update, &enable, 1); 618 + put_cpu(); 619 + 620 + free_cpumask_var(cpu_mask); 621 + 622 + return 0; 623 + } 624 + 625 + static int cdp_enable(void) 626 + { 627 + struct rdt_resource *r_l3data = &rdt_resources_all[RDT_RESOURCE_L3DATA]; 628 + struct rdt_resource *r_l3code = &rdt_resources_all[RDT_RESOURCE_L3CODE]; 629 + struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; 630 + int ret; 631 + 632 + if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) 633 + return -EINVAL; 634 + 635 + ret = set_l3_qos_cfg(r_l3, true); 636 + if (!ret) { 637 + r_l3->enabled = false; 638 + r_l3data->enabled = true; 639 + r_l3code->enabled = true; 640 + } 641 + return ret; 642 + } 643 + 644 + static void cdp_disable(void) 645 + { 646 + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; 647 + 648 + r->enabled = r->capable; 649 + 650 + if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { 651 + rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; 652 + rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; 653 + set_l3_qos_cfg(r, false); 654 + } 655 + } 656 + 657 + static int parse_rdtgroupfs_options(char *data) 658 + { 659 + char *token, *o = data; 660 + int ret = 0; 661 + 662 + while ((token = strsep(&o, ",")) != NULL) { 663 + if (!*token) 664 + return -EINVAL; 665 + 666 + if (!strcmp(token, "cdp")) 667 + ret = cdp_enable(); 668 + } 669 + 670 + return ret; 671 + } 672 + 673 + /* 674 + * We don't allow rdtgroup directories to be created anywhere 675 + * except the root directory. Thus when looking for the rdtgroup 676 + * structure for a kernfs node we are either looking at a directory, 677 + * in which case the rdtgroup structure is pointed at by the "priv" 678 + * field, otherwise we have a file, and need only look to the parent 679 + * to find the rdtgroup. 680 + */ 681 + static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) 682 + { 683 + if (kernfs_type(kn) == KERNFS_DIR) { 684 + /* 685 + * All the resource directories use "kn->priv" 686 + * to point to the "struct rdtgroup" for the 687 + * resource. "info" and its subdirectories don't 688 + * have rdtgroup structures, so return NULL here. 689 + */ 690 + if (kn == kn_info || kn->parent == kn_info) 691 + return NULL; 692 + else 693 + return kn->priv; 694 + } else { 695 + return kn->parent->priv; 696 + } 697 + } 698 + 699 + struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) 700 + { 701 + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); 702 + 703 + if (!rdtgrp) 704 + return NULL; 705 + 706 + atomic_inc(&rdtgrp->waitcount); 707 + kernfs_break_active_protection(kn); 708 + 709 + mutex_lock(&rdtgroup_mutex); 710 + 711 + /* Was this group deleted while we waited? */ 712 + if (rdtgrp->flags & RDT_DELETED) 713 + return NULL; 714 + 715 + return rdtgrp; 716 + } 717 + 718 + void rdtgroup_kn_unlock(struct kernfs_node *kn) 719 + { 720 + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); 721 + 722 + if (!rdtgrp) 723 + return; 724 + 725 + mutex_unlock(&rdtgroup_mutex); 726 + 727 + if (atomic_dec_and_test(&rdtgrp->waitcount) && 728 + (rdtgrp->flags & RDT_DELETED)) { 729 + kernfs_unbreak_active_protection(kn); 730 + kernfs_put(kn); 731 + kfree(rdtgrp); 732 + } else { 733 + kernfs_unbreak_active_protection(kn); 734 + } 735 + } 736 + 737 + static struct dentry *rdt_mount(struct file_system_type *fs_type, 738 + int flags, const char *unused_dev_name, 739 + void *data) 740 + { 741 + struct dentry *dentry; 742 + int ret; 743 + 744 + mutex_lock(&rdtgroup_mutex); 745 + /* 746 + * resctrl file system can only be mounted once. 747 + */ 748 + if (static_branch_unlikely(&rdt_enable_key)) { 749 + dentry = ERR_PTR(-EBUSY); 750 + goto out; 751 + } 752 + 753 + ret = parse_rdtgroupfs_options(data); 754 + if (ret) { 755 + dentry = ERR_PTR(ret); 756 + goto out_cdp; 757 + } 758 + 759 + closid_init(); 760 + 761 + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); 762 + if (ret) { 763 + dentry = ERR_PTR(ret); 764 + goto out_cdp; 765 + } 766 + 767 + dentry = kernfs_mount(fs_type, flags, rdt_root, 768 + RDTGROUP_SUPER_MAGIC, NULL); 769 + if (IS_ERR(dentry)) 770 + goto out_cdp; 771 + 772 + static_branch_enable(&rdt_enable_key); 773 + goto out; 774 + 775 + out_cdp: 776 + cdp_disable(); 777 + out: 778 + mutex_unlock(&rdtgroup_mutex); 779 + 780 + return dentry; 781 + } 782 + 783 + static int reset_all_cbms(struct rdt_resource *r) 784 + { 785 + struct msr_param msr_param; 786 + cpumask_var_t cpu_mask; 787 + struct rdt_domain *d; 788 + int i, cpu; 789 + 790 + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) 791 + return -ENOMEM; 792 + 793 + msr_param.res = r; 794 + msr_param.low = 0; 795 + msr_param.high = r->num_closid; 796 + 797 + /* 798 + * Disable resource control for this resource by setting all 799 + * CBMs in all domains to the maximum mask value. Pick one CPU 800 + * from each domain to update the MSRs below. 801 + */ 802 + list_for_each_entry(d, &r->domains, list) { 803 + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); 804 + 805 + for (i = 0; i < r->num_closid; i++) 806 + d->cbm[i] = r->max_cbm; 807 + } 808 + cpu = get_cpu(); 809 + /* Update CBM on this cpu if it's in cpu_mask. */ 810 + if (cpumask_test_cpu(cpu, cpu_mask)) 811 + rdt_cbm_update(&msr_param); 812 + /* Update CBM on all other cpus in cpu_mask. */ 813 + smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); 814 + put_cpu(); 815 + 816 + free_cpumask_var(cpu_mask); 817 + 818 + return 0; 819 + } 820 + 821 + /* 822 + * Move tasks from one to the other group. If @from is NULL, then all tasks 823 + * in the systems are moved unconditionally (used for teardown). 824 + * 825 + * If @mask is not NULL the cpus on which moved tasks are running are set 826 + * in that mask so the update smp function call is restricted to affected 827 + * cpus. 828 + */ 829 + static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, 830 + struct cpumask *mask) 831 + { 832 + struct task_struct *p, *t; 833 + 834 + read_lock(&tasklist_lock); 835 + for_each_process_thread(p, t) { 836 + if (!from || t->closid == from->closid) { 837 + t->closid = to->closid; 838 + #ifdef CONFIG_SMP 839 + /* 840 + * This is safe on x86 w/o barriers as the ordering 841 + * of writing to task_cpu() and t->on_cpu is 842 + * reverse to the reading here. The detection is 843 + * inaccurate as tasks might move or schedule 844 + * before the smp function call takes place. In 845 + * such a case the function call is pointless, but 846 + * there is no other side effect. 847 + */ 848 + if (mask && t->on_cpu) 849 + cpumask_set_cpu(task_cpu(t), mask); 850 + #endif 851 + } 852 + } 853 + read_unlock(&tasklist_lock); 854 + } 855 + 856 + /* 857 + * Forcibly remove all of subdirectories under root. 858 + */ 859 + static void rmdir_all_sub(void) 860 + { 861 + struct rdtgroup *rdtgrp, *tmp; 862 + 863 + /* Move all tasks to the default resource group */ 864 + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); 865 + 866 + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { 867 + /* Remove each rdtgroup other than root */ 868 + if (rdtgrp == &rdtgroup_default) 869 + continue; 870 + 871 + /* 872 + * Give any CPUs back to the default group. We cannot copy 873 + * cpu_online_mask because a CPU might have executed the 874 + * offline callback already, but is still marked online. 875 + */ 876 + cpumask_or(&rdtgroup_default.cpu_mask, 877 + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 878 + 879 + kernfs_remove(rdtgrp->kn); 880 + list_del(&rdtgrp->rdtgroup_list); 881 + kfree(rdtgrp); 882 + } 883 + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ 884 + get_online_cpus(); 885 + rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); 886 + put_online_cpus(); 887 + 888 + kernfs_remove(kn_info); 889 + } 890 + 891 + static void rdt_kill_sb(struct super_block *sb) 892 + { 893 + struct rdt_resource *r; 894 + 895 + mutex_lock(&rdtgroup_mutex); 896 + 897 + /*Put everything back to default values. */ 898 + for_each_enabled_rdt_resource(r) 899 + reset_all_cbms(r); 900 + cdp_disable(); 901 + rmdir_all_sub(); 902 + static_branch_disable(&rdt_enable_key); 903 + kernfs_kill_sb(sb); 904 + mutex_unlock(&rdtgroup_mutex); 905 + } 906 + 907 + static struct file_system_type rdt_fs_type = { 908 + .name = "resctrl", 909 + .mount = rdt_mount, 910 + .kill_sb = rdt_kill_sb, 911 + }; 912 + 913 + static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, 914 + umode_t mode) 915 + { 916 + struct rdtgroup *parent, *rdtgrp; 917 + struct kernfs_node *kn; 918 + int ret, closid; 919 + 920 + /* Only allow mkdir in the root directory */ 921 + if (parent_kn != rdtgroup_default.kn) 922 + return -EPERM; 923 + 924 + /* Do not accept '\n' to avoid unparsable situation. */ 925 + if (strchr(name, '\n')) 926 + return -EINVAL; 927 + 928 + parent = rdtgroup_kn_lock_live(parent_kn); 929 + if (!parent) { 930 + ret = -ENODEV; 931 + goto out_unlock; 932 + } 933 + 934 + ret = closid_alloc(); 935 + if (ret < 0) 936 + goto out_unlock; 937 + closid = ret; 938 + 939 + /* allocate the rdtgroup. */ 940 + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); 941 + if (!rdtgrp) { 942 + ret = -ENOSPC; 943 + goto out_closid_free; 944 + } 945 + rdtgrp->closid = closid; 946 + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); 947 + 948 + /* kernfs creates the directory for rdtgrp */ 949 + kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); 950 + if (IS_ERR(kn)) { 951 + ret = PTR_ERR(kn); 952 + goto out_cancel_ref; 953 + } 954 + rdtgrp->kn = kn; 955 + 956 + /* 957 + * kernfs_remove() will drop the reference count on "kn" which 958 + * will free it. But we still need it to stick around for the 959 + * rdtgroup_kn_unlock(kn} call below. Take one extra reference 960 + * here, which will be dropped inside rdtgroup_kn_unlock(). 961 + */ 962 + kernfs_get(kn); 963 + 964 + ret = rdtgroup_kn_set_ugid(kn); 965 + if (ret) 966 + goto out_destroy; 967 + 968 + ret = rdtgroup_add_files(kn, rdtgroup_base_files, 969 + ARRAY_SIZE(rdtgroup_base_files)); 970 + if (ret) 971 + goto out_destroy; 972 + 973 + kernfs_activate(kn); 974 + 975 + ret = 0; 976 + goto out_unlock; 977 + 978 + out_destroy: 979 + kernfs_remove(rdtgrp->kn); 980 + out_cancel_ref: 981 + list_del(&rdtgrp->rdtgroup_list); 982 + kfree(rdtgrp); 983 + out_closid_free: 984 + closid_free(closid); 985 + out_unlock: 986 + rdtgroup_kn_unlock(parent_kn); 987 + return ret; 988 + } 989 + 990 + static int rdtgroup_rmdir(struct kernfs_node *kn) 991 + { 992 + int ret, cpu, closid = rdtgroup_default.closid; 993 + struct rdtgroup *rdtgrp; 994 + cpumask_var_t tmpmask; 995 + 996 + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) 997 + return -ENOMEM; 998 + 999 + rdtgrp = rdtgroup_kn_lock_live(kn); 1000 + if (!rdtgrp) { 1001 + ret = -EPERM; 1002 + goto out; 1003 + } 1004 + 1005 + /* Give any tasks back to the default group */ 1006 + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); 1007 + 1008 + /* Give any CPUs back to the default group */ 1009 + cpumask_or(&rdtgroup_default.cpu_mask, 1010 + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); 1011 + 1012 + /* Update per cpu closid of the moved CPUs first */ 1013 + for_each_cpu(cpu, &rdtgrp->cpu_mask) 1014 + per_cpu(cpu_closid, cpu) = closid; 1015 + /* 1016 + * Update the MSR on moved CPUs and CPUs which have moved 1017 + * task running on them. 1018 + */ 1019 + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); 1020 + rdt_update_closid(tmpmask, NULL); 1021 + 1022 + rdtgrp->flags = RDT_DELETED; 1023 + closid_free(rdtgrp->closid); 1024 + list_del(&rdtgrp->rdtgroup_list); 1025 + 1026 + /* 1027 + * one extra hold on this, will drop when we kfree(rdtgrp) 1028 + * in rdtgroup_kn_unlock() 1029 + */ 1030 + kernfs_get(kn); 1031 + kernfs_remove(rdtgrp->kn); 1032 + ret = 0; 1033 + out: 1034 + rdtgroup_kn_unlock(kn); 1035 + free_cpumask_var(tmpmask); 1036 + return ret; 1037 + } 1038 + 1039 + static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) 1040 + { 1041 + if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) 1042 + seq_puts(seq, ",cdp"); 1043 + return 0; 1044 + } 1045 + 1046 + static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { 1047 + .mkdir = rdtgroup_mkdir, 1048 + .rmdir = rdtgroup_rmdir, 1049 + .show_options = rdtgroup_show_options, 1050 + }; 1051 + 1052 + static int __init rdtgroup_setup_root(void) 1053 + { 1054 + int ret; 1055 + 1056 + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, 1057 + KERNFS_ROOT_CREATE_DEACTIVATED, 1058 + &rdtgroup_default); 1059 + if (IS_ERR(rdt_root)) 1060 + return PTR_ERR(rdt_root); 1061 + 1062 + mutex_lock(&rdtgroup_mutex); 1063 + 1064 + rdtgroup_default.closid = 0; 1065 + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); 1066 + 1067 + ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, 1068 + ARRAY_SIZE(rdtgroup_base_files)); 1069 + if (ret) { 1070 + kernfs_destroy_root(rdt_root); 1071 + goto out; 1072 + } 1073 + 1074 + rdtgroup_default.kn = rdt_root->kn; 1075 + kernfs_activate(rdtgroup_default.kn); 1076 + 1077 + out: 1078 + mutex_unlock(&rdtgroup_mutex); 1079 + 1080 + return ret; 1081 + } 1082 + 1083 + /* 1084 + * rdtgroup_init - rdtgroup initialization 1085 + * 1086 + * Setup resctrl file system including set up root, create mount point, 1087 + * register rdtgroup filesystem, and initialize files under root directory. 1088 + * 1089 + * Return: 0 on success or -errno 1090 + */ 1091 + int __init rdtgroup_init(void) 1092 + { 1093 + int ret = 0; 1094 + 1095 + ret = rdtgroup_setup_root(); 1096 + if (ret) 1097 + return ret; 1098 + 1099 + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); 1100 + if (ret) 1101 + goto cleanup_root; 1102 + 1103 + ret = register_filesystem(&rdt_fs_type); 1104 + if (ret) 1105 + goto cleanup_mountpoint; 1106 + 1107 + return 0; 1108 + 1109 + cleanup_mountpoint: 1110 + sysfs_remove_mount_point(fs_kobj, "resctrl"); 1111 + cleanup_root: 1112 + kernfs_destroy_root(rdt_root); 1113 + 1114 + return ret; 1115 + }

+245

arch/x86/kernel/cpu/intel_rdt_schemata.c

··· 1 + /* 2 + * Resource Director Technology(RDT) 3 + * - Cache Allocation code. 4 + * 5 + * Copyright (C) 2016 Intel Corporation 6 + * 7 + * Authors: 8 + * Fenghua Yu <fenghua.yu@intel.com> 9 + * Tony Luck <tony.luck@intel.com> 10 + * 11 + * This program is free software; you can redistribute it and/or modify it 12 + * under the terms and conditions of the GNU General Public License, 13 + * version 2, as published by the Free Software Foundation. 14 + * 15 + * This program is distributed in the hope it will be useful, but WITHOUT 16 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 17 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 18 + * more details. 19 + * 20 + * More information about RDT be found in the Intel (R) x86 Architecture 21 + * Software Developer Manual June 2016, volume 3, section 17.17. 22 + */ 23 + 24 + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 25 + 26 + #include <linux/kernfs.h> 27 + #include <linux/seq_file.h> 28 + #include <linux/slab.h> 29 + #include <asm/intel_rdt.h> 30 + 31 + /* 32 + * Check whether a cache bit mask is valid. The SDM says: 33 + * Please note that all (and only) contiguous '1' combinations 34 + * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). 35 + * Additionally Haswell requires at least two bits set. 36 + */ 37 + static bool cbm_validate(unsigned long var, struct rdt_resource *r) 38 + { 39 + unsigned long first_bit, zero_bit; 40 + 41 + if (var == 0 || var > r->max_cbm) 42 + return false; 43 + 44 + first_bit = find_first_bit(&var, r->cbm_len); 45 + zero_bit = find_next_zero_bit(&var, r->cbm_len, first_bit); 46 + 47 + if (find_next_bit(&var, r->cbm_len, zero_bit) < r->cbm_len) 48 + return false; 49 + 50 + if ((zero_bit - first_bit) < r->min_cbm_bits) 51 + return false; 52 + return true; 53 + } 54 + 55 + /* 56 + * Read one cache bit mask (hex). Check that it is valid for the current 57 + * resource type. 58 + */ 59 + static int parse_cbm(char *buf, struct rdt_resource *r) 60 + { 61 + unsigned long data; 62 + int ret; 63 + 64 + ret = kstrtoul(buf, 16, &data); 65 + if (ret) 66 + return ret; 67 + if (!cbm_validate(data, r)) 68 + return -EINVAL; 69 + r->tmp_cbms[r->num_tmp_cbms++] = data; 70 + 71 + return 0; 72 + } 73 + 74 + /* 75 + * For each domain in this resource we expect to find a series of: 76 + * id=mask 77 + * separated by ";". The "id" is in decimal, and must appear in the 78 + * right order. 79 + */ 80 + static int parse_line(char *line, struct rdt_resource *r) 81 + { 82 + char *dom = NULL, *id; 83 + struct rdt_domain *d; 84 + unsigned long dom_id; 85 + 86 + list_for_each_entry(d, &r->domains, list) { 87 + dom = strsep(&line, ";"); 88 + if (!dom) 89 + return -EINVAL; 90 + id = strsep(&dom, "="); 91 + if (kstrtoul(id, 10, &dom_id) || dom_id != d->id) 92 + return -EINVAL; 93 + if (parse_cbm(dom, r)) 94 + return -EINVAL; 95 + } 96 + 97 + /* Any garbage at the end of the line? */ 98 + if (line && line[0]) 99 + return -EINVAL; 100 + return 0; 101 + } 102 + 103 + static int update_domains(struct rdt_resource *r, int closid) 104 + { 105 + struct msr_param msr_param; 106 + cpumask_var_t cpu_mask; 107 + struct rdt_domain *d; 108 + int cpu, idx = 0; 109 + 110 + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) 111 + return -ENOMEM; 112 + 113 + msr_param.low = closid; 114 + msr_param.high = msr_param.low + 1; 115 + msr_param.res = r; 116 + 117 + list_for_each_entry(d, &r->domains, list) { 118 + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); 119 + d->cbm[msr_param.low] = r->tmp_cbms[idx++]; 120 + } 121 + cpu = get_cpu(); 122 + /* Update CBM on this cpu if it's in cpu_mask. */ 123 + if (cpumask_test_cpu(cpu, cpu_mask)) 124 + rdt_cbm_update(&msr_param); 125 + /* Update CBM on other cpus. */ 126 + smp_call_function_many(cpu_mask, rdt_cbm_update, &msr_param, 1); 127 + put_cpu(); 128 + 129 + free_cpumask_var(cpu_mask); 130 + 131 + return 0; 132 + } 133 + 134 + ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, 135 + char *buf, size_t nbytes, loff_t off) 136 + { 137 + struct rdtgroup *rdtgrp; 138 + struct rdt_resource *r; 139 + char *tok, *resname; 140 + int closid, ret = 0; 141 + u32 *l3_cbms = NULL; 142 + 143 + /* Valid input requires a trailing newline */ 144 + if (nbytes == 0 || buf[nbytes - 1] != '\n') 145 + return -EINVAL; 146 + buf[nbytes - 1] = '\0'; 147 + 148 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 149 + if (!rdtgrp) { 150 + rdtgroup_kn_unlock(of->kn); 151 + return -ENOENT; 152 + } 153 + 154 + closid = rdtgrp->closid; 155 + 156 + /* get scratch space to save all the masks while we validate input */ 157 + for_each_enabled_rdt_resource(r) { 158 + r->tmp_cbms = kcalloc(r->num_domains, sizeof(*l3_cbms), 159 + GFP_KERNEL); 160 + if (!r->tmp_cbms) { 161 + ret = -ENOMEM; 162 + goto out; 163 + } 164 + r->num_tmp_cbms = 0; 165 + } 166 + 167 + while ((tok = strsep(&buf, "\n")) != NULL) { 168 + resname = strsep(&tok, ":"); 169 + if (!tok) { 170 + ret = -EINVAL; 171 + goto out; 172 + } 173 + for_each_enabled_rdt_resource(r) { 174 + if (!strcmp(resname, r->name) && 175 + closid < r->num_closid) { 176 + ret = parse_line(tok, r); 177 + if (ret) 178 + goto out; 179 + break; 180 + } 181 + } 182 + if (!r->name) { 183 + ret = -EINVAL; 184 + goto out; 185 + } 186 + } 187 + 188 + /* Did the parser find all the masks we need? */ 189 + for_each_enabled_rdt_resource(r) { 190 + if (r->num_tmp_cbms != r->num_domains) { 191 + ret = -EINVAL; 192 + goto out; 193 + } 194 + } 195 + 196 + for_each_enabled_rdt_resource(r) { 197 + ret = update_domains(r, closid); 198 + if (ret) 199 + goto out; 200 + } 201 + 202 + out: 203 + rdtgroup_kn_unlock(of->kn); 204 + for_each_enabled_rdt_resource(r) { 205 + kfree(r->tmp_cbms); 206 + r->tmp_cbms = NULL; 207 + } 208 + return ret ?: nbytes; 209 + } 210 + 211 + static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) 212 + { 213 + struct rdt_domain *dom; 214 + bool sep = false; 215 + 216 + seq_printf(s, "%s:", r->name); 217 + list_for_each_entry(dom, &r->domains, list) { 218 + if (sep) 219 + seq_puts(s, ";"); 220 + seq_printf(s, "%d=%x", dom->id, dom->cbm[closid]); 221 + sep = true; 222 + } 223 + seq_puts(s, "\n"); 224 + } 225 + 226 + int rdtgroup_schemata_show(struct kernfs_open_file *of, 227 + struct seq_file *s, void *v) 228 + { 229 + struct rdtgroup *rdtgrp; 230 + struct rdt_resource *r; 231 + int closid, ret = 0; 232 + 233 + rdtgrp = rdtgroup_kn_lock_live(of->kn); 234 + if (rdtgrp) { 235 + closid = rdtgrp->closid; 236 + for_each_enabled_rdt_resource(r) { 237 + if (closid < r->num_closid) 238 + show_doms(s, r, closid); 239 + } 240 + } else { 241 + ret = -ENOENT; 242 + } 243 + rdtgroup_kn_unlock(of->kn); 244 + return ret; 245 + }

+7 -4

arch/x86/kernel/cpu/scattered.c

··· 20 20 /* Please keep the leaf sorted by cpuid_bit.level for faster search. */ 21 21 static const struct cpuid_bit cpuid_bits[] = { 22 22 { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, 23 - { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, 24 - { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, 23 + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, 24 + { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, 25 25 { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, 26 26 { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, 27 - { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, 28 - { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, 27 + { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, 28 + { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, 29 + { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, 30 + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, 31 + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, 29 32 { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, 30 33 { 0, 0, 0, 0, 0 } 31 34 };

+4

arch/x86/kernel/process_32.c

··· 53 53 #include <asm/debugreg.h> 54 54 #include <asm/switch_to.h> 55 55 #include <asm/vm86.h> 56 + #include <asm/intel_rdt.h> 56 57 57 58 void __show_regs(struct pt_regs *regs, int all) 58 59 { ··· 296 295 switch_fpu_finish(next_fpu, cpu); 297 296 298 297 this_cpu_write(current_task, next_p); 298 + 299 + /* Load the Intel cache allocation PQR MSR. */ 300 + intel_rdt_sched_in(); 299 301 300 302 return prev_p; 301 303 }

+4

arch/x86/kernel/process_64.c

··· 49 49 #include <asm/switch_to.h> 50 50 #include <asm/xen/hypervisor.h> 51 51 #include <asm/vdso.h> 52 + #include <asm/intel_rdt.h> 52 53 53 54 __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); 54 55 ··· 476 475 if (ss_sel != __KERNEL_DS) 477 476 loadsegment(ss, __KERNEL_DS); 478 477 } 478 + 479 + /* Load the Intel cache allocation PQR MSR. */ 480 + intel_rdt_sched_in(); 479 481 480 482 return prev_p; 481 483 }

+5

drivers/base/cacheinfo.c

··· 363 363 return sprintf(buf, "%u\n", this_leaf->object); \ 364 364 } 365 365 366 + show_one(id, id); 366 367 show_one(level, level); 367 368 show_one(coherency_line_size, coherency_line_size); 368 369 show_one(number_of_sets, number_of_sets); ··· 445 444 return n; 446 445 } 447 446 447 + static DEVICE_ATTR_RO(id); 448 448 static DEVICE_ATTR_RO(level); 449 449 static DEVICE_ATTR_RO(type); 450 450 static DEVICE_ATTR_RO(coherency_line_size); ··· 459 457 static DEVICE_ATTR_RO(physical_line_partition); 460 458 461 459 static struct attribute *cache_default_attrs[] = { 460 + &dev_attr_id.attr, 462 461 &dev_attr_type.attr, 463 462 &dev_attr_level.attr, 464 463 &dev_attr_shared_cpu_map.attr, ··· 483 480 const struct cpumask *mask = &this_leaf->shared_cpu_map; 484 481 umode_t mode = attr->mode; 485 482 483 + if ((attr == &dev_attr_id.attr) && (this_leaf->attributes & CACHE_ID)) 484 + return mode; 486 485 if ((attr == &dev_attr_type.attr) && this_leaf->type) 487 486 return mode; 488 487 if ((attr == &dev_attr_level.attr) && this_leaf->level)

+3

include/linux/cacheinfo.h

··· 18 18 19 19 /** 20 20 * struct cacheinfo - represent a cache leaf node 21 + * @id: This cache's id. It is unique among caches with the same (type, level). 21 22 * @type: type of the cache - data, inst or unified 22 23 * @level: represents the hierarchy in the multi-level cache 23 24 * @coherency_line_size: size of each cache line usually representing ··· 45 44 * keeping, the remaining members form the core properties of the cache 46 45 */ 47 46 struct cacheinfo { 47 + unsigned int id; 48 48 enum cache_type type; 49 49 unsigned int level; 50 50 unsigned int coherency_line_size; ··· 63 61 #define CACHE_WRITE_ALLOCATE BIT(3) 64 62 #define CACHE_ALLOCATE_POLICY_MASK \ 65 63 (CACHE_READ_ALLOCATE | CACHE_WRITE_ALLOCATE) 64 + #define CACHE_ID BIT(4) 66 65 67 66 struct device_node *of_node; 68 67 bool disable_sysfs;

+3

include/linux/sched.h

··· 1821 1821 /* cg_list protected by css_set_lock and tsk->alloc_lock */ 1822 1822 struct list_head cg_list; 1823 1823 #endif 1824 + #ifdef CONFIG_INTEL_RDT_A 1825 + int closid; 1826 + #endif 1824 1827 #ifdef CONFIG_FUTEX 1825 1828 struct robust_list_head __user *robust_list; 1826 1829 #ifdef CONFIG_COMPAT

+1

include/uapi/linux/magic.h

··· 57 57 #define CGROUP_SUPER_MAGIC 0x27e0eb 58 58 #define CGROUP2_SUPER_MAGIC 0x63677270 59 59 60 + #define RDTGROUP_SUPER_MAGIC 0x7655821 60 61 61 62 #define STACK_END_MAGIC 0x57AC6E9D 62 63