Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

cpufreq: add virtual-cpufreq driver

Introduce a virtualized cpufreq driver for guest kernels to improve
performance and power of workloads within VMs.

This driver does two main things:

1. Sends the frequency of vCPUs as a hint to the host. The host uses the
hint to schedule the vCPU threads and decide physical CPU frequency.

2. If a VM does not support a virtualized FIE(like AMUs), it queries the
host CPU frequency by reading a MMIO region of a virtual cpufreq device
to update the guest's frequency scaling factor periodically. This enables
accurate Per-Entity Load Tracking for tasks running in the guest.

Co-developed-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: Saravana Kannan <saravanak@google.com>
Signed-off-by: David Dai <davidai@google.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>

authored by

David Dai and committed by
Viresh Kumar
4fd06a53 984638e4

+349
+14
drivers/cpufreq/Kconfig
··· 217 217 218 218 If in doubt, say N. 219 219 220 + config CPUFREQ_VIRT 221 + tristate "Virtual cpufreq driver" 222 + depends on GENERIC_ARCH_TOPOLOGY 223 + help 224 + This adds a virtualized cpufreq driver for guest kernels that 225 + read/writes to a MMIO region for a virtualized cpufreq device to 226 + communicate with the host. It sends performance requests to the host 227 + which gets used as a hint to schedule vCPU threads and select CPU 228 + frequency. If a VM does not support a virtualized FIE such as AMUs, 229 + it updates the frequency scaling factor by polling host CPU frequency 230 + to enable accurate Per-Entity Load Tracking for tasks running in the guest. 231 + 232 + If in doubt, say N. 233 + 220 234 config CPUFREQ_DT_PLATDEV 221 235 tristate "Generic DT based cpufreq platdev driver" 222 236 depends on OF
+1
drivers/cpufreq/Makefile
··· 16 16 17 17 obj-$(CONFIG_CPUFREQ_DT) += cpufreq-dt.o 18 18 obj-$(CONFIG_CPUFREQ_DT_PLATDEV) += cpufreq-dt-platdev.o 19 + obj-$(CONFIG_CPUFREQ_VIRT) += virtual-cpufreq.o 19 20 20 21 # Traces 21 22 CFLAGS_amd-pstate-trace.o := -I$(src)
+333
drivers/cpufreq/virtual-cpufreq.c
··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2024 Google LLC 4 + */ 5 + 6 + #include <linux/arch_topology.h> 7 + #include <linux/cpufreq.h> 8 + #include <linux/init.h> 9 + #include <linux/sched.h> 10 + #include <linux/kernel.h> 11 + #include <linux/module.h> 12 + #include <linux/of_address.h> 13 + #include <linux/of_platform.h> 14 + #include <linux/platform_device.h> 15 + #include <linux/slab.h> 16 + 17 + /* 18 + * CPU0..CPUn 19 + * +-------------+-------------------------------+--------+-------+ 20 + * | Register | Description | Offset | Len | 21 + * +-------------+-------------------------------+--------+-------+ 22 + * | cur_perf | read this register to get | 0x0 | 0x4 | 23 + * | | the current perf (integer val | | | 24 + * | | representing perf relative to | | | 25 + * | | max performance) | | | 26 + * | | that vCPU is running at | | | 27 + * +-------------+-------------------------------+--------+-------+ 28 + * | set_perf | write to this register to set | 0x4 | 0x4 | 29 + * | | perf value of the vCPU | | | 30 + * +-------------+-------------------------------+--------+-------+ 31 + * | perftbl_len | number of entries in perf | 0x8 | 0x4 | 32 + * | | table. A single entry in the | | | 33 + * | | perf table denotes no table | | | 34 + * | | and the entry contains | | | 35 + * | | the maximum perf value | | | 36 + * | | that this vCPU supports. | | | 37 + * | | The guest can request any | | | 38 + * | | value between 1 and max perf | | | 39 + * | | when perftbls are not used. | | | 40 + * +---------------------------------------------+--------+-------+ 41 + * | perftbl_sel | write to this register to | 0xc | 0x4 | 42 + * | | select perf table entry to | | | 43 + * | | read from | | | 44 + * +---------------------------------------------+--------+-------+ 45 + * | perftbl_rd | read this register to get | 0x10 | 0x4 | 46 + * | | perf value of the selected | | | 47 + * | | entry based on perftbl_sel | | | 48 + * +---------------------------------------------+--------+-------+ 49 + * | perf_domain | performance domain number | 0x14 | 0x4 | 50 + * | | that this vCPU belongs to. | | | 51 + * | | vCPUs sharing the same perf | | | 52 + * | | domain number are part of the | | | 53 + * | | same performance domain. | | | 54 + * +-------------+-------------------------------+--------+-------+ 55 + */ 56 + 57 + #define REG_CUR_PERF_STATE_OFFSET 0x0 58 + #define REG_SET_PERF_STATE_OFFSET 0x4 59 + #define REG_PERFTBL_LEN_OFFSET 0x8 60 + #define REG_PERFTBL_SEL_OFFSET 0xc 61 + #define REG_PERFTBL_RD_OFFSET 0x10 62 + #define REG_PERF_DOMAIN_OFFSET 0x14 63 + #define PER_CPU_OFFSET 0x1000 64 + 65 + #define PERFTBL_MAX_ENTRIES 64U 66 + 67 + static void __iomem *base; 68 + static DEFINE_PER_CPU(u32, perftbl_num_entries); 69 + 70 + static void virt_scale_freq_tick(void) 71 + { 72 + int cpu = smp_processor_id(); 73 + u32 max_freq = (u32)cpufreq_get_hw_max_freq(cpu); 74 + u64 cur_freq; 75 + unsigned long scale; 76 + 77 + cur_freq = (u64)readl_relaxed(base + cpu * PER_CPU_OFFSET 78 + + REG_CUR_PERF_STATE_OFFSET); 79 + 80 + cur_freq <<= SCHED_CAPACITY_SHIFT; 81 + scale = (unsigned long)div_u64(cur_freq, max_freq); 82 + scale = min(scale, SCHED_CAPACITY_SCALE); 83 + 84 + this_cpu_write(arch_freq_scale, scale); 85 + } 86 + 87 + static struct scale_freq_data virt_sfd = { 88 + .source = SCALE_FREQ_SOURCE_VIRT, 89 + .set_freq_scale = virt_scale_freq_tick, 90 + }; 91 + 92 + static unsigned int virt_cpufreq_set_perf(struct cpufreq_policy *policy, 93 + unsigned int target_freq) 94 + { 95 + writel_relaxed(target_freq, 96 + base + policy->cpu * PER_CPU_OFFSET + REG_SET_PERF_STATE_OFFSET); 97 + return 0; 98 + } 99 + 100 + static unsigned int virt_cpufreq_fast_switch(struct cpufreq_policy *policy, 101 + unsigned int target_freq) 102 + { 103 + virt_cpufreq_set_perf(policy, target_freq); 104 + return target_freq; 105 + } 106 + 107 + static u32 virt_cpufreq_get_perftbl_entry(int cpu, u32 idx) 108 + { 109 + writel_relaxed(idx, base + cpu * PER_CPU_OFFSET + 110 + REG_PERFTBL_SEL_OFFSET); 111 + return readl_relaxed(base + cpu * PER_CPU_OFFSET + 112 + REG_PERFTBL_RD_OFFSET); 113 + } 114 + 115 + static int virt_cpufreq_target(struct cpufreq_policy *policy, 116 + unsigned int target_freq, 117 + unsigned int relation) 118 + { 119 + struct cpufreq_freqs freqs; 120 + int ret = 0; 121 + 122 + freqs.old = policy->cur; 123 + freqs.new = target_freq; 124 + 125 + cpufreq_freq_transition_begin(policy, &freqs); 126 + ret = virt_cpufreq_set_perf(policy, target_freq); 127 + cpufreq_freq_transition_end(policy, &freqs, ret != 0); 128 + 129 + return ret; 130 + } 131 + 132 + static int virt_cpufreq_get_sharing_cpus(struct cpufreq_policy *policy) 133 + { 134 + u32 cur_perf_domain, perf_domain; 135 + struct device *cpu_dev; 136 + int cpu; 137 + 138 + cur_perf_domain = readl_relaxed(base + policy->cpu * 139 + PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET); 140 + 141 + for_each_possible_cpu(cpu) { 142 + cpu_dev = get_cpu_device(cpu); 143 + if (!cpu_dev) 144 + continue; 145 + 146 + perf_domain = readl_relaxed(base + cpu * 147 + PER_CPU_OFFSET + REG_PERF_DOMAIN_OFFSET); 148 + 149 + if (perf_domain == cur_perf_domain) 150 + cpumask_set_cpu(cpu, policy->cpus); 151 + } 152 + 153 + return 0; 154 + } 155 + 156 + static int virt_cpufreq_get_freq_info(struct cpufreq_policy *policy) 157 + { 158 + struct cpufreq_frequency_table *table; 159 + u32 num_perftbl_entries, idx; 160 + 161 + num_perftbl_entries = per_cpu(perftbl_num_entries, policy->cpu); 162 + 163 + if (num_perftbl_entries == 1) { 164 + policy->cpuinfo.min_freq = 1; 165 + policy->cpuinfo.max_freq = virt_cpufreq_get_perftbl_entry(policy->cpu, 0); 166 + 167 + policy->min = policy->cpuinfo.min_freq; 168 + policy->max = policy->cpuinfo.max_freq; 169 + 170 + policy->cur = policy->max; 171 + return 0; 172 + } 173 + 174 + table = kcalloc(num_perftbl_entries + 1, sizeof(*table), GFP_KERNEL); 175 + if (!table) 176 + return -ENOMEM; 177 + 178 + for (idx = 0; idx < num_perftbl_entries; idx++) 179 + table[idx].frequency = virt_cpufreq_get_perftbl_entry(policy->cpu, idx); 180 + 181 + table[idx].frequency = CPUFREQ_TABLE_END; 182 + policy->freq_table = table; 183 + 184 + return 0; 185 + } 186 + 187 + static int virt_cpufreq_cpu_init(struct cpufreq_policy *policy) 188 + { 189 + struct device *cpu_dev; 190 + int ret; 191 + 192 + cpu_dev = get_cpu_device(policy->cpu); 193 + if (!cpu_dev) 194 + return -ENODEV; 195 + 196 + ret = virt_cpufreq_get_freq_info(policy); 197 + if (ret) { 198 + dev_warn(cpu_dev, "failed to get cpufreq info\n"); 199 + return ret; 200 + } 201 + 202 + ret = virt_cpufreq_get_sharing_cpus(policy); 203 + if (ret) { 204 + dev_warn(cpu_dev, "failed to get sharing cpumask\n"); 205 + return ret; 206 + } 207 + 208 + /* 209 + * To simplify and improve latency of handling frequency requests on 210 + * the host side, this ensures that the vCPU thread triggering the MMIO 211 + * abort is the same thread whose performance constraints (Ex. uclamp 212 + * settings) need to be updated. This simplifies the VMM (Virtual 213 + * Machine Manager) having to find the correct vCPU thread and/or 214 + * facing permission issues when configuring other threads. 215 + */ 216 + policy->dvfs_possible_from_any_cpu = false; 217 + policy->fast_switch_possible = true; 218 + 219 + /* 220 + * Using the default SCALE_FREQ_SOURCE_CPUFREQ is insufficient since 221 + * the actual physical CPU frequency may not match requested frequency 222 + * from the vCPU thread due to frequency update latencies or other 223 + * inputs to the physical CPU frequency selection. This additional FIE 224 + * source allows for more accurate freq_scale updates and only takes 225 + * effect if another FIE source such as AMUs have not been registered. 226 + */ 227 + topology_set_scale_freq_source(&virt_sfd, policy->cpus); 228 + 229 + return 0; 230 + } 231 + 232 + static void virt_cpufreq_cpu_exit(struct cpufreq_policy *policy) 233 + { 234 + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_VIRT, policy->related_cpus); 235 + kfree(policy->freq_table); 236 + } 237 + 238 + static int virt_cpufreq_online(struct cpufreq_policy *policy) 239 + { 240 + /* Nothing to restore. */ 241 + return 0; 242 + } 243 + 244 + static int virt_cpufreq_offline(struct cpufreq_policy *policy) 245 + { 246 + /* Dummy offline() to avoid exit() being called and freeing resources. */ 247 + return 0; 248 + } 249 + 250 + static int virt_cpufreq_verify_policy(struct cpufreq_policy_data *policy) 251 + { 252 + if (policy->freq_table) 253 + return cpufreq_frequency_table_verify(policy, policy->freq_table); 254 + 255 + cpufreq_verify_within_cpu_limits(policy); 256 + return 0; 257 + } 258 + 259 + static struct cpufreq_driver cpufreq_virt_driver = { 260 + .name = "virt-cpufreq", 261 + .init = virt_cpufreq_cpu_init, 262 + .exit = virt_cpufreq_cpu_exit, 263 + .online = virt_cpufreq_online, 264 + .offline = virt_cpufreq_offline, 265 + .verify = virt_cpufreq_verify_policy, 266 + .target = virt_cpufreq_target, 267 + .fast_switch = virt_cpufreq_fast_switch, 268 + .attr = cpufreq_generic_attr, 269 + }; 270 + 271 + static int virt_cpufreq_driver_probe(struct platform_device *pdev) 272 + { 273 + u32 num_perftbl_entries; 274 + int ret, cpu; 275 + 276 + base = devm_platform_ioremap_resource(pdev, 0); 277 + if (IS_ERR(base)) 278 + return PTR_ERR(base); 279 + 280 + for_each_possible_cpu(cpu) { 281 + num_perftbl_entries = readl_relaxed(base + cpu * PER_CPU_OFFSET + 282 + REG_PERFTBL_LEN_OFFSET); 283 + 284 + if (!num_perftbl_entries || num_perftbl_entries > PERFTBL_MAX_ENTRIES) 285 + return -ENODEV; 286 + 287 + per_cpu(perftbl_num_entries, cpu) = num_perftbl_entries; 288 + } 289 + 290 + ret = cpufreq_register_driver(&cpufreq_virt_driver); 291 + if (ret) { 292 + dev_err(&pdev->dev, "Virtual CPUFreq driver failed to register: %d\n", ret); 293 + return ret; 294 + } 295 + 296 + dev_dbg(&pdev->dev, "Virtual CPUFreq driver initialized\n"); 297 + return 0; 298 + } 299 + 300 + static void virt_cpufreq_driver_remove(struct platform_device *pdev) 301 + { 302 + cpufreq_unregister_driver(&cpufreq_virt_driver); 303 + } 304 + 305 + static const struct of_device_id virt_cpufreq_match[] = { 306 + { .compatible = "qemu,virtual-cpufreq", .data = NULL}, 307 + {} 308 + }; 309 + MODULE_DEVICE_TABLE(of, virt_cpufreq_match); 310 + 311 + static struct platform_driver virt_cpufreq_driver = { 312 + .probe = virt_cpufreq_driver_probe, 313 + .remove = virt_cpufreq_driver_remove, 314 + .driver = { 315 + .name = "virt-cpufreq", 316 + .of_match_table = virt_cpufreq_match, 317 + }, 318 + }; 319 + 320 + static int __init virt_cpufreq_init(void) 321 + { 322 + return platform_driver_register(&virt_cpufreq_driver); 323 + } 324 + postcore_initcall(virt_cpufreq_init); 325 + 326 + static void __exit virt_cpufreq_exit(void) 327 + { 328 + platform_driver_unregister(&virt_cpufreq_driver); 329 + } 330 + module_exit(virt_cpufreq_exit); 331 + 332 + MODULE_DESCRIPTION("Virtual cpufreq driver"); 333 + MODULE_LICENSE("GPL");
+1
include/linux/arch_topology.h
··· 49 49 SCALE_FREQ_SOURCE_CPUFREQ = 0, 50 50 SCALE_FREQ_SOURCE_ARCH, 51 51 SCALE_FREQ_SOURCE_CPPC, 52 + SCALE_FREQ_SOURCE_VIRT, 52 53 }; 53 54 54 55 struct scale_freq_data {