intel_idle: native hardware cpuidle driver for latest Intel processors

This EXPERIMENTAL driver supersedes acpi_idle on
Intel Atom Processors, Intel Core i3/i5/i7 Processors
and associated Intel Xeon processors.

It does not support the Intel Core2 processor or earlier.

For kernels configured with ACPI, CONFIG_INTEL_IDLE=y
allows intel_idle to probe before the ACPI processor driver.
Booting with "intel_idle.max_cstate=0" disables intel_idle
and the system will fall back on ACPI's "acpi_idle".

Typical Linux distributions load ACPI processor module early,
making CONFIG_INTEL_IDLE=m not easily useful on ACPI platforms.

intel_idle probes all processors at module_init time.
Processors that are hot-added later will be limited
to using C1 in idle.

Signed-off-by: Len Brown <len.brown@intel.com>

Len Brown 26717172 02cf4f98

+486 -2
+7
MAINTAINERS
··· 2850 S: Maintained 2851 F: drivers/input/ 2852 2853 INTEL FRAMEBUFFER DRIVER (excluding 810 and 815) 2854 M: Maik Broemme <mbroemme@plusserver.de> 2855 L: linux-fbdev@vger.kernel.org
··· 2850 S: Maintained 2851 F: drivers/input/ 2852 2853 + INTEL IDLE DRIVER 2854 + M: Len Brown <lenb@kernel.org> 2855 + L: linux-pm@lists.linux-foundation.org 2856 + T: git git://git.kernel.org/pub/scm/linux/kernel/git/lenb/linux-idle-2.6.git 2857 + S: Supported 2858 + F: drivers/idle/intel_idle.c 2859 + 2860 INTEL FRAMEBUFFER DRIVER (excluding 810 and 815) 2861 M: Maik Broemme <mbroemme@plusserver.de> 2862 L: linux-fbdev@vger.kernel.org
+1 -1
drivers/Makefile
··· 10 obj-$(CONFIG_PARISC) += parisc/ 11 obj-$(CONFIG_RAPIDIO) += rapidio/ 12 obj-y += video/ 13 obj-$(CONFIG_ACPI) += acpi/ 14 obj-$(CONFIG_SFI) += sfi/ 15 # PnP must come after ACPI since it will eventually need to check if acpi ··· 92 obj-y += lguest/ 93 obj-$(CONFIG_CPU_FREQ) += cpufreq/ 94 obj-$(CONFIG_CPU_IDLE) += cpuidle/ 95 - obj-y += idle/ 96 obj-$(CONFIG_MMC) += mmc/ 97 obj-$(CONFIG_MEMSTICK) += memstick/ 98 obj-$(CONFIG_NEW_LEDS) += leds/
··· 10 obj-$(CONFIG_PARISC) += parisc/ 11 obj-$(CONFIG_RAPIDIO) += rapidio/ 12 obj-y += video/ 13 + obj-y += idle/ 14 obj-$(CONFIG_ACPI) += acpi/ 15 obj-$(CONFIG_SFI) += sfi/ 16 # PnP must come after ACPI since it will eventually need to check if acpi ··· 91 obj-y += lguest/ 92 obj-$(CONFIG_CPU_FREQ) += cpufreq/ 93 obj-$(CONFIG_CPU_IDLE) += cpuidle/ 94 obj-$(CONFIG_MMC) += mmc/ 95 obj-$(CONFIG_MEMSTICK) += memstick/ 96 obj-$(CONFIG_NEW_LEDS) += leds/
+5 -1
drivers/acpi/processor_driver.c
··· 922 return -ENOMEM; 923 #endif 924 925 - if (!cpuidle_register_driver(&acpi_idle_driver)) 926 printk(KERN_DEBUG "ACPI: %s registered with cpuidle\n", 927 acpi_idle_driver.name); 928 929 result = acpi_bus_register_driver(&acpi_processor_driver); 930 if (result < 0)
··· 922 return -ENOMEM; 923 #endif 924 925 + if (!cpuidle_register_driver(&acpi_idle_driver)) { 926 printk(KERN_DEBUG "ACPI: %s registered with cpuidle\n", 927 acpi_idle_driver.name); 928 + } else { 929 + printk(KERN_DEBUG "ACPI: acpi_idle yielding to %s", 930 + cpuidle_get_driver()->name); 931 + } 932 933 result = acpi_bus_register_driver(&acpi_processor_driver); 934 if (result < 0)
+11
drivers/idle/Kconfig
··· 1 2 menu "Memory power savings" 3 depends on X86_64
··· 1 + config INTEL_IDLE 2 + tristate "Cpuidle Driver for Intel Processors" 3 + depends on CPU_IDLE 4 + depends on X86 5 + depends on CPU_SUP_INTEL 6 + depends on EXPERIMENTAL 7 + help 8 + Enable intel_idle, a cpuidle driver that includes knowledge of 9 + native Intel hardware idle features. The acpi_idle driver 10 + can be configured at the same time, in order to handle 11 + processors intel_idle does not support. 12 13 menu "Memory power savings" 14 depends on X86_64
+1
drivers/idle/Makefile
··· 1 obj-$(CONFIG_I7300_IDLE) += i7300_idle.o 2
··· 1 obj-$(CONFIG_I7300_IDLE) += i7300_idle.o 2 + obj-$(CONFIG_INTEL_IDLE) += intel_idle.o 3
+461
drivers/idle/intel_idle.c
···
··· 1 + /* 2 + * intel_idle.c - native hardware idle loop for modern Intel processors 3 + * 4 + * Copyright (c) 2010, Intel Corporation. 5 + * Len Brown <len.brown@intel.com> 6 + * 7 + * This program is free software; you can redistribute it and/or modify it 8 + * under the terms and conditions of the GNU General Public License, 9 + * version 2, as published by the Free Software Foundation. 10 + * 11 + * This program is distributed in the hope it will be useful, but WITHOUT 12 + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 + * more details. 15 + * 16 + * You should have received a copy of the GNU General Public License along with 17 + * this program; if not, write to the Free Software Foundation, Inc., 18 + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. 19 + */ 20 + 21 + /* 22 + * intel_idle is a cpuidle driver that loads on specific Intel processors 23 + * in lieu of the legacy ACPI processor_idle driver. The intent is to 24 + * make Linux more efficient on these processors, as intel_idle knows 25 + * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs. 26 + */ 27 + 28 + /* 29 + * Design Assumptions 30 + * 31 + * All CPUs have same idle states as boot CPU 32 + * 33 + * Chipset BM_STS (bus master status) bit is a NOP 34 + * for preventing entry into deep C-stats 35 + */ 36 + 37 + /* 38 + * Known limitations 39 + * 40 + * The driver currently initializes for_each_online_cpu() upon modprobe. 41 + * It it unaware of subsequent processors hot-added to the system. 42 + * This means that if you boot with maxcpus=n and later online 43 + * processors above n, those processors will use C1 only. 44 + * 45 + * ACPI has a .suspend hack to turn off deep c-statees during suspend 46 + * to avoid complications with the lapic timer workaround. 47 + * Have not seen issues with suspend, but may need same workaround here. 48 + * 49 + * There is currently no kernel-based automatic probing/loading mechanism 50 + * if the driver is built as a module. 51 + */ 52 + 53 + /* un-comment DEBUG to enable pr_debug() statements */ 54 + #define DEBUG 55 + 56 + #include <linux/kernel.h> 57 + #include <linux/cpuidle.h> 58 + #include <linux/clockchips.h> 59 + #include <linux/hrtimer.h> /* ktime_get_real() */ 60 + #include <trace/events/power.h> 61 + #include <linux/sched.h> 62 + 63 + #define INTEL_IDLE_VERSION "0.4" 64 + #define PREFIX "intel_idle: " 65 + 66 + #define MWAIT_SUBSTATE_MASK (0xf) 67 + #define MWAIT_CSTATE_MASK (0xf) 68 + #define MWAIT_SUBSTATE_SIZE (4) 69 + #define MWAIT_MAX_NUM_CSTATES 8 70 + #define CPUID_MWAIT_LEAF (5) 71 + #define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1) 72 + #define CPUID5_ECX_INTERRUPT_BREAK (0x2) 73 + 74 + static struct cpuidle_driver intel_idle_driver = { 75 + .name = "intel_idle", 76 + .owner = THIS_MODULE, 77 + }; 78 + /* intel_idle.max_cstate=0 disables driver */ 79 + static int max_cstate = MWAIT_MAX_NUM_CSTATES - 1; 80 + static int power_policy = 7; /* 0 = max perf; 15 = max powersave */ 81 + 82 + static unsigned int substates; 83 + static int (*choose_substate)(int); 84 + 85 + /* Reliable LAPIC Timer States, bit 1 for C1 etc. */ 86 + static unsigned int lapic_timer_reliable_states; 87 + 88 + static struct cpuidle_device *intel_idle_cpuidle_devices; 89 + static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state); 90 + 91 + static struct cpuidle_state *cpuidle_state_table; 92 + 93 + /* 94 + * States are indexed by the cstate number, 95 + * which is also the index into the MWAIT hint array. 96 + * Thus C0 is a dummy. 97 + */ 98 + static struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = { 99 + { /* MWAIT C0 */ }, 100 + { /* MWAIT C1 */ 101 + .name = "NHM-C1", 102 + .desc = "MWAIT 0x00", 103 + .driver_data = (void *) 0x00, 104 + .flags = CPUIDLE_FLAG_TIME_VALID, 105 + .exit_latency = 3, 106 + .power_usage = 1000, 107 + .target_residency = 6, 108 + .enter = &intel_idle }, 109 + { /* MWAIT C2 */ 110 + .name = "NHM-C3", 111 + .desc = "MWAIT 0x10", 112 + .driver_data = (void *) 0x10, 113 + .flags = CPUIDLE_FLAG_TIME_VALID, 114 + .exit_latency = 20, 115 + .power_usage = 500, 116 + .target_residency = 80, 117 + .enter = &intel_idle }, 118 + { /* MWAIT C3 */ 119 + .name = "NHM-C6", 120 + .desc = "MWAIT 0x20", 121 + .driver_data = (void *) 0x20, 122 + .flags = CPUIDLE_FLAG_TIME_VALID, 123 + .exit_latency = 200, 124 + .power_usage = 350, 125 + .target_residency = 800, 126 + .enter = &intel_idle }, 127 + }; 128 + 129 + static struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = { 130 + { /* MWAIT C0 */ }, 131 + { /* MWAIT C1 */ 132 + .name = "ATM-C1", 133 + .desc = "MWAIT 0x00", 134 + .driver_data = (void *) 0x00, 135 + .flags = CPUIDLE_FLAG_TIME_VALID, 136 + .exit_latency = 1, 137 + .power_usage = 1000, 138 + .target_residency = 4, 139 + .enter = &intel_idle }, 140 + { /* MWAIT C2 */ 141 + .name = "ATM-C2", 142 + .desc = "MWAIT 0x10", 143 + .driver_data = (void *) 0x10, 144 + .flags = CPUIDLE_FLAG_TIME_VALID, 145 + .exit_latency = 20, 146 + .power_usage = 500, 147 + .target_residency = 80, 148 + .enter = &intel_idle }, 149 + { /* MWAIT C3 */ }, 150 + { /* MWAIT C4 */ 151 + .name = "ATM-C4", 152 + .desc = "MWAIT 0x30", 153 + .driver_data = (void *) 0x30, 154 + .flags = CPUIDLE_FLAG_TIME_VALID, 155 + .exit_latency = 100, 156 + .power_usage = 250, 157 + .target_residency = 400, 158 + .enter = &intel_idle }, 159 + { /* MWAIT C5 */ }, 160 + { /* MWAIT C6 */ 161 + .name = "ATM-C6", 162 + .desc = "MWAIT 0x40", 163 + .driver_data = (void *) 0x40, 164 + .flags = CPUIDLE_FLAG_TIME_VALID, 165 + .exit_latency = 200, 166 + .power_usage = 150, 167 + .target_residency = 800, 168 + .enter = NULL }, /* disabled */ 169 + }; 170 + 171 + /* 172 + * choose_tunable_substate() 173 + * 174 + * Run-time decision on which C-state substate to invoke 175 + * If power_policy = 0, choose shallowest substate (0) 176 + * If power_policy = 15, choose deepest substate 177 + * If power_policy = middle, choose middle substate etc. 178 + */ 179 + static int choose_tunable_substate(int cstate) 180 + { 181 + unsigned int num_substates; 182 + unsigned int substate_choice; 183 + 184 + power_policy &= 0xF; /* valid range: 0-15 */ 185 + cstate &= 7; /* valid range: 0-7 */ 186 + 187 + num_substates = (substates >> ((cstate) * 4)) & MWAIT_SUBSTATE_MASK; 188 + 189 + if (num_substates <= 1) 190 + return 0; 191 + 192 + substate_choice = ((power_policy + (power_policy + 1) * 193 + (num_substates - 1)) / 16); 194 + 195 + return substate_choice; 196 + } 197 + 198 + /* 199 + * choose_zero_substate() 200 + */ 201 + static int choose_zero_substate(int cstate) 202 + { 203 + return 0; 204 + } 205 + 206 + /** 207 + * intel_idle 208 + * @dev: cpuidle_device 209 + * @state: cpuidle state 210 + * 211 + */ 212 + static int intel_idle(struct cpuidle_device *dev, struct cpuidle_state *state) 213 + { 214 + unsigned long ecx = 1; /* break on interrupt flag */ 215 + unsigned long eax = (unsigned long)cpuidle_get_statedata(state); 216 + unsigned int cstate; 217 + ktime_t kt_before, kt_after; 218 + s64 usec_delta; 219 + int cpu = smp_processor_id(); 220 + 221 + cstate = (((eax) >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1; 222 + 223 + eax = eax + (choose_substate)(cstate); 224 + 225 + local_irq_disable(); 226 + 227 + if (!(lapic_timer_reliable_states & (1 << (cstate)))) 228 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); 229 + 230 + kt_before = ktime_get_real(); 231 + 232 + stop_critical_timings(); 233 + #ifndef MODULE 234 + trace_power_start(POWER_CSTATE, (eax >> 4) + 1); 235 + #endif 236 + if (!need_resched()) { 237 + 238 + __monitor((void *)&current_thread_info()->flags, 0, 0); 239 + smp_mb(); 240 + if (!need_resched()) 241 + __mwait(eax, ecx); 242 + } 243 + 244 + start_critical_timings(); 245 + 246 + kt_after = ktime_get_real(); 247 + usec_delta = ktime_to_us(ktime_sub(kt_after, kt_before)); 248 + 249 + local_irq_enable(); 250 + 251 + if (!(lapic_timer_reliable_states & (1 << (cstate)))) 252 + clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); 253 + 254 + return usec_delta; 255 + } 256 + 257 + /* 258 + * intel_idle_probe() 259 + */ 260 + static int intel_idle_probe(void) 261 + { 262 + unsigned int eax, ebx, ecx, edx; 263 + 264 + if (max_cstate == 0) { 265 + pr_debug(PREFIX "disabled\n"); 266 + return -EPERM; 267 + } 268 + 269 + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) 270 + return -ENODEV; 271 + 272 + if (!boot_cpu_has(X86_FEATURE_MWAIT)) 273 + return -ENODEV; 274 + 275 + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) 276 + return -ENODEV; 277 + 278 + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); 279 + 280 + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || 281 + !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) 282 + return -ENODEV; 283 + #ifdef DEBUG 284 + if (substates == 0) /* can over-ride via modparam */ 285 + #endif 286 + substates = edx; 287 + 288 + pr_debug(PREFIX "MWAIT substates: 0x%x\n", substates); 289 + 290 + if (boot_cpu_has(X86_FEATURE_ARAT)) /* Always Reliable APIC Timer */ 291 + lapic_timer_reliable_states = 0xFFFFFFFF; 292 + 293 + if (boot_cpu_data.x86 != 6) /* family 6 */ 294 + return -ENODEV; 295 + 296 + switch (boot_cpu_data.x86_model) { 297 + 298 + case 0x1A: /* Core i7, Xeon 5500 series */ 299 + case 0x1E: /* Core i7 and i5 Processor - Lynnfield Jasper Forest */ 300 + case 0x1F: /* Core i7 and i5 Processor - Nehalem */ 301 + case 0x2E: /* Nehalem-EX Xeon */ 302 + lapic_timer_reliable_states = (1 << 1); /* C1 */ 303 + 304 + case 0x25: /* Westmere */ 305 + case 0x2C: /* Westmere */ 306 + cpuidle_state_table = nehalem_cstates; 307 + choose_substate = choose_tunable_substate; 308 + break; 309 + 310 + case 0x1C: /* 28 - Atom Processor */ 311 + lapic_timer_reliable_states = (1 << 2) | (1 << 1); /* C2, C1 */ 312 + cpuidle_state_table = atom_cstates; 313 + choose_substate = choose_zero_substate; 314 + break; 315 + #ifdef FUTURE_USE 316 + case 0x17: /* 23 - Core 2 Duo */ 317 + lapic_timer_reliable_states = (1 << 2) | (1 << 1); /* C2, C1 */ 318 + #endif 319 + 320 + default: 321 + pr_debug(PREFIX "does not run on family %d model %d\n", 322 + boot_cpu_data.x86, boot_cpu_data.x86_model); 323 + return -ENODEV; 324 + } 325 + 326 + pr_debug(PREFIX "v" INTEL_IDLE_VERSION 327 + " model 0x%X\n", boot_cpu_data.x86_model); 328 + 329 + pr_debug(PREFIX "lapic_timer_reliable_states 0x%x\n", 330 + lapic_timer_reliable_states); 331 + return 0; 332 + } 333 + 334 + /* 335 + * intel_idle_cpuidle_devices_uninit() 336 + * unregister, free cpuidle_devices 337 + */ 338 + static void intel_idle_cpuidle_devices_uninit(void) 339 + { 340 + int i; 341 + struct cpuidle_device *dev; 342 + 343 + for_each_online_cpu(i) { 344 + dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); 345 + cpuidle_unregister_device(dev); 346 + } 347 + 348 + free_percpu(intel_idle_cpuidle_devices); 349 + return; 350 + } 351 + /* 352 + * intel_idle_cpuidle_devices_init() 353 + * allocate, initialize, register cpuidle_devices 354 + */ 355 + static int intel_idle_cpuidle_devices_init(void) 356 + { 357 + int i, cstate; 358 + struct cpuidle_device *dev; 359 + 360 + intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device); 361 + if (intel_idle_cpuidle_devices == NULL) 362 + return -ENOMEM; 363 + 364 + for_each_online_cpu(i) { 365 + dev = per_cpu_ptr(intel_idle_cpuidle_devices, i); 366 + 367 + dev->state_count = 1; 368 + 369 + for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) { 370 + int num_substates; 371 + 372 + if (cstate > max_cstate) { 373 + printk(PREFIX "max_cstate %d reached\n", 374 + max_cstate); 375 + break; 376 + } 377 + 378 + /* does the state exist in CPUID.MWAIT? */ 379 + num_substates = (substates >> ((cstate) * 4)) 380 + & MWAIT_SUBSTATE_MASK; 381 + if (num_substates == 0) 382 + continue; 383 + /* is the state not enabled? */ 384 + if (cpuidle_state_table[cstate].enter == NULL) { 385 + /* does the driver not know about the state? */ 386 + if (*cpuidle_state_table[cstate].name == '\0') 387 + pr_debug(PREFIX "unaware of model 0x%x" 388 + " MWAIT %d please" 389 + " contact lenb@kernel.org", 390 + boot_cpu_data.x86_model, cstate); 391 + continue; 392 + } 393 + 394 + if ((cstate > 2) && 395 + !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 396 + mark_tsc_unstable("TSC halts in idle" 397 + " states deeper than C2"); 398 + 399 + dev->states[dev->state_count] = /* structure copy */ 400 + cpuidle_state_table[cstate]; 401 + 402 + dev->state_count += 1; 403 + } 404 + 405 + dev->cpu = i; 406 + if (cpuidle_register_device(dev)) { 407 + pr_debug(PREFIX "cpuidle_register_device %d failed!\n", 408 + i); 409 + intel_idle_cpuidle_devices_uninit(); 410 + return -EIO; 411 + } 412 + } 413 + 414 + return 0; 415 + } 416 + 417 + 418 + static int __init intel_idle_init(void) 419 + { 420 + int retval; 421 + 422 + retval = intel_idle_probe(); 423 + if (retval) 424 + return retval; 425 + 426 + retval = cpuidle_register_driver(&intel_idle_driver); 427 + if (retval) { 428 + printk(KERN_DEBUG PREFIX "intel_idle yielding to %s", 429 + cpuidle_get_driver()->name); 430 + return retval; 431 + } 432 + 433 + retval = intel_idle_cpuidle_devices_init(); 434 + if (retval) { 435 + cpuidle_unregister_driver(&intel_idle_driver); 436 + return retval; 437 + } 438 + 439 + return 0; 440 + } 441 + 442 + static void __exit intel_idle_exit(void) 443 + { 444 + intel_idle_cpuidle_devices_uninit(); 445 + cpuidle_unregister_driver(&intel_idle_driver); 446 + 447 + return; 448 + } 449 + 450 + module_init(intel_idle_init); 451 + module_exit(intel_idle_exit); 452 + 453 + module_param(power_policy, int, 0644); 454 + module_param(max_cstate, int, 0444); 455 + #ifdef DEBUG 456 + module_param(substates, int, 0444); 457 + #endif 458 + 459 + MODULE_AUTHOR("Len Brown <len.brown@intel.com>"); 460 + MODULE_DESCRIPTION("Cpuidle driver for Intel Hardware v" INTEL_IDLE_VERSION); 461 + MODULE_LICENSE("GPL");