Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ACPI: Processor native C-states using MWAIT

Intel processors starting with the Core Duo support
support processor native C-state using the MWAIT instruction.
Refer: Intel Architecture Software Developer's Manual
http://www.intel.com/design/Pentium4/manuals/253668.htm

Platform firmware exports the support for Native C-state to OS using
ACPI _PDC and _CST methods.
Refer: Intel Processor Vendor-Specific ACPI: Interface Specification
http://www.intel.com/technology/iapc/acpi/downloads/302223.htm

With Processor Native C-state, we use 'MWAIT' instruction on the processor
to enter different C-states (C1, C2, C3). We won't use the special IO
ports to enter C-state and no SMM mode etc required to enter C-state.
Overall this will mean better C-state support.

One major advantage of using MWAIT for all C-states is, with this and
"treat interrupt as break event" feature of MWAIT, we can now get accurate
timing for the time spent in C1, C2, .. states.

Signed-off-by: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Len Brown <len.brown@intel.com>

authored by

Venkatesh Pallipadi and committed by
Len Brown
991528d7 b4bd8c66

+242 -56
+121 -1
arch/i386/kernel/acpi/cstate.c
··· 10 10 #include <linux/module.h> 11 11 #include <linux/init.h> 12 12 #include <linux/acpi.h> 13 + #include <linux/cpu.h> 13 14 14 15 #include <acpi/processor.h> 15 16 #include <asm/acpi.h> ··· 42 41 flags->bm_check = 1; 43 42 } 44 43 } 45 - 46 44 EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 45 + 46 + /* The code below handles cstate entry with monitor-mwait pair on Intel*/ 47 + 48 + struct cstate_entry_s { 49 + struct { 50 + unsigned int eax; 51 + unsigned int ecx; 52 + } states[ACPI_PROCESSOR_MAX_POWER]; 53 + }; 54 + static struct cstate_entry_s *cpu_cstate_entry; /* per CPU ptr */ 55 + 56 + static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; 57 + 58 + #define MWAIT_SUBSTATE_MASK (0xf) 59 + #define MWAIT_SUBSTATE_SIZE (4) 60 + 61 + #define CPUID_MWAIT_LEAF (5) 62 + #define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1) 63 + #define CPUID5_ECX_INTERRUPT_BREAK (0x2) 64 + 65 + #define MWAIT_ECX_INTERRUPT_BREAK (0x1) 66 + 67 + #define NATIVE_CSTATE_BEYOND_HALT (2) 68 + 69 + int acpi_processor_ffh_cstate_probe(unsigned int cpu, 70 + struct acpi_processor_cx *cx, struct acpi_power_register *reg) 71 + { 72 + struct cstate_entry_s *percpu_entry; 73 + struct cpuinfo_x86 *c = cpu_data + cpu; 74 + 75 + cpumask_t saved_mask; 76 + int retval; 77 + unsigned int eax, ebx, ecx, edx; 78 + unsigned int edx_part; 79 + unsigned int cstate_type; /* C-state type and not ACPI C-state type */ 80 + unsigned int num_cstate_subtype; 81 + 82 + if (!cpu_cstate_entry || c->cpuid_level < CPUID_MWAIT_LEAF ) 83 + return -1; 84 + 85 + if (reg->bit_offset != NATIVE_CSTATE_BEYOND_HALT) 86 + return -1; 87 + 88 + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); 89 + percpu_entry->states[cx->index].eax = 0; 90 + percpu_entry->states[cx->index].ecx = 0; 91 + 92 + /* Make sure we are running on right CPU */ 93 + saved_mask = current->cpus_allowed; 94 + retval = set_cpus_allowed(current, cpumask_of_cpu(cpu)); 95 + if (retval) 96 + return -1; 97 + 98 + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); 99 + 100 + /* Check whether this particular cx_type (in CST) is supported or not */ 101 + cstate_type = (cx->address >> MWAIT_SUBSTATE_SIZE) + 1; 102 + edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE); 103 + num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; 104 + 105 + retval = 0; 106 + if (num_cstate_subtype < (cx->address & MWAIT_SUBSTATE_MASK)) { 107 + retval = -1; 108 + goto out; 109 + } 110 + 111 + /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */ 112 + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || 113 + !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) { 114 + retval = -1; 115 + goto out; 116 + } 117 + percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK; 118 + 119 + /* Use the hint in CST */ 120 + percpu_entry->states[cx->index].eax = cx->address; 121 + 122 + if (!mwait_supported[cstate_type]) { 123 + mwait_supported[cstate_type] = 1; 124 + printk(KERN_DEBUG "Monitor-Mwait will be used to enter C-%d " 125 + "state\n", cx->type); 126 + } 127 + 128 + out: 129 + set_cpus_allowed(current, saved_mask); 130 + return retval; 131 + } 132 + EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); 133 + 134 + void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx) 135 + { 136 + unsigned int cpu = smp_processor_id(); 137 + struct cstate_entry_s *percpu_entry; 138 + 139 + percpu_entry = per_cpu_ptr(cpu_cstate_entry, cpu); 140 + mwait_idle_with_hints(percpu_entry->states[cx->index].eax, 141 + percpu_entry->states[cx->index].ecx); 142 + } 143 + EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_enter); 144 + 145 + static int __init ffh_cstate_init(void) 146 + { 147 + struct cpuinfo_x86 *c = &boot_cpu_data; 148 + if (c->x86_vendor != X86_VENDOR_INTEL) 149 + return -1; 150 + 151 + cpu_cstate_entry = alloc_percpu(struct cstate_entry_s); 152 + return 0; 153 + } 154 + 155 + static void __exit ffh_cstate_exit(void) 156 + { 157 + if (cpu_cstate_entry) { 158 + free_percpu(cpu_cstate_entry); 159 + cpu_cstate_entry = NULL; 160 + } 161 + } 162 + 163 + arch_initcall(ffh_cstate_init); 164 + __exitcall(ffh_cstate_exit);
+16 -8
arch/i386/kernel/process.c
··· 236 236 * We execute MONITOR against need_resched and enter optimized wait state 237 237 * through MWAIT. Whenever someone changes need_resched, we would be woken 238 238 * up from MWAIT (without an IPI). 239 + * 240 + * New with Core Duo processors, MWAIT can take some hints based on CPU 241 + * capability. 239 242 */ 243 + void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) 244 + { 245 + if (!need_resched()) { 246 + __monitor((void *)&current_thread_info()->flags, 0, 0); 247 + smp_mb(); 248 + if (!need_resched()) 249 + __mwait(eax, ecx); 250 + } 251 + } 252 + 253 + /* Default MONITOR/MWAIT with no hints, used for default C1 state */ 240 254 static void mwait_idle(void) 241 255 { 242 256 local_irq_enable(); 243 - 244 - while (!need_resched()) { 245 - __monitor((void *)&current_thread_info()->flags, 0, 0); 246 - smp_mb(); 247 - if (need_resched()) 248 - break; 249 - __mwait(0, 0); 250 - } 257 + while (!need_resched()) 258 + mwait_idle_with_hints(0, 0); 251 259 } 252 260 253 261 void __devinit select_idle_routine(const struct cpuinfo_x86 *c)
+16 -8
arch/x86_64/kernel/process.c
··· 238 238 * We execute MONITOR against need_resched and enter optimized wait state 239 239 * through MWAIT. Whenever someone changes need_resched, we would be woken 240 240 * up from MWAIT (without an IPI). 241 + * 242 + * New with Core Duo processors, MWAIT can take some hints based on CPU 243 + * capability. 241 244 */ 245 + void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) 246 + { 247 + if (!need_resched()) { 248 + __monitor((void *)&current_thread_info()->flags, 0, 0); 249 + smp_mb(); 250 + if (!need_resched()) 251 + __mwait(eax, ecx); 252 + } 253 + } 254 + 255 + /* Default MONITOR/MWAIT with no hints, used for default C1 state */ 242 256 static void mwait_idle(void) 243 257 { 244 258 local_irq_enable(); 245 - 246 - while (!need_resched()) { 247 - __monitor((void *)&current_thread_info()->flags, 0, 0); 248 - smp_mb(); 249 - if (need_resched()) 250 - break; 251 - __mwait(0, 0); 252 - } 259 + while (!need_resched()) 260 + mwait_idle_with_hints(0,0); 253 261 } 254 262 255 263 void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
+61 -36
drivers/acpi/processor_idle.c
··· 219 219 220 220 static atomic_t c3_cpu_count; 221 221 222 + /* Common C-state entry for C2, C3, .. */ 223 + static void acpi_cstate_enter(struct acpi_processor_cx *cstate) 224 + { 225 + if (cstate->space_id == ACPI_CSTATE_FFH) { 226 + /* Call into architectural FFH based C-state */ 227 + acpi_processor_ffh_cstate_enter(cstate); 228 + } else { 229 + int unused; 230 + /* IO port based C-state */ 231 + inb(cstate->address); 232 + /* Dummy wait op - must do something useless after P_LVL2 read 233 + because chipsets cannot guarantee that STPCLK# signal 234 + gets asserted in time to freeze execution properly. */ 235 + unused = inl(acpi_fadt.xpm_tmr_blk.address); 236 + } 237 + } 238 + 222 239 static void acpi_processor_idle(void) 223 240 { 224 241 struct acpi_processor *pr = NULL; ··· 378 361 /* Get start time (ticks) */ 379 362 t1 = inl(acpi_fadt.xpm_tmr_blk.address); 380 363 /* Invoke C2 */ 381 - inb(cx->address); 382 - /* Dummy wait op - must do something useless after P_LVL2 read 383 - because chipsets cannot guarantee that STPCLK# signal 384 - gets asserted in time to freeze execution properly. */ 385 - t2 = inl(acpi_fadt.xpm_tmr_blk.address); 364 + acpi_cstate_enter(cx); 386 365 /* Get end time (ticks) */ 387 366 t2 = inl(acpi_fadt.xpm_tmr_blk.address); 388 367 ··· 414 401 /* Get start time (ticks) */ 415 402 t1 = inl(acpi_fadt.xpm_tmr_blk.address); 416 403 /* Invoke C3 */ 417 - inb(cx->address); 418 - /* Dummy wait op (see above) */ 419 - t2 = inl(acpi_fadt.xpm_tmr_blk.address); 404 + acpi_cstate_enter(cx); 420 405 /* Get end time (ticks) */ 421 406 t2 = inl(acpi_fadt.xpm_tmr_blk.address); 422 407 if (pr->flags.bm_check) { ··· 639 628 return 0; 640 629 } 641 630 642 - static int acpi_processor_get_power_info_default_c1(struct acpi_processor *pr) 631 + static int acpi_processor_get_power_info_default(struct acpi_processor *pr) 643 632 { 644 - 645 - /* Zero initialize all the C-states info. */ 646 - memset(pr->power.states, 0, sizeof(pr->power.states)); 647 - 648 - /* set the first C-State to C1 */ 649 - pr->power.states[ACPI_STATE_C1].type = ACPI_STATE_C1; 650 - 651 - /* the C0 state only exists as a filler in our array, 652 - * and all processors need to support C1 */ 633 + if (!pr->power.states[ACPI_STATE_C1].valid) { 634 + /* set the first C-State to C1 */ 635 + /* all processors need to support C1 */ 636 + pr->power.states[ACPI_STATE_C1].type = ACPI_STATE_C1; 637 + pr->power.states[ACPI_STATE_C1].valid = 1; 638 + } 639 + /* the C0 state only exists as a filler in our array */ 653 640 pr->power.states[ACPI_STATE_C0].valid = 1; 654 - pr->power.states[ACPI_STATE_C1].valid = 1; 655 - 656 641 return 0; 657 642 } 658 643 ··· 665 658 if (nocst) 666 659 return -ENODEV; 667 660 668 - current_count = 1; 669 - 670 - /* Zero initialize C2 onwards and prepare for fresh CST lookup */ 671 - for (i = 2; i < ACPI_PROCESSOR_MAX_POWER; i++) 672 - memset(&(pr->power.states[i]), 0, 673 - sizeof(struct acpi_processor_cx)); 661 + current_count = 0; 674 662 675 663 status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer); 676 664 if (ACPI_FAILURE(status)) { ··· 720 718 (reg->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) 721 719 continue; 722 720 723 - cx.address = (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) ? 724 - 0 : reg->address; 725 - 726 721 /* There should be an easy way to extract an integer... */ 727 722 obj = (union acpi_object *)&(element->package.elements[1]); 728 723 if (obj->type != ACPI_TYPE_INTEGER) 729 724 continue; 730 725 731 726 cx.type = obj->integer.value; 727 + /* 728 + * Some buggy BIOSes won't list C1 in _CST - 729 + * Let acpi_processor_get_power_info_default() handle them later 730 + */ 731 + if (i == 1 && cx.type != ACPI_STATE_C1) 732 + current_count++; 732 733 733 - if ((cx.type != ACPI_STATE_C1) && 734 - (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO)) 735 - continue; 734 + cx.address = reg->address; 735 + cx.index = current_count + 1; 736 736 737 - if ((cx.type < ACPI_STATE_C2) || (cx.type > ACPI_STATE_C3)) 738 - continue; 737 + cx.space_id = ACPI_CSTATE_SYSTEMIO; 738 + if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) { 739 + if (acpi_processor_ffh_cstate_probe 740 + (pr->id, &cx, reg) == 0) { 741 + cx.space_id = ACPI_CSTATE_FFH; 742 + } else if (cx.type != ACPI_STATE_C1) { 743 + /* 744 + * C1 is a special case where FIXED_HARDWARE 745 + * can be handled in non-MWAIT way as well. 746 + * In that case, save this _CST entry info. 747 + * That is, we retain space_id of SYSTEM_IO for 748 + * halt based C1. 749 + * Otherwise, ignore this info and continue. 750 + */ 751 + continue; 752 + } 753 + } 739 754 740 755 obj = (union acpi_object *)&(element->package.elements[2]); 741 756 if (obj->type != ACPI_TYPE_INTEGER) ··· 957 938 /* NOTE: the idle thread may not be running while calling 958 939 * this function */ 959 940 960 - /* Adding C1 state */ 961 - acpi_processor_get_power_info_default_c1(pr); 941 + /* Zero initialize all the C-states info. */ 942 + memset(pr->power.states, 0, sizeof(pr->power.states)); 943 + 962 944 result = acpi_processor_get_power_info_cst(pr); 963 945 if (result == -ENODEV) 964 946 acpi_processor_get_power_info_fadt(pr); 947 + 948 + if (result) 949 + return result; 950 + 951 + acpi_processor_get_power_info_default(pr); 965 952 966 953 pr->power.count = acpi_processor_power_verify(pr); 967 954
+6 -3
include/acpi/pdc_intel.h
··· 13 13 #define ACPI_PDC_SMP_C_SWCOORD (0x0040) 14 14 #define ACPI_PDC_SMP_T_SWCOORD (0x0080) 15 15 #define ACPI_PDC_C_C1_FFH (0x0100) 16 + #define ACPI_PDC_C_C2C3_FFH (0x0200) 16 17 17 18 #define ACPI_PDC_EST_CAPABILITY_SMP (ACPI_PDC_SMP_C1PT | \ 18 19 ACPI_PDC_C_C1_HALT | \ ··· 24 23 ACPI_PDC_SMP_P_SWCOORD | \ 25 24 ACPI_PDC_P_FFH) 26 25 27 - #define ACPI_PDC_C_CAPABILITY_SMP (ACPI_PDC_SMP_C2C3 | \ 28 - ACPI_PDC_SMP_C1PT | \ 29 - ACPI_PDC_C_C1_HALT) 26 + #define ACPI_PDC_C_CAPABILITY_SMP (ACPI_PDC_SMP_C2C3 | \ 27 + ACPI_PDC_SMP_C1PT | \ 28 + ACPI_PDC_C_C1_HALT | \ 29 + ACPI_PDC_C_C1_FFH | \ 30 + ACPI_PDC_C_C2C3_FFH) 30 31 31 32 #endif /* __PDC_INTEL_H__ */
+18
include/acpi/processor.h
··· 29 29 #define DOMAIN_COORD_TYPE_SW_ANY 0xfd 30 30 #define DOMAIN_COORD_TYPE_HW_ALL 0xfe 31 31 32 + #define ACPI_CSTATE_SYSTEMIO (0) 33 + #define ACPI_CSTATE_FFH (1) 34 + 32 35 /* Power Management */ 33 36 34 37 struct acpi_processor_cx; ··· 61 58 u8 valid; 62 59 u8 type; 63 60 u32 address; 61 + u8 space_id; 62 + u8 index; 64 63 u32 latency; 65 64 u32 latency_ticks; 66 65 u32 power; ··· 211 206 #ifdef ARCH_HAS_POWER_INIT 212 207 void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, 213 208 unsigned int cpu); 209 + int acpi_processor_ffh_cstate_probe(unsigned int cpu, 210 + struct acpi_processor_cx *cx, struct acpi_power_register *reg); 211 + void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cstate); 214 212 #else 215 213 static inline void acpi_processor_power_init_bm_check(struct 216 214 acpi_processor_flags 217 215 *flags, unsigned int cpu) 218 216 { 219 217 flags->bm_check = 1; 218 + return; 219 + } 220 + static inline int acpi_processor_ffh_cstate_probe(unsigned int cpu, 221 + struct acpi_processor_cx *cx, struct acpi_power_register *reg) 222 + { 223 + return -1; 224 + } 225 + static inline void acpi_processor_ffh_cstate_enter( 226 + struct acpi_processor_cx *cstate) 227 + { 220 228 return; 221 229 } 222 230 #endif
+2
include/asm-i386/processor.h
··· 306 306 : :"a" (eax), "c" (ecx)); 307 307 } 308 308 309 + extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); 310 + 309 311 /* from system description table in BIOS. Mostly for MCA use, but 310 312 others may find it useful. */ 311 313 extern unsigned int machine_id;
+2
include/asm-x86_64/processor.h
··· 475 475 : :"a" (eax), "c" (ecx)); 476 476 } 477 477 478 + extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); 479 + 478 480 #define stack_current() \ 479 481 ({ \ 480 482 struct thread_info *ti; \