Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

powernv/cpuidle: Redesign idle states management

Deep idle states like sleep and winkle are per core idle states. A core
enters these states only when all the threads enter either the
particular idle state or a deeper one. There are tasks like fastsleep
hardware bug workaround and hypervisor core state save which have to be
done only by the last thread of the core entering deep idle state and
similarly tasks like timebase resync, hypervisor core register restore
that have to be done only by the first thread waking up from these
state.

The current idle state management does not have a way to distinguish the
first/last thread of the core waking/entering idle states. Tasks like
timebase resync are done for all the threads. This is not only is
suboptimal, but can cause functionality issues when subcores and kvm is
involved.

This patch adds the necessary infrastructure to track idle states of
threads in a per-core structure. It uses this info to perform tasks like
fastsleep workaround and timebase resync only once per core.

Signed-off-by: Shreyas B. Prabhu <shreyas@linux.vnet.ibm.com>
Originally-by: Preeti U. Murthy <preeti@linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: linux-pm@vger.kernel.org
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>

authored by

Shreyas B. Prabhu and committed by
Michael Ellerman
7cba160a 8eb8ac89

+297 -59
+20
arch/powerpc/include/asm/cpuidle.h
··· 1 + #ifndef _ASM_POWERPC_CPUIDLE_H 2 + #define _ASM_POWERPC_CPUIDLE_H 3 + 4 + #ifdef CONFIG_PPC_POWERNV 5 + /* Used in powernv idle state management */ 6 + #define PNV_THREAD_RUNNING 0 7 + #define PNV_THREAD_NAP 1 8 + #define PNV_THREAD_SLEEP 2 9 + #define PNV_THREAD_WINKLE 3 10 + #define PNV_CORE_IDLE_LOCK_BIT 0x100 11 + #define PNV_CORE_IDLE_THREAD_BITS 0x0FF 12 + 13 + #ifndef __ASSEMBLY__ 14 + extern u32 pnv_fastsleep_workaround_at_entry[]; 15 + extern u32 pnv_fastsleep_workaround_at_exit[]; 16 + #endif 17 + 18 + #endif 19 + 20 + #endif
+2
arch/powerpc/include/asm/opal.h
··· 160 160 #define OPAL_PCI_ERR_INJECT 96 161 161 #define OPAL_PCI_EEH_FREEZE_SET 97 162 162 #define OPAL_HANDLE_HMI 98 163 + #define OPAL_CONFIG_CPU_IDLE_STATE 99 163 164 #define OPAL_REGISTER_DUMP_REGION 101 164 165 #define OPAL_UNREGISTER_DUMP_REGION 102 165 166 #define OPAL_WRITE_TPO 103 ··· 176 175 */ 177 176 #define OPAL_PM_NAP_ENABLED 0x00010000 178 177 #define OPAL_PM_SLEEP_ENABLED 0x00020000 178 + #define OPAL_PM_SLEEP_ENABLED_ER1 0x00080000 179 179 180 180 #ifndef __ASSEMBLY__ 181 181
+8
arch/powerpc/include/asm/paca.h
··· 152 152 u64 tm_scratch; /* TM scratch area for reclaim */ 153 153 #endif 154 154 155 + #ifdef CONFIG_PPC_POWERNV 156 + /* Per-core mask tracking idle threads and a lock bit-[L][TTTTTTTT] */ 157 + u32 *core_idle_state_ptr; 158 + u8 thread_idle_state; /* PNV_THREAD_RUNNING/NAP/SLEEP */ 159 + /* Mask to indicate thread id in core */ 160 + u8 thread_mask; 161 + #endif 162 + 155 163 #ifdef CONFIG_PPC_BOOK3S_64 156 164 /* Exclusive emergency stack pointer for machine check exception. */ 157 165 void *mc_emergency_sp;
+1 -1
arch/powerpc/include/asm/processor.h
··· 452 452 453 453 extern int powersave_nap; /* set if nap mode can be used in idle loop */ 454 454 extern unsigned long power7_nap(int check_irq); 455 - extern void power7_sleep(void); 455 + extern unsigned long power7_sleep(void); 456 456 extern void flush_instruction_cache(void); 457 457 extern void hard_reset_now(void); 458 458 extern void poweroff_now(void);
+9
arch/powerpc/kernel/asm-offsets.c
··· 726 726 arch.timing_last_enter.tv32.tbl)); 727 727 #endif 728 728 729 + #ifdef CONFIG_PPC_POWERNV 730 + DEFINE(PACA_CORE_IDLE_STATE_PTR, 731 + offsetof(struct paca_struct, core_idle_state_ptr)); 732 + DEFINE(PACA_THREAD_IDLE_STATE, 733 + offsetof(struct paca_struct, thread_idle_state)); 734 + DEFINE(PACA_THREAD_MASK, 735 + offsetof(struct paca_struct, thread_mask)); 736 + #endif 737 + 729 738 return 0; 730 739 }
+15 -9
arch/powerpc/kernel/exceptions-64s.S
··· 15 15 #include <asm/hw_irq.h> 16 16 #include <asm/exception-64s.h> 17 17 #include <asm/ptrace.h> 18 + #include <asm/cpuidle.h> 18 19 19 20 /* 20 21 * We layout physical memory as follows: ··· 110 109 rlwinm. r13,r13,47-31,30,31 111 110 beq 9f 112 111 113 - /* waking up from powersave (nap) state */ 114 - cmpwi cr1,r13,2 115 - /* Total loss of HV state is fatal, we could try to use the 116 - * PIR to locate a PACA, then use an emergency stack etc... 117 - * OPAL v3 based powernv platforms have new idle states 118 - * which fall in this catagory. 119 - */ 120 - bgt cr1,8f 112 + cmpwi cr3,r13,2 113 + 121 114 GET_PACA(r13) 115 + lbz r0,PACA_THREAD_IDLE_STATE(r13) 116 + cmpwi cr2,r0,PNV_THREAD_NAP 117 + bgt cr2,8f /* Either sleep or Winkle */ 118 + 119 + /* Waking up from nap should not cause hypervisor state loss */ 120 + bgt cr3,. 121 + 122 + /* Waking up from nap */ 123 + li r0,PNV_THREAD_RUNNING 124 + stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ 122 125 123 126 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 124 127 li r0,KVM_HWTHREAD_IN_KERNEL ··· 138 133 139 134 /* Return SRR1 from power7_nap() */ 140 135 mfspr r3,SPRN_SRR1 141 - beq cr1,2f 136 + beq cr3,2f 142 137 b power7_wakeup_noloss 143 138 2: b power7_wakeup_loss 144 139 ··· 1387 1382 MACHINE_CHECK_HANDLER_WINDUP 1388 1383 GET_PACA(r13) 1389 1384 ld r1,PACAR1(r13) 1385 + li r3,PNV_THREAD_NAP 1390 1386 b power7_enter_nap_mode 1391 1387 4: 1392 1388 #endif
+154 -45
arch/powerpc/kernel/idle_power7.S
··· 18 18 #include <asm/hw_irq.h> 19 19 #include <asm/kvm_book3s_asm.h> 20 20 #include <asm/opal.h> 21 + #include <asm/cpuidle.h> 21 22 22 23 #undef DEBUG 23 24 ··· 38 37 39 38 /* 40 39 * Pass requested state in r3: 41 - * 0 - nap 42 - * 1 - sleep 40 + * r3 - PNV_THREAD_NAP/SLEEP/WINKLE 43 41 * 44 42 * To check IRQ_HAPPENED in r4 45 43 * 0 - don't check ··· 123 123 li r4,KVM_HWTHREAD_IN_NAP 124 124 stb r4,HSTATE_HWTHREAD_STATE(r13) 125 125 #endif 126 - cmpwi cr0,r3,1 127 - beq 2f 126 + stb r3,PACA_THREAD_IDLE_STATE(r13) 127 + cmpwi cr1,r3,PNV_THREAD_SLEEP 128 + bge cr1,2f 128 129 IDLE_STATE_ENTER_SEQ(PPC_NAP) 129 130 /* No return */ 130 - 2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP) 131 - /* No return */ 131 + 2: 132 + /* Sleep or winkle */ 133 + lbz r7,PACA_THREAD_MASK(r13) 134 + ld r14,PACA_CORE_IDLE_STATE_PTR(r13) 135 + lwarx_loop1: 136 + lwarx r15,0,r14 137 + andc r15,r15,r7 /* Clear thread bit */ 138 + 139 + andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS 140 + 141 + /* 142 + * If cr0 = 0, then current thread is the last thread of the core entering 143 + * sleep. Last thread needs to execute the hardware bug workaround code if 144 + * required by the platform. 145 + * Make the workaround call unconditionally here. The below branch call is 146 + * patched out when the idle states are discovered if the platform does not 147 + * require it. 148 + */ 149 + .global pnv_fastsleep_workaround_at_entry 150 + pnv_fastsleep_workaround_at_entry: 151 + beq fastsleep_workaround_at_entry 152 + 153 + stwcx. r15,0,r14 154 + bne- lwarx_loop1 155 + isync 156 + 157 + common_enter: /* common code for all the threads entering sleep */ 158 + IDLE_STATE_ENTER_SEQ(PPC_SLEEP) 159 + 160 + fastsleep_workaround_at_entry: 161 + ori r15,r15,PNV_CORE_IDLE_LOCK_BIT 162 + stwcx. r15,0,r14 163 + bne- lwarx_loop1 164 + isync 165 + 166 + /* Fast sleep workaround */ 167 + li r3,1 168 + li r4,1 169 + li r0,OPAL_CONFIG_CPU_IDLE_STATE 170 + bl opal_call_realmode 171 + 172 + /* Clear Lock bit */ 173 + li r0,0 174 + lwsync 175 + stw r0,0(r14) 176 + b common_enter 177 + 132 178 133 179 _GLOBAL(power7_idle) 134 180 /* Now check if user or arch enabled NAP mode */ ··· 187 141 188 142 _GLOBAL(power7_nap) 189 143 mr r4,r3 190 - li r3,0 144 + li r3,PNV_THREAD_NAP 191 145 b power7_powersave_common 192 146 /* No return */ 193 147 194 148 _GLOBAL(power7_sleep) 195 - li r3,1 149 + li r3,PNV_THREAD_SLEEP 196 150 li r4,1 197 151 b power7_powersave_common 198 152 /* No return */ 199 - 200 - /* 201 - * Make opal call in realmode. This is a generic function to be called 202 - * from realmode from reset vector. It handles endianess. 203 - * 204 - * r13 - paca pointer 205 - * r1 - stack pointer 206 - * r3 - opal token 207 - */ 208 - opal_call_realmode: 209 - mflr r12 210 - std r12,_LINK(r1) 211 - ld r2,PACATOC(r13) 212 - /* Set opal return address */ 213 - LOAD_REG_ADDR(r0,return_from_opal_call) 214 - mtlr r0 215 - /* Handle endian-ness */ 216 - li r0,MSR_LE 217 - mfmsr r12 218 - andc r12,r12,r0 219 - mtspr SPRN_HSRR1,r12 220 - mr r0,r3 /* Move opal token to r0 */ 221 - LOAD_REG_ADDR(r11,opal) 222 - ld r12,8(r11) 223 - ld r2,0(r11) 224 - mtspr SPRN_HSRR0,r12 225 - hrfid 226 - 227 - return_from_opal_call: 228 - FIXUP_ENDIAN 229 - ld r0,_LINK(r1) 230 - mtlr r0 231 - blr 232 153 233 154 #define CHECK_HMI_INTERRUPT \ 234 155 mfspr r0,SPRN_SRR1; \ ··· 210 197 ld r2,PACATOC(r13); \ 211 198 ld r1,PACAR1(r13); \ 212 199 std r3,ORIG_GPR3(r1); /* Save original r3 */ \ 213 - li r3,OPAL_HANDLE_HMI; /* Pass opal token argument*/ \ 200 + li r0,OPAL_HANDLE_HMI; /* Pass opal token argument*/ \ 214 201 bl opal_call_realmode; \ 215 202 ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \ 216 203 20: nop; ··· 219 206 _GLOBAL(power7_wakeup_tb_loss) 220 207 ld r2,PACATOC(r13); 221 208 ld r1,PACAR1(r13) 209 + /* 210 + * Before entering any idle state, the NVGPRs are saved in the stack 211 + * and they are restored before switching to the process context. Hence 212 + * until they are restored, they are free to be used. 213 + * 214 + * Save SRR1 in a NVGPR as it might be clobbered in opal_call_realmode 215 + * (called in CHECK_HMI_INTERRUPT). SRR1 is required to determine the 216 + * wakeup reason if we branch to kvm_start_guest. 217 + */ 222 218 219 + mfspr r16,SPRN_SRR1 223 220 BEGIN_FTR_SECTION 224 221 CHECK_HMI_INTERRUPT 225 222 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) 226 - /* Time base re-sync */ 227 - li r3,OPAL_RESYNC_TIMEBASE 228 - bl opal_call_realmode; 229 223 224 + lbz r7,PACA_THREAD_MASK(r13) 225 + ld r14,PACA_CORE_IDLE_STATE_PTR(r13) 226 + lwarx_loop2: 227 + lwarx r15,0,r14 228 + andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT 229 + /* 230 + * Lock bit is set in one of the 2 cases- 231 + * a. In the sleep/winkle enter path, the last thread is executing 232 + * fastsleep workaround code. 233 + * b. In the wake up path, another thread is executing fastsleep 234 + * workaround undo code or resyncing timebase or restoring context 235 + * In either case loop until the lock bit is cleared. 236 + */ 237 + bne core_idle_lock_held 238 + 239 + cmpwi cr2,r15,0 240 + or r15,r15,r7 /* Set thread bit */ 241 + 242 + beq cr2,first_thread 243 + 244 + /* Not first thread in core to wake up */ 245 + stwcx. r15,0,r14 246 + bne- lwarx_loop2 247 + isync 248 + b common_exit 249 + 250 + core_idle_lock_held: 251 + HMT_LOW 252 + core_idle_lock_loop: 253 + lwz r15,0(14) 254 + andi. r9,r15,PNV_CORE_IDLE_LOCK_BIT 255 + bne core_idle_lock_loop 256 + HMT_MEDIUM 257 + b lwarx_loop2 258 + 259 + first_thread: 260 + /* First thread in core to wakeup */ 261 + ori r15,r15,PNV_CORE_IDLE_LOCK_BIT 262 + stwcx. r15,0,r14 263 + bne- lwarx_loop2 264 + isync 265 + 266 + /* 267 + * First thread in the core waking up from fastsleep. It needs to 268 + * call the fastsleep workaround code if the platform requires it. 269 + * Call it unconditionally here. The below branch instruction will 270 + * be patched out when the idle states are discovered if platform 271 + * does not require workaround. 272 + */ 273 + .global pnv_fastsleep_workaround_at_exit 274 + pnv_fastsleep_workaround_at_exit: 275 + b fastsleep_workaround_at_exit 276 + 277 + timebase_resync: 278 + /* Do timebase resync if we are waking up from sleep. Use cr3 value 279 + * set in exceptions-64s.S */ 280 + ble cr3,clear_lock 281 + /* Time base re-sync */ 282 + li r0,OPAL_RESYNC_TIMEBASE 283 + bl opal_call_realmode; 230 284 /* TODO: Check r3 for failure */ 285 + 286 + clear_lock: 287 + andi. r15,r15,PNV_CORE_IDLE_THREAD_BITS 288 + lwsync 289 + stw r15,0(r14) 290 + 291 + common_exit: 292 + li r5,PNV_THREAD_RUNNING 293 + stb r5,PACA_THREAD_IDLE_STATE(r13) 294 + 295 + mtspr SPRN_SRR1,r16 296 + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE 297 + li r0,KVM_HWTHREAD_IN_KERNEL 298 + stb r0,HSTATE_HWTHREAD_STATE(r13) 299 + /* Order setting hwthread_state vs. testing hwthread_req */ 300 + sync 301 + lbz r0,HSTATE_HWTHREAD_REQ(r13) 302 + cmpwi r0,0 303 + beq 6f 304 + b kvm_start_guest 305 + 6: 306 + #endif 231 307 232 308 REST_NVGPRS(r1) 233 309 REST_GPR(2, r1) ··· 329 227 mtspr SPRN_SRR1,r4 330 228 mtspr SPRN_SRR0,r5 331 229 rfid 230 + 231 + fastsleep_workaround_at_exit: 232 + li r3,1 233 + li r4,0 234 + li r0,OPAL_CONFIG_CPU_IDLE_STATE 235 + bl opal_call_realmode 236 + b timebase_resync 332 237 333 238 /* 334 239 * R3 here contains the value that will be returned to the caller
+37
arch/powerpc/platforms/powernv/opal-wrappers.S
··· 158 158 blr 159 159 #endif 160 160 161 + /* 162 + * Make opal call in realmode. This is a generic function to be called 163 + * from realmode. It handles endianness. 164 + * 165 + * r13 - paca pointer 166 + * r1 - stack pointer 167 + * r0 - opal token 168 + */ 169 + _GLOBAL(opal_call_realmode) 170 + mflr r12 171 + std r12,PPC_LR_STKOFF(r1) 172 + ld r2,PACATOC(r13) 173 + /* Set opal return address */ 174 + LOAD_REG_ADDR(r12,return_from_opal_call) 175 + mtlr r12 176 + 177 + mfmsr r12 178 + #ifdef __LITTLE_ENDIAN__ 179 + /* Handle endian-ness */ 180 + li r11,MSR_LE 181 + andc r12,r12,r11 182 + #endif 183 + mtspr SPRN_HSRR1,r12 184 + LOAD_REG_ADDR(r11,opal) 185 + ld r12,8(r11) 186 + ld r2,0(r11) 187 + mtspr SPRN_HSRR0,r12 188 + hrfid 189 + 190 + return_from_opal_call: 191 + #ifdef __LITTLE_ENDIAN__ 192 + FIXUP_ENDIAN 193 + #endif 194 + ld r12,PPC_LR_STKOFF(r1) 195 + mtlr r12 196 + blr 197 + 161 198 OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); 162 199 OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); 163 200 OPAL_CALL(opal_console_read, OPAL_CONSOLE_READ);
+47 -2
arch/powerpc/platforms/powernv/setup.c
··· 36 36 #include <asm/opal.h> 37 37 #include <asm/kexec.h> 38 38 #include <asm/smp.h> 39 + #include <asm/cputhreads.h> 40 + #include <asm/cpuidle.h> 41 + #include <asm/code-patching.h> 39 42 40 43 #include "powernv.h" 41 44 ··· 293 290 294 291 static u32 supported_cpuidle_states; 295 292 293 + static void pnv_alloc_idle_core_states(void) 294 + { 295 + int i, j; 296 + int nr_cores = cpu_nr_cores(); 297 + u32 *core_idle_state; 298 + 299 + /* 300 + * core_idle_state - First 8 bits track the idle state of each thread 301 + * of the core. The 8th bit is the lock bit. Initially all thread bits 302 + * are set. They are cleared when the thread enters deep idle state 303 + * like sleep and winkle. Initially the lock bit is cleared. 304 + * The lock bit has 2 purposes 305 + * a. While the first thread is restoring core state, it prevents 306 + * other threads in the core from switching to process context. 307 + * b. While the last thread in the core is saving the core state, it 308 + * prevents a different thread from waking up. 309 + */ 310 + for (i = 0; i < nr_cores; i++) { 311 + int first_cpu = i * threads_per_core; 312 + int node = cpu_to_node(first_cpu); 313 + 314 + core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node); 315 + *core_idle_state = PNV_CORE_IDLE_THREAD_BITS; 316 + 317 + for (j = 0; j < threads_per_core; j++) { 318 + int cpu = first_cpu + j; 319 + 320 + paca[cpu].core_idle_state_ptr = core_idle_state; 321 + paca[cpu].thread_idle_state = PNV_THREAD_RUNNING; 322 + paca[cpu].thread_mask = 1 << j; 323 + } 324 + } 325 + } 326 + 296 327 u32 pnv_get_supported_cpuidle_states(void) 297 328 { 298 329 return supported_cpuidle_states; 299 330 } 331 + EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); 300 332 301 333 static int __init pnv_init_idle_states(void) 302 334 { ··· 368 330 flags = be32_to_cpu(idle_state_flags[i]); 369 331 supported_cpuidle_states |= flags; 370 332 } 371 - 333 + if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { 334 + patch_instruction( 335 + (unsigned int *)pnv_fastsleep_workaround_at_entry, 336 + PPC_INST_NOP); 337 + patch_instruction( 338 + (unsigned int *)pnv_fastsleep_workaround_at_exit, 339 + PPC_INST_NOP); 340 + } 341 + pnv_alloc_idle_core_states(); 372 342 return 0; 373 343 } 374 344 375 345 subsys_initcall(pnv_init_idle_states); 376 - 377 346 378 347 static int __init pnv_probe(void) 379 348 {
+2 -1
arch/powerpc/platforms/powernv/smp.c
··· 168 168 mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); 169 169 while (!generic_check_cpu_restart(cpu)) { 170 170 ppc64_runlatch_off(); 171 - if (idle_states & OPAL_PM_SLEEP_ENABLED) 171 + if ((idle_states & OPAL_PM_SLEEP_ENABLED) || 172 + (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) 172 173 srr1 = power7_sleep(); 173 174 else 174 175 srr1 = power7_nap(1);
+2 -1
drivers/cpuidle/cpuidle-powernv.c
··· 208 208 nr_idle_states++; 209 209 } 210 210 211 - if (flags & OPAL_PM_SLEEP_ENABLED) { 211 + if (flags & OPAL_PM_SLEEP_ENABLED || 212 + flags & OPAL_PM_SLEEP_ENABLED_ER1) { 212 213 /* Add FASTSLEEP state */ 213 214 strcpy(powernv_states[nr_idle_states].name, "FastSleep"); 214 215 strcpy(powernv_states[nr_idle_states].desc, "FastSleep");