Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

KVM: PPC: Book3S HV: Streamline guest entry and exit

On entry to the guest, secondary threads now wait for the primary to
switch the MMU after loading up most of their state, rather than before.
This means that the secondary threads get into the guest sooner, in the
common case where the secondary threads get to kvmppc_hv_entry before
the primary thread.

On exit, the first thread out increments the exit count and interrupts
the other threads (to get them out of the guest) before saving most
of its state, rather than after. That means that the other threads
exit sooner and means that the first thread doesn't spend so much
time waiting for the other threads at the point where the MMU gets
switched back to the host.

This pulls out the code that increments the exit count and interrupts
other threads into a separate function, kvmhv_commence_exit().
This also makes sure that r12 and vcpu->arch.trap are set correctly
in some corner cases.

Statistics from /sys/kernel/debug/kvm/vm*/vcpu*/timings show the
improvement. Aggregating across vcpus for a guest with 32 vcpus,
8 threads/vcore, running on a POWER8, gives this before the change:

rm_entry: avg 4537.3ns (222 - 48444, 1068878 samples)
rm_exit: avg 4787.6ns (152 - 165490, 1010717 samples)
rm_intr: avg 1673.6ns (12 - 341304, 3818691 samples)

and this after the change:

rm_entry: avg 3427.7ns (232 - 68150, 1118921 samples)
rm_exit: avg 4716.0ns (12 - 150720, 1119477 samples)
rm_intr: avg 1614.8ns (12 - 522436, 3850432 samples)

showing a substantial reduction in the time spent per guest entry in
the real-mode guest entry code, and smaller reductions in the real
mode guest exit and interrupt handling times. (The test was to start
the guest and boot Fedora 20 big-endian to the login prompt.)

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Alexander Graf <agraf@suse.de>

authored by

Paul Mackerras and committed by
Alexander Graf
6af27c84 7d6c40da

+126 -86
+126 -86
arch/powerpc/kvm/book3s_hv_rmhandlers.S
··· 175 175 /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ 176 176 mfspr r3, SPRN_HDEC 177 177 mtspr SPRN_DEC, r3 178 + /* 179 + * Make sure the primary has finished the MMU switch. 180 + * We should never get here on a secondary thread, but 181 + * check it for robustness' sake. 182 + */ 183 + ld r5, HSTATE_KVM_VCORE(r13) 184 + 65: lbz r0, VCORE_IN_GUEST(r5) 185 + cmpwi r0, 0 186 + beq 65b 187 + /* Set LPCR. */ 188 + ld r8,VCORE_LPCR(r5) 189 + mtspr SPRN_LPCR,r8 190 + isync 178 191 /* set our bit in napping_threads */ 179 192 ld r5, HSTATE_KVM_VCORE(r13) 180 193 lbz r7, HSTATE_PTID(r13) ··· 219 206 220 207 /* check the wake reason */ 221 208 bl kvmppc_check_wake_reason 222 - 209 + 223 210 /* see if any other thread is already exiting */ 224 211 lwz r0, VCORE_ENTRY_EXIT(r5) 225 212 cmpwi r0, 0x100 ··· 257 244 b kvmppc_got_guest 258 245 259 246 kvm_novcpu_exit: 260 - b hdec_soon 247 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 248 + ld r4, HSTATE_KVM_VCPU(r13) 249 + cmpdi r4, 0 250 + beq 13f 251 + addi r3, r4, VCPU_TB_RMEXIT 252 + bl kvmhv_accumulate_time 253 + #endif 254 + 13: bl kvmhv_commence_exit 255 + b kvmhv_switch_to_host 261 256 262 257 /* 263 258 * We come in here when wakened from nap mode. ··· 443 422 /* Primary thread switches to guest partition. */ 444 423 ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ 445 424 cmpwi r6,0 446 - bne 20f 425 + bne 10f 447 426 ld r6,KVM_SDR1(r9) 448 427 lwz r7,KVM_LPID(r9) 449 428 li r0,LPID_RSVD /* switch to reserved LPID */ ··· 514 493 515 494 li r0,1 516 495 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ 517 - b 10f 518 - 519 - /* Secondary threads wait for primary to have done partition switch */ 520 - 20: lbz r0,VCORE_IN_GUEST(r5) 521 - cmpwi r0,0 522 - beq 20b 523 - 524 - /* Set LPCR. */ 525 - 10: ld r8,VCORE_LPCR(r5) 526 - mtspr SPRN_LPCR,r8 527 - isync 528 - 529 - /* Check if HDEC expires soon */ 530 - mfspr r3,SPRN_HDEC 531 - cmpwi r3,512 /* 1 microsecond */ 532 - li r12,BOOK3S_INTERRUPT_HV_DECREMENTER 533 - blt hdec_soon 534 496 535 497 /* Do we have a guest vcpu to run? */ 536 - cmpdi r4, 0 498 + 10: cmpdi r4, 0 537 499 beq kvmppc_primary_no_guest 538 500 kvmppc_got_guest: 539 501 ··· 841 837 clrrdi r6,r6,1 842 838 mtspr SPRN_CTRLT,r6 843 839 4: 840 + /* Secondary threads wait for primary to have done partition switch */ 841 + ld r5, HSTATE_KVM_VCORE(r13) 842 + lbz r6, HSTATE_PTID(r13) 843 + cmpwi r6, 0 844 + beq 21f 845 + lbz r0, VCORE_IN_GUEST(r5) 846 + cmpwi r0, 0 847 + bne 21f 848 + HMT_LOW 849 + 20: lbz r0, VCORE_IN_GUEST(r5) 850 + cmpwi r0, 0 851 + beq 20b 852 + HMT_MEDIUM 853 + 21: 854 + /* Set LPCR. */ 855 + ld r8,VCORE_LPCR(r5) 856 + mtspr SPRN_LPCR,r8 857 + isync 858 + 859 + /* Check if HDEC expires soon */ 860 + mfspr r3, SPRN_HDEC 861 + cmpwi r3, 512 /* 1 microsecond */ 862 + blt hdec_soon 863 + 844 864 ld r6, VCPU_CTR(r4) 845 865 lwz r7, VCPU_XER(r4) 846 866 ··· 970 942 hrfid 971 943 b . 972 944 973 - #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 974 945 secondary_too_late: 946 + li r12, 0 975 947 cmpdi r4, 0 976 948 beq 11f 949 + stw r12, VCPU_TRAP(r4) 950 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 977 951 addi r3, r4, VCPU_TB_RMEXIT 978 952 bl kvmhv_accumulate_time 953 + #endif 979 954 11: b kvmhv_switch_to_host 980 955 981 956 hdec_soon: 982 - ld r4, HSTATE_KVM_VCPU(r13) 983 - cmpdi r4, 0 984 - beq 12f 957 + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER 958 + stw r12, VCPU_TRAP(r4) 959 + mr r9, r4 960 + #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 985 961 addi r3, r4, VCPU_TB_RMEXIT 986 962 bl kvmhv_accumulate_time 987 - 12: b kvmhv_do_exit 988 963 #endif 964 + b guest_exit_cont 989 965 990 966 /****************************************************************************** 991 967 * * ··· 1145 1113 stw r7, VCPU_DSISR(r9) 1146 1114 /* don't overwrite fault_dar/fault_dsisr if HDSI */ 1147 1115 cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE 1148 - beq 6f 1116 + beq mc_cont 1149 1117 std r6, VCPU_FAULT_DAR(r9) 1150 1118 stw r7, VCPU_FAULT_DSISR(r9) 1151 1119 ··· 1159 1127 bl kvmhv_accumulate_time 1160 1128 #endif 1161 1129 1130 + /* Increment exit count, poke other threads to exit */ 1131 + bl kvmhv_commence_exit 1132 + 1162 1133 /* Save guest CTRL register, set runlatch to 1 */ 1163 - 6: mfspr r6,SPRN_CTRLF 1134 + mfspr r6,SPRN_CTRLF 1164 1135 stw r6,VCPU_CTRL(r9) 1165 1136 andi. r0,r6,1 1166 1137 bne 4f ··· 1505 1470 slbia 1506 1471 ptesync 1507 1472 1508 - #ifndef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1509 - hdec_soon: 1510 - #endif 1511 - kvmhv_do_exit: /* r12 = trap, r13 = paca */ 1512 1473 /* 1513 1474 * POWER7/POWER8 guest -> host partition switch code. 1514 1475 * We don't have to lock against tlbies but we do 1515 1476 * have to coordinate the hardware threads. 1516 1477 */ 1517 - /* Set our bit in the threads-exiting-guest map in the 0xff00 1518 - bits of vcore->entry_exit_map */ 1519 - ld r5, HSTATE_KVM_VCORE(r13) 1520 - lbz r4, HSTATE_PTID(r13) 1521 - li r7, 0x100 1522 - sld r7, r7, r4 1523 - addi r6, r5, VCORE_ENTRY_EXIT 1524 - 41: lwarx r3, 0, r6 1525 - or r0, r3, r7 1526 - stwcx. r0, 0, r6 1527 - bne 41b 1528 - isync /* order stwcx. vs. reading napping_threads */ 1529 - 1530 - /* 1531 - * At this point we have an interrupt that we have to pass 1532 - * up to the kernel or qemu; we can't handle it in real mode. 1533 - * Thus we have to do a partition switch, so we have to 1534 - * collect the other threads, if we are the first thread 1535 - * to take an interrupt. To do this, we send a message or 1536 - * IPI to all the threads that have their bit set in the entry 1537 - * map in vcore->entry_exit_map (other than ourselves). 1538 - * However, we don't need to bother if this is an HDEC 1539 - * interrupt, since the other threads will already be on their 1540 - * way here in that case. 1541 - */ 1542 - cmpwi r3,0x100 /* Are we the first here? */ 1543 - bge 43f 1544 - cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER 1545 - beq 43f 1546 - 1547 - srwi r0,r7,8 1548 - andc. r3,r3,r0 /* no sense IPI'ing ourselves */ 1549 - beq 43f 1550 - /* Order entry/exit update vs. IPIs */ 1551 - sync 1552 - mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ 1553 - subf r6,r4,r13 1554 - 42: andi. r0,r3,1 1555 - beq 44f 1556 - ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ 1557 - li r0,IPI_PRIORITY 1558 - li r7,XICS_MFRR 1559 - stbcix r0,r7,r8 /* trigger the IPI */ 1560 - 44: srdi. r3,r3,1 1561 - addi r6,r6,PACA_SIZE 1562 - bne 42b 1563 - 1564 - #ifndef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING 1565 - secondary_too_late: 1566 - #endif 1567 1478 kvmhv_switch_to_host: 1568 1479 /* Secondary threads wait for primary to do partition switch */ 1569 - 43: ld r5,HSTATE_KVM_VCORE(r13) 1480 + ld r5,HSTATE_KVM_VCORE(r13) 1570 1481 ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ 1571 1482 lbz r3,HSTATE_PTID(r13) 1572 1483 cmpwi r3,0 ··· 1611 1630 1612 1631 ld r0, 112+PPC_LR_STKOFF(r1) 1613 1632 addi r1, r1, 112 1633 + mtlr r0 1634 + blr 1635 + 1636 + kvmhv_commence_exit: /* r12 = trap, r13 = paca, doesn't trash r9 */ 1637 + mflr r0 1638 + std r0, PPC_LR_STKOFF(r1) 1639 + stdu r1, -PPC_MIN_STKFRM(r1) 1640 + 1641 + /* Set our bit in the threads-exiting-guest map in the 0xff00 1642 + bits of vcore->entry_exit_map */ 1643 + ld r5, HSTATE_KVM_VCORE(r13) 1644 + lbz r4, HSTATE_PTID(r13) 1645 + li r7, 0x100 1646 + sld r7, r7, r4 1647 + addi r6, r5, VCORE_ENTRY_EXIT 1648 + 41: lwarx r3, 0, r6 1649 + or r0, r3, r7 1650 + stwcx. r0, 0, r6 1651 + bne 41b 1652 + isync /* order stwcx. vs. reading napping_threads */ 1653 + 1654 + /* 1655 + * At this point we have an interrupt that we have to pass 1656 + * up to the kernel or qemu; we can't handle it in real mode. 1657 + * Thus we have to do a partition switch, so we have to 1658 + * collect the other threads, if we are the first thread 1659 + * to take an interrupt. To do this, we send a message or 1660 + * IPI to all the threads that have their bit set in the entry 1661 + * map in vcore->entry_exit_map (other than ourselves). 1662 + * However, we don't need to bother if this is an HDEC 1663 + * interrupt, since the other threads will already be on their 1664 + * way here in that case. 1665 + */ 1666 + cmpwi r3,0x100 /* Are we the first here? */ 1667 + bge 43f 1668 + cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER 1669 + beq 43f 1670 + 1671 + srwi r0,r7,8 1672 + andc. r3,r3,r0 /* no sense IPI'ing ourselves */ 1673 + beq 43f 1674 + /* Order entry/exit update vs. IPIs */ 1675 + sync 1676 + mulli r4,r4,PACA_SIZE /* get paca for thread 0 */ 1677 + subf r6,r4,r13 1678 + 42: andi. r0,r3,1 1679 + beq 44f 1680 + ld r8,HSTATE_XICS_PHYS(r6) /* get thread's XICS reg addr */ 1681 + li r0,IPI_PRIORITY 1682 + li r7,XICS_MFRR 1683 + stbcix r0,r7,r8 /* trigger the IPI */ 1684 + 44: srdi. r3,r3,1 1685 + addi r6,r6,PACA_SIZE 1686 + bne 42b 1687 + 1688 + 43: ld r0, PPC_MIN_STKFRM+PPC_LR_STKOFF(r1) 1689 + addi r1, r1, PPC_MIN_STKFRM 1614 1690 mtlr r0 1615 1691 blr 1616 1692 ··· 2106 2068 lbz r5,VCPU_PRODDED(r3) 2107 2069 cmpwi r5,0 2108 2070 bne kvm_cede_prodded 2109 - li r0,0 /* set trap to 0 to say hcall is handled */ 2110 - stw r0,VCPU_TRAP(r3) 2071 + li r12,0 /* set trap to 0 to say hcall is handled */ 2072 + stw r12,VCPU_TRAP(r3) 2111 2073 li r0,H_SUCCESS 2112 2074 std r0,VCPU_GPR(R3)(r3) 2113 2075 ··· 2313 2275 2314 2276 /* we've ceded but we want to give control to the host */ 2315 2277 kvm_cede_exit: 2316 - b hcall_real_fallback 2278 + ld r9, HSTATE_KVM_VCPU(r13) 2279 + b guest_exit_cont 2317 2280 2318 2281 /* Try to handle a machine check in real mode */ 2319 2282 machine_check_realmode: ··· 2444 2405 bne- 43f 2445 2406 2446 2407 /* OK, it's an IPI for us */ 2408 + li r12, 0 2447 2409 li r3, -1 2448 2410 1: blr 2449 2411