Merge tag 'kvm-3.8-1' of git://git.kernel.org/pub/scm/virt/kvm/kvm

+135 -5

Documentation/virtual/kvm/api.txt

··· 1194 1194 This ioctl fetches PV specific information that need to be passed to the guest 1195 1195 using the device tree or other means from vm context. 1196 1196 1197 - For now the only implemented piece of information distributed here is an array 1198 - of 4 instructions that make up a hypercall. 1197 + The hcall array defines 4 instructions that make up a hypercall. 1199 1198 1200 1199 If any additional field gets added to this structure later on, a bit for that 1201 1200 additional piece of information will be set in the flags bitmap. 1202 1201 1202 + The flags bitmap is defined as: 1203 + 1204 + /* the host supports the ePAPR idle hcall 1205 + #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) 1203 1206 1204 1207 4.48 KVM_ASSIGN_PCI_DEVICE 1205 1208 ··· 1734 1731 Arch | Register | Width (bits) 1735 1732 | | 1736 1733 PPC | KVM_REG_PPC_HIOR | 64 1737 - 1734 + PPC | KVM_REG_PPC_IAC1 | 64 1735 + PPC | KVM_REG_PPC_IAC2 | 64 1736 + PPC | KVM_REG_PPC_IAC3 | 64 1737 + PPC | KVM_REG_PPC_IAC4 | 64 1738 + PPC | KVM_REG_PPC_DAC1 | 64 1739 + PPC | KVM_REG_PPC_DAC2 | 64 1740 + PPC | KVM_REG_PPC_DABR | 64 1741 + PPC | KVM_REG_PPC_DSCR | 64 1742 + PPC | KVM_REG_PPC_PURR | 64 1743 + PPC | KVM_REG_PPC_SPURR | 64 1744 + PPC | KVM_REG_PPC_DAR | 64 1745 + PPC | KVM_REG_PPC_DSISR | 32 1746 + PPC | KVM_REG_PPC_AMR | 64 1747 + PPC | KVM_REG_PPC_UAMOR | 64 1748 + PPC | KVM_REG_PPC_MMCR0 | 64 1749 + PPC | KVM_REG_PPC_MMCR1 | 64 1750 + PPC | KVM_REG_PPC_MMCRA | 64 1751 + PPC | KVM_REG_PPC_PMC1 | 32 1752 + PPC | KVM_REG_PPC_PMC2 | 32 1753 + PPC | KVM_REG_PPC_PMC3 | 32 1754 + PPC | KVM_REG_PPC_PMC4 | 32 1755 + PPC | KVM_REG_PPC_PMC5 | 32 1756 + PPC | KVM_REG_PPC_PMC6 | 32 1757 + PPC | KVM_REG_PPC_PMC7 | 32 1758 + PPC | KVM_REG_PPC_PMC8 | 32 1759 + PPC | KVM_REG_PPC_FPR0 | 64 1760 + ... 1761 + PPC | KVM_REG_PPC_FPR31 | 64 1762 + PPC | KVM_REG_PPC_VR0 | 128 1763 + ... 1764 + PPC | KVM_REG_PPC_VR31 | 128 1765 + PPC | KVM_REG_PPC_VSR0 | 128 1766 + ... 1767 + PPC | KVM_REG_PPC_VSR31 | 128 1768 + PPC | KVM_REG_PPC_FPSCR | 64 1769 + PPC | KVM_REG_PPC_VSCR | 32 1770 + PPC | KVM_REG_PPC_VPA_ADDR | 64 1771 + PPC | KVM_REG_PPC_VPA_SLB | 128 1772 + PPC | KVM_REG_PPC_VPA_DTL | 128 1773 + PPC | KVM_REG_PPC_EPCR | 32 1738 1774 1739 1775 4.69 KVM_GET_ONE_REG 1740 1776 ··· 1789 1747 at the memory location pointed to by "addr". 1790 1748 1791 1749 The list of registers accessible using this interface is identical to the 1792 - list in 4.64. 1750 + list in 4.68. 1793 1751 1794 1752 1795 1753 4.70 KVM_KVMCLOCK_CTRL ··· 2039 1997 the virtualized real-mode area (VRMA) facility, the kernel will 2040 1998 re-create the VMRA HPTEs on the next KVM_RUN of any vcpu.) 2041 1999 2000 + 4.77 KVM_S390_INTERRUPT 2001 + 2002 + Capability: basic 2003 + Architectures: s390 2004 + Type: vm ioctl, vcpu ioctl 2005 + Parameters: struct kvm_s390_interrupt (in) 2006 + Returns: 0 on success, -1 on error 2007 + 2008 + Allows to inject an interrupt to the guest. Interrupts can be floating 2009 + (vm ioctl) or per cpu (vcpu ioctl), depending on the interrupt type. 2010 + 2011 + Interrupt parameters are passed via kvm_s390_interrupt: 2012 + 2013 + struct kvm_s390_interrupt { 2014 + __u32 type; 2015 + __u32 parm; 2016 + __u64 parm64; 2017 + }; 2018 + 2019 + type can be one of the following: 2020 + 2021 + KVM_S390_SIGP_STOP (vcpu) - sigp restart 2022 + KVM_S390_PROGRAM_INT (vcpu) - program check; code in parm 2023 + KVM_S390_SIGP_SET_PREFIX (vcpu) - sigp set prefix; prefix address in parm 2024 + KVM_S390_RESTART (vcpu) - restart 2025 + KVM_S390_INT_VIRTIO (vm) - virtio external interrupt; external interrupt 2026 + parameters in parm and parm64 2027 + KVM_S390_INT_SERVICE (vm) - sclp external interrupt; sclp parameter in parm 2028 + KVM_S390_INT_EMERGENCY (vcpu) - sigp emergency; source cpu in parm 2029 + KVM_S390_INT_EXTERNAL_CALL (vcpu) - sigp external call; source cpu in parm 2030 + 2031 + Note that the vcpu ioctl is asynchronous to vcpu execution. 2032 + 2033 + 4.78 KVM_PPC_GET_HTAB_FD 2034 + 2035 + Capability: KVM_CAP_PPC_HTAB_FD 2036 + Architectures: powerpc 2037 + Type: vm ioctl 2038 + Parameters: Pointer to struct kvm_get_htab_fd (in) 2039 + Returns: file descriptor number (>= 0) on success, -1 on error 2040 + 2041 + This returns a file descriptor that can be used either to read out the 2042 + entries in the guest's hashed page table (HPT), or to write entries to 2043 + initialize the HPT. The returned fd can only be written to if the 2044 + KVM_GET_HTAB_WRITE bit is set in the flags field of the argument, and 2045 + can only be read if that bit is clear. The argument struct looks like 2046 + this: 2047 + 2048 + /* For KVM_PPC_GET_HTAB_FD */ 2049 + struct kvm_get_htab_fd { 2050 + __u64 flags; 2051 + __u64 start_index; 2052 + __u64 reserved[2]; 2053 + }; 2054 + 2055 + /* Values for kvm_get_htab_fd.flags */ 2056 + #define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) 2057 + #define KVM_GET_HTAB_WRITE ((__u64)0x2) 2058 + 2059 + The `start_index' field gives the index in the HPT of the entry at 2060 + which to start reading. It is ignored when writing. 2061 + 2062 + Reads on the fd will initially supply information about all 2063 + "interesting" HPT entries. Interesting entries are those with the 2064 + bolted bit set, if the KVM_GET_HTAB_BOLTED_ONLY bit is set, otherwise 2065 + all entries. When the end of the HPT is reached, the read() will 2066 + return. If read() is called again on the fd, it will start again from 2067 + the beginning of the HPT, but will only return HPT entries that have 2068 + changed since they were last read. 2069 + 2070 + Data read or written is structured as a header (8 bytes) followed by a 2071 + series of valid HPT entries (16 bytes) each. The header indicates how 2072 + many valid HPT entries there are and how many invalid entries follow 2073 + the valid entries. The invalid entries are not represented explicitly 2074 + in the stream. The header format is: 2075 + 2076 + struct kvm_get_htab_header { 2077 + __u32 index; 2078 + __u16 n_valid; 2079 + __u16 n_invalid; 2080 + }; 2081 + 2082 + Writes to the fd create HPT entries starting at the index given in the 2083 + header; first `n_valid' valid entries with contents from the data 2084 + written, then `n_invalid' invalid entries, invalidating any previously 2085 + valid entries found. 2086 + 2042 2087 2043 2088 5. The kvm_run structure 2044 2089 ------------------------ ··· 2238 2109 by kvm. The 'data' member contains the written data if 'is_write' is 2239 2110 true, and should be filled by application code otherwise. 2240 2111 2241 - NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO and KVM_EXIT_OSI, the corresponding 2112 + NOTE: For KVM_EXIT_IO, KVM_EXIT_MMIO, KVM_EXIT_OSI, KVM_EXIT_DCR 2113 + and KVM_EXIT_PAPR the corresponding 2242 2114 operations are complete (and guest state is consistent) only after userspace 2243 2115 has re-entered the kernel with KVM_RUN. The kernel side will first finish 2244 2116 incomplete operations and then check for pending signals. Userspace

+3 -2

MAINTAINERS

··· 4314 4314 F: virt/kvm/ 4315 4315 4316 4316 KERNEL VIRTUAL MACHINE (KVM) FOR AMD-V 4317 - M: Joerg Roedel <joerg.roedel@amd.com> 4317 + M: Joerg Roedel <joro@8bytes.org> 4318 4318 L: kvm@vger.kernel.org 4319 4319 W: http://kvm.qumranet.com 4320 - S: Supported 4320 + S: Maintained 4321 4321 F: arch/x86/include/asm/svm.h 4322 4322 F: arch/x86/kvm/svm.c 4323 4323 ··· 4325 4325 M: Alexander Graf <agraf@suse.de> 4326 4326 L: kvm-ppc@vger.kernel.org 4327 4327 W: http://kvm.qumranet.com 4328 + T: git git://github.com/agraf/linux-2.6.git 4328 4329 S: Supported 4329 4330 F: arch/powerpc/include/asm/kvm* 4330 4331 F: arch/powerpc/kvm/

+5 -2

arch/ia64/kvm/kvm-ia64.c

··· 1330 1330 return 0; 1331 1331 } 1332 1332 1333 + int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 1334 + { 1335 + return 0; 1336 + } 1337 + 1333 1338 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 1334 1339 { 1335 1340 return -EINVAL; ··· 1367 1362 struct kvm_memslots *slots; 1368 1363 struct kvm_memory_slot *memslot; 1369 1364 int j; 1370 - unsigned long base_gfn; 1371 1365 1372 1366 slots = kvm_memslots(kvm); 1373 1367 kvm_for_each_memslot(memslot, slots) { 1374 - base_gfn = memslot->base_gfn; 1375 1368 for (j = 0; j < memslot->npages; j++) { 1376 1369 if (memslot->rmap[j]) 1377 1370 put_page((struct page *)memslot->rmap[j]);

-1

arch/powerpc/include/asm/Kbuild

··· 1 1 2 - 3 2 generic-y += clkdev.h 4 3 generic-y += rwsem.h 5 4 generic-y += trace_clock.h

+16 -67

arch/powerpc/include/asm/epapr_hcalls.h

··· 50 50 #ifndef _EPAPR_HCALLS_H 51 51 #define _EPAPR_HCALLS_H 52 52 53 + #include <uapi/asm/epapr_hcalls.h> 54 + 55 + #ifndef __ASSEMBLY__ 53 56 #include <linux/types.h> 54 57 #include <linux/errno.h> 55 58 #include <asm/byteorder.h> 56 - 57 - #define EV_BYTE_CHANNEL_SEND 1 58 - #define EV_BYTE_CHANNEL_RECEIVE 2 59 - #define EV_BYTE_CHANNEL_POLL 3 60 - #define EV_INT_SET_CONFIG 4 61 - #define EV_INT_GET_CONFIG 5 62 - #define EV_INT_SET_MASK 6 63 - #define EV_INT_GET_MASK 7 64 - #define EV_INT_IACK 9 65 - #define EV_INT_EOI 10 66 - #define EV_INT_SEND_IPI 11 67 - #define EV_INT_SET_TASK_PRIORITY 12 68 - #define EV_INT_GET_TASK_PRIORITY 13 69 - #define EV_DOORBELL_SEND 14 70 - #define EV_MSGSND 15 71 - #define EV_IDLE 16 72 - 73 - /* vendor ID: epapr */ 74 - #define EV_LOCAL_VENDOR_ID 0 /* for private use */ 75 - #define EV_EPAPR_VENDOR_ID 1 76 - #define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */ 77 - #define EV_IBM_VENDOR_ID 3 /* IBM */ 78 - #define EV_GHS_VENDOR_ID 4 /* Green Hills Software */ 79 - #define EV_ENEA_VENDOR_ID 5 /* Enea */ 80 - #define EV_WR_VENDOR_ID 6 /* Wind River Systems */ 81 - #define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */ 82 - #define EV_KVM_VENDOR_ID 42 /* KVM */ 83 - 84 - /* The max number of bytes that a byte channel can send or receive per call */ 85 - #define EV_BYTE_CHANNEL_MAX_BYTES 16 86 - 87 - 88 - #define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num)) 89 - #define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num) 90 - 91 - /* epapr error codes */ 92 - #define EV_EPERM 1 /* Operation not permitted */ 93 - #define EV_ENOENT 2 /* Entry Not Found */ 94 - #define EV_EIO 3 /* I/O error occured */ 95 - #define EV_EAGAIN 4 /* The operation had insufficient 96 - * resources to complete and should be 97 - * retried 98 - */ 99 - #define EV_ENOMEM 5 /* There was insufficient memory to 100 - * complete the operation */ 101 - #define EV_EFAULT 6 /* Bad guest address */ 102 - #define EV_ENODEV 7 /* No such device */ 103 - #define EV_EINVAL 8 /* An argument supplied to the hcall 104 - was out of range or invalid */ 105 - #define EV_INTERNAL 9 /* An internal error occured */ 106 - #define EV_CONFIG 10 /* A configuration error was detected */ 107 - #define EV_INVALID_STATE 11 /* The object is in an invalid state */ 108 - #define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */ 109 - #define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */ 110 59 111 60 /* 112 61 * Hypercall register clobber list ··· 142 193 r5 = priority; 143 194 r6 = destination; 144 195 145 - __asm__ __volatile__ ("sc 1" 196 + asm volatile("bl epapr_hypercall_start" 146 197 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6) 147 198 : : EV_HCALL_CLOBBERS4 148 199 ); ··· 171 222 r11 = EV_HCALL_TOKEN(EV_INT_GET_CONFIG); 172 223 r3 = interrupt; 173 224 174 - __asm__ __volatile__ ("sc 1" 225 + asm volatile("bl epapr_hypercall_start" 175 226 : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5), "=r" (r6) 176 227 : : EV_HCALL_CLOBBERS4 177 228 ); ··· 201 252 r3 = interrupt; 202 253 r4 = mask; 203 254 204 - __asm__ __volatile__ ("sc 1" 255 + asm volatile("bl epapr_hypercall_start" 205 256 : "+r" (r11), "+r" (r3), "+r" (r4) 206 257 : : EV_HCALL_CLOBBERS2 207 258 ); ··· 226 277 r11 = EV_HCALL_TOKEN(EV_INT_GET_MASK); 227 278 r3 = interrupt; 228 279 229 - __asm__ __volatile__ ("sc 1" 280 + asm volatile("bl epapr_hypercall_start" 230 281 : "+r" (r11), "+r" (r3), "=r" (r4) 231 282 : : EV_HCALL_CLOBBERS2 232 283 ); ··· 254 305 r11 = EV_HCALL_TOKEN(EV_INT_EOI); 255 306 r3 = interrupt; 256 307 257 - __asm__ __volatile__ ("sc 1" 308 + asm volatile("bl epapr_hypercall_start" 258 309 : "+r" (r11), "+r" (r3) 259 310 : : EV_HCALL_CLOBBERS1 260 311 ); ··· 293 344 r7 = be32_to_cpu(p[2]); 294 345 r8 = be32_to_cpu(p[3]); 295 346 296 - __asm__ __volatile__ ("sc 1" 347 + asm volatile("bl epapr_hypercall_start" 297 348 : "+r" (r11), "+r" (r3), 298 349 "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), "+r" (r8) 299 350 : : EV_HCALL_CLOBBERS6 ··· 332 383 r3 = handle; 333 384 r4 = *count; 334 385 335 - __asm__ __volatile__ ("sc 1" 386 + asm volatile("bl epapr_hypercall_start" 336 387 : "+r" (r11), "+r" (r3), "+r" (r4), 337 388 "=r" (r5), "=r" (r6), "=r" (r7), "=r" (r8) 338 389 : : EV_HCALL_CLOBBERS6 ··· 370 421 r11 = EV_HCALL_TOKEN(EV_BYTE_CHANNEL_POLL); 371 422 r3 = handle; 372 423 373 - __asm__ __volatile__ ("sc 1" 424 + asm volatile("bl epapr_hypercall_start" 374 425 : "+r" (r11), "+r" (r3), "=r" (r4), "=r" (r5) 375 426 : : EV_HCALL_CLOBBERS3 376 427 ); ··· 403 454 r11 = EV_HCALL_TOKEN(EV_INT_IACK); 404 455 r3 = handle; 405 456 406 - __asm__ __volatile__ ("sc 1" 457 + asm volatile("bl epapr_hypercall_start" 407 458 : "+r" (r11), "+r" (r3), "=r" (r4) 408 459 : : EV_HCALL_CLOBBERS2 409 460 ); ··· 427 478 r11 = EV_HCALL_TOKEN(EV_DOORBELL_SEND); 428 479 r3 = handle; 429 480 430 - __asm__ __volatile__ ("sc 1" 481 + asm volatile("bl epapr_hypercall_start" 431 482 : "+r" (r11), "+r" (r3) 432 483 : : EV_HCALL_CLOBBERS1 433 484 ); ··· 447 498 448 499 r11 = EV_HCALL_TOKEN(EV_IDLE); 449 500 450 - __asm__ __volatile__ ("sc 1" 501 + asm volatile("bl epapr_hypercall_start" 451 502 : "+r" (r11), "=r" (r3) 452 503 : : EV_HCALL_CLOBBERS1 453 504 ); 454 505 455 506 return r3; 456 507 } 457 - 458 - #endif 508 + #endif /* !__ASSEMBLY__ */ 509 + #endif /* _EPAPR_HCALLS_H */

+18 -18

arch/powerpc/include/asm/fsl_hcalls.h

··· 96 96 r11 = FH_HCALL_TOKEN(FH_SEND_NMI); 97 97 r3 = vcpu_mask; 98 98 99 - __asm__ __volatile__ ("sc 1" 99 + asm volatile("bl epapr_hypercall_start" 100 100 : "+r" (r11), "+r" (r3) 101 101 : : EV_HCALL_CLOBBERS1 102 102 ); ··· 151 151 r9 = (uint32_t)propvalue_addr; 152 152 r10 = *propvalue_len; 153 153 154 - __asm__ __volatile__ ("sc 1" 154 + asm volatile("bl epapr_hypercall_start" 155 155 : "+r" (r11), 156 156 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), 157 157 "+r" (r8), "+r" (r9), "+r" (r10) ··· 205 205 r9 = (uint32_t)propvalue_addr; 206 206 r10 = propvalue_len; 207 207 208 - __asm__ __volatile__ ("sc 1" 208 + asm volatile("bl epapr_hypercall_start" 209 209 : "+r" (r11), 210 210 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7), 211 211 "+r" (r8), "+r" (r9), "+r" (r10) ··· 229 229 r11 = FH_HCALL_TOKEN(FH_PARTITION_RESTART); 230 230 r3 = partition; 231 231 232 - __asm__ __volatile__ ("sc 1" 232 + asm volatile("bl epapr_hypercall_start" 233 233 : "+r" (r11), "+r" (r3) 234 234 : : EV_HCALL_CLOBBERS1 235 235 ); ··· 262 262 r11 = FH_HCALL_TOKEN(FH_PARTITION_GET_STATUS); 263 263 r3 = partition; 264 264 265 - __asm__ __volatile__ ("sc 1" 265 + asm volatile("bl epapr_hypercall_start" 266 266 : "+r" (r11), "+r" (r3), "=r" (r4) 267 267 : : EV_HCALL_CLOBBERS2 268 268 ); ··· 295 295 r4 = entry_point; 296 296 r5 = load; 297 297 298 - __asm__ __volatile__ ("sc 1" 298 + asm volatile("bl epapr_hypercall_start" 299 299 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5) 300 300 : : EV_HCALL_CLOBBERS3 301 301 ); ··· 317 317 r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP); 318 318 r3 = partition; 319 319 320 - __asm__ __volatile__ ("sc 1" 320 + asm volatile("bl epapr_hypercall_start" 321 321 : "+r" (r11), "+r" (r3) 322 322 : : EV_HCALL_CLOBBERS1 323 323 ); ··· 376 376 #endif 377 377 r7 = count; 378 378 379 - __asm__ __volatile__ ("sc 1" 379 + asm volatile("bl epapr_hypercall_start" 380 380 : "+r" (r11), 381 381 "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), "+r" (r7) 382 382 : : EV_HCALL_CLOBBERS5 ··· 399 399 r11 = FH_HCALL_TOKEN(FH_DMA_ENABLE); 400 400 r3 = liodn; 401 401 402 - __asm__ __volatile__ ("sc 1" 402 + asm volatile("bl epapr_hypercall_start" 403 403 : "+r" (r11), "+r" (r3) 404 404 : : EV_HCALL_CLOBBERS1 405 405 ); ··· 421 421 r11 = FH_HCALL_TOKEN(FH_DMA_DISABLE); 422 422 r3 = liodn; 423 423 424 - __asm__ __volatile__ ("sc 1" 424 + asm volatile("bl epapr_hypercall_start" 425 425 : "+r" (r11), "+r" (r3) 426 426 : : EV_HCALL_CLOBBERS1 427 427 ); ··· 447 447 r11 = FH_HCALL_TOKEN(FH_VMPIC_GET_MSIR); 448 448 r3 = interrupt; 449 449 450 - __asm__ __volatile__ ("sc 1" 450 + asm volatile("bl epapr_hypercall_start" 451 451 : "+r" (r11), "+r" (r3), "=r" (r4) 452 452 : : EV_HCALL_CLOBBERS2 453 453 ); ··· 469 469 470 470 r11 = FH_HCALL_TOKEN(FH_SYSTEM_RESET); 471 471 472 - __asm__ __volatile__ ("sc 1" 472 + asm volatile("bl epapr_hypercall_start" 473 473 : "+r" (r11), "=r" (r3) 474 474 : : EV_HCALL_CLOBBERS1 475 475 ); ··· 506 506 r6 = addr_lo; 507 507 r7 = peek; 508 508 509 - __asm__ __volatile__ ("sc 1" 509 + asm volatile("bl epapr_hypercall_start" 510 510 : "+r" (r11), "+r" (r3), "+r" (r4), "+r" (r5), "+r" (r6), 511 511 "+r" (r7) 512 512 : : EV_HCALL_CLOBBERS5 ··· 542 542 r3 = handle; 543 543 r4 = vcpu; 544 544 545 - __asm__ __volatile__ ("sc 1" 545 + asm volatile("bl epapr_hypercall_start" 546 546 : "+r" (r11), "+r" (r3), "+r" (r4) 547 547 : : EV_HCALL_CLOBBERS2 548 548 ); ··· 572 572 r3 = handle; 573 573 r4 = vcpu; 574 574 575 - __asm__ __volatile__ ("sc 1" 575 + asm volatile("bl epapr_hypercall_start" 576 576 : "+r" (r11), "+r" (r3), "+r" (r4) 577 577 : : EV_HCALL_CLOBBERS2 578 578 ); ··· 597 597 r3 = handle; 598 598 r4 = vcpu; 599 599 600 - __asm__ __volatile__ ("sc 1" 600 + asm volatile("bl epapr_hypercall_start" 601 601 : "+r" (r11), "+r" (r3), "+r" (r4) 602 602 : : EV_HCALL_CLOBBERS2 603 603 ); ··· 618 618 r11 = FH_HCALL_TOKEN(FH_CLAIM_DEVICE); 619 619 r3 = handle; 620 620 621 - __asm__ __volatile__ ("sc 1" 621 + asm volatile("bl epapr_hypercall_start" 622 622 : "+r" (r11), "+r" (r3) 623 623 : : EV_HCALL_CLOBBERS1 624 624 ); ··· 645 645 r11 = FH_HCALL_TOKEN(FH_PARTITION_STOP_DMA); 646 646 r3 = handle; 647 647 648 - __asm__ __volatile__ ("sc 1" 648 + asm volatile("bl epapr_hypercall_start" 649 649 : "+r" (r11), "+r" (r3) 650 650 : : EV_HCALL_CLOBBERS1 651 651 );

+1

arch/powerpc/include/asm/kvm_asm.h

··· 118 118 119 119 #define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ 120 120 #define RESUME_FLAG_HOST (1<<1) /* Resume host? */ 121 + #define RESUME_FLAG_ARCH1 (1<<2) 121 122 122 123 #define RESUME_GUEST 0 123 124 #define RESUME_GUEST_NV RESUME_FLAG_NV

+9 -3

arch/powerpc/include/asm/kvm_book3s.h

··· 81 81 u64 sdr1; 82 82 u64 hior; 83 83 u64 msr_mask; 84 + u64 purr_offset; 85 + u64 spurr_offset; 84 86 #ifdef CONFIG_PPC_BOOK3S_32 85 87 u32 vsid_pool[VSID_POOL_SIZE]; 86 88 u32 vsid_next; ··· 159 157 extern void kvmppc_unpin_guest_page(struct kvm *kvm, void *addr); 160 158 extern long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 161 159 long pte_index, unsigned long pteh, unsigned long ptel); 162 - extern long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 163 - long pte_index, unsigned long pteh, unsigned long ptel); 160 + extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, 161 + long pte_index, unsigned long pteh, unsigned long ptel, 162 + pgd_t *pgdir, bool realmode, unsigned long *idx_ret); 163 + extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, 164 + unsigned long pte_index, unsigned long avpn, 165 + unsigned long *hpret); 164 166 extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, 165 - struct kvm_memory_slot *memslot); 167 + struct kvm_memory_slot *memslot, unsigned long *map); 166 168 167 169 extern void kvmppc_entry_trampoline(void); 168 170 extern void kvmppc_hv_entry_trampoline(void);

+32 -1

arch/powerpc/include/asm/kvm_book3s_64.h

··· 50 50 #define HPTE_V_HVLOCK 0x40UL 51 51 #define HPTE_V_ABSENT 0x20UL 52 52 53 + /* 54 + * We use this bit in the guest_rpte field of the revmap entry 55 + * to indicate a modified HPTE. 56 + */ 57 + #define HPTE_GR_MODIFIED (1ul << 62) 58 + 59 + /* These bits are reserved in the guest view of the HPTE */ 60 + #define HPTE_GR_RESERVED HPTE_GR_MODIFIED 61 + 53 62 static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits) 54 63 { 55 64 unsigned long tmp, old; ··· 69 60 " ori %0,%0,%4\n" 70 61 " stdcx. %0,0,%2\n" 71 62 " beq+ 2f\n" 72 - " li %1,%3\n" 63 + " mr %1,%3\n" 73 64 "2: isync" 74 65 : "=&r" (tmp), "=&r" (old) 75 66 : "r" (hpte), "r" (bits), "i" (HPTE_V_HVLOCK) ··· 244 235 if (pagesize <= PAGE_SIZE) 245 236 return 1; 246 237 return !(memslot->base_gfn & mask) && !(memslot->npages & mask); 238 + } 239 + 240 + /* 241 + * This works for 4k, 64k and 16M pages on POWER7, 242 + * and 4k and 16M pages on PPC970. 243 + */ 244 + static inline unsigned long slb_pgsize_encoding(unsigned long psize) 245 + { 246 + unsigned long senc = 0; 247 + 248 + if (psize > 0x1000) { 249 + senc = SLB_VSID_L; 250 + if (psize == 0x10000) 251 + senc |= SLB_VSID_LP_01; 252 + } 253 + return senc; 254 + } 255 + 256 + static inline int is_vrma_hpte(unsigned long hpte_v) 257 + { 258 + return (hpte_v & ~0xffffffUL) == 259 + (HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16))); 247 260 } 248 261 249 262 #endif /* __ASM_KVM_BOOK3S_64_H__ */

+27 -2

arch/powerpc/include/asm/kvm_booke_hv_asm.h

··· 17 17 * there are no exceptions for which we fall through directly to 18 18 * the normal host handler. 19 19 * 20 + * 32-bit host 20 21 * Expected inputs (normal exceptions): 21 22 * SCRATCH0 = saved r10 22 23 * r10 = thread struct ··· 34 33 * *(r8 + GPR9) = saved r9 35 34 * *(r8 + GPR10) = saved r10 (r10 not yet clobbered) 36 35 * *(r8 + GPR11) = saved r11 36 + * 37 + * 64-bit host 38 + * Expected inputs (GEN/GDBELL/DBG/MC exception types): 39 + * r10 = saved CR 40 + * r13 = PACA_POINTER 41 + * *(r13 + PACA_EX##type + EX_R10) = saved r10 42 + * *(r13 + PACA_EX##type + EX_R11) = saved r11 43 + * SPRN_SPRG_##type##_SCRATCH = saved r13 44 + * 45 + * Expected inputs (CRIT exception type): 46 + * r10 = saved CR 47 + * r13 = PACA_POINTER 48 + * *(r13 + PACA_EX##type + EX_R10) = saved r10 49 + * *(r13 + PACA_EX##type + EX_R11) = saved r11 50 + * *(r13 + PACA_EX##type + EX_R13) = saved r13 51 + * 52 + * Expected inputs (TLB exception type): 53 + * r10 = saved CR 54 + * r13 = PACA_POINTER 55 + * *(r13 + PACA_EX##type + EX_TLB_R10) = saved r10 56 + * *(r13 + PACA_EX##type + EX_TLB_R11) = saved r11 57 + * SPRN_SPRG_GEN_SCRATCH = saved r13 58 + * 59 + * Only the bolted version of TLB miss exception handlers is supported now. 37 60 */ 38 61 .macro DO_KVM intno srr1 39 62 #ifdef CONFIG_KVM_BOOKE_HV 40 63 BEGIN_FTR_SECTION 41 64 mtocrf 0x80, r11 /* check MSR[GS] without clobbering reg */ 42 - bf 3, kvmppc_resume_\intno\()_\srr1 65 + bf 3, 1975f 43 66 b kvmppc_handler_\intno\()_\srr1 44 - kvmppc_resume_\intno\()_\srr1: 67 + 1975: 45 68 END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) 46 69 #endif 47 70 .endm

+52 -16

arch/powerpc/include/asm/kvm_host.h

··· 46 46 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 47 47 #endif 48 48 49 - #ifdef CONFIG_KVM_BOOK3S_64_HV 49 + #if !defined(CONFIG_KVM_440) 50 50 #include <linux/mmu_notifier.h> 51 51 52 52 #define KVM_ARCH_WANT_MMU_NOTIFIER ··· 204 204 }; 205 205 206 206 /* 207 - * We use the top bit of each memslot->rmap entry as a lock bit, 207 + * We use the top bit of each memslot->arch.rmap entry as a lock bit, 208 208 * and bit 32 as a present flag. The bottom 32 bits are the 209 209 * index in the guest HPT of a HPTE that points to the page. 210 210 */ ··· 215 215 #define KVMPPC_RMAP_PRESENT 0x100000000ul 216 216 #define KVMPPC_RMAP_INDEX 0xfffffffful 217 217 218 - /* Low-order bits in kvm->arch.slot_phys[][] */ 218 + /* Low-order bits in memslot->arch.slot_phys[] */ 219 219 #define KVMPPC_PAGE_ORDER_MASK 0x1f 220 220 #define KVMPPC_PAGE_NO_CACHE HPTE_R_I /* 0x20 */ 221 221 #define KVMPPC_PAGE_WRITETHRU HPTE_R_W /* 0x40 */ 222 222 #define KVMPPC_GOT_PAGE 0x80 223 223 224 224 struct kvm_arch_memory_slot { 225 + #ifdef CONFIG_KVM_BOOK3S_64_HV 225 226 unsigned long *rmap; 227 + unsigned long *slot_phys; 228 + #endif /* CONFIG_KVM_BOOK3S_64_HV */ 226 229 }; 227 230 228 231 struct kvm_arch { ··· 246 243 int using_mmu_notifiers; 247 244 u32 hpt_order; 248 245 atomic_t vcpus_running; 246 + u32 online_vcores; 249 247 unsigned long hpt_npte; 250 248 unsigned long hpt_mask; 249 + atomic_t hpte_mod_interest; 251 250 spinlock_t slot_phys_lock; 252 - unsigned long *slot_phys[KVM_MEM_SLOTS_NUM]; 253 - int slot_npages[KVM_MEM_SLOTS_NUM]; 254 - unsigned short last_vcpu[NR_CPUS]; 251 + cpumask_t need_tlb_flush; 255 252 struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; 256 253 struct kvmppc_linear_info *hpt_li; 257 254 #endif /* CONFIG_KVM_BOOK3S_64_HV */ ··· 276 273 int nap_count; 277 274 int napping_threads; 278 275 u16 pcpu; 276 + u16 last_cpu; 279 277 u8 vcore_state; 280 278 u8 in_guest; 281 279 struct list_head runnable_threads; ··· 292 288 293 289 /* Values for vcore_state */ 294 290 #define VCORE_INACTIVE 0 295 - #define VCORE_RUNNING 1 296 - #define VCORE_EXITING 2 297 - #define VCORE_SLEEPING 3 291 + #define VCORE_SLEEPING 1 292 + #define VCORE_STARTING 2 293 + #define VCORE_RUNNING 3 294 + #define VCORE_EXITING 4 298 295 299 296 /* 300 297 * Struct used to manage memory for a virtual processor area ··· 351 346 bool class : 1; 352 347 }; 353 348 349 + # ifdef CONFIG_PPC_FSL_BOOK3E 350 + #define KVMPPC_BOOKE_IAC_NUM 2 351 + #define KVMPPC_BOOKE_DAC_NUM 2 352 + # else 353 + #define KVMPPC_BOOKE_IAC_NUM 4 354 + #define KVMPPC_BOOKE_DAC_NUM 2 355 + # endif 356 + #define KVMPPC_BOOKE_MAX_IAC 4 357 + #define KVMPPC_BOOKE_MAX_DAC 2 358 + 359 + struct kvmppc_booke_debug_reg { 360 + u32 dbcr0; 361 + u32 dbcr1; 362 + u32 dbcr2; 363 + #ifdef CONFIG_KVM_E500MC 364 + u32 dbcr4; 365 + #endif 366 + u64 iac[KVMPPC_BOOKE_MAX_IAC]; 367 + u64 dac[KVMPPC_BOOKE_MAX_DAC]; 368 + }; 369 + 354 370 struct kvm_vcpu_arch { 355 371 ulong host_stack; 356 372 u32 host_pid; ··· 406 380 u32 host_mas4; 407 381 u32 host_mas6; 408 382 u32 shadow_epcr; 409 - u32 epcr; 410 383 u32 shadow_msrp; 411 384 u32 eplc; 412 385 u32 epsc; 413 386 u32 oldpir; 387 + #endif 388 + 389 + #if defined(CONFIG_BOOKE) 390 + #if defined(CONFIG_KVM_BOOKE_HV) || defined(CONFIG_64BIT) 391 + u32 epcr; 392 + #endif 414 393 #endif 415 394 416 395 #ifdef CONFIG_PPC_BOOK3S ··· 471 440 472 441 u32 ccr0; 473 442 u32 ccr1; 474 - u32 dbcr0; 475 - u32 dbcr1; 476 443 u32 dbsr; 477 444 478 445 u64 mmcr[3]; ··· 500 471 ulong fault_esr; 501 472 ulong queued_dear; 502 473 ulong queued_esr; 474 + spinlock_t wdt_lock; 475 + struct timer_list wdt_timer; 503 476 u32 tlbcfg[4]; 504 477 u32 mmucfg; 505 478 u32 epr; 479 + struct kvmppc_booke_debug_reg dbg_reg; 506 480 #endif 507 481 gpa_t paddr_accessed; 508 482 gva_t vaddr_accessed; ··· 518 486 u8 osi_needed; 519 487 u8 osi_enabled; 520 488 u8 papr_enabled; 489 + u8 watchdog_enabled; 521 490 u8 sane; 522 491 u8 cpu_type; 523 492 u8 hcall_needed; ··· 530 497 u64 dec_jiffies; 531 498 u64 dec_expires; 532 499 unsigned long pending_exceptions; 533 - u16 last_cpu; 534 500 u8 ceded; 535 501 u8 prodded; 536 502 u32 last_inst; ··· 566 534 unsigned long dtl_index; 567 535 u64 stolen_logged; 568 536 struct kvmppc_vpa slb_shadow; 537 + 538 + spinlock_t tbacct_lock; 539 + u64 busy_stolen; 540 + u64 busy_preempt; 569 541 #endif 570 542 }; 571 543 572 544 /* Values for vcpu->arch.state */ 573 - #define KVMPPC_VCPU_STOPPED 0 574 - #define KVMPPC_VCPU_BUSY_IN_HOST 1 575 - #define KVMPPC_VCPU_RUNNABLE 2 545 + #define KVMPPC_VCPU_NOTREADY 0 546 + #define KVMPPC_VCPU_RUNNABLE 1 547 + #define KVMPPC_VCPU_BUSY_IN_HOST 2 576 548 577 549 /* Values for vcpu->arch.io_gpr */ 578 550 #define KVM_MMIO_REG_MASK 0x001f

+7 -8

arch/powerpc/include/asm/kvm_para.h

··· 21 21 22 22 #include <uapi/asm/kvm_para.h> 23 23 24 - 25 24 #ifdef CONFIG_KVM_GUEST 26 25 27 26 #include <linux/of.h> ··· 54 55 unsigned long *out, 55 56 unsigned long nr) 56 57 { 57 - return HC_EV_UNIMPLEMENTED; 58 + return EV_UNIMPLEMENTED; 58 59 } 59 60 60 61 #endif ··· 65 66 unsigned long out[8]; 66 67 unsigned long r; 67 68 68 - r = kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 69 + r = kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); 69 70 *r2 = out[0]; 70 71 71 72 return r; ··· 76 77 unsigned long in[8]; 77 78 unsigned long out[8]; 78 79 79 - return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 80 + return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); 80 81 } 81 82 82 83 static inline long kvm_hypercall1(unsigned int nr, unsigned long p1) ··· 85 86 unsigned long out[8]; 86 87 87 88 in[0] = p1; 88 - return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 89 + return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); 89 90 } 90 91 91 92 static inline long kvm_hypercall2(unsigned int nr, unsigned long p1, ··· 96 97 97 98 in[0] = p1; 98 99 in[1] = p2; 99 - return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 100 + return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); 100 101 } 101 102 102 103 static inline long kvm_hypercall3(unsigned int nr, unsigned long p1, ··· 108 109 in[0] = p1; 109 110 in[1] = p2; 110 111 in[2] = p3; 111 - return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 112 + return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); 112 113 } 113 114 114 115 static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, ··· 122 123 in[1] = p2; 123 124 in[2] = p3; 124 125 in[3] = p4; 125 - return kvm_hypercall(in, out, nr | HC_VENDOR_KVM); 126 + return kvm_hypercall(in, out, KVM_HCALL_TOKEN(nr)); 126 127 } 127 128 128 129

+86 -1

arch/powerpc/include/asm/kvm_ppc.h

··· 28 28 #include <linux/types.h> 29 29 #include <linux/kvm_types.h> 30 30 #include <linux/kvm_host.h> 31 + #include <linux/bug.h> 31 32 #ifdef CONFIG_PPC_BOOK3S 32 33 #include <asm/kvm_book3s.h> 33 34 #else ··· 69 68 extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); 70 69 extern void kvmppc_decrementer_func(unsigned long data); 71 70 extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); 71 + extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); 72 + extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu); 72 73 73 74 /* Core-specific hooks */ 74 75 ··· 107 104 struct kvm_interrupt *irq); 108 105 extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, 109 106 struct kvm_interrupt *irq); 107 + extern void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu); 110 108 111 109 extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 112 110 unsigned int op, int *advance); ··· 115 111 ulong val); 116 112 extern int kvmppc_core_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, 117 113 ulong *val); 114 + extern int kvmppc_core_check_requests(struct kvm_vcpu *vcpu); 118 115 119 116 extern int kvmppc_booke_init(void); 120 117 extern void kvmppc_booke_exit(void); ··· 144 139 extern void kvm_release_hpt(struct kvmppc_linear_info *li); 145 140 extern int kvmppc_core_init_vm(struct kvm *kvm); 146 141 extern void kvmppc_core_destroy_vm(struct kvm *kvm); 142 + extern void kvmppc_core_free_memslot(struct kvm_memory_slot *free, 143 + struct kvm_memory_slot *dont); 144 + extern int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, 145 + unsigned long npages); 147 146 extern int kvmppc_core_prepare_memory_region(struct kvm *kvm, 147 + struct kvm_memory_slot *memslot, 148 148 struct kvm_userspace_memory_region *mem); 149 149 extern void kvmppc_core_commit_memory_region(struct kvm *kvm, 150 - struct kvm_userspace_memory_region *mem); 150 + struct kvm_userspace_memory_region *mem, 151 + struct kvm_memory_slot old); 151 152 extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm, 152 153 struct kvm_ppc_smmu_info *info); 154 + extern void kvmppc_core_flush_memslot(struct kvm *kvm, 155 + struct kvm_memory_slot *memslot); 153 156 154 157 extern int kvmppc_bookehv_init(void); 155 158 extern void kvmppc_bookehv_exit(void); 159 + 160 + extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu); 161 + 162 + extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *); 156 163 157 164 /* 158 165 * Cuts out inst bits with ordering according to spec. ··· 199 182 return r; 200 183 } 201 184 185 + union kvmppc_one_reg { 186 + u32 wval; 187 + u64 dval; 188 + vector128 vval; 189 + u64 vsxval[2]; 190 + struct { 191 + u64 addr; 192 + u64 length; 193 + } vpaval; 194 + }; 195 + 196 + #define one_reg_size(id) \ 197 + (1ul << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) 198 + 199 + #define get_reg_val(id, reg) ({ \ 200 + union kvmppc_one_reg __u; \ 201 + switch (one_reg_size(id)) { \ 202 + case 4: __u.wval = (reg); break; \ 203 + case 8: __u.dval = (reg); break; \ 204 + default: BUG(); \ 205 + } \ 206 + __u; \ 207 + }) 208 + 209 + 210 + #define set_reg_val(id, val) ({ \ 211 + u64 __v; \ 212 + switch (one_reg_size(id)) { \ 213 + case 4: __v = (val).wval; break; \ 214 + case 8: __v = (val).dval; break; \ 215 + default: BUG(); \ 216 + } \ 217 + __v; \ 218 + }) 219 + 202 220 void kvmppc_core_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 203 221 int kvmppc_core_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); 204 222 ··· 242 190 243 191 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); 244 192 int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg); 193 + int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *); 194 + int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *); 245 195 246 196 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid); 247 197 ··· 284 230 } 285 231 } 286 232 233 + /* Please call after prepare_to_enter. This function puts the lazy ee state 234 + back to normal mode, without actually enabling interrupts. */ 235 + static inline void kvmppc_lazy_ee_enable(void) 236 + { 237 + #ifdef CONFIG_PPC64 238 + /* Only need to enable IRQs by hard enabling them after this */ 239 + local_paca->irq_happened = 0; 240 + local_paca->soft_enabled = 1; 241 + #endif 242 + } 243 + 244 + static inline ulong kvmppc_get_ea_indexed(struct kvm_vcpu *vcpu, int ra, int rb) 245 + { 246 + ulong ea; 247 + ulong msr_64bit = 0; 248 + 249 + ea = kvmppc_get_gpr(vcpu, rb); 250 + if (ra) 251 + ea += kvmppc_get_gpr(vcpu, ra); 252 + 253 + #if defined(CONFIG_PPC_BOOK3E_64) 254 + msr_64bit = MSR_CM; 255 + #elif defined(CONFIG_PPC_BOOK3S_64) 256 + msr_64bit = MSR_SF; 257 + #endif 258 + 259 + if (!(vcpu->arch.shared->msr & msr_64bit)) 260 + ea = (uint32_t)ea; 261 + 262 + return ea; 263 + } 287 264 288 265 #endif /* __POWERPC_KVM_PPC_H__ */

+1 -1

arch/powerpc/include/asm/mmu-book3e.h

··· 59 59 #define MAS1_TSIZE_SHIFT 7 60 60 #define MAS1_TSIZE(x) (((x) << MAS1_TSIZE_SHIFT) & MAS1_TSIZE_MASK) 61 61 62 - #define MAS2_EPN 0xFFFFF000 62 + #define MAS2_EPN (~0xFFFUL) 63 63 #define MAS2_X0 0x00000040 64 64 #define MAS2_X1 0x00000020 65 65 #define MAS2_W 0x00000010

+10

arch/powerpc/include/asm/mmu-hash64.h

··· 121 121 #define PP_RXRX 3 /* Supervisor read, User read */ 122 122 #define PP_RXXX (HPTE_R_PP0 | 2) /* Supervisor read, user none */ 123 123 124 + /* Fields for tlbiel instruction in architecture 2.06 */ 125 + #define TLBIEL_INVAL_SEL_MASK 0xc00 /* invalidation selector */ 126 + #define TLBIEL_INVAL_PAGE 0x000 /* invalidate a single page */ 127 + #define TLBIEL_INVAL_SET_LPID 0x800 /* invalidate a set for current LPID */ 128 + #define TLBIEL_INVAL_SET 0xc00 /* invalidate a set for all LPIDs */ 129 + #define TLBIEL_INVAL_SET_MASK 0xfff000 /* set number to inval. */ 130 + #define TLBIEL_INVAL_SET_SHIFT 12 131 + 132 + #define POWER7_TLB_SETS 128 /* # sets in POWER7 TLB */ 133 + 124 134 #ifndef __ASSEMBLY__ 125 135 126 136 struct hash_pte {

+1

arch/powerpc/include/asm/reg.h

··· 518 518 #define SRR1_WS_DEEPER 0x00020000 /* Some resources not maintained */ 519 519 #define SRR1_WS_DEEP 0x00010000 /* All resources maintained */ 520 520 #define SRR1_PROGFPE 0x00100000 /* Floating Point Enabled */ 521 + #define SRR1_PROGILL 0x00080000 /* Illegal instruction */ 521 522 #define SRR1_PROGPRIV 0x00040000 /* Privileged instruction */ 522 523 #define SRR1_PROGTRAP 0x00020000 /* Trap */ 523 524 #define SRR1_PROGADDR 0x00010000 /* SRR0 contains subsequent addr */

+7

arch/powerpc/include/asm/reg_booke.h

··· 539 539 #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ 540 540 #define TCR_ARE 0x00400000 /* Auto Reload Enable */ 541 541 542 + #ifdef CONFIG_E500 543 + #define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \ 544 + (((tcr) & 0x1E0000) >> 15)) 545 + #else 546 + #define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30) 547 + #endif 548 + 542 549 /* Bit definitions for the TSR. */ 543 550 #define TSR_ENW 0x80000000 /* Enable Next Watchdog */ 544 551 #define TSR_WIS 0x40000000 /* WDT Interrupt Status */

+8

arch/powerpc/include/asm/smp.h

··· 67 67 void generic_set_cpu_dead(unsigned int cpu); 68 68 void generic_set_cpu_up(unsigned int cpu); 69 69 int generic_check_cpu_restart(unsigned int cpu); 70 + 71 + extern void inhibit_secondary_onlining(void); 72 + extern void uninhibit_secondary_onlining(void); 73 + 74 + #else /* HOTPLUG_CPU */ 75 + static inline void inhibit_secondary_onlining(void) {} 76 + static inline void uninhibit_secondary_onlining(void) {} 77 + 70 78 #endif 71 79 72 80 #ifdef CONFIG_PPC64

+1

arch/powerpc/include/uapi/asm/Kbuild

··· 7 7 header-y += byteorder.h 8 8 header-y += cputable.h 9 9 header-y += elf.h 10 + header-y += epapr_hcalls.h 10 11 header-y += errno.h 11 12 header-y += fcntl.h 12 13 header-y += ioctl.h

+98

arch/powerpc/include/uapi/asm/epapr_hcalls.h

··· 1 + /* 2 + * ePAPR hcall interface 3 + * 4 + * Copyright 2008-2011 Freescale Semiconductor, Inc. 5 + * 6 + * Author: Timur Tabi <timur@freescale.com> 7 + * 8 + * This file is provided under a dual BSD/GPL license. When using or 9 + * redistributing this file, you may do so under either license. 10 + * 11 + * Redistribution and use in source and binary forms, with or without 12 + * modification, are permitted provided that the following conditions are met: 13 + * * Redistributions of source code must retain the above copyright 14 + * notice, this list of conditions and the following disclaimer. 15 + * * Redistributions in binary form must reproduce the above copyright 16 + * notice, this list of conditions and the following disclaimer in the 17 + * documentation and/or other materials provided with the distribution. 18 + * * Neither the name of Freescale Semiconductor nor the 19 + * names of its contributors may be used to endorse or promote products 20 + * derived from this software without specific prior written permission. 21 + * 22 + * 23 + * ALTERNATIVELY, this software may be distributed under the terms of the 24 + * GNU General Public License ("GPL") as published by the Free Software 25 + * Foundation, either version 2 of that License or (at your option) any 26 + * later version. 27 + * 28 + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY 29 + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 30 + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 31 + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY 32 + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 33 + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 34 + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 35 + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37 + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 + */ 39 + 40 + #ifndef _UAPI_ASM_POWERPC_EPAPR_HCALLS_H 41 + #define _UAPI_ASM_POWERPC_EPAPR_HCALLS_H 42 + 43 + #define EV_BYTE_CHANNEL_SEND 1 44 + #define EV_BYTE_CHANNEL_RECEIVE 2 45 + #define EV_BYTE_CHANNEL_POLL 3 46 + #define EV_INT_SET_CONFIG 4 47 + #define EV_INT_GET_CONFIG 5 48 + #define EV_INT_SET_MASK 6 49 + #define EV_INT_GET_MASK 7 50 + #define EV_INT_IACK 9 51 + #define EV_INT_EOI 10 52 + #define EV_INT_SEND_IPI 11 53 + #define EV_INT_SET_TASK_PRIORITY 12 54 + #define EV_INT_GET_TASK_PRIORITY 13 55 + #define EV_DOORBELL_SEND 14 56 + #define EV_MSGSND 15 57 + #define EV_IDLE 16 58 + 59 + /* vendor ID: epapr */ 60 + #define EV_LOCAL_VENDOR_ID 0 /* for private use */ 61 + #define EV_EPAPR_VENDOR_ID 1 62 + #define EV_FSL_VENDOR_ID 2 /* Freescale Semiconductor */ 63 + #define EV_IBM_VENDOR_ID 3 /* IBM */ 64 + #define EV_GHS_VENDOR_ID 4 /* Green Hills Software */ 65 + #define EV_ENEA_VENDOR_ID 5 /* Enea */ 66 + #define EV_WR_VENDOR_ID 6 /* Wind River Systems */ 67 + #define EV_AMCC_VENDOR_ID 7 /* Applied Micro Circuits */ 68 + #define EV_KVM_VENDOR_ID 42 /* KVM */ 69 + 70 + /* The max number of bytes that a byte channel can send or receive per call */ 71 + #define EV_BYTE_CHANNEL_MAX_BYTES 16 72 + 73 + 74 + #define _EV_HCALL_TOKEN(id, num) (((id) << 16) | (num)) 75 + #define EV_HCALL_TOKEN(hcall_num) _EV_HCALL_TOKEN(EV_EPAPR_VENDOR_ID, hcall_num) 76 + 77 + /* epapr return codes */ 78 + #define EV_SUCCESS 0 79 + #define EV_EPERM 1 /* Operation not permitted */ 80 + #define EV_ENOENT 2 /* Entry Not Found */ 81 + #define EV_EIO 3 /* I/O error occured */ 82 + #define EV_EAGAIN 4 /* The operation had insufficient 83 + * resources to complete and should be 84 + * retried 85 + */ 86 + #define EV_ENOMEM 5 /* There was insufficient memory to 87 + * complete the operation */ 88 + #define EV_EFAULT 6 /* Bad guest address */ 89 + #define EV_ENODEV 7 /* No such device */ 90 + #define EV_EINVAL 8 /* An argument supplied to the hcall 91 + was out of range or invalid */ 92 + #define EV_INTERNAL 9 /* An internal error occured */ 93 + #define EV_CONFIG 10 /* A configuration error was detected */ 94 + #define EV_INVALID_STATE 11 /* The object is in an invalid state */ 95 + #define EV_UNIMPLEMENTED 12 /* Unimplemented hypercall */ 96 + #define EV_BUFFER_OVERFLOW 13 /* Caller-supplied buffer too small */ 97 + 98 + #endif /* _UAPI_ASM_POWERPC_EPAPR_HCALLS_H */

+86

arch/powerpc/include/uapi/asm/kvm.h

··· 221 221 222 222 __u32 dbsr; /* KVM_SREGS_E_UPDATE_DBSR */ 223 223 __u32 dbcr[3]; 224 + /* 225 + * iac/dac registers are 64bit wide, while this API 226 + * interface provides only lower 32 bits on 64 bit 227 + * processors. ONE_REG interface is added for 64bit 228 + * iac/dac registers. 229 + */ 224 230 __u32 iac[4]; 225 231 __u32 dac[2]; 226 232 __u32 dvc[2]; ··· 331 325 __u32 reserved[8]; 332 326 }; 333 327 328 + /* For KVM_PPC_GET_HTAB_FD */ 329 + struct kvm_get_htab_fd { 330 + __u64 flags; 331 + __u64 start_index; 332 + __u64 reserved[2]; 333 + }; 334 + 335 + /* Values for kvm_get_htab_fd.flags */ 336 + #define KVM_GET_HTAB_BOLTED_ONLY ((__u64)0x1) 337 + #define KVM_GET_HTAB_WRITE ((__u64)0x2) 338 + 339 + /* 340 + * Data read on the file descriptor is formatted as a series of 341 + * records, each consisting of a header followed by a series of 342 + * `n_valid' HPTEs (16 bytes each), which are all valid. Following 343 + * those valid HPTEs there are `n_invalid' invalid HPTEs, which 344 + * are not represented explicitly in the stream. The same format 345 + * is used for writing. 346 + */ 347 + struct kvm_get_htab_header { 348 + __u32 index; 349 + __u16 n_valid; 350 + __u16 n_invalid; 351 + }; 352 + 334 353 #define KVM_REG_PPC_HIOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1) 354 + #define KVM_REG_PPC_IAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2) 355 + #define KVM_REG_PPC_IAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3) 356 + #define KVM_REG_PPC_IAC3 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x4) 357 + #define KVM_REG_PPC_IAC4 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x5) 358 + #define KVM_REG_PPC_DAC1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x6) 359 + #define KVM_REG_PPC_DAC2 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x7) 360 + #define KVM_REG_PPC_DABR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8) 361 + #define KVM_REG_PPC_DSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x9) 362 + #define KVM_REG_PPC_PURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xa) 363 + #define KVM_REG_PPC_SPURR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xb) 364 + #define KVM_REG_PPC_DAR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc) 365 + #define KVM_REG_PPC_DSISR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xd) 366 + #define KVM_REG_PPC_AMR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xe) 367 + #define KVM_REG_PPC_UAMOR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xf) 368 + 369 + #define KVM_REG_PPC_MMCR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x10) 370 + #define KVM_REG_PPC_MMCR1 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x11) 371 + #define KVM_REG_PPC_MMCRA (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x12) 372 + 373 + #define KVM_REG_PPC_PMC1 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x18) 374 + #define KVM_REG_PPC_PMC2 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x19) 375 + #define KVM_REG_PPC_PMC3 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1a) 376 + #define KVM_REG_PPC_PMC4 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1b) 377 + #define KVM_REG_PPC_PMC5 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1c) 378 + #define KVM_REG_PPC_PMC6 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1d) 379 + #define KVM_REG_PPC_PMC7 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1e) 380 + #define KVM_REG_PPC_PMC8 (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x1f) 381 + 382 + /* 32 floating-point registers */ 383 + #define KVM_REG_PPC_FPR0 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x20) 384 + #define KVM_REG_PPC_FPR(n) (KVM_REG_PPC_FPR0 + (n)) 385 + #define KVM_REG_PPC_FPR31 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3f) 386 + 387 + /* 32 VMX/Altivec vector registers */ 388 + #define KVM_REG_PPC_VR0 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x40) 389 + #define KVM_REG_PPC_VR(n) (KVM_REG_PPC_VR0 + (n)) 390 + #define KVM_REG_PPC_VR31 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x5f) 391 + 392 + /* 32 double-width FP registers for VSX */ 393 + /* High-order halves overlap with FP regs */ 394 + #define KVM_REG_PPC_VSR0 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x60) 395 + #define KVM_REG_PPC_VSR(n) (KVM_REG_PPC_VSR0 + (n)) 396 + #define KVM_REG_PPC_VSR31 (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x7f) 397 + 398 + /* FP and vector status/control registers */ 399 + #define KVM_REG_PPC_FPSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x80) 400 + #define KVM_REG_PPC_VSCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x81) 401 + 402 + /* Virtual processor areas */ 403 + /* For SLB & DTL, address in high (first) half, length in low half */ 404 + #define KVM_REG_PPC_VPA_ADDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x82) 405 + #define KVM_REG_PPC_VPA_SLB (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x83) 406 + #define KVM_REG_PPC_VPA_DTL (KVM_REG_PPC | KVM_REG_SIZE_U128 | 0x84) 407 + 408 + #define KVM_REG_PPC_EPCR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x85) 335 409 336 410 #endif /* __LINUX_KVM_POWERPC_H */

+4 -3

arch/powerpc/include/uapi/asm/kvm_para.h

··· 75 75 }; 76 76 77 77 #define KVM_SC_MAGIC_R0 0x4b564d21 /* "KVM!" */ 78 - #define HC_VENDOR_KVM (42 << 16) 79 - #define HC_EV_SUCCESS 0 80 - #define HC_EV_UNIMPLEMENTED 12 78 + 79 + #define KVM_HCALL_TOKEN(num) _EV_HCALL_TOKEN(EV_KVM_VENDOR_ID, num) 80 + 81 + #include <uapi/asm/epapr_hcalls.h> 81 82 82 83 #define KVM_FEATURE_MAGIC_PAGE 1 83 84

+1 -3

arch/powerpc/kernel/asm-offsets.c

··· 441 441 DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); 442 442 DEFINE(KVM_HOST_SDR1, offsetof(struct kvm, arch.host_sdr1)); 443 443 DEFINE(KVM_TLBIE_LOCK, offsetof(struct kvm, arch.tlbie_lock)); 444 - DEFINE(KVM_ONLINE_CPUS, offsetof(struct kvm, online_vcpus.counter)); 445 - DEFINE(KVM_LAST_VCPU, offsetof(struct kvm, arch.last_vcpu)); 444 + DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); 446 445 DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr)); 447 446 DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor)); 448 447 DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); ··· 469 470 DEFINE(VCPU_SLB, offsetof(struct kvm_vcpu, arch.slb)); 470 471 DEFINE(VCPU_SLB_MAX, offsetof(struct kvm_vcpu, arch.slb_max)); 471 472 DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); 472 - DEFINE(VCPU_LAST_CPU, offsetof(struct kvm_vcpu, arch.last_cpu)); 473 473 DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); 474 474 DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); 475 475 DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst));

+28

arch/powerpc/kernel/epapr_hcalls.S

··· 8 8 */ 9 9 10 10 #include <linux/threads.h> 11 + #include <asm/epapr_hcalls.h> 11 12 #include <asm/reg.h> 12 13 #include <asm/page.h> 13 14 #include <asm/cputable.h> 14 15 #include <asm/thread_info.h> 15 16 #include <asm/ppc_asm.h> 17 + #include <asm/asm-compat.h> 16 18 #include <asm/asm-offsets.h> 19 + 20 + /* epapr_ev_idle() was derived from e500_idle() */ 21 + _GLOBAL(epapr_ev_idle) 22 + CURRENT_THREAD_INFO(r3, r1) 23 + PPC_LL r4, TI_LOCAL_FLAGS(r3) /* set napping bit */ 24 + ori r4, r4,_TLF_NAPPING /* so when we take an exception */ 25 + PPC_STL r4, TI_LOCAL_FLAGS(r3) /* it will return to our caller */ 26 + 27 + wrteei 1 28 + 29 + idle_loop: 30 + LOAD_REG_IMMEDIATE(r11, EV_HCALL_TOKEN(EV_IDLE)) 31 + 32 + .global epapr_ev_idle_start 33 + epapr_ev_idle_start: 34 + li r3, -1 35 + nop 36 + nop 37 + nop 38 + 39 + /* 40 + * Guard against spurious wakeups from a hypervisor -- 41 + * only interrupt will cause us to return to LR due to 42 + * _TLF_NAPPING. 43 + */ 44 + b idle_loop 17 45 18 46 /* Hypercall entry point. Will be patched with device tree instructions. */ 19 47 .global epapr_hypercall_start

+10 -1

arch/powerpc/kernel/epapr_paravirt.c

··· 21 21 #include <asm/epapr_hcalls.h> 22 22 #include <asm/cacheflush.h> 23 23 #include <asm/code-patching.h> 24 + #include <asm/machdep.h> 25 + 26 + extern void epapr_ev_idle(void); 27 + extern u32 epapr_ev_idle_start[]; 24 28 25 29 bool epapr_paravirt_enabled; 26 30 ··· 45 41 if (len % 4 || len > (4 * 4)) 46 42 return -ENODEV; 47 43 48 - for (i = 0; i < (len / 4); i++) 44 + for (i = 0; i < (len / 4); i++) { 49 45 patch_instruction(epapr_hypercall_start + i, insts[i]); 46 + patch_instruction(epapr_ev_idle_start + i, insts[i]); 47 + } 48 + 49 + if (of_get_property(hyper_node, "has-idle", NULL)) 50 + ppc_md.power_save = epapr_ev_idle; 50 51 51 52 epapr_paravirt_enabled = true; 52 53

+1 -1

arch/powerpc/kernel/kvm.c

··· 419 419 in[0] = KVM_MAGIC_PAGE; 420 420 in[1] = KVM_MAGIC_PAGE; 421 421 422 - kvm_hypercall(in, out, HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE); 422 + kvm_hypercall(in, out, KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE)); 423 423 424 424 *features = out[0]; 425 425 }

+5

arch/powerpc/kernel/ppc_ksyms.c

··· 43 43 #include <asm/dcr.h> 44 44 #include <asm/ftrace.h> 45 45 #include <asm/switch_to.h> 46 + #include <asm/epapr_hcalls.h> 46 47 47 48 #ifdef CONFIG_PPC32 48 49 extern void transfer_to_handler(void); ··· 191 190 192 191 #ifdef CONFIG_PPC_BOOK3S_64 193 192 EXPORT_SYMBOL_GPL(mmu_psize_defs); 193 + #endif 194 + 195 + #ifdef CONFIG_EPAPR_PARAVIRT 196 + EXPORT_SYMBOL(epapr_hypercall_start); 194 197 #endif

+46

arch/powerpc/kernel/smp.c

··· 427 427 { 428 428 return per_cpu(cpu_state, cpu) == CPU_UP_PREPARE; 429 429 } 430 + 431 + static atomic_t secondary_inhibit_count; 432 + 433 + /* 434 + * Don't allow secondary CPU threads to come online 435 + */ 436 + void inhibit_secondary_onlining(void) 437 + { 438 + /* 439 + * This makes secondary_inhibit_count stable during cpu 440 + * online/offline operations. 441 + */ 442 + get_online_cpus(); 443 + 444 + atomic_inc(&secondary_inhibit_count); 445 + put_online_cpus(); 446 + } 447 + EXPORT_SYMBOL_GPL(inhibit_secondary_onlining); 448 + 449 + /* 450 + * Allow secondary CPU threads to come online again 451 + */ 452 + void uninhibit_secondary_onlining(void) 453 + { 454 + get_online_cpus(); 455 + atomic_dec(&secondary_inhibit_count); 456 + put_online_cpus(); 457 + } 458 + EXPORT_SYMBOL_GPL(uninhibit_secondary_onlining); 459 + 460 + static int secondaries_inhibited(void) 461 + { 462 + return atomic_read(&secondary_inhibit_count); 463 + } 464 + 465 + #else /* HOTPLUG_CPU */ 466 + 467 + #define secondaries_inhibited() 0 468 + 430 469 #endif 431 470 432 471 static void cpu_idle_thread_init(unsigned int cpu, struct task_struct *idle) ··· 483 444 int __cpuinit __cpu_up(unsigned int cpu, struct task_struct *tidle) 484 445 { 485 446 int rc, c; 447 + 448 + /* 449 + * Don't allow secondary threads to come online if inhibited 450 + */ 451 + if (threads_per_core > 1 && secondaries_inhibited() && 452 + cpu % threads_per_core != 0) 453 + return -EBUSY; 486 454 487 455 if (smp_ops == NULL || 488 456 (smp_ops->cpu_bootable && !smp_ops->cpu_bootable(cpu)))

+1

arch/powerpc/kvm/44x.c

··· 83 83 vcpu_44x->shadow_refs[i].gtlb_index = -1; 84 84 85 85 vcpu->arch.cpu_type = KVM_CPU_440; 86 + vcpu->arch.pvr = mfspr(SPRN_PVR); 86 87 87 88 return 0; 88 89 }

+68 -44

arch/powerpc/kvm/44x_emulate.c

··· 27 27 #include "booke.h" 28 28 #include "44x_tlb.h" 29 29 30 + #define XOP_MFDCRX 259 30 31 #define XOP_MFDCR 323 32 + #define XOP_MTDCRX 387 31 33 #define XOP_MTDCR 451 32 34 #define XOP_TLBSX 914 33 35 #define XOP_ICCCI 966 34 36 #define XOP_TLBWE 978 37 + 38 + static int emulate_mtdcr(struct kvm_vcpu *vcpu, int rs, int dcrn) 39 + { 40 + /* emulate some access in kernel */ 41 + switch (dcrn) { 42 + case DCRN_CPR0_CONFIG_ADDR: 43 + vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); 44 + return EMULATE_DONE; 45 + default: 46 + vcpu->run->dcr.dcrn = dcrn; 47 + vcpu->run->dcr.data = kvmppc_get_gpr(vcpu, rs); 48 + vcpu->run->dcr.is_write = 1; 49 + vcpu->arch.dcr_is_write = 1; 50 + vcpu->arch.dcr_needed = 1; 51 + kvmppc_account_exit(vcpu, DCR_EXITS); 52 + return EMULATE_DO_DCR; 53 + } 54 + } 55 + 56 + static int emulate_mfdcr(struct kvm_vcpu *vcpu, int rt, int dcrn) 57 + { 58 + /* The guest may access CPR0 registers to determine the timebase 59 + * frequency, and it must know the real host frequency because it 60 + * can directly access the timebase registers. 61 + * 62 + * It would be possible to emulate those accesses in userspace, 63 + * but userspace can really only figure out the end frequency. 64 + * We could decompose that into the factors that compute it, but 65 + * that's tricky math, and it's easier to just report the real 66 + * CPR0 values. 67 + */ 68 + switch (dcrn) { 69 + case DCRN_CPR0_CONFIG_ADDR: 70 + kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); 71 + break; 72 + case DCRN_CPR0_CONFIG_DATA: 73 + local_irq_disable(); 74 + mtdcr(DCRN_CPR0_CONFIG_ADDR, 75 + vcpu->arch.cpr0_cfgaddr); 76 + kvmppc_set_gpr(vcpu, rt, 77 + mfdcr(DCRN_CPR0_CONFIG_DATA)); 78 + local_irq_enable(); 79 + break; 80 + default: 81 + vcpu->run->dcr.dcrn = dcrn; 82 + vcpu->run->dcr.data = 0; 83 + vcpu->run->dcr.is_write = 0; 84 + vcpu->arch.dcr_is_write = 0; 85 + vcpu->arch.io_gpr = rt; 86 + vcpu->arch.dcr_needed = 1; 87 + kvmppc_account_exit(vcpu, DCR_EXITS); 88 + return EMULATE_DO_DCR; 89 + } 90 + 91 + return EMULATE_DONE; 92 + } 35 93 36 94 int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, 37 95 unsigned int inst, int *advance) ··· 108 50 switch (get_xop(inst)) { 109 51 110 52 case XOP_MFDCR: 111 - /* The guest may access CPR0 registers to determine the timebase 112 - * frequency, and it must know the real host frequency because it 113 - * can directly access the timebase registers. 114 - * 115 - * It would be possible to emulate those accesses in userspace, 116 - * but userspace can really only figure out the end frequency. 117 - * We could decompose that into the factors that compute it, but 118 - * that's tricky math, and it's easier to just report the real 119 - * CPR0 values. 120 - */ 121 - switch (dcrn) { 122 - case DCRN_CPR0_CONFIG_ADDR: 123 - kvmppc_set_gpr(vcpu, rt, vcpu->arch.cpr0_cfgaddr); 124 - break; 125 - case DCRN_CPR0_CONFIG_DATA: 126 - local_irq_disable(); 127 - mtdcr(DCRN_CPR0_CONFIG_ADDR, 128 - vcpu->arch.cpr0_cfgaddr); 129 - kvmppc_set_gpr(vcpu, rt, 130 - mfdcr(DCRN_CPR0_CONFIG_DATA)); 131 - local_irq_enable(); 132 - break; 133 - default: 134 - run->dcr.dcrn = dcrn; 135 - run->dcr.data = 0; 136 - run->dcr.is_write = 0; 137 - vcpu->arch.io_gpr = rt; 138 - vcpu->arch.dcr_needed = 1; 139 - kvmppc_account_exit(vcpu, DCR_EXITS); 140 - emulated = EMULATE_DO_DCR; 141 - } 53 + emulated = emulate_mfdcr(vcpu, rt, dcrn); 54 + break; 142 55 56 + case XOP_MFDCRX: 57 + emulated = emulate_mfdcr(vcpu, rt, 58 + kvmppc_get_gpr(vcpu, ra)); 143 59 break; 144 60 145 61 case XOP_MTDCR: 146 - /* emulate some access in kernel */ 147 - switch (dcrn) { 148 - case DCRN_CPR0_CONFIG_ADDR: 149 - vcpu->arch.cpr0_cfgaddr = kvmppc_get_gpr(vcpu, rs); 150 - break; 151 - default: 152 - run->dcr.dcrn = dcrn; 153 - run->dcr.data = kvmppc_get_gpr(vcpu, rs); 154 - run->dcr.is_write = 1; 155 - vcpu->arch.dcr_needed = 1; 156 - kvmppc_account_exit(vcpu, DCR_EXITS); 157 - emulated = EMULATE_DO_DCR; 158 - } 62 + emulated = emulate_mtdcr(vcpu, rs, dcrn); 63 + break; 159 64 65 + case XOP_MTDCRX: 66 + emulated = emulate_mtdcr(vcpu, rs, 67 + kvmppc_get_gpr(vcpu, ra)); 160 68 break; 161 69 162 70 case XOP_TLBWE:

+4

arch/powerpc/kvm/Kconfig

··· 20 20 bool 21 21 select PREEMPT_NOTIFIERS 22 22 select ANON_INODES 23 + select HAVE_KVM_EVENTFD 23 24 24 25 config KVM_BOOK3S_HANDLER 25 26 bool ··· 37 36 config KVM_BOOK3S_PR 38 37 bool 39 38 select KVM_MMIO 39 + select MMU_NOTIFIER 40 40 41 41 config KVM_BOOK3S_32 42 42 tristate "KVM support for PowerPC book3s_32 processors" ··· 125 123 depends on EXPERIMENTAL && E500 && !PPC_E500MC 126 124 select KVM 127 125 select KVM_MMIO 126 + select MMU_NOTIFIER 128 127 ---help--- 129 128 Support running unmodified E500 guest kernels in virtual machines on 130 129 E500v2 host processors. ··· 141 138 select KVM 142 139 select KVM_MMIO 143 140 select KVM_BOOKE_HV 141 + select MMU_NOTIFIER 144 142 ---help--- 145 143 Support running unmodified E500MC/E5500 (32-bit) guest kernels in 146 144 virtual machines on E500MC/E5500 host processors.

+4 -1

arch/powerpc/kvm/Makefile

··· 6 6 7 7 ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm 8 8 9 - common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o) 9 + common-objs-y = $(addprefix ../../../virt/kvm/, kvm_main.o coalesced_mmio.o \ 10 + eventfd.o) 10 11 11 12 CFLAGS_44x_tlb.o := -I. 12 13 CFLAGS_e500_tlb.o := -I. ··· 73 72 book3s_hv_rmhandlers.o \ 74 73 book3s_hv_rm_mmu.o \ 75 74 book3s_64_vio_hv.o \ 75 + book3s_hv_ras.o \ 76 76 book3s_hv_builtin.o 77 77 78 78 kvm-book3s_64-module-objs := \ 79 79 ../../../virt/kvm/kvm_main.o \ 80 + ../../../virt/kvm/eventfd.o \ 80 81 powerpc.o \ 81 82 emulate.o \ 82 83 book3s.o \

+125

arch/powerpc/kvm/book3s.c

··· 411 411 return 0; 412 412 } 413 413 414 + int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) 415 + { 416 + return 0; 417 + } 418 + 419 + void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) 420 + { 421 + } 422 + 414 423 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 415 424 { 416 425 int i; ··· 483 474 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 484 475 { 485 476 return -ENOTSUPP; 477 + } 478 + 479 + int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 480 + { 481 + int r; 482 + union kvmppc_one_reg val; 483 + int size; 484 + long int i; 485 + 486 + size = one_reg_size(reg->id); 487 + if (size > sizeof(val)) 488 + return -EINVAL; 489 + 490 + r = kvmppc_get_one_reg(vcpu, reg->id, &val); 491 + 492 + if (r == -EINVAL) { 493 + r = 0; 494 + switch (reg->id) { 495 + case KVM_REG_PPC_DAR: 496 + val = get_reg_val(reg->id, vcpu->arch.shared->dar); 497 + break; 498 + case KVM_REG_PPC_DSISR: 499 + val = get_reg_val(reg->id, vcpu->arch.shared->dsisr); 500 + break; 501 + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: 502 + i = reg->id - KVM_REG_PPC_FPR0; 503 + val = get_reg_val(reg->id, vcpu->arch.fpr[i]); 504 + break; 505 + case KVM_REG_PPC_FPSCR: 506 + val = get_reg_val(reg->id, vcpu->arch.fpscr); 507 + break; 508 + #ifdef CONFIG_ALTIVEC 509 + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: 510 + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { 511 + r = -ENXIO; 512 + break; 513 + } 514 + val.vval = vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0]; 515 + break; 516 + case KVM_REG_PPC_VSCR: 517 + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { 518 + r = -ENXIO; 519 + break; 520 + } 521 + val = get_reg_val(reg->id, vcpu->arch.vscr.u[3]); 522 + break; 523 + #endif /* CONFIG_ALTIVEC */ 524 + default: 525 + r = -EINVAL; 526 + break; 527 + } 528 + } 529 + if (r) 530 + return r; 531 + 532 + if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) 533 + r = -EFAULT; 534 + 535 + return r; 536 + } 537 + 538 + int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 539 + { 540 + int r; 541 + union kvmppc_one_reg val; 542 + int size; 543 + long int i; 544 + 545 + size = one_reg_size(reg->id); 546 + if (size > sizeof(val)) 547 + return -EINVAL; 548 + 549 + if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) 550 + return -EFAULT; 551 + 552 + r = kvmppc_set_one_reg(vcpu, reg->id, &val); 553 + 554 + if (r == -EINVAL) { 555 + r = 0; 556 + switch (reg->id) { 557 + case KVM_REG_PPC_DAR: 558 + vcpu->arch.shared->dar = set_reg_val(reg->id, val); 559 + break; 560 + case KVM_REG_PPC_DSISR: 561 + vcpu->arch.shared->dsisr = set_reg_val(reg->id, val); 562 + break; 563 + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: 564 + i = reg->id - KVM_REG_PPC_FPR0; 565 + vcpu->arch.fpr[i] = set_reg_val(reg->id, val); 566 + break; 567 + case KVM_REG_PPC_FPSCR: 568 + vcpu->arch.fpscr = set_reg_val(reg->id, val); 569 + break; 570 + #ifdef CONFIG_ALTIVEC 571 + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: 572 + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { 573 + r = -ENXIO; 574 + break; 575 + } 576 + vcpu->arch.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; 577 + break; 578 + case KVM_REG_PPC_VSCR: 579 + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { 580 + r = -ENXIO; 581 + break; 582 + } 583 + vcpu->arch.vscr.u[3] = set_reg_val(reg->id, val); 584 + break; 585 + #endif /* CONFIG_ALTIVEC */ 586 + default: 587 + r = -EINVAL; 588 + break; 589 + } 590 + } 591 + 592 + return r; 486 593 } 487 594 488 595 int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,

+2 -1

arch/powerpc/kvm/book3s_32_mmu_host.c

··· 155 155 156 156 /* Get host physical address for gpa */ 157 157 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); 158 - if (is_error_pfn(hpaddr)) { 158 + if (is_error_noslot_pfn(hpaddr)) { 159 159 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", 160 160 orig_pte->eaddr); 161 161 r = -EINVAL; ··· 254 254 255 255 kvmppc_mmu_hpte_cache_map(vcpu, pte); 256 256 257 + kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); 257 258 out: 258 259 return r; 259 260 }

+2 -1

arch/powerpc/kvm/book3s_64_mmu_host.c

··· 93 93 94 94 /* Get host physical address for gpa */ 95 95 hpaddr = kvmppc_gfn_to_pfn(vcpu, orig_pte->raddr >> PAGE_SHIFT); 96 - if (is_error_pfn(hpaddr)) { 96 + if (is_error_noslot_pfn(hpaddr)) { 97 97 printk(KERN_INFO "Couldn't get guest page for gfn %lx!\n", orig_pte->eaddr); 98 98 r = -EINVAL; 99 99 goto out; ··· 171 171 172 172 kvmppc_mmu_hpte_cache_map(vcpu, pte); 173 173 } 174 + kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); 174 175 175 176 out: 176 177 return r;

+442 -32

arch/powerpc/kvm/book3s_64_mmu_hv.c

··· 24 24 #include <linux/slab.h> 25 25 #include <linux/hugetlb.h> 26 26 #include <linux/vmalloc.h> 27 + #include <linux/srcu.h> 28 + #include <linux/anon_inodes.h> 29 + #include <linux/file.h> 27 30 28 31 #include <asm/tlbflush.h> 29 32 #include <asm/kvm_ppc.h> ··· 42 39 43 40 /* Power architecture requires HPT is at least 256kB */ 44 41 #define PPC_MIN_HPT_ORDER 18 42 + 43 + static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 44 + long pte_index, unsigned long pteh, 45 + unsigned long ptel, unsigned long *pte_idx_ret); 46 + static void kvmppc_rmap_reset(struct kvm *kvm); 45 47 46 48 long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) 47 49 { ··· 145 137 /* Set the entire HPT to 0, i.e. invalid HPTEs */ 146 138 memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); 147 139 /* 148 - * Set the whole last_vcpu array to an invalid vcpu number. 149 - * This ensures that each vcpu will flush its TLB on next entry. 140 + * Reset all the reverse-mapping chains for all memslots 150 141 */ 151 - memset(kvm->arch.last_vcpu, 0xff, sizeof(kvm->arch.last_vcpu)); 142 + kvmppc_rmap_reset(kvm); 143 + /* Ensure that each vcpu will flush its TLB on next entry. */ 144 + cpumask_setall(&kvm->arch.need_tlb_flush); 152 145 *htab_orderp = order; 153 146 err = 0; 154 147 } else { ··· 193 184 unsigned long addr, hash; 194 185 unsigned long psize; 195 186 unsigned long hp0, hp1; 187 + unsigned long idx_ret; 196 188 long ret; 197 189 struct kvm *kvm = vcpu->kvm; 198 190 ··· 225 215 hash = (hash << 3) + 7; 226 216 hp_v = hp0 | ((addr >> 16) & ~0x7fUL); 227 217 hp_r = hp1 | addr; 228 - ret = kvmppc_virtmode_h_enter(vcpu, H_EXACT, hash, hp_v, hp_r); 218 + ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, 219 + &idx_ret); 229 220 if (ret != H_SUCCESS) { 230 221 pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", 231 222 addr, ret); ··· 271 260 272 261 /* 273 262 * This is called to get a reference to a guest page if there isn't 274 - * one already in the kvm->arch.slot_phys[][] arrays. 263 + * one already in the memslot->arch.slot_phys[] array. 275 264 */ 276 265 static long kvmppc_get_guest_page(struct kvm *kvm, unsigned long gfn, 277 266 struct kvm_memory_slot *memslot, ··· 286 275 struct vm_area_struct *vma; 287 276 unsigned long pfn, i, npages; 288 277 289 - physp = kvm->arch.slot_phys[memslot->id]; 278 + physp = memslot->arch.slot_phys; 290 279 if (!physp) 291 280 return -EINVAL; 292 281 if (physp[gfn - memslot->base_gfn]) ··· 364 353 return err; 365 354 } 366 355 367 - /* 368 - * We come here on a H_ENTER call from the guest when we are not 369 - * using mmu notifiers and we don't have the requested page pinned 370 - * already. 371 - */ 372 - long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 373 - long pte_index, unsigned long pteh, unsigned long ptel) 356 + long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, 357 + long pte_index, unsigned long pteh, 358 + unsigned long ptel, unsigned long *pte_idx_ret) 374 359 { 375 - struct kvm *kvm = vcpu->kvm; 376 360 unsigned long psize, gpa, gfn; 377 361 struct kvm_memory_slot *memslot; 378 362 long ret; ··· 395 389 do_insert: 396 390 /* Protect linux PTE lookup from page table destruction */ 397 391 rcu_read_lock_sched(); /* this disables preemption too */ 398 - vcpu->arch.pgdir = current->mm->pgd; 399 - ret = kvmppc_h_enter(vcpu, flags, pte_index, pteh, ptel); 392 + ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, 393 + current->mm->pgd, false, pte_idx_ret); 400 394 rcu_read_unlock_sched(); 401 395 if (ret == H_TOO_HARD) { 402 396 /* this can't happen */ ··· 405 399 } 406 400 return ret; 407 401 402 + } 403 + 404 + /* 405 + * We come here on a H_ENTER call from the guest when we are not 406 + * using mmu notifiers and we don't have the requested page pinned 407 + * already. 408 + */ 409 + long kvmppc_virtmode_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 410 + long pte_index, unsigned long pteh, 411 + unsigned long ptel) 412 + { 413 + return kvmppc_virtmode_do_h_enter(vcpu->kvm, flags, pte_index, 414 + pteh, ptel, &vcpu->arch.gpr[4]); 408 415 } 409 416 410 417 static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, ··· 589 570 struct kvm *kvm = vcpu->kvm; 590 571 unsigned long *hptep, hpte[3], r; 591 572 unsigned long mmu_seq, psize, pte_size; 592 - unsigned long gfn, hva, pfn; 573 + unsigned long gpa, gfn, hva, pfn; 593 574 struct kvm_memory_slot *memslot; 594 575 unsigned long *rmap; 595 576 struct revmap_entry *rev; ··· 627 608 628 609 /* Translate the logical address and get the page */ 629 610 psize = hpte_page_size(hpte[0], r); 630 - gfn = hpte_rpn(r, psize); 611 + gpa = (r & HPTE_R_RPN & ~(psize - 1)) | (ea & (psize - 1)); 612 + gfn = gpa >> PAGE_SHIFT; 631 613 memslot = gfn_to_memslot(kvm, gfn); 632 614 633 615 /* No memslot means it's an emulated MMIO region */ 634 - if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { 635 - unsigned long gpa = (gfn << PAGE_SHIFT) | (ea & (psize - 1)); 616 + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 636 617 return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, 637 618 dsisr & DSISR_ISSTORE); 638 - } 639 619 640 620 if (!kvm->arch.using_mmu_notifiers) 641 621 return -EFAULT; /* should never get here */ ··· 728 710 729 711 /* Check if we might have been invalidated; let the guest retry if so */ 730 712 ret = RESUME_GUEST; 731 - if (mmu_notifier_retry(vcpu, mmu_seq)) { 713 + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { 732 714 unlock_rmap(rmap); 733 715 goto out_unlock; 734 716 } ··· 772 754 hptep[0] &= ~HPTE_V_HVLOCK; 773 755 preempt_enable(); 774 756 goto out_put; 757 + } 758 + 759 + static void kvmppc_rmap_reset(struct kvm *kvm) 760 + { 761 + struct kvm_memslots *slots; 762 + struct kvm_memory_slot *memslot; 763 + int srcu_idx; 764 + 765 + srcu_idx = srcu_read_lock(&kvm->srcu); 766 + slots = kvm->memslots; 767 + kvm_for_each_memslot(memslot, slots) { 768 + /* 769 + * This assumes it is acceptable to lose reference and 770 + * change bits across a reset. 771 + */ 772 + memset(memslot->arch.rmap, 0, 773 + memslot->npages * sizeof(*memslot->arch.rmap)); 774 + } 775 + srcu_read_unlock(&kvm->srcu, srcu_idx); 775 776 } 776 777 777 778 static int kvm_handle_hva_range(struct kvm *kvm, ··· 887 850 psize = hpte_page_size(hptep[0], ptel); 888 851 if ((hptep[0] & HPTE_V_VALID) && 889 852 hpte_rpn(ptel, psize) == gfn) { 890 - hptep[0] |= HPTE_V_ABSENT; 853 + if (kvm->arch.using_mmu_notifiers) 854 + hptep[0] |= HPTE_V_ABSENT; 891 855 kvmppc_invalidate_hpte(kvm, hptep, i); 892 856 /* Harvest R and C */ 893 857 rcbits = hptep[1] & (HPTE_R_R | HPTE_R_C); ··· 913 875 if (kvm->arch.using_mmu_notifiers) 914 876 kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); 915 877 return 0; 878 + } 879 + 880 + void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) 881 + { 882 + unsigned long *rmapp; 883 + unsigned long gfn; 884 + unsigned long n; 885 + 886 + rmapp = memslot->arch.rmap; 887 + gfn = memslot->base_gfn; 888 + for (n = memslot->npages; n; --n) { 889 + /* 890 + * Testing the present bit without locking is OK because 891 + * the memslot has been marked invalid already, and hence 892 + * no new HPTEs referencing this page can be created, 893 + * thus the present bit can't go from 0 to 1. 894 + */ 895 + if (*rmapp & KVMPPC_RMAP_PRESENT) 896 + kvm_unmap_rmapp(kvm, rmapp, gfn); 897 + ++rmapp; 898 + ++gfn; 899 + } 916 900 } 917 901 918 902 static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, ··· 1090 1030 return ret; 1091 1031 } 1092 1032 1093 - long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) 1033 + long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, 1034 + unsigned long *map) 1094 1035 { 1095 1036 unsigned long i; 1096 - unsigned long *rmapp, *map; 1037 + unsigned long *rmapp; 1097 1038 1098 1039 preempt_disable(); 1099 1040 rmapp = memslot->arch.rmap; 1100 - map = memslot->dirty_bitmap; 1101 1041 for (i = 0; i < memslot->npages; ++i) { 1102 - if (kvm_test_clear_dirty(kvm, rmapp)) 1042 + if (kvm_test_clear_dirty(kvm, rmapp) && map) 1103 1043 __set_bit_le(i, map); 1104 1044 ++rmapp; 1105 1045 } ··· 1117 1057 unsigned long hva, psize, offset; 1118 1058 unsigned long pa; 1119 1059 unsigned long *physp; 1060 + int srcu_idx; 1120 1061 1062 + srcu_idx = srcu_read_lock(&kvm->srcu); 1121 1063 memslot = gfn_to_memslot(kvm, gfn); 1122 1064 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1123 - return NULL; 1065 + goto err; 1124 1066 if (!kvm->arch.using_mmu_notifiers) { 1125 - physp = kvm->arch.slot_phys[memslot->id]; 1067 + physp = memslot->arch.slot_phys; 1126 1068 if (!physp) 1127 - return NULL; 1069 + goto err; 1128 1070 physp += gfn - memslot->base_gfn; 1129 1071 pa = *physp; 1130 1072 if (!pa) { 1131 1073 if (kvmppc_get_guest_page(kvm, gfn, memslot, 1132 1074 PAGE_SIZE) < 0) 1133 - return NULL; 1075 + goto err; 1134 1076 pa = *physp; 1135 1077 } 1136 1078 page = pfn_to_page(pa >> PAGE_SHIFT); ··· 1141 1079 hva = gfn_to_hva_memslot(memslot, gfn); 1142 1080 npages = get_user_pages_fast(hva, 1, 1, pages); 1143 1081 if (npages < 1) 1144 - return NULL; 1082 + goto err; 1145 1083 page = pages[0]; 1146 1084 } 1085 + srcu_read_unlock(&kvm->srcu, srcu_idx); 1086 + 1147 1087 psize = PAGE_SIZE; 1148 1088 if (PageHuge(page)) { 1149 1089 page = compound_head(page); ··· 1155 1091 if (nb_ret) 1156 1092 *nb_ret = psize - offset; 1157 1093 return page_address(page) + offset; 1094 + 1095 + err: 1096 + srcu_read_unlock(&kvm->srcu, srcu_idx); 1097 + return NULL; 1158 1098 } 1159 1099 1160 1100 void kvmppc_unpin_guest_page(struct kvm *kvm, void *va) ··· 1166 1098 struct page *page = virt_to_page(va); 1167 1099 1168 1100 put_page(page); 1101 + } 1102 + 1103 + /* 1104 + * Functions for reading and writing the hash table via reads and 1105 + * writes on a file descriptor. 1106 + * 1107 + * Reads return the guest view of the hash table, which has to be 1108 + * pieced together from the real hash table and the guest_rpte 1109 + * values in the revmap array. 1110 + * 1111 + * On writes, each HPTE written is considered in turn, and if it 1112 + * is valid, it is written to the HPT as if an H_ENTER with the 1113 + * exact flag set was done. When the invalid count is non-zero 1114 + * in the header written to the stream, the kernel will make 1115 + * sure that that many HPTEs are invalid, and invalidate them 1116 + * if not. 1117 + */ 1118 + 1119 + struct kvm_htab_ctx { 1120 + unsigned long index; 1121 + unsigned long flags; 1122 + struct kvm *kvm; 1123 + int first_pass; 1124 + }; 1125 + 1126 + #define HPTE_SIZE (2 * sizeof(unsigned long)) 1127 + 1128 + static long record_hpte(unsigned long flags, unsigned long *hptp, 1129 + unsigned long *hpte, struct revmap_entry *revp, 1130 + int want_valid, int first_pass) 1131 + { 1132 + unsigned long v, r; 1133 + int ok = 1; 1134 + int valid, dirty; 1135 + 1136 + /* Unmodified entries are uninteresting except on the first pass */ 1137 + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1138 + if (!first_pass && !dirty) 1139 + return 0; 1140 + 1141 + valid = 0; 1142 + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) { 1143 + valid = 1; 1144 + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && 1145 + !(hptp[0] & HPTE_V_BOLTED)) 1146 + valid = 0; 1147 + } 1148 + if (valid != want_valid) 1149 + return 0; 1150 + 1151 + v = r = 0; 1152 + if (valid || dirty) { 1153 + /* lock the HPTE so it's stable and read it */ 1154 + preempt_disable(); 1155 + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) 1156 + cpu_relax(); 1157 + v = hptp[0]; 1158 + if (v & HPTE_V_ABSENT) { 1159 + v &= ~HPTE_V_ABSENT; 1160 + v |= HPTE_V_VALID; 1161 + } 1162 + /* re-evaluate valid and dirty from synchronized HPTE value */ 1163 + valid = !!(v & HPTE_V_VALID); 1164 + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) 1165 + valid = 0; 1166 + r = revp->guest_rpte | (hptp[1] & (HPTE_R_R | HPTE_R_C)); 1167 + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); 1168 + /* only clear modified if this is the right sort of entry */ 1169 + if (valid == want_valid && dirty) { 1170 + r &= ~HPTE_GR_MODIFIED; 1171 + revp->guest_rpte = r; 1172 + } 1173 + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory"); 1174 + hptp[0] &= ~HPTE_V_HVLOCK; 1175 + preempt_enable(); 1176 + if (!(valid == want_valid && (first_pass || dirty))) 1177 + ok = 0; 1178 + } 1179 + hpte[0] = v; 1180 + hpte[1] = r; 1181 + return ok; 1182 + } 1183 + 1184 + static ssize_t kvm_htab_read(struct file *file, char __user *buf, 1185 + size_t count, loff_t *ppos) 1186 + { 1187 + struct kvm_htab_ctx *ctx = file->private_data; 1188 + struct kvm *kvm = ctx->kvm; 1189 + struct kvm_get_htab_header hdr; 1190 + unsigned long *hptp; 1191 + struct revmap_entry *revp; 1192 + unsigned long i, nb, nw; 1193 + unsigned long __user *lbuf; 1194 + struct kvm_get_htab_header __user *hptr; 1195 + unsigned long flags; 1196 + int first_pass; 1197 + unsigned long hpte[2]; 1198 + 1199 + if (!access_ok(VERIFY_WRITE, buf, count)) 1200 + return -EFAULT; 1201 + 1202 + first_pass = ctx->first_pass; 1203 + flags = ctx->flags; 1204 + 1205 + i = ctx->index; 1206 + hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); 1207 + revp = kvm->arch.revmap + i; 1208 + lbuf = (unsigned long __user *)buf; 1209 + 1210 + nb = 0; 1211 + while (nb + sizeof(hdr) + HPTE_SIZE < count) { 1212 + /* Initialize header */ 1213 + hptr = (struct kvm_get_htab_header __user *)buf; 1214 + hdr.n_valid = 0; 1215 + hdr.n_invalid = 0; 1216 + nw = nb; 1217 + nb += sizeof(hdr); 1218 + lbuf = (unsigned long __user *)(buf + sizeof(hdr)); 1219 + 1220 + /* Skip uninteresting entries, i.e. clean on not-first pass */ 1221 + if (!first_pass) { 1222 + while (i < kvm->arch.hpt_npte && 1223 + !(revp->guest_rpte & HPTE_GR_MODIFIED)) { 1224 + ++i; 1225 + hptp += 2; 1226 + ++revp; 1227 + } 1228 + } 1229 + hdr.index = i; 1230 + 1231 + /* Grab a series of valid entries */ 1232 + while (i < kvm->arch.hpt_npte && 1233 + hdr.n_valid < 0xffff && 1234 + nb + HPTE_SIZE < count && 1235 + record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { 1236 + /* valid entry, write it out */ 1237 + ++hdr.n_valid; 1238 + if (__put_user(hpte[0], lbuf) || 1239 + __put_user(hpte[1], lbuf + 1)) 1240 + return -EFAULT; 1241 + nb += HPTE_SIZE; 1242 + lbuf += 2; 1243 + ++i; 1244 + hptp += 2; 1245 + ++revp; 1246 + } 1247 + /* Now skip invalid entries while we can */ 1248 + while (i < kvm->arch.hpt_npte && 1249 + hdr.n_invalid < 0xffff && 1250 + record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { 1251 + /* found an invalid entry */ 1252 + ++hdr.n_invalid; 1253 + ++i; 1254 + hptp += 2; 1255 + ++revp; 1256 + } 1257 + 1258 + if (hdr.n_valid || hdr.n_invalid) { 1259 + /* write back the header */ 1260 + if (__copy_to_user(hptr, &hdr, sizeof(hdr))) 1261 + return -EFAULT; 1262 + nw = nb; 1263 + buf = (char __user *)lbuf; 1264 + } else { 1265 + nb = nw; 1266 + } 1267 + 1268 + /* Check if we've wrapped around the hash table */ 1269 + if (i >= kvm->arch.hpt_npte) { 1270 + i = 0; 1271 + ctx->first_pass = 0; 1272 + break; 1273 + } 1274 + } 1275 + 1276 + ctx->index = i; 1277 + 1278 + return nb; 1279 + } 1280 + 1281 + static ssize_t kvm_htab_write(struct file *file, const char __user *buf, 1282 + size_t count, loff_t *ppos) 1283 + { 1284 + struct kvm_htab_ctx *ctx = file->private_data; 1285 + struct kvm *kvm = ctx->kvm; 1286 + struct kvm_get_htab_header hdr; 1287 + unsigned long i, j; 1288 + unsigned long v, r; 1289 + unsigned long __user *lbuf; 1290 + unsigned long *hptp; 1291 + unsigned long tmp[2]; 1292 + ssize_t nb; 1293 + long int err, ret; 1294 + int rma_setup; 1295 + 1296 + if (!access_ok(VERIFY_READ, buf, count)) 1297 + return -EFAULT; 1298 + 1299 + /* lock out vcpus from running while we're doing this */ 1300 + mutex_lock(&kvm->lock); 1301 + rma_setup = kvm->arch.rma_setup_done; 1302 + if (rma_setup) { 1303 + kvm->arch.rma_setup_done = 0; /* temporarily */ 1304 + /* order rma_setup_done vs. vcpus_running */ 1305 + smp_mb(); 1306 + if (atomic_read(&kvm->arch.vcpus_running)) { 1307 + kvm->arch.rma_setup_done = 1; 1308 + mutex_unlock(&kvm->lock); 1309 + return -EBUSY; 1310 + } 1311 + } 1312 + 1313 + err = 0; 1314 + for (nb = 0; nb + sizeof(hdr) <= count; ) { 1315 + err = -EFAULT; 1316 + if (__copy_from_user(&hdr, buf, sizeof(hdr))) 1317 + break; 1318 + 1319 + err = 0; 1320 + if (nb + hdr.n_valid * HPTE_SIZE > count) 1321 + break; 1322 + 1323 + nb += sizeof(hdr); 1324 + buf += sizeof(hdr); 1325 + 1326 + err = -EINVAL; 1327 + i = hdr.index; 1328 + if (i >= kvm->arch.hpt_npte || 1329 + i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) 1330 + break; 1331 + 1332 + hptp = (unsigned long *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); 1333 + lbuf = (unsigned long __user *)buf; 1334 + for (j = 0; j < hdr.n_valid; ++j) { 1335 + err = -EFAULT; 1336 + if (__get_user(v, lbuf) || __get_user(r, lbuf + 1)) 1337 + goto out; 1338 + err = -EINVAL; 1339 + if (!(v & HPTE_V_VALID)) 1340 + goto out; 1341 + lbuf += 2; 1342 + nb += HPTE_SIZE; 1343 + 1344 + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) 1345 + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1346 + err = -EIO; 1347 + ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, 1348 + tmp); 1349 + if (ret != H_SUCCESS) { 1350 + pr_err("kvm_htab_write ret %ld i=%ld v=%lx " 1351 + "r=%lx\n", ret, i, v, r); 1352 + goto out; 1353 + } 1354 + if (!rma_setup && is_vrma_hpte(v)) { 1355 + unsigned long psize = hpte_page_size(v, r); 1356 + unsigned long senc = slb_pgsize_encoding(psize); 1357 + unsigned long lpcr; 1358 + 1359 + kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | 1360 + (VRMA_VSID << SLB_VSID_SHIFT_1T); 1361 + lpcr = kvm->arch.lpcr & ~LPCR_VRMASD; 1362 + lpcr |= senc << (LPCR_VRMASD_SH - 4); 1363 + kvm->arch.lpcr = lpcr; 1364 + rma_setup = 1; 1365 + } 1366 + ++i; 1367 + hptp += 2; 1368 + } 1369 + 1370 + for (j = 0; j < hdr.n_invalid; ++j) { 1371 + if (hptp[0] & (HPTE_V_VALID | HPTE_V_ABSENT)) 1372 + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); 1373 + ++i; 1374 + hptp += 2; 1375 + } 1376 + err = 0; 1377 + } 1378 + 1379 + out: 1380 + /* Order HPTE updates vs. rma_setup_done */ 1381 + smp_wmb(); 1382 + kvm->arch.rma_setup_done = rma_setup; 1383 + mutex_unlock(&kvm->lock); 1384 + 1385 + if (err) 1386 + return err; 1387 + return nb; 1388 + } 1389 + 1390 + static int kvm_htab_release(struct inode *inode, struct file *filp) 1391 + { 1392 + struct kvm_htab_ctx *ctx = filp->private_data; 1393 + 1394 + filp->private_data = NULL; 1395 + if (!(ctx->flags & KVM_GET_HTAB_WRITE)) 1396 + atomic_dec(&ctx->kvm->arch.hpte_mod_interest); 1397 + kvm_put_kvm(ctx->kvm); 1398 + kfree(ctx); 1399 + return 0; 1400 + } 1401 + 1402 + static struct file_operations kvm_htab_fops = { 1403 + .read = kvm_htab_read, 1404 + .write = kvm_htab_write, 1405 + .llseek = default_llseek, 1406 + .release = kvm_htab_release, 1407 + }; 1408 + 1409 + int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) 1410 + { 1411 + int ret; 1412 + struct kvm_htab_ctx *ctx; 1413 + int rwflag; 1414 + 1415 + /* reject flags we don't recognize */ 1416 + if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) 1417 + return -EINVAL; 1418 + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); 1419 + if (!ctx) 1420 + return -ENOMEM; 1421 + kvm_get_kvm(kvm); 1422 + ctx->kvm = kvm; 1423 + ctx->index = ghf->start_index; 1424 + ctx->flags = ghf->flags; 1425 + ctx->first_pass = 1; 1426 + 1427 + rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; 1428 + ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag); 1429 + if (ret < 0) { 1430 + kvm_put_kvm(kvm); 1431 + return ret; 1432 + } 1433 + 1434 + if (rwflag == O_RDONLY) { 1435 + mutex_lock(&kvm->slots_lock); 1436 + atomic_inc(&kvm->arch.hpte_mod_interest); 1437 + /* make sure kvmppc_do_h_enter etc. see the increment */ 1438 + synchronize_srcu_expedited(&kvm->srcu); 1439 + mutex_unlock(&kvm->slots_lock); 1440 + } 1441 + 1442 + return ret; 1169 1443 } 1170 1444 1171 1445 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)

+15 -1

arch/powerpc/kvm/book3s_emulate.c

··· 22 22 #include <asm/kvm_book3s.h> 23 23 #include <asm/reg.h> 24 24 #include <asm/switch_to.h> 25 + #include <asm/time.h> 25 26 26 27 #define OP_19_XOP_RFID 18 27 28 #define OP_19_XOP_RFI 50 ··· 396 395 (mfmsr() & MSR_HV)) 397 396 vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; 398 397 break; 398 + case SPRN_PURR: 399 + to_book3s(vcpu)->purr_offset = spr_val - get_tb(); 400 + break; 401 + case SPRN_SPURR: 402 + to_book3s(vcpu)->spurr_offset = spr_val - get_tb(); 403 + break; 399 404 case SPRN_GQR0: 400 405 case SPRN_GQR1: 401 406 case SPRN_GQR2: ··· 419 412 case SPRN_CTRLF: 420 413 case SPRN_CTRLT: 421 414 case SPRN_L2CR: 415 + case SPRN_DSCR: 422 416 case SPRN_MMCR0_GEKKO: 423 417 case SPRN_MMCR1_GEKKO: 424 418 case SPRN_PMC1_GEKKO: ··· 491 483 *spr_val = to_book3s(vcpu)->hid[5]; 492 484 break; 493 485 case SPRN_CFAR: 494 - case SPRN_PURR: 486 + case SPRN_DSCR: 495 487 *spr_val = 0; 488 + break; 489 + case SPRN_PURR: 490 + *spr_val = get_tb() + to_book3s(vcpu)->purr_offset; 491 + break; 492 + case SPRN_SPURR: 493 + *spr_val = get_tb() + to_book3s(vcpu)->purr_offset; 496 494 break; 497 495 case SPRN_GQR0: 498 496 case SPRN_GQR1:

-3

arch/powerpc/kvm/book3s_exports.c

··· 28 28 #ifdef CONFIG_ALTIVEC 29 29 EXPORT_SYMBOL_GPL(kvmppc_load_up_altivec); 30 30 #endif 31 - #ifdef CONFIG_VSX 32 - EXPORT_SYMBOL_GPL(kvmppc_load_up_vsx); 33 - #endif 34 31 #endif 35 32

+501 -174

arch/powerpc/kvm/book3s_hv.c

··· 30 30 #include <linux/cpumask.h> 31 31 #include <linux/spinlock.h> 32 32 #include <linux/page-flags.h> 33 + #include <linux/srcu.h> 33 34 34 35 #include <asm/reg.h> 35 36 #include <asm/cputable.h> ··· 47 46 #include <asm/page.h> 48 47 #include <asm/hvcall.h> 49 48 #include <asm/switch_to.h> 49 + #include <asm/smp.h> 50 50 #include <linux/gfp.h> 51 51 #include <linux/vmalloc.h> 52 52 #include <linux/highmem.h> ··· 57 55 /* #define EXIT_DEBUG_SIMPLE */ 58 56 /* #define EXIT_DEBUG_INT */ 59 57 58 + /* Used to indicate that a guest page fault needs to be handled */ 59 + #define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) 60 + 61 + /* Used as a "null" value for timebase values */ 62 + #define TB_NIL (~(u64)0) 63 + 60 64 static void kvmppc_end_cede(struct kvm_vcpu *vcpu); 61 65 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); 66 + 67 + /* 68 + * We use the vcpu_load/put functions to measure stolen time. 69 + * Stolen time is counted as time when either the vcpu is able to 70 + * run as part of a virtual core, but the task running the vcore 71 + * is preempted or sleeping, or when the vcpu needs something done 72 + * in the kernel by the task running the vcpu, but that task is 73 + * preempted or sleeping. Those two things have to be counted 74 + * separately, since one of the vcpu tasks will take on the job 75 + * of running the core, and the other vcpu tasks in the vcore will 76 + * sleep waiting for it to do that, but that sleep shouldn't count 77 + * as stolen time. 78 + * 79 + * Hence we accumulate stolen time when the vcpu can run as part of 80 + * a vcore using vc->stolen_tb, and the stolen time when the vcpu 81 + * needs its task to do other things in the kernel (for example, 82 + * service a page fault) in busy_stolen. We don't accumulate 83 + * stolen time for a vcore when it is inactive, or for a vcpu 84 + * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of 85 + * a misnomer; it means that the vcpu task is not executing in 86 + * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in 87 + * the kernel. We don't have any way of dividing up that time 88 + * between time that the vcpu is genuinely stopped, time that 89 + * the task is actively working on behalf of the vcpu, and time 90 + * that the task is preempted, so we don't count any of it as 91 + * stolen. 92 + * 93 + * Updates to busy_stolen are protected by arch.tbacct_lock; 94 + * updates to vc->stolen_tb are protected by the arch.tbacct_lock 95 + * of the vcpu that has taken responsibility for running the vcore 96 + * (i.e. vc->runner). The stolen times are measured in units of 97 + * timebase ticks. (Note that the != TB_NIL checks below are 98 + * purely defensive; they should never fail.) 99 + */ 62 100 63 101 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 64 102 { 65 103 struct kvmppc_vcore *vc = vcpu->arch.vcore; 66 104 67 - local_paca->kvm_hstate.kvm_vcpu = vcpu; 68 - local_paca->kvm_hstate.kvm_vcore = vc; 69 - if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) 105 + spin_lock(&vcpu->arch.tbacct_lock); 106 + if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE && 107 + vc->preempt_tb != TB_NIL) { 70 108 vc->stolen_tb += mftb() - vc->preempt_tb; 109 + vc->preempt_tb = TB_NIL; 110 + } 111 + if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST && 112 + vcpu->arch.busy_preempt != TB_NIL) { 113 + vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt; 114 + vcpu->arch.busy_preempt = TB_NIL; 115 + } 116 + spin_unlock(&vcpu->arch.tbacct_lock); 71 117 } 72 118 73 119 void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) 74 120 { 75 121 struct kvmppc_vcore *vc = vcpu->arch.vcore; 76 122 123 + spin_lock(&vcpu->arch.tbacct_lock); 77 124 if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) 78 125 vc->preempt_tb = mftb(); 126 + if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) 127 + vcpu->arch.busy_preempt = mftb(); 128 + spin_unlock(&vcpu->arch.tbacct_lock); 79 129 } 80 130 81 131 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) ··· 194 140 { 195 141 vpa->shared_proc = 1; 196 142 vpa->yield_count = 1; 143 + } 144 + 145 + static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v, 146 + unsigned long addr, unsigned long len) 147 + { 148 + /* check address is cacheline aligned */ 149 + if (addr & (L1_CACHE_BYTES - 1)) 150 + return -EINVAL; 151 + spin_lock(&vcpu->arch.vpa_update_lock); 152 + if (v->next_gpa != addr || v->len != len) { 153 + v->next_gpa = addr; 154 + v->len = addr ? len : 0; 155 + v->update_pending = 1; 156 + } 157 + spin_unlock(&vcpu->arch.vpa_update_lock); 158 + return 0; 197 159 } 198 160 199 161 /* Length for a per-processor buffer is passed in at offset 4 in the buffer */ ··· 387 317 388 318 static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) 389 319 { 320 + if (!(vcpu->arch.vpa.update_pending || 321 + vcpu->arch.slb_shadow.update_pending || 322 + vcpu->arch.dtl.update_pending)) 323 + return; 324 + 390 325 spin_lock(&vcpu->arch.vpa_update_lock); 391 326 if (vcpu->arch.vpa.update_pending) { 392 327 kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); 393 - init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); 328 + if (vcpu->arch.vpa.pinned_addr) 329 + init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); 394 330 } 395 331 if (vcpu->arch.dtl.update_pending) { 396 332 kvmppc_update_vpa(vcpu, &vcpu->arch.dtl); ··· 408 332 spin_unlock(&vcpu->arch.vpa_update_lock); 409 333 } 410 334 335 + /* 336 + * Return the accumulated stolen time for the vcore up until `now'. 337 + * The caller should hold the vcore lock. 338 + */ 339 + static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) 340 + { 341 + u64 p; 342 + 343 + /* 344 + * If we are the task running the vcore, then since we hold 345 + * the vcore lock, we can't be preempted, so stolen_tb/preempt_tb 346 + * can't be updated, so we don't need the tbacct_lock. 347 + * If the vcore is inactive, it can't become active (since we 348 + * hold the vcore lock), so the vcpu load/put functions won't 349 + * update stolen_tb/preempt_tb, and we don't need tbacct_lock. 350 + */ 351 + if (vc->vcore_state != VCORE_INACTIVE && 352 + vc->runner->arch.run_task != current) { 353 + spin_lock(&vc->runner->arch.tbacct_lock); 354 + p = vc->stolen_tb; 355 + if (vc->preempt_tb != TB_NIL) 356 + p += now - vc->preempt_tb; 357 + spin_unlock(&vc->runner->arch.tbacct_lock); 358 + } else { 359 + p = vc->stolen_tb; 360 + } 361 + return p; 362 + } 363 + 411 364 static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, 412 365 struct kvmppc_vcore *vc) 413 366 { 414 367 struct dtl_entry *dt; 415 368 struct lppaca *vpa; 416 - unsigned long old_stolen; 369 + unsigned long stolen; 370 + unsigned long core_stolen; 371 + u64 now; 417 372 418 373 dt = vcpu->arch.dtl_ptr; 419 374 vpa = vcpu->arch.vpa.pinned_addr; 420 - old_stolen = vcpu->arch.stolen_logged; 421 - vcpu->arch.stolen_logged = vc->stolen_tb; 375 + now = mftb(); 376 + core_stolen = vcore_stolen_time(vc, now); 377 + stolen = core_stolen - vcpu->arch.stolen_logged; 378 + vcpu->arch.stolen_logged = core_stolen; 379 + spin_lock(&vcpu->arch.tbacct_lock); 380 + stolen += vcpu->arch.busy_stolen; 381 + vcpu->arch.busy_stolen = 0; 382 + spin_unlock(&vcpu->arch.tbacct_lock); 422 383 if (!dt || !vpa) 423 384 return; 424 385 memset(dt, 0, sizeof(struct dtl_entry)); 425 386 dt->dispatch_reason = 7; 426 387 dt->processor_id = vc->pcpu + vcpu->arch.ptid; 427 - dt->timebase = mftb(); 428 - dt->enqueue_to_dispatch_time = vc->stolen_tb - old_stolen; 388 + dt->timebase = now; 389 + dt->enqueue_to_dispatch_time = stolen; 429 390 dt->srr0 = kvmppc_get_pc(vcpu); 430 391 dt->srr1 = vcpu->arch.shregs.msr; 431 392 ++dt; ··· 479 366 unsigned long req = kvmppc_get_gpr(vcpu, 3); 480 367 unsigned long target, ret = H_SUCCESS; 481 368 struct kvm_vcpu *tvcpu; 369 + int idx; 482 370 483 371 switch (req) { 484 372 case H_ENTER: 373 + idx = srcu_read_lock(&vcpu->kvm->srcu); 485 374 ret = kvmppc_virtmode_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4), 486 375 kvmppc_get_gpr(vcpu, 5), 487 376 kvmppc_get_gpr(vcpu, 6), 488 377 kvmppc_get_gpr(vcpu, 7)); 378 + srcu_read_unlock(&vcpu->kvm->srcu, idx); 489 379 break; 490 380 case H_CEDE: 491 381 break; ··· 545 429 case BOOK3S_INTERRUPT_PERFMON: 546 430 r = RESUME_GUEST; 547 431 break; 432 + case BOOK3S_INTERRUPT_MACHINE_CHECK: 433 + /* 434 + * Deliver a machine check interrupt to the guest. 435 + * We have to do this, even if the host has handled the 436 + * machine check, because machine checks use SRR0/1 and 437 + * the interrupt might have trashed guest state in them. 438 + */ 439 + kvmppc_book3s_queue_irqprio(vcpu, 440 + BOOK3S_INTERRUPT_MACHINE_CHECK); 441 + r = RESUME_GUEST; 442 + break; 548 443 case BOOK3S_INTERRUPT_PROGRAM: 549 444 { 550 445 ulong flags; ··· 597 470 * have been handled already. 598 471 */ 599 472 case BOOK3S_INTERRUPT_H_DATA_STORAGE: 600 - r = kvmppc_book3s_hv_page_fault(run, vcpu, 601 - vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); 473 + r = RESUME_PAGE_FAULT; 602 474 break; 603 475 case BOOK3S_INTERRUPT_H_INST_STORAGE: 604 - r = kvmppc_book3s_hv_page_fault(run, vcpu, 605 - kvmppc_get_pc(vcpu), 0); 476 + vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); 477 + vcpu->arch.fault_dsisr = 0; 478 + r = RESUME_PAGE_FAULT; 606 479 break; 607 480 /* 608 481 * This occurs if the guest executes an illegal instruction. ··· 662 535 return 0; 663 536 } 664 537 665 - int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 538 + int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) 666 539 { 667 - int r = -EINVAL; 540 + int r = 0; 541 + long int i; 668 542 669 - switch (reg->id) { 543 + switch (id) { 670 544 case KVM_REG_PPC_HIOR: 671 - r = put_user(0, (u64 __user *)reg->addr); 545 + *val = get_reg_val(id, 0); 546 + break; 547 + case KVM_REG_PPC_DABR: 548 + *val = get_reg_val(id, vcpu->arch.dabr); 549 + break; 550 + case KVM_REG_PPC_DSCR: 551 + *val = get_reg_val(id, vcpu->arch.dscr); 552 + break; 553 + case KVM_REG_PPC_PURR: 554 + *val = get_reg_val(id, vcpu->arch.purr); 555 + break; 556 + case KVM_REG_PPC_SPURR: 557 + *val = get_reg_val(id, vcpu->arch.spurr); 558 + break; 559 + case KVM_REG_PPC_AMR: 560 + *val = get_reg_val(id, vcpu->arch.amr); 561 + break; 562 + case KVM_REG_PPC_UAMOR: 563 + *val = get_reg_val(id, vcpu->arch.uamor); 564 + break; 565 + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA: 566 + i = id - KVM_REG_PPC_MMCR0; 567 + *val = get_reg_val(id, vcpu->arch.mmcr[i]); 568 + break; 569 + case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: 570 + i = id - KVM_REG_PPC_PMC1; 571 + *val = get_reg_val(id, vcpu->arch.pmc[i]); 572 + break; 573 + #ifdef CONFIG_VSX 574 + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: 575 + if (cpu_has_feature(CPU_FTR_VSX)) { 576 + /* VSX => FP reg i is stored in arch.vsr[2*i] */ 577 + long int i = id - KVM_REG_PPC_FPR0; 578 + *val = get_reg_val(id, vcpu->arch.vsr[2 * i]); 579 + } else { 580 + /* let generic code handle it */ 581 + r = -EINVAL; 582 + } 583 + break; 584 + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: 585 + if (cpu_has_feature(CPU_FTR_VSX)) { 586 + long int i = id - KVM_REG_PPC_VSR0; 587 + val->vsxval[0] = vcpu->arch.vsr[2 * i]; 588 + val->vsxval[1] = vcpu->arch.vsr[2 * i + 1]; 589 + } else { 590 + r = -ENXIO; 591 + } 592 + break; 593 + #endif /* CONFIG_VSX */ 594 + case KVM_REG_PPC_VPA_ADDR: 595 + spin_lock(&vcpu->arch.vpa_update_lock); 596 + *val = get_reg_val(id, vcpu->arch.vpa.next_gpa); 597 + spin_unlock(&vcpu->arch.vpa_update_lock); 598 + break; 599 + case KVM_REG_PPC_VPA_SLB: 600 + spin_lock(&vcpu->arch.vpa_update_lock); 601 + val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa; 602 + val->vpaval.length = vcpu->arch.slb_shadow.len; 603 + spin_unlock(&vcpu->arch.vpa_update_lock); 604 + break; 605 + case KVM_REG_PPC_VPA_DTL: 606 + spin_lock(&vcpu->arch.vpa_update_lock); 607 + val->vpaval.addr = vcpu->arch.dtl.next_gpa; 608 + val->vpaval.length = vcpu->arch.dtl.len; 609 + spin_unlock(&vcpu->arch.vpa_update_lock); 672 610 break; 673 611 default: 612 + r = -EINVAL; 674 613 break; 675 614 } 676 615 677 616 return r; 678 617 } 679 618 680 - int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 619 + int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) 681 620 { 682 - int r = -EINVAL; 621 + int r = 0; 622 + long int i; 623 + unsigned long addr, len; 683 624 684 - switch (reg->id) { 625 + switch (id) { 685 626 case KVM_REG_PPC_HIOR: 686 - { 687 - u64 hior; 688 627 /* Only allow this to be set to zero */ 689 - r = get_user(hior, (u64 __user *)reg->addr); 690 - if (!r && (hior != 0)) 628 + if (set_reg_val(id, *val)) 691 629 r = -EINVAL; 692 630 break; 693 - } 631 + case KVM_REG_PPC_DABR: 632 + vcpu->arch.dabr = set_reg_val(id, *val); 633 + break; 634 + case KVM_REG_PPC_DSCR: 635 + vcpu->arch.dscr = set_reg_val(id, *val); 636 + break; 637 + case KVM_REG_PPC_PURR: 638 + vcpu->arch.purr = set_reg_val(id, *val); 639 + break; 640 + case KVM_REG_PPC_SPURR: 641 + vcpu->arch.spurr = set_reg_val(id, *val); 642 + break; 643 + case KVM_REG_PPC_AMR: 644 + vcpu->arch.amr = set_reg_val(id, *val); 645 + break; 646 + case KVM_REG_PPC_UAMOR: 647 + vcpu->arch.uamor = set_reg_val(id, *val); 648 + break; 649 + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRA: 650 + i = id - KVM_REG_PPC_MMCR0; 651 + vcpu->arch.mmcr[i] = set_reg_val(id, *val); 652 + break; 653 + case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: 654 + i = id - KVM_REG_PPC_PMC1; 655 + vcpu->arch.pmc[i] = set_reg_val(id, *val); 656 + break; 657 + #ifdef CONFIG_VSX 658 + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: 659 + if (cpu_has_feature(CPU_FTR_VSX)) { 660 + /* VSX => FP reg i is stored in arch.vsr[2*i] */ 661 + long int i = id - KVM_REG_PPC_FPR0; 662 + vcpu->arch.vsr[2 * i] = set_reg_val(id, *val); 663 + } else { 664 + /* let generic code handle it */ 665 + r = -EINVAL; 666 + } 667 + break; 668 + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: 669 + if (cpu_has_feature(CPU_FTR_VSX)) { 670 + long int i = id - KVM_REG_PPC_VSR0; 671 + vcpu->arch.vsr[2 * i] = val->vsxval[0]; 672 + vcpu->arch.vsr[2 * i + 1] = val->vsxval[1]; 673 + } else { 674 + r = -ENXIO; 675 + } 676 + break; 677 + #endif /* CONFIG_VSX */ 678 + case KVM_REG_PPC_VPA_ADDR: 679 + addr = set_reg_val(id, *val); 680 + r = -EINVAL; 681 + if (!addr && (vcpu->arch.slb_shadow.next_gpa || 682 + vcpu->arch.dtl.next_gpa)) 683 + break; 684 + r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca)); 685 + break; 686 + case KVM_REG_PPC_VPA_SLB: 687 + addr = val->vpaval.addr; 688 + len = val->vpaval.length; 689 + r = -EINVAL; 690 + if (addr && !vcpu->arch.vpa.next_gpa) 691 + break; 692 + r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len); 693 + break; 694 + case KVM_REG_PPC_VPA_DTL: 695 + addr = val->vpaval.addr; 696 + len = val->vpaval.length; 697 + r = -EINVAL; 698 + if (addr && (len < sizeof(struct dtl_entry) || 699 + !vcpu->arch.vpa.next_gpa)) 700 + break; 701 + len -= len % sizeof(struct dtl_entry); 702 + r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); 703 + break; 694 704 default: 705 + r = -EINVAL; 695 706 break; 696 707 } 697 708 ··· 864 599 goto free_vcpu; 865 600 866 601 vcpu->arch.shared = &vcpu->arch.shregs; 867 - vcpu->arch.last_cpu = -1; 868 602 vcpu->arch.mmcr[0] = MMCR0_FC; 869 603 vcpu->arch.ctrl = CTRL_RUNLATCH; 870 604 /* default to host PVR, since we can't spoof it */ 871 605 vcpu->arch.pvr = mfspr(SPRN_PVR); 872 606 kvmppc_set_pvr(vcpu, vcpu->arch.pvr); 873 607 spin_lock_init(&vcpu->arch.vpa_update_lock); 608 + spin_lock_init(&vcpu->arch.tbacct_lock); 609 + vcpu->arch.busy_preempt = TB_NIL; 874 610 875 611 kvmppc_mmu_book3s_hv_init(vcpu); 876 612 877 - /* 878 - * We consider the vcpu stopped until we see the first run ioctl for it. 879 - */ 880 - vcpu->arch.state = KVMPPC_VCPU_STOPPED; 613 + vcpu->arch.state = KVMPPC_VCPU_NOTREADY; 881 614 882 615 init_waitqueue_head(&vcpu->arch.cpu_run); 883 616 ··· 887 624 INIT_LIST_HEAD(&vcore->runnable_threads); 888 625 spin_lock_init(&vcore->lock); 889 626 init_waitqueue_head(&vcore->wq); 890 - vcore->preempt_tb = mftb(); 627 + vcore->preempt_tb = TB_NIL; 891 628 } 892 629 kvm->arch.vcores[core] = vcore; 630 + kvm->arch.online_vcores++; 893 631 } 894 632 mutex_unlock(&kvm->lock); 895 633 ··· 901 637 ++vcore->num_threads; 902 638 spin_unlock(&vcore->lock); 903 639 vcpu->arch.vcore = vcore; 904 - vcpu->arch.stolen_logged = vcore->stolen_tb; 905 640 906 641 vcpu->arch.cpu_type = KVM_CPU_3S_64; 907 642 kvmppc_sanity_check(vcpu); ··· 960 697 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, 961 698 struct kvm_vcpu *vcpu) 962 699 { 963 - struct kvm_vcpu *v; 700 + u64 now; 964 701 965 702 if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) 966 703 return; 704 + spin_lock(&vcpu->arch.tbacct_lock); 705 + now = mftb(); 706 + vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) - 707 + vcpu->arch.stolen_logged; 708 + vcpu->arch.busy_preempt = now; 967 709 vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 710 + spin_unlock(&vcpu->arch.tbacct_lock); 968 711 --vc->n_runnable; 969 - ++vc->n_busy; 970 - /* decrement the physical thread id of each following vcpu */ 971 - v = vcpu; 972 - list_for_each_entry_continue(v, &vc->runnable_threads, arch.run_list) 973 - --v->arch.ptid; 974 712 list_del(&vcpu->arch.run_list); 975 713 } 976 714 ··· 984 720 985 721 /* Ensure the thread won't go into the kernel if it wakes */ 986 722 tpaca->kvm_hstate.hwthread_req = 1; 723 + tpaca->kvm_hstate.kvm_vcpu = NULL; 987 724 988 725 /* 989 726 * If the thread is already executing in the kernel (e.g. handling ··· 1034 769 smp_wmb(); 1035 770 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) 1036 771 if (vcpu->arch.ptid) { 1037 - kvmppc_grab_hwthread(cpu); 1038 772 xics_wake_cpu(cpu); 1039 773 ++vc->n_woken; 1040 774 } ··· 1059 795 1060 796 /* 1061 797 * Check that we are on thread 0 and that any other threads in 1062 - * this core are off-line. 798 + * this core are off-line. Then grab the threads so they can't 799 + * enter the kernel. 1063 800 */ 1064 801 static int on_primary_thread(void) 1065 802 { ··· 1072 807 while (++thr < threads_per_core) 1073 808 if (cpu_online(cpu + thr)) 1074 809 return 0; 810 + 811 + /* Grab all hw threads so they can't go into the kernel */ 812 + for (thr = 1; thr < threads_per_core; ++thr) { 813 + if (kvmppc_grab_hwthread(cpu + thr)) { 814 + /* Couldn't grab one; let the others go */ 815 + do { 816 + kvmppc_release_hwthread(cpu + thr); 817 + } while (--thr > 0); 818 + return 0; 819 + } 820 + } 1075 821 return 1; 1076 822 } 1077 823 ··· 1090 814 * Run a set of guest threads on a physical core. 1091 815 * Called with vc->lock held. 1092 816 */ 1093 - static int kvmppc_run_core(struct kvmppc_vcore *vc) 817 + static void kvmppc_run_core(struct kvmppc_vcore *vc) 1094 818 { 1095 819 struct kvm_vcpu *vcpu, *vcpu0, *vnext; 1096 820 long ret; 1097 821 u64 now; 1098 822 int ptid, i, need_vpa_update; 823 + int srcu_idx; 824 + struct kvm_vcpu *vcpus_to_update[threads_per_core]; 1099 825 1100 826 /* don't start if any threads have a signal pending */ 1101 827 need_vpa_update = 0; 1102 828 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1103 829 if (signal_pending(vcpu->arch.run_task)) 1104 - return 0; 1105 - need_vpa_update |= vcpu->arch.vpa.update_pending | 1106 - vcpu->arch.slb_shadow.update_pending | 1107 - vcpu->arch.dtl.update_pending; 830 + return; 831 + if (vcpu->arch.vpa.update_pending || 832 + vcpu->arch.slb_shadow.update_pending || 833 + vcpu->arch.dtl.update_pending) 834 + vcpus_to_update[need_vpa_update++] = vcpu; 1108 835 } 1109 836 1110 837 /* ··· 1117 838 vc->n_woken = 0; 1118 839 vc->nap_count = 0; 1119 840 vc->entry_exit_count = 0; 1120 - vc->vcore_state = VCORE_RUNNING; 841 + vc->vcore_state = VCORE_STARTING; 1121 842 vc->in_guest = 0; 1122 843 vc->napping_threads = 0; 1123 844 ··· 1127 848 */ 1128 849 if (need_vpa_update) { 1129 850 spin_unlock(&vc->lock); 1130 - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 1131 - kvmppc_update_vpas(vcpu); 851 + for (i = 0; i < need_vpa_update; ++i) 852 + kvmppc_update_vpas(vcpus_to_update[i]); 1132 853 spin_lock(&vc->lock); 1133 - } 1134 - 1135 - /* 1136 - * Make sure we are running on thread 0, and that 1137 - * secondary threads are offline. 1138 - * XXX we should also block attempts to bring any 1139 - * secondary threads online. 1140 - */ 1141 - if (threads_per_core > 1 && !on_primary_thread()) { 1142 - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 1143 - vcpu->arch.ret = -EBUSY; 1144 - goto out; 1145 854 } 1146 855 1147 856 /* ··· 1146 879 } 1147 880 } 1148 881 if (!vcpu0) 1149 - return 0; /* nothing to run */ 882 + goto out; /* nothing to run; should never happen */ 1150 883 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 1151 884 if (vcpu->arch.ceded) 1152 885 vcpu->arch.ptid = ptid++; 1153 886 1154 - vc->stolen_tb += mftb() - vc->preempt_tb; 887 + /* 888 + * Make sure we are running on thread 0, and that 889 + * secondary threads are offline. 890 + */ 891 + if (threads_per_core > 1 && !on_primary_thread()) { 892 + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) 893 + vcpu->arch.ret = -EBUSY; 894 + goto out; 895 + } 896 + 1155 897 vc->pcpu = smp_processor_id(); 1156 898 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1157 899 kvmppc_start_thread(vcpu); 1158 900 kvmppc_create_dtl_entry(vcpu, vc); 1159 901 } 1160 - /* Grab any remaining hw threads so they can't go into the kernel */ 1161 - for (i = ptid; i < threads_per_core; ++i) 1162 - kvmppc_grab_hwthread(vc->pcpu + i); 1163 902 903 + vc->vcore_state = VCORE_RUNNING; 1164 904 preempt_disable(); 1165 905 spin_unlock(&vc->lock); 1166 906 1167 907 kvm_guest_enter(); 908 + 909 + srcu_idx = srcu_read_lock(&vcpu0->kvm->srcu); 910 + 1168 911 __kvmppc_vcore_entry(NULL, vcpu0); 1169 - for (i = 0; i < threads_per_core; ++i) 1170 - kvmppc_release_hwthread(vc->pcpu + i); 1171 912 1172 913 spin_lock(&vc->lock); 1173 914 /* disable sending of IPIs on virtual external irqs */ ··· 1184 909 /* wait for secondary threads to finish writing their state to memory */ 1185 910 if (vc->nap_count < vc->n_woken) 1186 911 kvmppc_wait_for_nap(vc); 912 + for (i = 0; i < threads_per_core; ++i) 913 + kvmppc_release_hwthread(vc->pcpu + i); 1187 914 /* prevent other vcpu threads from doing kvmppc_start_thread() now */ 1188 915 vc->vcore_state = VCORE_EXITING; 1189 916 spin_unlock(&vc->lock); 917 + 918 + srcu_read_unlock(&vcpu0->kvm->srcu, srcu_idx); 1190 919 1191 920 /* make sure updates to secondary vcpu structs are visible now */ 1192 921 smp_mb(); ··· 1199 920 preempt_enable(); 1200 921 kvm_resched(vcpu); 1201 922 923 + spin_lock(&vc->lock); 1202 924 now = get_tb(); 1203 925 list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { 1204 926 /* cancel pending dec exception if dec is positive */ ··· 1223 943 } 1224 944 } 1225 945 1226 - spin_lock(&vc->lock); 1227 946 out: 1228 947 vc->vcore_state = VCORE_INACTIVE; 1229 - vc->preempt_tb = mftb(); 1230 948 list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, 1231 949 arch.run_list) { 1232 950 if (vcpu->arch.ret != RESUME_GUEST) { ··· 1232 954 wake_up(&vcpu->arch.cpu_run); 1233 955 } 1234 956 } 1235 - 1236 - return 1; 1237 957 } 1238 958 1239 959 /* ··· 1255 979 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) 1256 980 { 1257 981 DEFINE_WAIT(wait); 1258 - struct kvm_vcpu *v; 1259 - int all_idle = 1; 1260 982 1261 983 prepare_to_wait(&vc->wq, &wait, TASK_INTERRUPTIBLE); 1262 984 vc->vcore_state = VCORE_SLEEPING; 1263 985 spin_unlock(&vc->lock); 1264 - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { 1265 - if (!v->arch.ceded || v->arch.pending_exceptions) { 1266 - all_idle = 0; 1267 - break; 1268 - } 1269 - } 1270 - if (all_idle) 1271 - schedule(); 986 + schedule(); 1272 987 finish_wait(&vc->wq, &wait); 1273 988 spin_lock(&vc->lock); 1274 989 vc->vcore_state = VCORE_INACTIVE; ··· 1268 1001 static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 1269 1002 { 1270 1003 int n_ceded; 1271 - int prev_state; 1272 1004 struct kvmppc_vcore *vc; 1273 1005 struct kvm_vcpu *v, *vn; 1274 1006 1275 1007 kvm_run->exit_reason = 0; 1276 1008 vcpu->arch.ret = RESUME_GUEST; 1277 1009 vcpu->arch.trap = 0; 1010 + kvmppc_update_vpas(vcpu); 1278 1011 1279 1012 /* 1280 1013 * Synchronize with other threads in this virtual core ··· 1284 1017 vcpu->arch.ceded = 0; 1285 1018 vcpu->arch.run_task = current; 1286 1019 vcpu->arch.kvm_run = kvm_run; 1287 - prev_state = vcpu->arch.state; 1020 + vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); 1288 1021 vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; 1022 + vcpu->arch.busy_preempt = TB_NIL; 1289 1023 list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); 1290 1024 ++vc->n_runnable; 1291 1025 ··· 1295 1027 * If the vcore is already running, we may be able to start 1296 1028 * this thread straight away and have it join in. 1297 1029 */ 1298 - if (prev_state == KVMPPC_VCPU_STOPPED) { 1030 + if (!signal_pending(current)) { 1299 1031 if (vc->vcore_state == VCORE_RUNNING && 1300 1032 VCORE_EXIT_COUNT(vc) == 0) { 1301 1033 vcpu->arch.ptid = vc->n_runnable - 1; 1034 + kvmppc_create_dtl_entry(vcpu, vc); 1302 1035 kvmppc_start_thread(vcpu); 1036 + } else if (vc->vcore_state == VCORE_SLEEPING) { 1037 + wake_up(&vc->wq); 1303 1038 } 1304 1039 1305 - } else if (prev_state == KVMPPC_VCPU_BUSY_IN_HOST) 1306 - --vc->n_busy; 1040 + } 1307 1041 1308 1042 while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && 1309 1043 !signal_pending(current)) { 1310 - if (vc->n_busy || vc->vcore_state != VCORE_INACTIVE) { 1044 + if (vc->vcore_state != VCORE_INACTIVE) { 1311 1045 spin_unlock(&vc->lock); 1312 1046 kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); 1313 1047 spin_lock(&vc->lock); 1314 1048 continue; 1315 1049 } 1316 - vc->runner = vcpu; 1317 - n_ceded = 0; 1318 - list_for_each_entry(v, &vc->runnable_threads, arch.run_list) 1319 - n_ceded += v->arch.ceded; 1320 - if (n_ceded == vc->n_runnable) 1321 - kvmppc_vcore_blocked(vc); 1322 - else 1323 - kvmppc_run_core(vc); 1324 - 1325 1050 list_for_each_entry_safe(v, vn, &vc->runnable_threads, 1326 1051 arch.run_list) { 1327 1052 kvmppc_core_prepare_to_enter(v); ··· 1326 1065 wake_up(&v->arch.cpu_run); 1327 1066 } 1328 1067 } 1068 + if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) 1069 + break; 1070 + vc->runner = vcpu; 1071 + n_ceded = 0; 1072 + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) 1073 + if (!v->arch.pending_exceptions) 1074 + n_ceded += v->arch.ceded; 1075 + if (n_ceded == vc->n_runnable) 1076 + kvmppc_vcore_blocked(vc); 1077 + else 1078 + kvmppc_run_core(vc); 1329 1079 vc->runner = NULL; 1330 1080 } 1331 1081 1332 - if (signal_pending(current)) { 1333 - if (vc->vcore_state == VCORE_RUNNING || 1334 - vc->vcore_state == VCORE_EXITING) { 1335 - spin_unlock(&vc->lock); 1336 - kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); 1337 - spin_lock(&vc->lock); 1338 - } 1339 - if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { 1340 - kvmppc_remove_runnable(vc, vcpu); 1341 - vcpu->stat.signal_exits++; 1342 - kvm_run->exit_reason = KVM_EXIT_INTR; 1343 - vcpu->arch.ret = -EINTR; 1344 - } 1082 + while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && 1083 + (vc->vcore_state == VCORE_RUNNING || 1084 + vc->vcore_state == VCORE_EXITING)) { 1085 + spin_unlock(&vc->lock); 1086 + kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); 1087 + spin_lock(&vc->lock); 1088 + } 1089 + 1090 + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { 1091 + kvmppc_remove_runnable(vc, vcpu); 1092 + vcpu->stat.signal_exits++; 1093 + kvm_run->exit_reason = KVM_EXIT_INTR; 1094 + vcpu->arch.ret = -EINTR; 1095 + } 1096 + 1097 + if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) { 1098 + /* Wake up some vcpu to run the core */ 1099 + v = list_first_entry(&vc->runnable_threads, 1100 + struct kvm_vcpu, arch.run_list); 1101 + wake_up(&v->arch.cpu_run); 1345 1102 } 1346 1103 1347 1104 spin_unlock(&vc->lock); ··· 1369 1090 int kvmppc_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) 1370 1091 { 1371 1092 int r; 1093 + int srcu_idx; 1372 1094 1373 1095 if (!vcpu->arch.sane) { 1374 1096 run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ··· 1400 1120 flush_vsx_to_thread(current); 1401 1121 vcpu->arch.wqp = &vcpu->arch.vcore->wq; 1402 1122 vcpu->arch.pgdir = current->mm->pgd; 1123 + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; 1403 1124 1404 1125 do { 1405 1126 r = kvmppc_run_vcpu(run, vcpu); ··· 1409 1128 !(vcpu->arch.shregs.msr & MSR_PR)) { 1410 1129 r = kvmppc_pseries_do_hcall(vcpu); 1411 1130 kvmppc_core_prepare_to_enter(vcpu); 1131 + } else if (r == RESUME_PAGE_FAULT) { 1132 + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 1133 + r = kvmppc_book3s_hv_page_fault(run, vcpu, 1134 + vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); 1135 + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); 1412 1136 } 1413 1137 } while (r == RESUME_GUEST); 1414 1138 1415 1139 out: 1140 + vcpu->arch.state = KVMPPC_VCPU_NOTREADY; 1416 1141 atomic_dec(&vcpu->kvm->arch.vcpus_running); 1417 1142 return r; 1418 1143 } ··· 1560 1273 n = kvm_dirty_bitmap_bytes(memslot); 1561 1274 memset(memslot->dirty_bitmap, 0, n); 1562 1275 1563 - r = kvmppc_hv_get_dirty_log(kvm, memslot); 1276 + r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap); 1564 1277 if (r) 1565 1278 goto out; 1566 1279 ··· 1574 1287 return r; 1575 1288 } 1576 1289 1577 - static unsigned long slb_pgsize_encoding(unsigned long psize) 1578 - { 1579 - unsigned long senc = 0; 1580 - 1581 - if (psize > 0x1000) { 1582 - senc = SLB_VSID_L; 1583 - if (psize == 0x10000) 1584 - senc |= SLB_VSID_LP_01; 1585 - } 1586 - return senc; 1587 - } 1588 - 1589 - int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1590 - struct kvm_userspace_memory_region *mem) 1591 - { 1592 - unsigned long npages; 1593 - unsigned long *phys; 1594 - 1595 - /* Allocate a slot_phys array */ 1596 - phys = kvm->arch.slot_phys[mem->slot]; 1597 - if (!kvm->arch.using_mmu_notifiers && !phys) { 1598 - npages = mem->memory_size >> PAGE_SHIFT; 1599 - phys = vzalloc(npages * sizeof(unsigned long)); 1600 - if (!phys) 1601 - return -ENOMEM; 1602 - kvm->arch.slot_phys[mem->slot] = phys; 1603 - kvm->arch.slot_npages[mem->slot] = npages; 1604 - } 1605 - 1606 - return 0; 1607 - } 1608 - 1609 - static void unpin_slot(struct kvm *kvm, int slot_id) 1290 + static void unpin_slot(struct kvm_memory_slot *memslot) 1610 1291 { 1611 1292 unsigned long *physp; 1612 1293 unsigned long j, npages, pfn; 1613 1294 struct page *page; 1614 1295 1615 - physp = kvm->arch.slot_phys[slot_id]; 1616 - npages = kvm->arch.slot_npages[slot_id]; 1617 - if (physp) { 1618 - spin_lock(&kvm->arch.slot_phys_lock); 1619 - for (j = 0; j < npages; j++) { 1620 - if (!(physp[j] & KVMPPC_GOT_PAGE)) 1621 - continue; 1622 - pfn = physp[j] >> PAGE_SHIFT; 1623 - page = pfn_to_page(pfn); 1624 - SetPageDirty(page); 1625 - put_page(page); 1626 - } 1627 - kvm->arch.slot_phys[slot_id] = NULL; 1628 - spin_unlock(&kvm->arch.slot_phys_lock); 1629 - vfree(physp); 1296 + physp = memslot->arch.slot_phys; 1297 + npages = memslot->npages; 1298 + if (!physp) 1299 + return; 1300 + for (j = 0; j < npages; j++) { 1301 + if (!(physp[j] & KVMPPC_GOT_PAGE)) 1302 + continue; 1303 + pfn = physp[j] >> PAGE_SHIFT; 1304 + page = pfn_to_page(pfn); 1305 + SetPageDirty(page); 1306 + put_page(page); 1630 1307 } 1631 1308 } 1632 1309 1633 - void kvmppc_core_commit_memory_region(struct kvm *kvm, 1634 - struct kvm_userspace_memory_region *mem) 1310 + void kvmppc_core_free_memslot(struct kvm_memory_slot *free, 1311 + struct kvm_memory_slot *dont) 1635 1312 { 1313 + if (!dont || free->arch.rmap != dont->arch.rmap) { 1314 + vfree(free->arch.rmap); 1315 + free->arch.rmap = NULL; 1316 + } 1317 + if (!dont || free->arch.slot_phys != dont->arch.slot_phys) { 1318 + unpin_slot(free); 1319 + vfree(free->arch.slot_phys); 1320 + free->arch.slot_phys = NULL; 1321 + } 1322 + } 1323 + 1324 + int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, 1325 + unsigned long npages) 1326 + { 1327 + slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); 1328 + if (!slot->arch.rmap) 1329 + return -ENOMEM; 1330 + slot->arch.slot_phys = NULL; 1331 + 1332 + return 0; 1333 + } 1334 + 1335 + int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1336 + struct kvm_memory_slot *memslot, 1337 + struct kvm_userspace_memory_region *mem) 1338 + { 1339 + unsigned long *phys; 1340 + 1341 + /* Allocate a slot_phys array if needed */ 1342 + phys = memslot->arch.slot_phys; 1343 + if (!kvm->arch.using_mmu_notifiers && !phys && memslot->npages) { 1344 + phys = vzalloc(memslot->npages * sizeof(unsigned long)); 1345 + if (!phys) 1346 + return -ENOMEM; 1347 + memslot->arch.slot_phys = phys; 1348 + } 1349 + 1350 + return 0; 1351 + } 1352 + 1353 + void kvmppc_core_commit_memory_region(struct kvm *kvm, 1354 + struct kvm_userspace_memory_region *mem, 1355 + struct kvm_memory_slot old) 1356 + { 1357 + unsigned long npages = mem->memory_size >> PAGE_SHIFT; 1358 + struct kvm_memory_slot *memslot; 1359 + 1360 + if (npages && old.npages) { 1361 + /* 1362 + * If modifying a memslot, reset all the rmap dirty bits. 1363 + * If this is a new memslot, we don't need to do anything 1364 + * since the rmap array starts out as all zeroes, 1365 + * i.e. no pages are dirty. 1366 + */ 1367 + memslot = id_to_memslot(kvm->memslots, mem->slot); 1368 + kvmppc_hv_get_dirty_log(kvm, memslot, NULL); 1369 + } 1636 1370 } 1637 1371 1638 1372 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) ··· 1670 1362 unsigned long rmls; 1671 1363 unsigned long *physp; 1672 1364 unsigned long i, npages; 1365 + int srcu_idx; 1673 1366 1674 1367 mutex_lock(&kvm->lock); 1675 1368 if (kvm->arch.rma_setup_done) ··· 1686 1377 } 1687 1378 1688 1379 /* Look up the memslot for guest physical address 0 */ 1380 + srcu_idx = srcu_read_lock(&kvm->srcu); 1689 1381 memslot = gfn_to_memslot(kvm, 0); 1690 1382 1691 1383 /* We must have some memory at 0 by now */ 1692 1384 err = -EINVAL; 1693 1385 if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 1694 - goto out; 1386 + goto out_srcu; 1695 1387 1696 1388 /* Look up the VMA for the start of this memory slot */ 1697 1389 hva = memslot->userspace_addr; ··· 1716 1406 err = -EPERM; 1717 1407 if (cpu_has_feature(CPU_FTR_ARCH_201)) { 1718 1408 pr_err("KVM: CPU requires an RMO\n"); 1719 - goto out; 1409 + goto out_srcu; 1720 1410 } 1721 1411 1722 1412 /* We can handle 4k, 64k or 16M pages in the VRMA */ 1723 1413 err = -EINVAL; 1724 1414 if (!(psize == 0x1000 || psize == 0x10000 || 1725 1415 psize == 0x1000000)) 1726 - goto out; 1416 + goto out_srcu; 1727 1417 1728 1418 /* Update VRMASD field in the LPCR */ 1729 1419 senc = slb_pgsize_encoding(psize); ··· 1746 1436 err = -EINVAL; 1747 1437 if (rmls < 0) { 1748 1438 pr_err("KVM: Can't use RMA of 0x%lx bytes\n", rma_size); 1749 - goto out; 1439 + goto out_srcu; 1750 1440 } 1751 1441 atomic_inc(&ri->use_count); 1752 1442 kvm->arch.rma = ri; ··· 1775 1465 /* Initialize phys addrs of pages in RMO */ 1776 1466 npages = ri->npages; 1777 1467 porder = __ilog2(npages); 1778 - physp = kvm->arch.slot_phys[memslot->id]; 1779 - spin_lock(&kvm->arch.slot_phys_lock); 1780 - for (i = 0; i < npages; ++i) 1781 - physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + porder; 1782 - spin_unlock(&kvm->arch.slot_phys_lock); 1468 + physp = memslot->arch.slot_phys; 1469 + if (physp) { 1470 + if (npages > memslot->npages) 1471 + npages = memslot->npages; 1472 + spin_lock(&kvm->arch.slot_phys_lock); 1473 + for (i = 0; i < npages; ++i) 1474 + physp[i] = ((ri->base_pfn + i) << PAGE_SHIFT) + 1475 + porder; 1476 + spin_unlock(&kvm->arch.slot_phys_lock); 1477 + } 1783 1478 } 1784 1479 1785 1480 /* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */ 1786 1481 smp_wmb(); 1787 1482 kvm->arch.rma_setup_done = 1; 1788 1483 err = 0; 1484 + out_srcu: 1485 + srcu_read_unlock(&kvm->srcu, srcu_idx); 1789 1486 out: 1790 1487 mutex_unlock(&kvm->lock); 1791 1488 return err; ··· 1812 1495 if (lpid < 0) 1813 1496 return -ENOMEM; 1814 1497 kvm->arch.lpid = lpid; 1498 + 1499 + /* 1500 + * Since we don't flush the TLB when tearing down a VM, 1501 + * and this lpid might have previously been used, 1502 + * make sure we flush on each core before running the new VM. 1503 + */ 1504 + cpumask_setall(&kvm->arch.need_tlb_flush); 1815 1505 1816 1506 INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); 1817 1507 ··· 1847 1523 1848 1524 kvm->arch.using_mmu_notifiers = !!cpu_has_feature(CPU_FTR_ARCH_206); 1849 1525 spin_lock_init(&kvm->arch.slot_phys_lock); 1526 + 1527 + /* 1528 + * Don't allow secondary CPU threads to come online 1529 + * while any KVM VMs exist. 1530 + */ 1531 + inhibit_secondary_onlining(); 1532 + 1850 1533 return 0; 1851 1534 } 1852 1535 1853 1536 void kvmppc_core_destroy_vm(struct kvm *kvm) 1854 1537 { 1855 - unsigned long i; 1856 - 1857 - if (!kvm->arch.using_mmu_notifiers) 1858 - for (i = 0; i < KVM_MEM_SLOTS_NUM; i++) 1859 - unpin_slot(kvm, i); 1538 + uninhibit_secondary_onlining(); 1860 1539 1861 1540 if (kvm->arch.rma) { 1862 1541 kvm_release_rma(kvm->arch.rma);

+2 -2

arch/powerpc/kvm/book3s_hv_builtin.c

··· 157 157 linear_info = alloc_bootmem(count * sizeof(struct kvmppc_linear_info)); 158 158 for (i = 0; i < count; ++i) { 159 159 linear = alloc_bootmem_align(size, size); 160 - pr_info("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, 161 - size >> 20); 160 + pr_debug("Allocated KVM %s at %p (%ld MB)\n", typestr, linear, 161 + size >> 20); 162 162 linear_info[i].base_virt = linear; 163 163 linear_info[i].base_pfn = __pa(linear) >> PAGE_SHIFT; 164 164 linear_info[i].npages = npages;

+144

arch/powerpc/kvm/book3s_hv_ras.c

··· 1 + /* 2 + * This program is free software; you can redistribute it and/or modify 3 + * it under the terms of the GNU General Public License, version 2, as 4 + * published by the Free Software Foundation. 5 + * 6 + * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> 7 + */ 8 + 9 + #include <linux/types.h> 10 + #include <linux/string.h> 11 + #include <linux/kvm.h> 12 + #include <linux/kvm_host.h> 13 + #include <linux/kernel.h> 14 + #include <asm/opal.h> 15 + 16 + /* SRR1 bits for machine check on POWER7 */ 17 + #define SRR1_MC_LDSTERR (1ul << (63-42)) 18 + #define SRR1_MC_IFETCH_SH (63-45) 19 + #define SRR1_MC_IFETCH_MASK 0x7 20 + #define SRR1_MC_IFETCH_SLBPAR 2 /* SLB parity error */ 21 + #define SRR1_MC_IFETCH_SLBMULTI 3 /* SLB multi-hit */ 22 + #define SRR1_MC_IFETCH_SLBPARMULTI 4 /* SLB parity + multi-hit */ 23 + #define SRR1_MC_IFETCH_TLBMULTI 5 /* I-TLB multi-hit */ 24 + 25 + /* DSISR bits for machine check on POWER7 */ 26 + #define DSISR_MC_DERAT_MULTI 0x800 /* D-ERAT multi-hit */ 27 + #define DSISR_MC_TLB_MULTI 0x400 /* D-TLB multi-hit */ 28 + #define DSISR_MC_SLB_PARITY 0x100 /* SLB parity error */ 29 + #define DSISR_MC_SLB_MULTI 0x080 /* SLB multi-hit */ 30 + #define DSISR_MC_SLB_PARMULTI 0x040 /* SLB parity + multi-hit */ 31 + 32 + /* POWER7 SLB flush and reload */ 33 + static void reload_slb(struct kvm_vcpu *vcpu) 34 + { 35 + struct slb_shadow *slb; 36 + unsigned long i, n; 37 + 38 + /* First clear out SLB */ 39 + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); 40 + 41 + /* Do they have an SLB shadow buffer registered? */ 42 + slb = vcpu->arch.slb_shadow.pinned_addr; 43 + if (!slb) 44 + return; 45 + 46 + /* Sanity check */ 47 + n = min_t(u32, slb->persistent, SLB_MIN_SIZE); 48 + if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end) 49 + return; 50 + 51 + /* Load up the SLB from that */ 52 + for (i = 0; i < n; ++i) { 53 + unsigned long rb = slb->save_area[i].esid; 54 + unsigned long rs = slb->save_area[i].vsid; 55 + 56 + rb = (rb & ~0xFFFul) | i; /* insert entry number */ 57 + asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); 58 + } 59 + } 60 + 61 + /* POWER7 TLB flush */ 62 + static void flush_tlb_power7(struct kvm_vcpu *vcpu) 63 + { 64 + unsigned long i, rb; 65 + 66 + rb = TLBIEL_INVAL_SET_LPID; 67 + for (i = 0; i < POWER7_TLB_SETS; ++i) { 68 + asm volatile("tlbiel %0" : : "r" (rb)); 69 + rb += 1 << TLBIEL_INVAL_SET_SHIFT; 70 + } 71 + } 72 + 73 + /* 74 + * On POWER7, see if we can handle a machine check that occurred inside 75 + * the guest in real mode, without switching to the host partition. 76 + * 77 + * Returns: 0 => exit guest, 1 => deliver machine check to guest 78 + */ 79 + static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) 80 + { 81 + unsigned long srr1 = vcpu->arch.shregs.msr; 82 + struct opal_machine_check_event *opal_evt; 83 + long handled = 1; 84 + 85 + if (srr1 & SRR1_MC_LDSTERR) { 86 + /* error on load/store */ 87 + unsigned long dsisr = vcpu->arch.shregs.dsisr; 88 + 89 + if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | 90 + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) { 91 + /* flush and reload SLB; flushes D-ERAT too */ 92 + reload_slb(vcpu); 93 + dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | 94 + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI); 95 + } 96 + if (dsisr & DSISR_MC_TLB_MULTI) { 97 + flush_tlb_power7(vcpu); 98 + dsisr &= ~DSISR_MC_TLB_MULTI; 99 + } 100 + /* Any other errors we don't understand? */ 101 + if (dsisr & 0xffffffffUL) 102 + handled = 0; 103 + } 104 + 105 + switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) { 106 + case 0: 107 + break; 108 + case SRR1_MC_IFETCH_SLBPAR: 109 + case SRR1_MC_IFETCH_SLBMULTI: 110 + case SRR1_MC_IFETCH_SLBPARMULTI: 111 + reload_slb(vcpu); 112 + break; 113 + case SRR1_MC_IFETCH_TLBMULTI: 114 + flush_tlb_power7(vcpu); 115 + break; 116 + default: 117 + handled = 0; 118 + } 119 + 120 + /* 121 + * See if OPAL has already handled the condition. 122 + * We assume that if the condition is recovered then OPAL 123 + * will have generated an error log event that we will pick 124 + * up and log later. 125 + */ 126 + opal_evt = local_paca->opal_mc_evt; 127 + if (opal_evt->version == OpalMCE_V1 && 128 + (opal_evt->severity == OpalMCE_SEV_NO_ERROR || 129 + opal_evt->disposition == OpalMCE_DISPOSITION_RECOVERED)) 130 + handled = 1; 131 + 132 + if (handled) 133 + opal_evt->in_use = 0; 134 + 135 + return handled; 136 + } 137 + 138 + long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) 139 + { 140 + if (cpu_has_feature(CPU_FTR_ARCH_206)) 141 + return kvmppc_realmode_mc_power7(vcpu); 142 + 143 + return 0; 144 + }

+114 -29

arch/powerpc/kvm/book3s_hv_rm_mmu.c

··· 35 35 return __va(addr); 36 36 } 37 37 38 + /* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */ 39 + static int global_invalidates(struct kvm *kvm, unsigned long flags) 40 + { 41 + int global; 42 + 43 + /* 44 + * If there is only one vcore, and it's currently running, 45 + * we can use tlbiel as long as we mark all other physical 46 + * cores as potentially having stale TLB entries for this lpid. 47 + * If we're not using MMU notifiers, we never take pages away 48 + * from the guest, so we can use tlbiel if requested. 49 + * Otherwise, don't use tlbiel. 50 + */ 51 + if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcore) 52 + global = 0; 53 + else if (kvm->arch.using_mmu_notifiers) 54 + global = 1; 55 + else 56 + global = !(flags & H_LOCAL); 57 + 58 + if (!global) { 59 + /* any other core might now have stale TLB entries... */ 60 + smp_wmb(); 61 + cpumask_setall(&kvm->arch.need_tlb_flush); 62 + cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, 63 + &kvm->arch.need_tlb_flush); 64 + } 65 + 66 + return global; 67 + } 68 + 38 69 /* 39 70 * Add this HPTE into the chain for the real page. 40 71 * Must be called with the chain locked; it unlocks the chain. ··· 90 59 head->back = pte_index; 91 60 } else { 92 61 rev->forw = rev->back = pte_index; 93 - i = pte_index; 62 + *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | 63 + pte_index | KVMPPC_RMAP_PRESENT; 94 64 } 95 - smp_wmb(); 96 - *rmap = i | KVMPPC_RMAP_REFERENCED | KVMPPC_RMAP_PRESENT; /* unlock */ 65 + unlock_rmap(rmap); 97 66 } 98 67 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); 68 + 69 + /* 70 + * Note modification of an HPTE; set the HPTE modified bit 71 + * if anyone is interested. 72 + */ 73 + static inline void note_hpte_modification(struct kvm *kvm, 74 + struct revmap_entry *rev) 75 + { 76 + if (atomic_read(&kvm->arch.hpte_mod_interest)) 77 + rev->guest_rpte |= HPTE_GR_MODIFIED; 78 + } 99 79 100 80 /* Remove this HPTE from the chain for a real page */ 101 81 static void remove_revmap_chain(struct kvm *kvm, long pte_index, ··· 123 81 ptel = rev->guest_rpte |= rcbits; 124 82 gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); 125 83 memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); 126 - if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) 84 + if (!memslot) 127 85 return; 128 86 129 87 rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); ··· 145 103 unlock_rmap(rmap); 146 104 } 147 105 148 - static pte_t lookup_linux_pte(struct kvm_vcpu *vcpu, unsigned long hva, 106 + static pte_t lookup_linux_pte(pgd_t *pgdir, unsigned long hva, 149 107 int writing, unsigned long *pte_sizep) 150 108 { 151 109 pte_t *ptep; 152 110 unsigned long ps = *pte_sizep; 153 111 unsigned int shift; 154 112 155 - ptep = find_linux_pte_or_hugepte(vcpu->arch.pgdir, hva, &shift); 113 + ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift); 156 114 if (!ptep) 157 115 return __pte(0); 158 116 if (shift) ··· 172 130 hpte[0] = hpte_v; 173 131 } 174 132 175 - long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 176 - long pte_index, unsigned long pteh, unsigned long ptel) 133 + long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, 134 + long pte_index, unsigned long pteh, unsigned long ptel, 135 + pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) 177 136 { 178 - struct kvm *kvm = vcpu->kvm; 179 137 unsigned long i, pa, gpa, gfn, psize; 180 138 unsigned long slot_fn, hva; 181 139 unsigned long *hpte; 182 140 struct revmap_entry *rev; 183 - unsigned long g_ptel = ptel; 141 + unsigned long g_ptel; 184 142 struct kvm_memory_slot *memslot; 185 143 unsigned long *physp, pte_size; 186 144 unsigned long is_io; ··· 189 147 unsigned int writing; 190 148 unsigned long mmu_seq; 191 149 unsigned long rcbits; 192 - bool realmode = vcpu->arch.vcore->vcore_state == VCORE_RUNNING; 193 150 194 151 psize = hpte_page_size(pteh, ptel); 195 152 if (!psize) 196 153 return H_PARAMETER; 197 154 writing = hpte_is_writable(ptel); 198 155 pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); 156 + ptel &= ~HPTE_GR_RESERVED; 157 + g_ptel = ptel; 199 158 200 159 /* used later to detect if we might have been invalidated */ 201 160 mmu_seq = kvm->mmu_notifier_seq; ··· 226 183 rmap = &memslot->arch.rmap[slot_fn]; 227 184 228 185 if (!kvm->arch.using_mmu_notifiers) { 229 - physp = kvm->arch.slot_phys[memslot->id]; 186 + physp = memslot->arch.slot_phys; 230 187 if (!physp) 231 188 return H_PARAMETER; 232 189 physp += slot_fn; ··· 244 201 245 202 /* Look up the Linux PTE for the backing page */ 246 203 pte_size = psize; 247 - pte = lookup_linux_pte(vcpu, hva, writing, &pte_size); 204 + pte = lookup_linux_pte(pgdir, hva, writing, &pte_size); 248 205 if (pte_present(pte)) { 249 206 if (writing && !pte_write(pte)) 250 207 /* make the actual HPTE be read-only */ ··· 253 210 pa = pte_pfn(pte) << PAGE_SHIFT; 254 211 } 255 212 } 213 + 256 214 if (pte_size < psize) 257 215 return H_PARAMETER; 258 216 if (pa && pte_size > psize) ··· 331 287 rev = &kvm->arch.revmap[pte_index]; 332 288 if (realmode) 333 289 rev = real_vmalloc_addr(rev); 334 - if (rev) 290 + if (rev) { 335 291 rev->guest_rpte = g_ptel; 292 + note_hpte_modification(kvm, rev); 293 + } 336 294 337 295 /* Link HPTE into reverse-map chain */ 338 296 if (pteh & HPTE_V_VALID) { ··· 343 297 lock_rmap(rmap); 344 298 /* Check for pending invalidations under the rmap chain lock */ 345 299 if (kvm->arch.using_mmu_notifiers && 346 - mmu_notifier_retry(vcpu, mmu_seq)) { 300 + mmu_notifier_retry(kvm, mmu_seq)) { 347 301 /* inval in progress, write a non-present HPTE */ 348 302 pteh |= HPTE_V_ABSENT; 349 303 pteh &= ~HPTE_V_VALID; ··· 364 318 hpte[0] = pteh; 365 319 asm volatile("ptesync" : : : "memory"); 366 320 367 - vcpu->arch.gpr[4] = pte_index; 321 + *pte_idx_ret = pte_index; 368 322 return H_SUCCESS; 369 323 } 370 - EXPORT_SYMBOL_GPL(kvmppc_h_enter); 324 + EXPORT_SYMBOL_GPL(kvmppc_do_h_enter); 325 + 326 + long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, 327 + long pte_index, unsigned long pteh, unsigned long ptel) 328 + { 329 + return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel, 330 + vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]); 331 + } 371 332 372 333 #define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) 373 334 ··· 396 343 return old == 0; 397 344 } 398 345 399 - long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, 400 - unsigned long pte_index, unsigned long avpn, 401 - unsigned long va) 346 + long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, 347 + unsigned long pte_index, unsigned long avpn, 348 + unsigned long *hpret) 402 349 { 403 - struct kvm *kvm = vcpu->kvm; 404 350 unsigned long *hpte; 405 351 unsigned long v, r, rb; 406 352 struct revmap_entry *rev; ··· 421 369 if (v & HPTE_V_VALID) { 422 370 hpte[0] &= ~HPTE_V_VALID; 423 371 rb = compute_tlbie_rb(v, hpte[1], pte_index); 424 - if (!(flags & H_LOCAL) && atomic_read(&kvm->online_vcpus) > 1) { 372 + if (global_invalidates(kvm, flags)) { 425 373 while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) 426 374 cpu_relax(); 427 375 asm volatile("ptesync" : : : "memory"); ··· 437 385 /* Read PTE low word after tlbie to get final R/C values */ 438 386 remove_revmap_chain(kvm, pte_index, rev, v, hpte[1]); 439 387 } 440 - r = rev->guest_rpte; 388 + r = rev->guest_rpte & ~HPTE_GR_RESERVED; 389 + note_hpte_modification(kvm, rev); 441 390 unlock_hpte(hpte, 0); 442 391 443 - vcpu->arch.gpr[4] = v; 444 - vcpu->arch.gpr[5] = r; 392 + hpret[0] = v; 393 + hpret[1] = r; 445 394 return H_SUCCESS; 395 + } 396 + EXPORT_SYMBOL_GPL(kvmppc_do_h_remove); 397 + 398 + long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, 399 + unsigned long pte_index, unsigned long avpn) 400 + { 401 + return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn, 402 + &vcpu->arch.gpr[4]); 446 403 } 447 404 448 405 long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) ··· 520 459 521 460 args[j] = ((0x80 | flags) << 56) + pte_index; 522 461 rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); 462 + note_hpte_modification(kvm, rev); 523 463 524 464 if (!(hp[0] & HPTE_V_VALID)) { 525 465 /* insert R and C bits from PTE */ ··· 596 534 return H_NOT_FOUND; 597 535 } 598 536 599 - if (atomic_read(&kvm->online_vcpus) == 1) 600 - flags |= H_LOCAL; 601 537 v = hpte[0]; 602 538 bits = (flags << 55) & HPTE_R_PP0; 603 539 bits |= (flags << 48) & HPTE_R_KEY_HI; ··· 608 548 if (rev) { 609 549 r = (rev->guest_rpte & ~mask) | bits; 610 550 rev->guest_rpte = r; 551 + note_hpte_modification(kvm, rev); 611 552 } 612 553 r = (hpte[1] & ~mask) | bits; 613 554 ··· 616 555 if (v & HPTE_V_VALID) { 617 556 rb = compute_tlbie_rb(v, r, pte_index); 618 557 hpte[0] = v & ~HPTE_V_VALID; 619 - if (!(flags & H_LOCAL)) { 558 + if (global_invalidates(kvm, flags)) { 620 559 while(!try_lock_tlbie(&kvm->arch.tlbie_lock)) 621 560 cpu_relax(); 622 561 asm volatile("ptesync" : : : "memory"); ··· 628 567 asm volatile("ptesync" : : : "memory"); 629 568 asm volatile("tlbiel %0" : : "r" (rb)); 630 569 asm volatile("ptesync" : : : "memory"); 570 + } 571 + /* 572 + * If the host has this page as readonly but the guest 573 + * wants to make it read/write, reduce the permissions. 574 + * Checking the host permissions involves finding the 575 + * memslot and then the Linux PTE for the page. 576 + */ 577 + if (hpte_is_writable(r) && kvm->arch.using_mmu_notifiers) { 578 + unsigned long psize, gfn, hva; 579 + struct kvm_memory_slot *memslot; 580 + pgd_t *pgdir = vcpu->arch.pgdir; 581 + pte_t pte; 582 + 583 + psize = hpte_page_size(v, r); 584 + gfn = ((r & HPTE_R_RPN) & ~(psize - 1)) >> PAGE_SHIFT; 585 + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); 586 + if (memslot) { 587 + hva = __gfn_to_hva_memslot(memslot, gfn); 588 + pte = lookup_linux_pte(pgdir, hva, 1, &psize); 589 + if (pte_present(pte) && !pte_write(pte)) 590 + r = hpte_make_readonly(r); 591 + } 631 592 } 632 593 } 633 594 hpte[1] = r; ··· 682 599 v &= ~HPTE_V_ABSENT; 683 600 v |= HPTE_V_VALID; 684 601 } 685 - if (v & HPTE_V_VALID) 602 + if (v & HPTE_V_VALID) { 686 603 r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C)); 604 + r &= ~HPTE_GR_RESERVED; 605 + } 687 606 vcpu->arch.gpr[4 + i * 2] = v; 688 607 vcpu->arch.gpr[5 + i * 2] = r; 689 608 }

+81 -61

arch/powerpc/kvm/book3s_hv_rmhandlers.S

··· 27 27 #include <asm/asm-offsets.h> 28 28 #include <asm/exception-64s.h> 29 29 #include <asm/kvm_book3s_asm.h> 30 + #include <asm/mmu-hash64.h> 30 31 31 32 /***************************************************************************** 32 33 * * ··· 135 134 136 135 27: /* XXX should handle hypervisor maintenance interrupts etc. here */ 137 136 137 + /* reload vcpu pointer after clearing the IPI */ 138 + ld r4,HSTATE_KVM_VCPU(r13) 139 + cmpdi r4,0 138 140 /* if we have no vcpu to run, go back to sleep */ 139 - beq cr1,kvm_no_guest 141 + beq kvm_no_guest 140 142 141 143 /* were we napping due to cede? */ 142 144 lbz r0,HSTATE_NAPPING(r13) ··· 314 310 mtspr SPRN_SDR1,r6 /* switch to partition page table */ 315 311 mtspr SPRN_LPID,r7 316 312 isync 313 + 314 + /* See if we need to flush the TLB */ 315 + lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ 316 + clrldi r7,r6,64-6 /* extract bit number (6 bits) */ 317 + srdi r6,r6,6 /* doubleword number */ 318 + sldi r6,r6,3 /* address offset */ 319 + add r6,r6,r9 320 + addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ 317 321 li r0,1 322 + sld r0,r0,r7 323 + ld r7,0(r6) 324 + and. r7,r7,r0 325 + beq 22f 326 + 23: ldarx r7,0,r6 /* if set, clear the bit */ 327 + andc r7,r7,r0 328 + stdcx. r7,0,r6 329 + bne 23b 330 + li r6,128 /* and flush the TLB */ 331 + mtctr r6 332 + li r7,0x800 /* IS field = 0b10 */ 333 + ptesync 334 + 28: tlbiel r7 335 + addi r7,r7,0x1000 336 + bdnz 28b 337 + ptesync 338 + 339 + 22: li r0,1 318 340 stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ 319 341 b 10f 320 342 ··· 362 332 li r12,BOOK3S_INTERRUPT_HV_DECREMENTER 363 333 mr r9,r4 364 334 blt hdec_soon 365 - 366 - /* 367 - * Invalidate the TLB if we could possibly have stale TLB 368 - * entries for this partition on this core due to the use 369 - * of tlbiel. 370 - * XXX maybe only need this on primary thread? 371 - */ 372 - ld r9,VCPU_KVM(r4) /* pointer to struct kvm */ 373 - lwz r5,VCPU_VCPUID(r4) 374 - lhz r6,PACAPACAINDEX(r13) 375 - rldimi r6,r5,0,62 /* XXX map as if threads 1:1 p:v */ 376 - lhz r8,VCPU_LAST_CPU(r4) 377 - sldi r7,r6,1 /* see if this is the same vcpu */ 378 - add r7,r7,r9 /* as last ran on this pcpu */ 379 - lhz r0,KVM_LAST_VCPU(r7) 380 - cmpw r6,r8 /* on the same cpu core as last time? */ 381 - bne 3f 382 - cmpw r0,r5 /* same vcpu as this core last ran? */ 383 - beq 1f 384 - 3: sth r6,VCPU_LAST_CPU(r4) /* if not, invalidate partition TLB */ 385 - sth r5,KVM_LAST_VCPU(r7) 386 - li r6,128 387 - mtctr r6 388 - li r7,0x800 /* IS field = 0b10 */ 389 - ptesync 390 - 2: tlbiel r7 391 - addi r7,r7,0x1000 392 - bdnz 2b 393 - ptesync 394 - 1: 395 335 396 336 /* Save purr/spurr */ 397 337 mfspr r5,SPRN_PURR ··· 679 679 1: 680 680 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 681 681 682 - nohpte_cont: 683 - hcall_real_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 682 + guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ 684 683 /* Save DEC */ 685 684 mfspr r5,SPRN_DEC 686 685 mftb r6 ··· 699 700 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 700 701 std r6, VCPU_FAULT_DAR(r9) 701 702 stw r7, VCPU_FAULT_DSISR(r9) 703 + 704 + /* See if it is a machine check */ 705 + cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK 706 + beq machine_check_realmode 707 + mc_cont: 702 708 703 709 /* Save guest CTRL register, set runlatch to 1 */ 704 710 6: mfspr r6,SPRN_CTRLF ··· 1117 1113 /* 1118 1114 * For external and machine check interrupts, we need 1119 1115 * to call the Linux handler to process the interrupt. 1120 - * We do that by jumping to the interrupt vector address 1121 - * which we have in r12. The [h]rfid at the end of the 1116 + * We do that by jumping to absolute address 0x500 for 1117 + * external interrupts, or the machine_check_fwnmi label 1118 + * for machine checks (since firmware might have patched 1119 + * the vector area at 0x200). The [h]rfid at the end of the 1122 1120 * handler will return to the book3s_hv_interrupts.S code. 1123 1121 * For other interrupts we do the rfid to get back 1124 - * to the book3s_interrupts.S code here. 1122 + * to the book3s_hv_interrupts.S code here. 1125 1123 */ 1126 1124 ld r8, HSTATE_VMHANDLER(r13) 1127 1125 ld r7, HSTATE_HOST_MSR(r13) 1128 1126 1127 + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK 1129 1128 cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL 1129 + BEGIN_FTR_SECTION 1130 1130 beq 11f 1131 - cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK 1131 + END_FTR_SECTION_IFSET(CPU_FTR_ARCH_206) 1132 1132 1133 1133 /* RFI into the highmem handler, or branch to interrupt handler */ 1134 - 12: mfmsr r6 1135 - mtctr r12 1134 + mfmsr r6 1136 1135 li r0, MSR_RI 1137 1136 andc r6, r6, r0 1138 1137 mtmsrd r6, 1 /* Clear RI in MSR */ 1139 1138 mtsrr0 r8 1140 1139 mtsrr1 r7 1141 - beqctr 1140 + beqa 0x500 /* external interrupt (PPC970) */ 1141 + beq cr1, 13f /* machine check */ 1142 1142 RFI 1143 1143 1144 - 11: 1145 - BEGIN_FTR_SECTION 1146 - b 12b 1147 - END_FTR_SECTION_IFSET(CPU_FTR_ARCH_201) 1148 - mtspr SPRN_HSRR0, r8 1144 + /* On POWER7, we have external interrupts set to use HSRR0/1 */ 1145 + 11: mtspr SPRN_HSRR0, r8 1149 1146 mtspr SPRN_HSRR1, r7 1150 1147 ba 0x500 1148 + 1149 + 13: b machine_check_fwnmi 1151 1150 1152 1151 /* 1153 1152 * Check whether an HDSI is an HPTE not found fault or something else. ··· 1184 1177 cmpdi r3, 0 /* retry the instruction */ 1185 1178 beq 6f 1186 1179 cmpdi r3, -1 /* handle in kernel mode */ 1187 - beq nohpte_cont 1180 + beq guest_exit_cont 1188 1181 cmpdi r3, -2 /* MMIO emulation; need instr word */ 1189 1182 beq 2f 1190 1183 ··· 1198 1191 li r10, BOOK3S_INTERRUPT_DATA_STORAGE 1199 1192 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 1200 1193 rotldi r11, r11, 63 1194 + fast_interrupt_c_return: 1201 1195 6: ld r7, VCPU_CTR(r9) 1202 1196 lwz r8, VCPU_XER(r9) 1203 1197 mtctr r7 ··· 1231 1223 /* Unset guest mode. */ 1232 1224 li r0, KVM_GUEST_MODE_NONE 1233 1225 stb r0, HSTATE_IN_GUEST(r13) 1234 - b nohpte_cont 1226 + b guest_exit_cont 1235 1227 1236 1228 /* 1237 1229 * Similarly for an HISI, reflect it to the guest as an ISI unless ··· 1257 1249 ld r11, VCPU_MSR(r9) 1258 1250 li r12, BOOK3S_INTERRUPT_H_INST_STORAGE 1259 1251 cmpdi r3, 0 /* retry the instruction */ 1260 - beq 6f 1252 + beq fast_interrupt_c_return 1261 1253 cmpdi r3, -1 /* handle in kernel mode */ 1262 - beq nohpte_cont 1254 + beq guest_exit_cont 1263 1255 1264 1256 /* Synthesize an ISI for the guest */ 1265 1257 mr r11, r3 ··· 1268 1260 li r10, BOOK3S_INTERRUPT_INST_STORAGE 1269 1261 li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 1270 1262 rotldi r11, r11, 63 1271 - 6: ld r7, VCPU_CTR(r9) 1272 - lwz r8, VCPU_XER(r9) 1273 - mtctr r7 1274 - mtxer r8 1275 - mr r4, r9 1276 - b fast_guest_return 1263 + b fast_interrupt_c_return 1277 1264 1278 1265 3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ 1279 1266 ld r5, KVM_VRMA_SLB_V(r6) ··· 1284 1281 hcall_try_real_mode: 1285 1282 ld r3,VCPU_GPR(R3)(r9) 1286 1283 andi. r0,r11,MSR_PR 1287 - bne hcall_real_cont 1284 + bne guest_exit_cont 1288 1285 clrrdi r3,r3,2 1289 1286 cmpldi r3,hcall_real_table_end - hcall_real_table 1290 - bge hcall_real_cont 1287 + bge guest_exit_cont 1291 1288 LOAD_REG_ADDR(r4, hcall_real_table) 1292 1289 lwzx r3,r3,r4 1293 1290 cmpwi r3,0 1294 - beq hcall_real_cont 1291 + beq guest_exit_cont 1295 1292 add r3,r3,r4 1296 1293 mtctr r3 1297 1294 mr r3,r9 /* get vcpu pointer */ ··· 1312 1309 li r12,BOOK3S_INTERRUPT_SYSCALL 1313 1310 ld r9, HSTATE_KVM_VCPU(r13) 1314 1311 1315 - b hcall_real_cont 1312 + b guest_exit_cont 1316 1313 1317 1314 .globl hcall_real_table 1318 1315 hcall_real_table: ··· 1571 1568 li r3,H_TOO_HARD 1572 1569 blr 1573 1570 1571 + /* Try to handle a machine check in real mode */ 1572 + machine_check_realmode: 1573 + mr r3, r9 /* get vcpu pointer */ 1574 + bl .kvmppc_realmode_machine_check 1575 + nop 1576 + cmpdi r3, 0 /* continue exiting from guest? */ 1577 + ld r9, HSTATE_KVM_VCPU(r13) 1578 + li r12, BOOK3S_INTERRUPT_MACHINE_CHECK 1579 + beq mc_cont 1580 + /* If not, deliver a machine check. SRR0/1 are already set */ 1581 + li r10, BOOK3S_INTERRUPT_MACHINE_CHECK 1582 + li r11, (MSR_ME << 1) | 1 /* synthesize MSR_SF | MSR_ME */ 1583 + rotldi r11, r11, 63 1584 + b fast_interrupt_c_return 1585 + 1574 1586 secondary_too_late: 1575 1587 ld r5,HSTATE_KVM_VCORE(r13) 1576 1588 HMT_LOW ··· 1605 1587 .endr 1606 1588 1607 1589 secondary_nap: 1590 + /* Clear our vcpu pointer so we don't come back in early */ 1591 + li r0, 0 1592 + std r0, HSTATE_KVM_VCPU(r13) 1593 + lwsync 1608 1594 /* Clear any pending IPI - assume we're a secondary thread */ 1609 1595 ld r5, HSTATE_XICS_PHYS(r13) 1610 1596 li r7, XICS_XIRR ··· 1634 1612 kvm_no_guest: 1635 1613 li r0, KVM_HWTHREAD_IN_NAP 1636 1614 stb r0, HSTATE_HWTHREAD_STATE(r13) 1637 - li r0, 0 1638 - std r0, HSTATE_KVM_VCPU(r13) 1639 1615 1640 1616 li r3, LPCR_PECE0 1641 1617 mfspr r4, SPRN_LPCR

-5

arch/powerpc/kvm/book3s_mmu_hpte.c

··· 114 114 hlist_del_init_rcu(&pte->list_vpte); 115 115 hlist_del_init_rcu(&pte->list_vpte_long); 116 116 117 - if (pte->pte.may_write) 118 - kvm_release_pfn_dirty(pte->pfn); 119 - else 120 - kvm_release_pfn_clean(pte->pfn); 121 - 122 117 spin_unlock(&vcpu3s->mmu_lock); 123 118 124 119 vcpu3s->hpte_cache_count--;

+193 -101

arch/powerpc/kvm/book3s_pr.c

··· 52 52 #define MSR_USER32 MSR_USER 53 53 #define MSR_USER64 MSR_USER 54 54 #define HW_PAGE_SIZE PAGE_SIZE 55 - #define __hard_irq_disable local_irq_disable 56 - #define __hard_irq_enable local_irq_enable 57 55 #endif 58 56 59 57 void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) ··· 64 66 svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; 65 67 svcpu_put(svcpu); 66 68 #endif 67 - 69 + vcpu->cpu = smp_processor_id(); 68 70 #ifdef CONFIG_PPC_BOOK3S_32 69 71 current->thread.kvm_shadow_vcpu = to_book3s(vcpu)->shadow_vcpu; 70 72 #endif ··· 81 83 svcpu_put(svcpu); 82 84 #endif 83 85 84 - kvmppc_giveup_ext(vcpu, MSR_FP); 85 - kvmppc_giveup_ext(vcpu, MSR_VEC); 86 - kvmppc_giveup_ext(vcpu, MSR_VSX); 86 + kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); 87 + vcpu->cpu = -1; 87 88 } 89 + 90 + int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) 91 + { 92 + int r = 1; /* Indicate we want to get back into the guest */ 93 + 94 + /* We misuse TLB_FLUSH to indicate that we want to clear 95 + all shadow cache entries */ 96 + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 97 + kvmppc_mmu_pte_flush(vcpu, 0, 0); 98 + 99 + return r; 100 + } 101 + 102 + /************* MMU Notifiers *************/ 103 + 104 + int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 105 + { 106 + trace_kvm_unmap_hva(hva); 107 + 108 + /* 109 + * Flush all shadow tlb entries everywhere. This is slow, but 110 + * we are 100% sure that we catch the to be unmapped page 111 + */ 112 + kvm_flush_remote_tlbs(kvm); 113 + 114 + return 0; 115 + } 116 + 117 + int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) 118 + { 119 + /* kvm_unmap_hva flushes everything anyways */ 120 + kvm_unmap_hva(kvm, start); 121 + 122 + return 0; 123 + } 124 + 125 + int kvm_age_hva(struct kvm *kvm, unsigned long hva) 126 + { 127 + /* XXX could be more clever ;) */ 128 + return 0; 129 + } 130 + 131 + int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 132 + { 133 + /* XXX could be more clever ;) */ 134 + return 0; 135 + } 136 + 137 + void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 138 + { 139 + /* The page will get remapped properly on its next fault */ 140 + kvm_unmap_hva(kvm, hva); 141 + } 142 + 143 + /*****************************************/ 88 144 89 145 static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) 90 146 { 91 147 ulong smsr = vcpu->arch.shared->msr; 92 148 93 149 /* Guest MSR values */ 94 - smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_DE; 150 + smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE; 95 151 /* Process MSR values */ 96 152 smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; 97 153 /* External providers the guest reserved */ ··· 431 379 432 380 static inline int get_fpr_index(int i) 433 381 { 434 - #ifdef CONFIG_VSX 435 - i *= 2; 436 - #endif 437 - return i; 382 + return i * TS_FPRWIDTH; 438 383 } 439 384 440 385 /* Give up external provider (FPU, Altivec, VSX) */ ··· 445 396 u64 *thread_fpr = (u64*)t->fpr; 446 397 int i; 447 398 448 - if (!(vcpu->arch.guest_owned_ext & msr)) 399 + /* 400 + * VSX instructions can access FP and vector registers, so if 401 + * we are giving up VSX, make sure we give up FP and VMX as well. 402 + */ 403 + if (msr & MSR_VSX) 404 + msr |= MSR_FP | MSR_VEC; 405 + 406 + msr &= vcpu->arch.guest_owned_ext; 407 + if (!msr) 449 408 return; 450 409 451 410 #ifdef DEBUG_EXT 452 411 printk(KERN_INFO "Giving up ext 0x%lx\n", msr); 453 412 #endif 454 413 455 - switch (msr) { 456 - case MSR_FP: 414 + if (msr & MSR_FP) { 415 + /* 416 + * Note that on CPUs with VSX, giveup_fpu stores 417 + * both the traditional FP registers and the added VSX 418 + * registers into thread.fpr[]. 419 + */ 457 420 giveup_fpu(current); 458 421 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) 459 422 vcpu_fpr[i] = thread_fpr[get_fpr_index(i)]; 460 423 461 424 vcpu->arch.fpscr = t->fpscr.val; 462 - break; 463 - case MSR_VEC: 425 + 426 + #ifdef CONFIG_VSX 427 + if (cpu_has_feature(CPU_FTR_VSX)) 428 + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++) 429 + vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; 430 + #endif 431 + } 432 + 464 433 #ifdef CONFIG_ALTIVEC 434 + if (msr & MSR_VEC) { 465 435 giveup_altivec(current); 466 436 memcpy(vcpu->arch.vr, t->vr, sizeof(vcpu->arch.vr)); 467 437 vcpu->arch.vscr = t->vscr; 468 - #endif 469 - break; 470 - case MSR_VSX: 471 - #ifdef CONFIG_VSX 472 - __giveup_vsx(current); 473 - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) 474 - vcpu_vsx[i] = thread_fpr[get_fpr_index(i) + 1]; 475 - #endif 476 - break; 477 - default: 478 - BUG(); 479 438 } 439 + #endif 480 440 481 - vcpu->arch.guest_owned_ext &= ~msr; 482 - current->thread.regs->msr &= ~msr; 441 + vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX); 483 442 kvmppc_recalc_shadow_msr(vcpu); 484 443 } 485 444 ··· 547 490 return RESUME_GUEST; 548 491 } 549 492 550 - /* We already own the ext */ 551 - if (vcpu->arch.guest_owned_ext & msr) { 552 - return RESUME_GUEST; 493 + if (msr == MSR_VSX) { 494 + /* No VSX? Give an illegal instruction interrupt */ 495 + #ifdef CONFIG_VSX 496 + if (!cpu_has_feature(CPU_FTR_VSX)) 497 + #endif 498 + { 499 + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); 500 + return RESUME_GUEST; 501 + } 502 + 503 + /* 504 + * We have to load up all the FP and VMX registers before 505 + * we can let the guest use VSX instructions. 506 + */ 507 + msr = MSR_FP | MSR_VEC | MSR_VSX; 553 508 } 509 + 510 + /* See if we already own all the ext(s) needed */ 511 + msr &= ~vcpu->arch.guest_owned_ext; 512 + if (!msr) 513 + return RESUME_GUEST; 554 514 555 515 #ifdef DEBUG_EXT 556 516 printk(KERN_INFO "Loading up ext 0x%lx\n", msr); ··· 575 501 576 502 current->thread.regs->msr |= msr; 577 503 578 - switch (msr) { 579 - case MSR_FP: 504 + if (msr & MSR_FP) { 580 505 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) 581 506 thread_fpr[get_fpr_index(i)] = vcpu_fpr[i]; 582 - 507 + #ifdef CONFIG_VSX 508 + for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr) / 2; i++) 509 + thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; 510 + #endif 583 511 t->fpscr.val = vcpu->arch.fpscr; 584 512 t->fpexc_mode = 0; 585 513 kvmppc_load_up_fpu(); 586 - break; 587 - case MSR_VEC: 514 + } 515 + 516 + if (msr & MSR_VEC) { 588 517 #ifdef CONFIG_ALTIVEC 589 518 memcpy(t->vr, vcpu->arch.vr, sizeof(vcpu->arch.vr)); 590 519 t->vscr = vcpu->arch.vscr; 591 520 t->vrsave = -1; 592 521 kvmppc_load_up_altivec(); 593 522 #endif 594 - break; 595 - case MSR_VSX: 596 - #ifdef CONFIG_VSX 597 - for (i = 0; i < ARRAY_SIZE(vcpu->arch.vsr); i++) 598 - thread_fpr[get_fpr_index(i) + 1] = vcpu_vsx[i]; 599 - kvmppc_load_up_vsx(); 600 - #endif 601 - break; 602 - default: 603 - BUG(); 604 523 } 605 524 606 525 vcpu->arch.guest_owned_ext |= msr; 607 - 608 526 kvmppc_recalc_shadow_msr(vcpu); 609 527 610 528 return RESUME_GUEST; ··· 606 540 unsigned int exit_nr) 607 541 { 608 542 int r = RESUME_HOST; 543 + int s; 609 544 610 545 vcpu->stat.sum_exits++; 611 546 612 547 run->exit_reason = KVM_EXIT_UNKNOWN; 613 548 run->ready_for_interrupt_injection = 1; 614 549 615 - /* We get here with MSR.EE=0, so enable it to be a nice citizen */ 616 - __hard_irq_enable(); 550 + /* We get here with MSR.EE=1 */ 617 551 618 - trace_kvm_book3s_exit(exit_nr, vcpu); 619 - preempt_enable(); 620 - kvm_resched(vcpu); 552 + trace_kvm_exit(exit_nr, vcpu); 553 + kvm_guest_exit(); 554 + 621 555 switch (exit_nr) { 622 556 case BOOK3S_INTERRUPT_INST_STORAGE: 623 557 { ··· 868 802 } 869 803 } 870 804 871 - preempt_disable(); 872 805 if (!(r & RESUME_HOST)) { 873 806 /* To avoid clobbering exit_reason, only check for signals if 874 807 * we aren't already exiting to userspace for some other ··· 879 814 * and if we really did time things so badly, then we just exit 880 815 * again due to a host external interrupt. 881 816 */ 882 - __hard_irq_disable(); 883 - if (signal_pending(current)) { 884 - __hard_irq_enable(); 885 - #ifdef EXIT_DEBUG 886 - printk(KERN_EMERG "KVM: Going back to host\n"); 887 - #endif 888 - vcpu->stat.signal_exits++; 889 - run->exit_reason = KVM_EXIT_INTR; 890 - r = -EINTR; 817 + local_irq_disable(); 818 + s = kvmppc_prepare_to_enter(vcpu); 819 + if (s <= 0) { 820 + local_irq_enable(); 821 + r = s; 891 822 } else { 892 - /* In case an interrupt came in that was triggered 893 - * from userspace (like DEC), we need to check what 894 - * to inject now! */ 895 - kvmppc_core_prepare_to_enter(vcpu); 823 + kvmppc_lazy_ee_enable(); 896 824 } 897 825 } 898 826 ··· 957 899 return 0; 958 900 } 959 901 960 - int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 902 + int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) 961 903 { 962 - int r = -EINVAL; 904 + int r = 0; 963 905 964 - switch (reg->id) { 906 + switch (id) { 965 907 case KVM_REG_PPC_HIOR: 966 - r = copy_to_user((u64 __user *)(long)reg->addr, 967 - &to_book3s(vcpu)->hior, sizeof(u64)); 908 + *val = get_reg_val(id, to_book3s(vcpu)->hior); 968 909 break; 910 + #ifdef CONFIG_VSX 911 + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: { 912 + long int i = id - KVM_REG_PPC_VSR0; 913 + 914 + if (!cpu_has_feature(CPU_FTR_VSX)) { 915 + r = -ENXIO; 916 + break; 917 + } 918 + val->vsxval[0] = vcpu->arch.fpr[i]; 919 + val->vsxval[1] = vcpu->arch.vsr[i]; 920 + break; 921 + } 922 + #endif /* CONFIG_VSX */ 969 923 default: 924 + r = -EINVAL; 970 925 break; 971 926 } 972 927 973 928 return r; 974 929 } 975 930 976 - int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 931 + int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *val) 977 932 { 978 - int r = -EINVAL; 933 + int r = 0; 979 934 980 - switch (reg->id) { 935 + switch (id) { 981 936 case KVM_REG_PPC_HIOR: 982 - r = copy_from_user(&to_book3s(vcpu)->hior, 983 - (u64 __user *)(long)reg->addr, sizeof(u64)); 984 - if (!r) 985 - to_book3s(vcpu)->hior_explicit = true; 937 + to_book3s(vcpu)->hior = set_reg_val(id, *val); 938 + to_book3s(vcpu)->hior_explicit = true; 986 939 break; 940 + #ifdef CONFIG_VSX 941 + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: { 942 + long int i = id - KVM_REG_PPC_VSR0; 943 + 944 + if (!cpu_has_feature(CPU_FTR_VSX)) { 945 + r = -ENXIO; 946 + break; 947 + } 948 + vcpu->arch.fpr[i] = val->vsxval[0]; 949 + vcpu->arch.vsr[i] = val->vsxval[1]; 950 + break; 951 + } 952 + #endif /* CONFIG_VSX */ 987 953 default: 954 + r = -EINVAL; 988 955 break; 989 956 } 990 957 ··· 1103 1020 #endif 1104 1021 ulong ext_msr; 1105 1022 1106 - preempt_disable(); 1107 - 1108 1023 /* Check if we can run the vcpu at all */ 1109 1024 if (!vcpu->arch.sane) { 1110 1025 kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; ··· 1110 1029 goto out; 1111 1030 } 1112 1031 1113 - kvmppc_core_prepare_to_enter(vcpu); 1114 - 1115 1032 /* 1116 1033 * Interrupts could be timers for the guest which we have to inject 1117 1034 * again, so let's postpone them until we're in the guest and if we 1118 1035 * really did time things so badly, then we just exit again due to 1119 1036 * a host external interrupt. 1120 1037 */ 1121 - __hard_irq_disable(); 1122 - 1123 - /* No need to go into the guest when all we do is going out */ 1124 - if (signal_pending(current)) { 1125 - __hard_irq_enable(); 1126 - kvm_run->exit_reason = KVM_EXIT_INTR; 1127 - ret = -EINTR; 1038 + local_irq_disable(); 1039 + ret = kvmppc_prepare_to_enter(vcpu); 1040 + if (ret <= 0) { 1041 + local_irq_enable(); 1128 1042 goto out; 1129 1043 } 1130 1044 ··· 1146 1070 /* Save VSX state in stack */ 1147 1071 used_vsr = current->thread.used_vsr; 1148 1072 if (used_vsr && (current->thread.regs->msr & MSR_VSX)) 1149 - __giveup_vsx(current); 1073 + __giveup_vsx(current); 1150 1074 #endif 1151 1075 1152 1076 /* Remember the MSR with disabled extensions */ ··· 1156 1080 if (vcpu->arch.shared->msr & MSR_FP) 1157 1081 kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); 1158 1082 1159 - kvm_guest_enter(); 1083 + kvmppc_lazy_ee_enable(); 1160 1084 1161 1085 ret = __kvmppc_vcpu_run(kvm_run, vcpu); 1162 1086 1163 - kvm_guest_exit(); 1087 + /* No need for kvm_guest_exit. It's done in handle_exit. 1088 + We also get here with interrupts enabled. */ 1089 + 1090 + /* Make sure we save the guest FPU/Altivec/VSX state */ 1091 + kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); 1164 1092 1165 1093 current->thread.regs->msr = ext_msr; 1166 1094 1167 - /* Make sure we save the guest FPU/Altivec/VSX state */ 1168 - kvmppc_giveup_ext(vcpu, MSR_FP); 1169 - kvmppc_giveup_ext(vcpu, MSR_VEC); 1170 - kvmppc_giveup_ext(vcpu, MSR_VSX); 1171 - 1172 - /* Restore FPU state from stack */ 1095 + /* Restore FPU/VSX state from stack */ 1173 1096 memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); 1174 1097 current->thread.fpscr.val = fpscr; 1175 1098 current->thread.fpexc_mode = fpexc_mode; ··· 1188 1113 #endif 1189 1114 1190 1115 out: 1191 - preempt_enable(); 1116 + vcpu->mode = OUTSIDE_GUEST_MODE; 1192 1117 return ret; 1193 1118 } 1194 1119 ··· 1256 1181 } 1257 1182 #endif /* CONFIG_PPC64 */ 1258 1183 1184 + void kvmppc_core_free_memslot(struct kvm_memory_slot *free, 1185 + struct kvm_memory_slot *dont) 1186 + { 1187 + } 1188 + 1189 + int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, 1190 + unsigned long npages) 1191 + { 1192 + return 0; 1193 + } 1194 + 1259 1195 int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1196 + struct kvm_memory_slot *memslot, 1260 1197 struct kvm_userspace_memory_region *mem) 1261 1198 { 1262 1199 return 0; 1263 1200 } 1264 1201 1265 1202 void kvmppc_core_commit_memory_region(struct kvm *kvm, 1266 - struct kvm_userspace_memory_region *mem) 1203 + struct kvm_userspace_memory_region *mem, 1204 + struct kvm_memory_slot old) 1205 + { 1206 + } 1207 + 1208 + void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) 1267 1209 { 1268 1210 } 1269 1211

+8 -10

arch/powerpc/kvm/book3s_rmhandlers.S

··· 170 170 * Call kvmppc_handler_trampoline_enter in real mode 171 171 * 172 172 * On entry, r4 contains the guest shadow MSR 173 + * MSR.EE has to be 0 when calling this function 173 174 */ 174 175 _GLOBAL(kvmppc_entry_trampoline) 175 176 mfmsr r5 176 177 LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) 177 178 toreal(r7) 178 179 179 - li r9, MSR_RI 180 - ori r9, r9, MSR_EE 181 - andc r9, r5, r9 /* Clear EE and RI in MSR value */ 182 180 li r6, MSR_IR | MSR_DR 183 - ori r6, r6, MSR_EE 184 - andc r6, r5, r6 /* Clear EE, DR and IR in MSR value */ 185 - MTMSR_EERI(r9) /* Clear EE and RI in MSR */ 186 - mtsrr0 r7 /* before we set srr0/1 */ 181 + andc r6, r5, r6 /* Clear DR and IR in MSR value */ 182 + /* 183 + * Set EE in HOST_MSR so that it's enabled when we get into our 184 + * C exit handler function 185 + */ 186 + ori r5, r5, MSR_EE 187 + mtsrr0 r7 187 188 mtsrr1 r6 188 189 RFI 189 190 ··· 233 232 define_load_up(fpu) 234 233 #ifdef CONFIG_ALTIVEC 235 234 define_load_up(altivec) 236 - #endif 237 - #ifdef CONFIG_VSX 238 - define_load_up(vsx) 239 235 #endif 240 236 241 237 #include "book3s_segment.S"

+296 -50

arch/powerpc/kvm/booke.c

··· 36 36 #include <asm/dbell.h> 37 37 #include <asm/hw_irq.h> 38 38 #include <asm/irq.h> 39 + #include <asm/time.h> 39 40 40 41 #include "timing.h" 41 42 #include "booke.h" 43 + #include "trace.h" 42 44 43 45 unsigned long kvmppc_booke_handlers; 44 46 ··· 64 62 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 65 63 { "doorbell", VCPU_STAT(dbell_exits) }, 66 64 { "guest doorbell", VCPU_STAT(gdbell_exits) }, 65 + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 67 66 { NULL } 68 67 }; 69 68 ··· 123 120 } 124 121 #endif 125 122 123 + static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) 124 + { 125 + #if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV) 126 + /* We always treat the FP bit as enabled from the host 127 + perspective, so only need to adjust the shadow MSR */ 128 + vcpu->arch.shadow_msr &= ~MSR_FP; 129 + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP; 130 + #endif 131 + } 132 + 126 133 /* 127 134 * Helper function for "full" MSR writes. No need to call this if only 128 135 * EE/CE/ME/DE/RI are changing. ··· 149 136 150 137 kvmppc_mmu_msr_notify(vcpu, old_msr); 151 138 kvmppc_vcpu_sync_spe(vcpu); 139 + kvmppc_vcpu_sync_fpu(vcpu); 152 140 } 153 141 154 142 static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, 155 143 unsigned int priority) 156 144 { 145 + trace_kvm_booke_queue_irqprio(vcpu, priority); 157 146 set_bit(priority, &vcpu->arch.pending_exceptions); 158 147 } 159 148 ··· 219 204 { 220 205 clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); 221 206 clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); 207 + } 208 + 209 + static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) 210 + { 211 + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); 212 + } 213 + 214 + static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) 215 + { 216 + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); 222 217 } 223 218 224 219 static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) ··· 312 287 bool crit; 313 288 bool keep_irq = false; 314 289 enum int_class int_class; 290 + ulong new_msr = vcpu->arch.shared->msr; 315 291 316 292 /* Truncate crit indicators in 32 bit mode */ 317 293 if (!(vcpu->arch.shared->msr & MSR_SF)) { ··· 351 325 msr_mask = MSR_CE | MSR_ME | MSR_DE; 352 326 int_class = INT_CLASS_NONCRIT; 353 327 break; 328 + case BOOKE_IRQPRIO_WATCHDOG: 354 329 case BOOKE_IRQPRIO_CRITICAL: 355 330 case BOOKE_IRQPRIO_DBELL_CRIT: 356 331 allowed = vcpu->arch.shared->msr & MSR_CE; ··· 408 381 set_guest_esr(vcpu, vcpu->arch.queued_esr); 409 382 if (update_dear == true) 410 383 set_guest_dear(vcpu, vcpu->arch.queued_dear); 411 - kvmppc_set_msr(vcpu, vcpu->arch.shared->msr & msr_mask); 384 + 385 + new_msr &= msr_mask; 386 + #if defined(CONFIG_64BIT) 387 + if (vcpu->arch.epcr & SPRN_EPCR_ICM) 388 + new_msr |= MSR_CM; 389 + #endif 390 + kvmppc_set_msr(vcpu, new_msr); 412 391 413 392 if (!keep_irq) 414 393 clear_bit(priority, &vcpu->arch.pending_exceptions); ··· 437 404 return allowed; 438 405 } 439 406 407 + /* 408 + * Return the number of jiffies until the next timeout. If the timeout is 409 + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA 410 + * because the larger value can break the timer APIs. 411 + */ 412 + static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) 413 + { 414 + u64 tb, wdt_tb, wdt_ticks = 0; 415 + u64 nr_jiffies = 0; 416 + u32 period = TCR_GET_WP(vcpu->arch.tcr); 417 + 418 + wdt_tb = 1ULL << (63 - period); 419 + tb = get_tb(); 420 + /* 421 + * The watchdog timeout will hapeen when TB bit corresponding 422 + * to watchdog will toggle from 0 to 1. 423 + */ 424 + if (tb & wdt_tb) 425 + wdt_ticks = wdt_tb; 426 + 427 + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); 428 + 429 + /* Convert timebase ticks to jiffies */ 430 + nr_jiffies = wdt_ticks; 431 + 432 + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) 433 + nr_jiffies++; 434 + 435 + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); 436 + } 437 + 438 + static void arm_next_watchdog(struct kvm_vcpu *vcpu) 439 + { 440 + unsigned long nr_jiffies; 441 + unsigned long flags; 442 + 443 + /* 444 + * If TSR_ENW and TSR_WIS are not set then no need to exit to 445 + * userspace, so clear the KVM_REQ_WATCHDOG request. 446 + */ 447 + if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS)) 448 + clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests); 449 + 450 + spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); 451 + nr_jiffies = watchdog_next_timeout(vcpu); 452 + /* 453 + * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA 454 + * then do not run the watchdog timer as this can break timer APIs. 455 + */ 456 + if (nr_jiffies < NEXT_TIMER_MAX_DELTA) 457 + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); 458 + else 459 + del_timer(&vcpu->arch.wdt_timer); 460 + spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); 461 + } 462 + 463 + void kvmppc_watchdog_func(unsigned long data) 464 + { 465 + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; 466 + u32 tsr, new_tsr; 467 + int final; 468 + 469 + do { 470 + new_tsr = tsr = vcpu->arch.tsr; 471 + final = 0; 472 + 473 + /* Time out event */ 474 + if (tsr & TSR_ENW) { 475 + if (tsr & TSR_WIS) 476 + final = 1; 477 + else 478 + new_tsr = tsr | TSR_WIS; 479 + } else { 480 + new_tsr = tsr | TSR_ENW; 481 + } 482 + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); 483 + 484 + if (new_tsr & TSR_WIS) { 485 + smp_wmb(); 486 + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); 487 + kvm_vcpu_kick(vcpu); 488 + } 489 + 490 + /* 491 + * If this is final watchdog expiry and some action is required 492 + * then exit to userspace. 493 + */ 494 + if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && 495 + vcpu->arch.watchdog_enabled) { 496 + smp_wmb(); 497 + kvm_make_request(KVM_REQ_WATCHDOG, vcpu); 498 + kvm_vcpu_kick(vcpu); 499 + } 500 + 501 + /* 502 + * Stop running the watchdog timer after final expiration to 503 + * prevent the host from being flooded with timers if the 504 + * guest sets a short period. 505 + * Timers will resume when TSR/TCR is updated next time. 506 + */ 507 + if (!final) 508 + arm_next_watchdog(vcpu); 509 + } 510 + 440 511 static void update_timer_ints(struct kvm_vcpu *vcpu) 441 512 { 442 513 if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) 443 514 kvmppc_core_queue_dec(vcpu); 444 515 else 445 516 kvmppc_core_dequeue_dec(vcpu); 517 + 518 + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) 519 + kvmppc_core_queue_watchdog(vcpu); 520 + else 521 + kvmppc_core_dequeue_watchdog(vcpu); 446 522 } 447 523 448 524 static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) 449 525 { 450 526 unsigned long *pending = &vcpu->arch.pending_exceptions; 451 527 unsigned int priority; 452 - 453 - if (vcpu->requests) { 454 - if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) { 455 - smp_mb(); 456 - update_timer_ints(vcpu); 457 - } 458 - } 459 528 460 529 priority = __ffs(*pending); 461 530 while (priority < BOOKE_IRQPRIO_MAX) { ··· 594 459 return r; 595 460 } 596 461 597 - /* 598 - * Common checks before entering the guest world. Call with interrupts 599 - * disabled. 600 - * 601 - * returns !0 if a signal is pending and check_signal is true 602 - */ 603 - static int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) 462 + int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) 604 463 { 605 - int r = 0; 464 + int r = 1; /* Indicate we want to get back into the guest */ 606 465 607 - WARN_ON_ONCE(!irqs_disabled()); 608 - while (true) { 609 - if (need_resched()) { 610 - local_irq_enable(); 611 - cond_resched(); 612 - local_irq_disable(); 613 - continue; 614 - } 466 + if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) 467 + update_timer_ints(vcpu); 468 + #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) 469 + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 470 + kvmppc_core_flush_tlb(vcpu); 471 + #endif 615 472 616 - if (signal_pending(current)) { 617 - r = 1; 618 - break; 619 - } 620 - 621 - if (kvmppc_core_prepare_to_enter(vcpu)) { 622 - /* interrupts got enabled in between, so we 623 - are back at square 1 */ 624 - continue; 625 - } 626 - 627 - break; 473 + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) { 474 + vcpu->run->exit_reason = KVM_EXIT_WATCHDOG; 475 + r = 0; 628 476 } 629 477 630 478 return r; ··· 615 497 616 498 int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 617 499 { 618 - int ret; 500 + int ret, s; 619 501 #ifdef CONFIG_PPC_FPU 620 502 unsigned int fpscr; 621 503 int fpexc_mode; ··· 628 510 } 629 511 630 512 local_irq_disable(); 631 - if (kvmppc_prepare_to_enter(vcpu)) { 632 - kvm_run->exit_reason = KVM_EXIT_INTR; 633 - ret = -EINTR; 513 + s = kvmppc_prepare_to_enter(vcpu); 514 + if (s <= 0) { 515 + local_irq_enable(); 516 + ret = s; 634 517 goto out; 635 518 } 519 + kvmppc_lazy_ee_enable(); 636 520 637 521 kvm_guest_enter(); 638 522 ··· 662 542 663 543 ret = __kvmppc_vcpu_run(kvm_run, vcpu); 664 544 545 + /* No need for kvm_guest_exit. It's done in handle_exit. 546 + We also get here with interrupts enabled. */ 547 + 665 548 #ifdef CONFIG_PPC_FPU 666 549 kvmppc_save_guest_fp(vcpu); 667 550 ··· 680 557 current->thread.fpexc_mode = fpexc_mode; 681 558 #endif 682 559 683 - kvm_guest_exit(); 684 - 685 560 out: 686 - local_irq_enable(); 561 + vcpu->mode = OUTSIDE_GUEST_MODE; 687 562 return ret; 688 563 } 689 564 ··· 789 668 unsigned int exit_nr) 790 669 { 791 670 int r = RESUME_HOST; 671 + int s; 792 672 793 673 /* update before a new last_exit_type is rewritten */ 794 674 kvmppc_update_timing_stats(vcpu); ··· 798 676 kvmppc_restart_interrupt(vcpu, exit_nr); 799 677 800 678 local_irq_enable(); 679 + 680 + trace_kvm_exit(exit_nr, vcpu); 681 + kvm_guest_exit(); 801 682 802 683 run->exit_reason = KVM_EXIT_UNKNOWN; 803 684 run->ready_for_interrupt_injection = 1; ··· 1096 971 */ 1097 972 if (!(r & RESUME_HOST)) { 1098 973 local_irq_disable(); 1099 - if (kvmppc_prepare_to_enter(vcpu)) { 1100 - run->exit_reason = KVM_EXIT_INTR; 1101 - r = (-EINTR << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); 1102 - kvmppc_account_exit(vcpu, SIGNAL_EXITS); 974 + s = kvmppc_prepare_to_enter(vcpu); 975 + if (s <= 0) { 976 + local_irq_enable(); 977 + r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); 978 + } else { 979 + kvmppc_lazy_ee_enable(); 1103 980 } 1104 981 } 1105 982 ··· 1136 1009 r = kvmppc_core_vcpu_setup(vcpu); 1137 1010 kvmppc_sanity_check(vcpu); 1138 1011 return r; 1012 + } 1013 + 1014 + int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) 1015 + { 1016 + /* setup watchdog timer once */ 1017 + spin_lock_init(&vcpu->arch.wdt_lock); 1018 + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, 1019 + (unsigned long)vcpu); 1020 + 1021 + return 0; 1022 + } 1023 + 1024 + void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) 1025 + { 1026 + del_timer_sync(&vcpu->arch.wdt_timer); 1139 1027 } 1140 1028 1141 1029 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) ··· 1248 1106 } 1249 1107 1250 1108 if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { 1109 + u32 old_tsr = vcpu->arch.tsr; 1110 + 1251 1111 vcpu->arch.tsr = sregs->u.e.tsr; 1112 + 1113 + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) 1114 + arm_next_watchdog(vcpu); 1115 + 1252 1116 update_timer_ints(vcpu); 1253 1117 } 1254 1118 ··· 1369 1221 1370 1222 int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 1371 1223 { 1372 - return -EINVAL; 1224 + int r = -EINVAL; 1225 + 1226 + switch (reg->id) { 1227 + case KVM_REG_PPC_IAC1: 1228 + case KVM_REG_PPC_IAC2: 1229 + case KVM_REG_PPC_IAC3: 1230 + case KVM_REG_PPC_IAC4: { 1231 + int iac = reg->id - KVM_REG_PPC_IAC1; 1232 + r = copy_to_user((u64 __user *)(long)reg->addr, 1233 + &vcpu->arch.dbg_reg.iac[iac], sizeof(u64)); 1234 + break; 1235 + } 1236 + case KVM_REG_PPC_DAC1: 1237 + case KVM_REG_PPC_DAC2: { 1238 + int dac = reg->id - KVM_REG_PPC_DAC1; 1239 + r = copy_to_user((u64 __user *)(long)reg->addr, 1240 + &vcpu->arch.dbg_reg.dac[dac], sizeof(u64)); 1241 + break; 1242 + } 1243 + #if defined(CONFIG_64BIT) 1244 + case KVM_REG_PPC_EPCR: 1245 + r = put_user(vcpu->arch.epcr, (u32 __user *)(long)reg->addr); 1246 + break; 1247 + #endif 1248 + default: 1249 + break; 1250 + } 1251 + return r; 1373 1252 } 1374 1253 1375 1254 int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) 1376 1255 { 1377 - return -EINVAL; 1256 + int r = -EINVAL; 1257 + 1258 + switch (reg->id) { 1259 + case KVM_REG_PPC_IAC1: 1260 + case KVM_REG_PPC_IAC2: 1261 + case KVM_REG_PPC_IAC3: 1262 + case KVM_REG_PPC_IAC4: { 1263 + int iac = reg->id - KVM_REG_PPC_IAC1; 1264 + r = copy_from_user(&vcpu->arch.dbg_reg.iac[iac], 1265 + (u64 __user *)(long)reg->addr, sizeof(u64)); 1266 + break; 1267 + } 1268 + case KVM_REG_PPC_DAC1: 1269 + case KVM_REG_PPC_DAC2: { 1270 + int dac = reg->id - KVM_REG_PPC_DAC1; 1271 + r = copy_from_user(&vcpu->arch.dbg_reg.dac[dac], 1272 + (u64 __user *)(long)reg->addr, sizeof(u64)); 1273 + break; 1274 + } 1275 + #if defined(CONFIG_64BIT) 1276 + case KVM_REG_PPC_EPCR: { 1277 + u32 new_epcr; 1278 + r = get_user(new_epcr, (u32 __user *)(long)reg->addr); 1279 + if (r == 0) 1280 + kvmppc_set_epcr(vcpu, new_epcr); 1281 + break; 1282 + } 1283 + #endif 1284 + default: 1285 + break; 1286 + } 1287 + return r; 1378 1288 } 1379 1289 1380 1290 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) ··· 1459 1253 return -ENOTSUPP; 1460 1254 } 1461 1255 1256 + void kvmppc_core_free_memslot(struct kvm_memory_slot *free, 1257 + struct kvm_memory_slot *dont) 1258 + { 1259 + } 1260 + 1261 + int kvmppc_core_create_memslot(struct kvm_memory_slot *slot, 1262 + unsigned long npages) 1263 + { 1264 + return 0; 1265 + } 1266 + 1462 1267 int kvmppc_core_prepare_memory_region(struct kvm *kvm, 1268 + struct kvm_memory_slot *memslot, 1463 1269 struct kvm_userspace_memory_region *mem) 1464 1270 { 1465 1271 return 0; 1466 1272 } 1467 1273 1468 1274 void kvmppc_core_commit_memory_region(struct kvm *kvm, 1469 - struct kvm_userspace_memory_region *mem) 1275 + struct kvm_userspace_memory_region *mem, 1276 + struct kvm_memory_slot old) 1470 1277 { 1278 + } 1279 + 1280 + void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) 1281 + { 1282 + } 1283 + 1284 + void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr) 1285 + { 1286 + #if defined(CONFIG_64BIT) 1287 + vcpu->arch.epcr = new_epcr; 1288 + #ifdef CONFIG_KVM_BOOKE_HV 1289 + vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM; 1290 + if (vcpu->arch.epcr & SPRN_EPCR_ICM) 1291 + vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM; 1292 + #endif 1293 + #endif 1471 1294 } 1472 1295 1473 1296 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) 1474 1297 { 1475 1298 vcpu->arch.tcr = new_tcr; 1299 + arm_next_watchdog(vcpu); 1476 1300 update_timer_ints(vcpu); 1477 1301 } 1478 1302 ··· 1517 1281 void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) 1518 1282 { 1519 1283 clear_bits(tsr_bits, &vcpu->arch.tsr); 1284 + 1285 + /* 1286 + * We may have stopped the watchdog due to 1287 + * being stuck on final expiration. 1288 + */ 1289 + if (tsr_bits & (TSR_ENW | TSR_WIS)) 1290 + arm_next_watchdog(vcpu); 1291 + 1520 1292 update_timer_ints(vcpu); 1521 1293 } 1522 1294 ··· 1542 1298 1543 1299 void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1544 1300 { 1301 + vcpu->cpu = smp_processor_id(); 1545 1302 current->thread.kvm_vcpu = vcpu; 1546 1303 } 1547 1304 1548 1305 void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) 1549 1306 { 1550 1307 current->thread.kvm_vcpu = NULL; 1308 + vcpu->cpu = -1; 1551 1309 } 1552 1310 1553 1311 int __init kvmppc_booke_init(void)

+1

arch/powerpc/kvm/booke.h

··· 69 69 void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); 70 70 void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); 71 71 72 + void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr); 72 73 void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr); 73 74 void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); 74 75 void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits);

+31 -5

arch/powerpc/kvm/booke_emulate.c

··· 133 133 vcpu->arch.csrr1 = spr_val; 134 134 break; 135 135 case SPRN_DBCR0: 136 - vcpu->arch.dbcr0 = spr_val; 136 + vcpu->arch.dbg_reg.dbcr0 = spr_val; 137 137 break; 138 138 case SPRN_DBCR1: 139 - vcpu->arch.dbcr1 = spr_val; 139 + vcpu->arch.dbg_reg.dbcr1 = spr_val; 140 140 break; 141 141 case SPRN_DBSR: 142 142 vcpu->arch.dbsr &= ~spr_val; ··· 145 145 kvmppc_clr_tsr_bits(vcpu, spr_val); 146 146 break; 147 147 case SPRN_TCR: 148 + /* 149 + * WRC is a 2-bit field that is supposed to preserve its 150 + * value once written to non-zero. 151 + */ 152 + if (vcpu->arch.tcr & TCR_WRC_MASK) { 153 + spr_val &= ~TCR_WRC_MASK; 154 + spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; 155 + } 148 156 kvmppc_set_tcr(vcpu, spr_val); 149 157 break; 150 158 ··· 237 229 case SPRN_IVOR15: 238 230 vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val; 239 231 break; 240 - 232 + case SPRN_MCSR: 233 + vcpu->arch.mcsr &= ~spr_val; 234 + break; 235 + #if defined(CONFIG_64BIT) 236 + case SPRN_EPCR: 237 + kvmppc_set_epcr(vcpu, spr_val); 238 + #ifdef CONFIG_KVM_BOOKE_HV 239 + mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr); 240 + #endif 241 + break; 242 + #endif 241 243 default: 242 244 emulated = EMULATE_FAIL; 243 245 } ··· 276 258 *spr_val = vcpu->arch.csrr1; 277 259 break; 278 260 case SPRN_DBCR0: 279 - *spr_val = vcpu->arch.dbcr0; 261 + *spr_val = vcpu->arch.dbg_reg.dbcr0; 280 262 break; 281 263 case SPRN_DBCR1: 282 - *spr_val = vcpu->arch.dbcr1; 264 + *spr_val = vcpu->arch.dbg_reg.dbcr1; 283 265 break; 284 266 case SPRN_DBSR: 285 267 *spr_val = vcpu->arch.dbsr; ··· 339 321 case SPRN_IVOR15: 340 322 *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; 341 323 break; 324 + case SPRN_MCSR: 325 + *spr_val = vcpu->arch.mcsr; 326 + break; 327 + #if defined(CONFIG_64BIT) 328 + case SPRN_EPCR: 329 + *spr_val = vcpu->arch.epcr; 330 + break; 331 + #endif 342 332 343 333 default: 344 334 emulated = EMULATE_FAIL;

+132 -13

arch/powerpc/kvm/bookehv_interrupts.S

··· 16 16 * 17 17 * Author: Varun Sethi <varun.sethi@freescale.com> 18 18 * Author: Scott Wood <scotwood@freescale.com> 19 + * Author: Mihai Caraman <mihai.caraman@freescale.com> 19 20 * 20 21 * This file is derived from arch/powerpc/kvm/booke_interrupts.S 21 22 */ ··· 31 30 #include <asm/bitsperlong.h> 32 31 #include <asm/thread_info.h> 33 32 33 + #ifdef CONFIG_64BIT 34 + #include <asm/exception-64e.h> 35 + #else 34 36 #include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ 35 - 36 - #define GET_VCPU(vcpu, thread) \ 37 - PPC_LL vcpu, THREAD_KVM_VCPU(thread) 37 + #endif 38 38 39 39 #define LONGBYTES (BITS_PER_LONG / 8) 40 40 41 41 #define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES)) 42 42 43 43 /* The host stack layout: */ 44 - #define HOST_R1 (0 * LONGBYTES) /* Implied by stwu. */ 45 - #define HOST_CALLEE_LR (1 * LONGBYTES) 46 - #define HOST_RUN (2 * LONGBYTES) /* struct kvm_run */ 44 + #define HOST_R1 0 /* Implied by stwu. */ 45 + #define HOST_CALLEE_LR PPC_LR_STKOFF 46 + #define HOST_RUN (HOST_CALLEE_LR + LONGBYTES) 47 47 /* 48 48 * r2 is special: it holds 'current', and it made nonvolatile in the 49 49 * kernel with the -ffixed-r2 gcc option. 50 50 */ 51 - #define HOST_R2 (3 * LONGBYTES) 52 - #define HOST_CR (4 * LONGBYTES) 53 - #define HOST_NV_GPRS (5 * LONGBYTES) 51 + #define HOST_R2 (HOST_RUN + LONGBYTES) 52 + #define HOST_CR (HOST_R2 + LONGBYTES) 53 + #define HOST_NV_GPRS (HOST_CR + LONGBYTES) 54 54 #define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES)) 55 55 #define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n) 56 56 #define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES) 57 57 #define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */ 58 - #define HOST_STACK_LR (HOST_STACK_SIZE + LONGBYTES) /* In caller stack frame. */ 58 + /* LR in caller stack frame. */ 59 + #define HOST_STACK_LR (HOST_STACK_SIZE + PPC_LR_STKOFF) 59 60 60 61 #define NEED_EMU 0x00000001 /* emulation -- save nv regs */ 61 62 #define NEED_DEAR 0x00000002 /* save faulting DEAR */ ··· 204 201 b kvmppc_resume_host 205 202 .endm 206 203 204 + #ifdef CONFIG_64BIT 205 + /* Exception types */ 206 + #define EX_GEN 1 207 + #define EX_GDBELL 2 208 + #define EX_DBG 3 209 + #define EX_MC 4 210 + #define EX_CRIT 5 211 + #define EX_TLB 6 212 + 213 + /* 214 + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h 215 + */ 216 + .macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags 217 + _GLOBAL(kvmppc_handler_\intno\()_\srr1) 218 + mr r11, r4 219 + /* 220 + * Get vcpu from Paca: paca->__current.thread->kvm_vcpu 221 + */ 222 + PPC_LL r4, PACACURRENT(r13) 223 + PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4) 224 + stw r10, VCPU_CR(r4) 225 + PPC_STL r11, VCPU_GPR(R4)(r4) 226 + PPC_STL r5, VCPU_GPR(R5)(r4) 227 + .if \type == EX_CRIT 228 + PPC_LL r5, (\paca_ex + EX_R13)(r13) 229 + .else 230 + mfspr r5, \scratch 231 + .endif 232 + PPC_STL r6, VCPU_GPR(R6)(r4) 233 + PPC_STL r8, VCPU_GPR(R8)(r4) 234 + PPC_STL r9, VCPU_GPR(R9)(r4) 235 + PPC_STL r5, VCPU_GPR(R13)(r4) 236 + PPC_LL r6, (\paca_ex + \ex_r10)(r13) 237 + PPC_LL r8, (\paca_ex + \ex_r11)(r13) 238 + PPC_STL r3, VCPU_GPR(R3)(r4) 239 + PPC_STL r7, VCPU_GPR(R7)(r4) 240 + PPC_STL r12, VCPU_GPR(R12)(r4) 241 + PPC_STL r6, VCPU_GPR(R10)(r4) 242 + PPC_STL r8, VCPU_GPR(R11)(r4) 243 + mfctr r5 244 + PPC_STL r5, VCPU_CTR(r4) 245 + mfspr r5, \srr0 246 + mfspr r6, \srr1 247 + kvm_handler_common \intno, \srr0, \flags 248 + .endm 249 + 250 + #define EX_PARAMS(type) \ 251 + EX_##type, \ 252 + SPRN_SPRG_##type##_SCRATCH, \ 253 + PACA_EX##type, \ 254 + EX_R10, \ 255 + EX_R11 256 + 257 + #define EX_PARAMS_TLB \ 258 + EX_TLB, \ 259 + SPRN_SPRG_GEN_SCRATCH, \ 260 + PACA_EXTLB, \ 261 + EX_TLB_R10, \ 262 + EX_TLB_R11 263 + 264 + kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \ 265 + SPRN_CSRR0, SPRN_CSRR1, 0 266 + kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \ 267 + SPRN_MCSRR0, SPRN_MCSRR1, 0 268 + kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \ 269 + SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR) 270 + kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \ 271 + SPRN_SRR0, SPRN_SRR1, NEED_ESR 272 + kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \ 273 + SPRN_SRR0, SPRN_SRR1, 0 274 + kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \ 275 + SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR) 276 + kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \ 277 + SPRN_SRR0, SPRN_SRR1,NEED_ESR 278 + kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \ 279 + SPRN_SRR0, SPRN_SRR1, 0 280 + kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \ 281 + SPRN_SRR0, SPRN_SRR1, 0 282 + kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \ 283 + SPRN_SRR0, SPRN_SRR1, 0 284 + kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \ 285 + SPRN_SRR0, SPRN_SRR1, 0 286 + kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\ 287 + SPRN_CSRR0, SPRN_CSRR1, 0 288 + /* 289 + * Only bolted TLB miss exception handlers are supported for now 290 + */ 291 + kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \ 292 + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) 293 + kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \ 294 + SPRN_SRR0, SPRN_SRR1, 0 295 + kvm_handler BOOKE_INTERRUPT_SPE_UNAVAIL, EX_PARAMS(GEN), \ 296 + SPRN_SRR0, SPRN_SRR1, 0 297 + kvm_handler BOOKE_INTERRUPT_SPE_FP_DATA, EX_PARAMS(GEN), \ 298 + SPRN_SRR0, SPRN_SRR1, 0 299 + kvm_handler BOOKE_INTERRUPT_SPE_FP_ROUND, EX_PARAMS(GEN), \ 300 + SPRN_SRR0, SPRN_SRR1, 0 301 + kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \ 302 + SPRN_SRR0, SPRN_SRR1, 0 303 + kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \ 304 + SPRN_SRR0, SPRN_SRR1, 0 305 + kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \ 306 + SPRN_CSRR0, SPRN_CSRR1, 0 307 + kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \ 308 + SPRN_SRR0, SPRN_SRR1, NEED_EMU 309 + kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \ 310 + SPRN_SRR0, SPRN_SRR1, 0 311 + kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \ 312 + SPRN_GSRR0, SPRN_GSRR1, 0 313 + kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \ 314 + SPRN_CSRR0, SPRN_CSRR1, 0 315 + kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \ 316 + SPRN_DSRR0, SPRN_DSRR1, 0 317 + kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \ 318 + SPRN_CSRR0, SPRN_CSRR1, 0 319 + #else 207 320 /* 208 321 * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h 209 322 */ 210 323 .macro kvm_handler intno srr0, srr1, flags 211 324 _GLOBAL(kvmppc_handler_\intno\()_\srr1) 212 - GET_VCPU(r11, r10) 325 + PPC_LL r11, THREAD_KVM_VCPU(r10) 213 326 PPC_STL r3, VCPU_GPR(R3)(r11) 214 327 mfspr r3, SPRN_SPRG_RSCRATCH0 215 328 PPC_STL r4, VCPU_GPR(R4)(r11) ··· 352 233 .macro kvm_lvl_handler intno scratch srr0, srr1, flags 353 234 _GLOBAL(kvmppc_handler_\intno\()_\srr1) 354 235 mfspr r10, SPRN_SPRG_THREAD 355 - GET_VCPU(r11, r10) 236 + PPC_LL r11, THREAD_KVM_VCPU(r10) 356 237 PPC_STL r3, VCPU_GPR(R3)(r11) 357 238 mfspr r3, \scratch 358 239 PPC_STL r4, VCPU_GPR(R4)(r11) ··· 414 295 SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 415 296 kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ 416 297 SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0 417 - 298 + #endif 418 299 419 300 /* Registers: 420 301 * SPRG_SCRATCH0: guest r10

+5 -6

arch/powerpc/kvm/e500.h

··· 27 27 #define E500_TLB_NUM 2 28 28 29 29 #define E500_TLB_VALID 1 30 - #define E500_TLB_DIRTY 2 31 - #define E500_TLB_BITMAP 4 30 + #define E500_TLB_BITMAP 2 32 31 33 32 struct tlbe_ref { 34 33 pfn_t pfn; ··· 129 130 ulong value); 130 131 int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu); 131 132 int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu); 132 - int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb); 133 - int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb); 134 - int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb); 133 + int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea); 134 + int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea); 135 + int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea); 135 136 int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500); 136 137 void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); 137 138 ··· 154 155 155 156 static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) 156 157 { 157 - return tlbe->mas2 & 0xfffff000; 158 + return tlbe->mas2 & MAS2_EPN; 158 159 } 159 160 160 161 static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe)

+10 -4

arch/powerpc/kvm/e500_emulate.c

··· 89 89 int ra = get_ra(inst); 90 90 int rb = get_rb(inst); 91 91 int rt = get_rt(inst); 92 + gva_t ea; 92 93 93 94 switch (get_op(inst)) { 94 95 case 31: ··· 114 113 break; 115 114 116 115 case XOP_TLBSX: 117 - emulated = kvmppc_e500_emul_tlbsx(vcpu,rb); 116 + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); 117 + emulated = kvmppc_e500_emul_tlbsx(vcpu, ea); 118 118 break; 119 119 120 - case XOP_TLBILX: 121 - emulated = kvmppc_e500_emul_tlbilx(vcpu, rt, ra, rb); 120 + case XOP_TLBILX: { 121 + int type = rt & 0x3; 122 + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); 123 + emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea); 122 124 break; 125 + } 123 126 124 127 case XOP_TLBIVAX: 125 - emulated = kvmppc_e500_emul_tlbivax(vcpu, ra, rb); 128 + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); 129 + emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); 126 130 break; 127 131 128 132 default:

+93 -39

arch/powerpc/kvm/e500_tlb.c

··· 304 304 ref->flags = E500_TLB_VALID; 305 305 306 306 if (tlbe_is_writable(gtlbe)) 307 - ref->flags |= E500_TLB_DIRTY; 307 + kvm_set_pfn_dirty(pfn); 308 308 } 309 309 310 310 static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) 311 311 { 312 312 if (ref->flags & E500_TLB_VALID) { 313 - if (ref->flags & E500_TLB_DIRTY) 314 - kvm_release_pfn_dirty(ref->pfn); 315 - else 316 - kvm_release_pfn_clean(ref->pfn); 317 - 313 + trace_kvm_booke206_ref_release(ref->pfn, ref->flags); 318 314 ref->flags = 0; 319 315 } 320 316 } ··· 351 355 } 352 356 353 357 clear_tlb_privs(vcpu_e500); 358 + } 359 + 360 + void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu) 361 + { 362 + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 363 + clear_tlb_refs(vcpu_e500); 364 + clear_tlb1_bitmap(vcpu_e500); 354 365 } 355 366 356 367 static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, ··· 415 412 struct tlbe_ref *ref) 416 413 { 417 414 struct kvm_memory_slot *slot; 418 - unsigned long pfn, hva; 415 + unsigned long pfn = 0; /* silence GCC warning */ 416 + unsigned long hva; 419 417 int pfnmap = 0; 420 418 int tsize = BOOK3E_PAGESZ_4K; 421 419 ··· 525 521 if (likely(!pfnmap)) { 526 522 unsigned long tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); 527 523 pfn = gfn_to_pfn_memslot(slot, gfn); 528 - if (is_error_pfn(pfn)) { 524 + if (is_error_noslot_pfn(pfn)) { 529 525 printk(KERN_ERR "Couldn't get real page for gfn %lx!\n", 530 526 (long)gfn); 531 527 return; ··· 545 541 546 542 /* Clear i-cache for new pages */ 547 543 kvmppc_mmu_flush_icache(pfn); 544 + 545 + /* Drop refcount on page, so that mmu notifiers can clear it */ 546 + kvm_release_pfn_clean(pfn); 548 547 } 549 548 550 549 /* XXX only map the one-one case, for now use TLB0 */ ··· 689 682 return EMULATE_DONE; 690 683 } 691 684 692 - int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, int ra, int rb) 685 + int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea) 693 686 { 694 687 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 695 688 unsigned int ia; 696 689 int esel, tlbsel; 697 - gva_t ea; 698 - 699 - ea = ((ra) ? kvmppc_get_gpr(vcpu, ra) : 0) + kvmppc_get_gpr(vcpu, rb); 700 690 701 691 ia = (ea >> 2) & 0x1; 702 692 ··· 720 716 } 721 717 722 718 static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, 723 - int pid, int rt) 719 + int pid, int type) 724 720 { 725 721 struct kvm_book3e_206_tlb_entry *tlbe; 726 722 int tid, esel; ··· 729 725 for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) { 730 726 tlbe = get_entry(vcpu_e500, tlbsel, esel); 731 727 tid = get_tlb_tid(tlbe); 732 - if (rt == 0 || tid == pid) { 728 + if (type == 0 || tid == pid) { 733 729 inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); 734 730 kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); 735 731 } ··· 737 733 } 738 734 739 735 static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid, 740 - int ra, int rb) 736 + gva_t ea) 741 737 { 742 738 int tlbsel, esel; 743 - gva_t ea; 744 - 745 - ea = kvmppc_get_gpr(&vcpu_e500->vcpu, rb); 746 - if (ra) 747 - ea += kvmppc_get_gpr(&vcpu_e500->vcpu, ra); 748 739 749 740 for (tlbsel = 0; tlbsel < 2; tlbsel++) { 750 741 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1); ··· 751 752 } 752 753 } 753 754 754 - int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int rt, int ra, int rb) 755 + int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea) 755 756 { 756 757 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 757 758 int pid = get_cur_spid(vcpu); 758 759 759 - if (rt == 0 || rt == 1) { 760 - tlbilx_all(vcpu_e500, 0, pid, rt); 761 - tlbilx_all(vcpu_e500, 1, pid, rt); 762 - } else if (rt == 3) { 763 - tlbilx_one(vcpu_e500, pid, ra, rb); 760 + if (type == 0 || type == 1) { 761 + tlbilx_all(vcpu_e500, 0, pid, type); 762 + tlbilx_all(vcpu_e500, 1, pid, type); 763 + } else if (type == 3) { 764 + tlbilx_one(vcpu_e500, pid, ea); 764 765 } 765 766 766 767 return EMULATE_DONE; ··· 785 786 return EMULATE_DONE; 786 787 } 787 788 788 - int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, int rb) 789 + int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea) 789 790 { 790 791 struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); 791 792 int as = !!get_cur_sas(vcpu); 792 793 unsigned int pid = get_cur_spid(vcpu); 793 794 int esel, tlbsel; 794 795 struct kvm_book3e_206_tlb_entry *gtlbe = NULL; 795 - gva_t ea; 796 - 797 - ea = kvmppc_get_gpr(vcpu, rb); 798 796 799 797 for (tlbsel = 0; tlbsel < 2; tlbsel++) { 800 798 esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); ··· 871 875 872 876 gtlbe->mas1 = vcpu->arch.shared->mas1; 873 877 gtlbe->mas2 = vcpu->arch.shared->mas2; 878 + if (!(vcpu->arch.shared->msr & MSR_CM)) 879 + gtlbe->mas2 &= 0xffffffffUL; 874 880 gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; 875 881 876 882 trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, ··· 1037 1039 sesel = 0; /* unused */ 1038 1040 priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; 1039 1041 1040 - kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, 1041 - &priv->ref, eaddr, &stlbe); 1042 + /* Only triggers after clear_tlb_refs */ 1043 + if (unlikely(!(priv->ref.flags & E500_TLB_VALID))) 1044 + kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); 1045 + else 1046 + kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, 1047 + &priv->ref, eaddr, &stlbe); 1042 1048 break; 1043 1049 1044 1050 case 1: { ··· 1061 1059 1062 1060 write_stlbe(vcpu_e500, gtlbe, &stlbe, stlbsel, sesel); 1063 1061 } 1062 + 1063 + /************* MMU Notifiers *************/ 1064 + 1065 + int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) 1066 + { 1067 + trace_kvm_unmap_hva(hva); 1068 + 1069 + /* 1070 + * Flush all shadow tlb entries everywhere. This is slow, but 1071 + * we are 100% sure that we catch the to be unmapped page 1072 + */ 1073 + kvm_flush_remote_tlbs(kvm); 1074 + 1075 + return 0; 1076 + } 1077 + 1078 + int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) 1079 + { 1080 + /* kvm_unmap_hva flushes everything anyways */ 1081 + kvm_unmap_hva(kvm, start); 1082 + 1083 + return 0; 1084 + } 1085 + 1086 + int kvm_age_hva(struct kvm *kvm, unsigned long hva) 1087 + { 1088 + /* XXX could be more clever ;) */ 1089 + return 0; 1090 + } 1091 + 1092 + int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) 1093 + { 1094 + /* XXX could be more clever ;) */ 1095 + return 0; 1096 + } 1097 + 1098 + void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) 1099 + { 1100 + /* The page will get remapped properly on its next fault */ 1101 + kvm_unmap_hva(kvm, hva); 1102 + } 1103 + 1104 + /*****************************************/ 1064 1105 1065 1106 static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) 1066 1107 { ··· 1126 1081 } 1127 1082 1128 1083 vcpu_e500->num_shared_tlb_pages = 0; 1084 + 1085 + kfree(vcpu_e500->shared_tlb_pages); 1129 1086 vcpu_e500->shared_tlb_pages = NULL; 1130 1087 } else { 1131 1088 kfree(vcpu_e500->gtlb_arch); ··· 1225 1178 } 1226 1179 1227 1180 virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); 1228 - if (!virt) 1181 + if (!virt) { 1182 + ret = -ENOMEM; 1229 1183 goto err_put_page; 1184 + } 1230 1185 1231 1186 privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], 1232 1187 GFP_KERNEL); 1233 1188 privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], 1234 1189 GFP_KERNEL); 1235 1190 1236 - if (!privs[0] || !privs[1]) 1237 - goto err_put_page; 1191 + if (!privs[0] || !privs[1]) { 1192 + ret = -ENOMEM; 1193 + goto err_privs; 1194 + } 1238 1195 1239 1196 g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], 1240 1197 GFP_KERNEL); 1241 - if (!g2h_bitmap) 1242 - goto err_put_page; 1198 + if (!g2h_bitmap) { 1199 + ret = -ENOMEM; 1200 + goto err_privs; 1201 + } 1243 1202 1244 1203 free_gtlb(vcpu_e500); 1245 1204 ··· 1285 1232 kvmppc_recalc_tlb1map_range(vcpu_e500); 1286 1233 return 0; 1287 1234 1288 - err_put_page: 1235 + err_privs: 1289 1236 kfree(privs[0]); 1290 1237 kfree(privs[1]); 1291 1238 1239 + err_put_page: 1292 1240 for (i = 0; i < num_pages; i++) 1293 1241 put_page(pages[i]); 1294 1242 ··· 1386 1332 if (!vcpu_e500->gtlb_priv[1]) 1387 1333 goto err; 1388 1334 1389 - vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(unsigned int) * 1335 + vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) * 1390 1336 vcpu_e500->gtlb_params[1].entries, 1391 1337 GFP_KERNEL); 1392 1338 if (!vcpu_e500->g2h_tlb1_map)

+121 -100

arch/powerpc/kvm/emulate.c

··· 131 131 return vcpu->arch.dec - jd; 132 132 } 133 133 134 + static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) 135 + { 136 + enum emulation_result emulated = EMULATE_DONE; 137 + ulong spr_val = kvmppc_get_gpr(vcpu, rs); 138 + 139 + switch (sprn) { 140 + case SPRN_SRR0: 141 + vcpu->arch.shared->srr0 = spr_val; 142 + break; 143 + case SPRN_SRR1: 144 + vcpu->arch.shared->srr1 = spr_val; 145 + break; 146 + 147 + /* XXX We need to context-switch the timebase for 148 + * watchdog and FIT. */ 149 + case SPRN_TBWL: break; 150 + case SPRN_TBWU: break; 151 + 152 + case SPRN_MSSSR0: break; 153 + 154 + case SPRN_DEC: 155 + vcpu->arch.dec = spr_val; 156 + kvmppc_emulate_dec(vcpu); 157 + break; 158 + 159 + case SPRN_SPRG0: 160 + vcpu->arch.shared->sprg0 = spr_val; 161 + break; 162 + case SPRN_SPRG1: 163 + vcpu->arch.shared->sprg1 = spr_val; 164 + break; 165 + case SPRN_SPRG2: 166 + vcpu->arch.shared->sprg2 = spr_val; 167 + break; 168 + case SPRN_SPRG3: 169 + vcpu->arch.shared->sprg3 = spr_val; 170 + break; 171 + 172 + default: 173 + emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, 174 + spr_val); 175 + if (emulated == EMULATE_FAIL) 176 + printk(KERN_INFO "mtspr: unknown spr " 177 + "0x%x\n", sprn); 178 + break; 179 + } 180 + 181 + kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); 182 + 183 + return emulated; 184 + } 185 + 186 + static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) 187 + { 188 + enum emulation_result emulated = EMULATE_DONE; 189 + ulong spr_val = 0; 190 + 191 + switch (sprn) { 192 + case SPRN_SRR0: 193 + spr_val = vcpu->arch.shared->srr0; 194 + break; 195 + case SPRN_SRR1: 196 + spr_val = vcpu->arch.shared->srr1; 197 + break; 198 + case SPRN_PVR: 199 + spr_val = vcpu->arch.pvr; 200 + break; 201 + case SPRN_PIR: 202 + spr_val = vcpu->vcpu_id; 203 + break; 204 + case SPRN_MSSSR0: 205 + spr_val = 0; 206 + break; 207 + 208 + /* Note: mftb and TBRL/TBWL are user-accessible, so 209 + * the guest can always access the real TB anyways. 210 + * In fact, we probably will never see these traps. */ 211 + case SPRN_TBWL: 212 + spr_val = get_tb() >> 32; 213 + break; 214 + case SPRN_TBWU: 215 + spr_val = get_tb(); 216 + break; 217 + 218 + case SPRN_SPRG0: 219 + spr_val = vcpu->arch.shared->sprg0; 220 + break; 221 + case SPRN_SPRG1: 222 + spr_val = vcpu->arch.shared->sprg1; 223 + break; 224 + case SPRN_SPRG2: 225 + spr_val = vcpu->arch.shared->sprg2; 226 + break; 227 + case SPRN_SPRG3: 228 + spr_val = vcpu->arch.shared->sprg3; 229 + break; 230 + /* Note: SPRG4-7 are user-readable, so we don't get 231 + * a trap. */ 232 + 233 + case SPRN_DEC: 234 + spr_val = kvmppc_get_dec(vcpu, get_tb()); 235 + break; 236 + default: 237 + emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, 238 + &spr_val); 239 + if (unlikely(emulated == EMULATE_FAIL)) { 240 + printk(KERN_INFO "mfspr: unknown spr " 241 + "0x%x\n", sprn); 242 + } 243 + break; 244 + } 245 + 246 + if (emulated == EMULATE_DONE) 247 + kvmppc_set_gpr(vcpu, rt, spr_val); 248 + kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); 249 + 250 + return emulated; 251 + } 252 + 134 253 /* XXX to do: 135 254 * lhax 136 255 * lhaux ··· 275 156 int sprn = get_sprn(inst); 276 157 enum emulation_result emulated = EMULATE_DONE; 277 158 int advance = 1; 278 - ulong spr_val = 0; 279 159 280 160 /* this default type might be overwritten by subcategories */ 281 161 kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); ··· 354 236 break; 355 237 356 238 case OP_31_XOP_MFSPR: 357 - switch (sprn) { 358 - case SPRN_SRR0: 359 - spr_val = vcpu->arch.shared->srr0; 360 - break; 361 - case SPRN_SRR1: 362 - spr_val = vcpu->arch.shared->srr1; 363 - break; 364 - case SPRN_PVR: 365 - spr_val = vcpu->arch.pvr; 366 - break; 367 - case SPRN_PIR: 368 - spr_val = vcpu->vcpu_id; 369 - break; 370 - case SPRN_MSSSR0: 371 - spr_val = 0; 372 - break; 373 - 374 - /* Note: mftb and TBRL/TBWL are user-accessible, so 375 - * the guest can always access the real TB anyways. 376 - * In fact, we probably will never see these traps. */ 377 - case SPRN_TBWL: 378 - spr_val = get_tb() >> 32; 379 - break; 380 - case SPRN_TBWU: 381 - spr_val = get_tb(); 382 - break; 383 - 384 - case SPRN_SPRG0: 385 - spr_val = vcpu->arch.shared->sprg0; 386 - break; 387 - case SPRN_SPRG1: 388 - spr_val = vcpu->arch.shared->sprg1; 389 - break; 390 - case SPRN_SPRG2: 391 - spr_val = vcpu->arch.shared->sprg2; 392 - break; 393 - case SPRN_SPRG3: 394 - spr_val = vcpu->arch.shared->sprg3; 395 - break; 396 - /* Note: SPRG4-7 are user-readable, so we don't get 397 - * a trap. */ 398 - 399 - case SPRN_DEC: 400 - spr_val = kvmppc_get_dec(vcpu, get_tb()); 401 - break; 402 - default: 403 - emulated = kvmppc_core_emulate_mfspr(vcpu, sprn, 404 - &spr_val); 405 - if (unlikely(emulated == EMULATE_FAIL)) { 406 - printk(KERN_INFO "mfspr: unknown spr " 407 - "0x%x\n", sprn); 408 - } 409 - break; 410 - } 411 - kvmppc_set_gpr(vcpu, rt, spr_val); 412 - kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); 239 + emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt); 413 240 break; 414 241 415 242 case OP_31_XOP_STHX: ··· 371 308 break; 372 309 373 310 case OP_31_XOP_MTSPR: 374 - spr_val = kvmppc_get_gpr(vcpu, rs); 375 - switch (sprn) { 376 - case SPRN_SRR0: 377 - vcpu->arch.shared->srr0 = spr_val; 378 - break; 379 - case SPRN_SRR1: 380 - vcpu->arch.shared->srr1 = spr_val; 381 - break; 382 - 383 - /* XXX We need to context-switch the timebase for 384 - * watchdog and FIT. */ 385 - case SPRN_TBWL: break; 386 - case SPRN_TBWU: break; 387 - 388 - case SPRN_MSSSR0: break; 389 - 390 - case SPRN_DEC: 391 - vcpu->arch.dec = spr_val; 392 - kvmppc_emulate_dec(vcpu); 393 - break; 394 - 395 - case SPRN_SPRG0: 396 - vcpu->arch.shared->sprg0 = spr_val; 397 - break; 398 - case SPRN_SPRG1: 399 - vcpu->arch.shared->sprg1 = spr_val; 400 - break; 401 - case SPRN_SPRG2: 402 - vcpu->arch.shared->sprg2 = spr_val; 403 - break; 404 - case SPRN_SPRG3: 405 - vcpu->arch.shared->sprg3 = spr_val; 406 - break; 407 - 408 - default: 409 - emulated = kvmppc_core_emulate_mtspr(vcpu, sprn, 410 - spr_val); 411 - if (emulated == EMULATE_FAIL) 412 - printk(KERN_INFO "mtspr: unknown spr " 413 - "0x%x\n", sprn); 414 - break; 415 - } 416 - kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); 311 + emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs); 417 312 break; 418 313 419 314 case OP_31_XOP_DCBI:

+163 -24

arch/powerpc/kvm/powerpc.c

··· 30 30 #include <asm/kvm_ppc.h> 31 31 #include <asm/tlbflush.h> 32 32 #include <asm/cputhreads.h> 33 + #include <asm/irqflags.h> 33 34 #include "timing.h" 34 35 #include "../mm/mmu_decl.h" 35 36 ··· 39 38 40 39 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 41 40 { 42 - return !(v->arch.shared->msr & MSR_WE) || 43 - !!(v->arch.pending_exceptions) || 41 + return !!(v->arch.pending_exceptions) || 44 42 v->requests; 45 43 } 46 44 ··· 47 47 { 48 48 return 1; 49 49 } 50 + 51 + #ifndef CONFIG_KVM_BOOK3S_64_HV 52 + /* 53 + * Common checks before entering the guest world. Call with interrupts 54 + * disabled. 55 + * 56 + * returns: 57 + * 58 + * == 1 if we're ready to go into guest state 59 + * <= 0 if we need to go back to the host with return value 60 + */ 61 + int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) 62 + { 63 + int r = 1; 64 + 65 + WARN_ON_ONCE(!irqs_disabled()); 66 + while (true) { 67 + if (need_resched()) { 68 + local_irq_enable(); 69 + cond_resched(); 70 + local_irq_disable(); 71 + continue; 72 + } 73 + 74 + if (signal_pending(current)) { 75 + kvmppc_account_exit(vcpu, SIGNAL_EXITS); 76 + vcpu->run->exit_reason = KVM_EXIT_INTR; 77 + r = -EINTR; 78 + break; 79 + } 80 + 81 + vcpu->mode = IN_GUEST_MODE; 82 + 83 + /* 84 + * Reading vcpu->requests must happen after setting vcpu->mode, 85 + * so we don't miss a request because the requester sees 86 + * OUTSIDE_GUEST_MODE and assumes we'll be checking requests 87 + * before next entering the guest (and thus doesn't IPI). 88 + */ 89 + smp_mb(); 90 + 91 + if (vcpu->requests) { 92 + /* Make sure we process requests preemptable */ 93 + local_irq_enable(); 94 + trace_kvm_check_requests(vcpu); 95 + r = kvmppc_core_check_requests(vcpu); 96 + local_irq_disable(); 97 + if (r > 0) 98 + continue; 99 + break; 100 + } 101 + 102 + if (kvmppc_core_prepare_to_enter(vcpu)) { 103 + /* interrupts got enabled in between, so we 104 + are back at square 1 */ 105 + continue; 106 + } 107 + 108 + #ifdef CONFIG_PPC64 109 + /* lazy EE magic */ 110 + hard_irq_disable(); 111 + if (lazy_irq_pending()) { 112 + /* Got an interrupt in between, try again */ 113 + local_irq_enable(); 114 + local_irq_disable(); 115 + kvm_guest_exit(); 116 + continue; 117 + } 118 + 119 + trace_hardirqs_on(); 120 + #endif 121 + 122 + kvm_guest_enter(); 123 + break; 124 + } 125 + 126 + return r; 127 + } 128 + #endif /* CONFIG_KVM_BOOK3S_64_HV */ 50 129 51 130 int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) 52 131 { ··· 146 67 } 147 68 148 69 switch (nr) { 149 - case HC_VENDOR_KVM | KVM_HC_PPC_MAP_MAGIC_PAGE: 70 + case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE): 150 71 { 151 72 vcpu->arch.magic_page_pa = param1; 152 73 vcpu->arch.magic_page_ea = param2; 153 74 154 75 r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; 155 76 156 - r = HC_EV_SUCCESS; 77 + r = EV_SUCCESS; 157 78 break; 158 79 } 159 - case HC_VENDOR_KVM | KVM_HC_FEATURES: 160 - r = HC_EV_SUCCESS; 80 + case KVM_HCALL_TOKEN(KVM_HC_FEATURES): 81 + r = EV_SUCCESS; 161 82 #if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2) 162 83 /* XXX Missing magic page on 44x */ 163 84 r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); ··· 165 86 166 87 /* Second return value is in r4 */ 167 88 break; 89 + case EV_HCALL_TOKEN(EV_IDLE): 90 + r = EV_SUCCESS; 91 + kvm_vcpu_block(vcpu); 92 + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); 93 + break; 168 94 default: 169 - r = HC_EV_UNIMPLEMENTED; 95 + r = EV_UNIMPLEMENTED; 170 96 break; 171 97 } 172 98 ··· 304 220 switch (ext) { 305 221 #ifdef CONFIG_BOOKE 306 222 case KVM_CAP_PPC_BOOKE_SREGS: 223 + case KVM_CAP_PPC_BOOKE_WATCHDOG: 307 224 #else 308 225 case KVM_CAP_PPC_SEGSTATE: 309 226 case KVM_CAP_PPC_HIOR: ··· 314 229 case KVM_CAP_PPC_IRQ_LEVEL: 315 230 case KVM_CAP_ENABLE_CAP: 316 231 case KVM_CAP_ONE_REG: 232 + case KVM_CAP_IOEVENTFD: 317 233 r = 1; 318 234 break; 319 235 #ifndef CONFIG_KVM_BOOK3S_64_HV ··· 346 260 if (cpu_has_feature(CPU_FTR_ARCH_201)) 347 261 r = 2; 348 262 break; 263 + #endif 349 264 case KVM_CAP_SYNC_MMU: 265 + #ifdef CONFIG_KVM_BOOK3S_64_HV 350 266 r = cpu_has_feature(CPU_FTR_ARCH_206) ? 1 : 0; 267 + #elif defined(KVM_ARCH_WANT_MMU_NOTIFIER) 268 + r = 1; 269 + #else 270 + r = 0; 351 271 break; 352 272 #endif 273 + #ifdef CONFIG_KVM_BOOK3S_64_HV 274 + case KVM_CAP_PPC_HTAB_FD: 275 + r = 1; 276 + break; 277 + #endif 278 + break; 353 279 case KVM_CAP_NR_VCPUS: 354 280 /* 355 281 * Recommending a number of CPUs is somewhat arbitrary; we ··· 400 302 void kvm_arch_free_memslot(struct kvm_memory_slot *free, 401 303 struct kvm_memory_slot *dont) 402 304 { 403 - if (!dont || free->arch.rmap != dont->arch.rmap) { 404 - vfree(free->arch.rmap); 405 - free->arch.rmap = NULL; 406 - } 305 + kvmppc_core_free_memslot(free, dont); 407 306 } 408 307 409 308 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) 410 309 { 411 - slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); 412 - if (!slot->arch.rmap) 413 - return -ENOMEM; 414 - 415 - return 0; 310 + return kvmppc_core_create_memslot(slot, npages); 416 311 } 417 312 418 313 int kvm_arch_prepare_memory_region(struct kvm *kvm, ··· 414 323 struct kvm_userspace_memory_region *mem, 415 324 int user_alloc) 416 325 { 417 - return kvmppc_core_prepare_memory_region(kvm, mem); 326 + return kvmppc_core_prepare_memory_region(kvm, memslot, mem); 418 327 } 419 328 420 329 void kvm_arch_commit_memory_region(struct kvm *kvm, ··· 422 331 struct kvm_memory_slot old, 423 332 int user_alloc) 424 333 { 425 - kvmppc_core_commit_memory_region(kvm, mem); 334 + kvmppc_core_commit_memory_region(kvm, mem, old); 426 335 } 427 336 428 337 void kvm_arch_flush_shadow_all(struct kvm *kvm) ··· 432 341 void kvm_arch_flush_shadow_memslot(struct kvm *kvm, 433 342 struct kvm_memory_slot *slot) 434 343 { 344 + kvmppc_core_flush_memslot(kvm, slot); 435 345 } 436 346 437 347 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) ··· 444 352 kvmppc_create_vcpu_debugfs(vcpu, id); 445 353 } 446 354 return vcpu; 355 + } 356 + 357 + int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 358 + { 359 + return 0; 447 360 } 448 361 449 362 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) ··· 487 390 488 391 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) 489 392 { 393 + int ret; 394 + 490 395 hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); 491 396 tasklet_init(&vcpu->arch.tasklet, kvmppc_decrementer_func, (ulong)vcpu); 492 397 vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; ··· 497 398 #ifdef CONFIG_KVM_EXIT_TIMING 498 399 mutex_init(&vcpu->arch.exit_timing_lock); 499 400 #endif 500 - 501 - return 0; 401 + ret = kvmppc_subarch_vcpu_init(vcpu); 402 + return ret; 502 403 } 503 404 504 405 void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 505 406 { 506 407 kvmppc_mmu_destroy(vcpu); 408 + kvmppc_subarch_vcpu_uninit(vcpu); 507 409 } 508 410 509 411 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) ··· 520 420 mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); 521 421 #endif 522 422 kvmppc_core_vcpu_load(vcpu, cpu); 523 - vcpu->cpu = smp_processor_id(); 524 423 } 525 424 526 425 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) ··· 528 429 #ifdef CONFIG_BOOKE 529 430 vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); 530 431 #endif 531 - vcpu->cpu = -1; 532 432 } 533 433 534 434 int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, ··· 625 527 vcpu->mmio_is_write = 0; 626 528 vcpu->arch.mmio_sign_extend = 0; 627 529 530 + if (!kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, 531 + bytes, &run->mmio.data)) { 532 + kvmppc_complete_mmio_load(vcpu, run); 533 + vcpu->mmio_needed = 0; 534 + return EMULATE_DONE; 535 + } 536 + 628 537 return EMULATE_DO_MMIO; 629 538 } 630 539 ··· 641 536 { 642 537 int r; 643 538 644 - r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian); 645 539 vcpu->arch.mmio_sign_extend = 1; 540 + r = kvmppc_handle_load(run, vcpu, rt, bytes, is_bigendian); 646 541 647 542 return r; 648 543 } ··· 678 573 case 2: st_le16(data, val); break; 679 574 case 1: *(u8 *)data = val; break; 680 575 } 576 + } 577 + 578 + if (!kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr, 579 + bytes, &run->mmio.data)) { 580 + kvmppc_complete_mmio_load(vcpu, run); 581 + vcpu->mmio_needed = 0; 582 + return EMULATE_DONE; 681 583 } 682 584 683 585 return EMULATE_DO_MMIO; ··· 761 649 r = 0; 762 650 vcpu->arch.papr_enabled = true; 763 651 break; 652 + #ifdef CONFIG_BOOKE 653 + case KVM_CAP_PPC_BOOKE_WATCHDOG: 654 + r = 0; 655 + vcpu->arch.watchdog_enabled = true; 656 + break; 657 + #endif 764 658 #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) 765 659 case KVM_CAP_SW_TLB: { 766 660 struct kvm_config_tlb cfg; ··· 869 751 870 752 static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) 871 753 { 754 + u32 inst_nop = 0x60000000; 755 + #ifdef CONFIG_KVM_BOOKE_HV 756 + u32 inst_sc1 = 0x44000022; 757 + pvinfo->hcall[0] = inst_sc1; 758 + pvinfo->hcall[1] = inst_nop; 759 + pvinfo->hcall[2] = inst_nop; 760 + pvinfo->hcall[3] = inst_nop; 761 + #else 872 762 u32 inst_lis = 0x3c000000; 873 763 u32 inst_ori = 0x60000000; 874 - u32 inst_nop = 0x60000000; 875 764 u32 inst_sc = 0x44000002; 876 765 u32 inst_imm_mask = 0xffff; 877 766 ··· 895 770 pvinfo->hcall[1] = inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask); 896 771 pvinfo->hcall[2] = inst_sc; 897 772 pvinfo->hcall[3] = inst_nop; 773 + #endif 774 + 775 + pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE; 898 776 899 777 return 0; 900 778 } ··· 958 830 if (put_user(htab_order, (u32 __user *)argp)) 959 831 break; 960 832 r = 0; 833 + break; 834 + } 835 + 836 + case KVM_PPC_GET_HTAB_FD: { 837 + struct kvm *kvm = filp->private_data; 838 + struct kvm_get_htab_fd ghf; 839 + 840 + r = -EFAULT; 841 + if (copy_from_user(&ghf, argp, sizeof(ghf))) 842 + break; 843 + r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); 961 844 break; 962 845 } 963 846 #endif /* CONFIG_KVM_BOOK3S_64_HV */

+176 -28

arch/powerpc/kvm/trace.h

··· 31 31 __entry->inst, __entry->pc, __entry->emulate) 32 32 ); 33 33 34 + #ifdef CONFIG_PPC_BOOK3S 35 + #define kvm_trace_symbol_exit \ 36 + {0x100, "SYSTEM_RESET"}, \ 37 + {0x200, "MACHINE_CHECK"}, \ 38 + {0x300, "DATA_STORAGE"}, \ 39 + {0x380, "DATA_SEGMENT"}, \ 40 + {0x400, "INST_STORAGE"}, \ 41 + {0x480, "INST_SEGMENT"}, \ 42 + {0x500, "EXTERNAL"}, \ 43 + {0x501, "EXTERNAL_LEVEL"}, \ 44 + {0x502, "EXTERNAL_HV"}, \ 45 + {0x600, "ALIGNMENT"}, \ 46 + {0x700, "PROGRAM"}, \ 47 + {0x800, "FP_UNAVAIL"}, \ 48 + {0x900, "DECREMENTER"}, \ 49 + {0x980, "HV_DECREMENTER"}, \ 50 + {0xc00, "SYSCALL"}, \ 51 + {0xd00, "TRACE"}, \ 52 + {0xe00, "H_DATA_STORAGE"}, \ 53 + {0xe20, "H_INST_STORAGE"}, \ 54 + {0xe40, "H_EMUL_ASSIST"}, \ 55 + {0xf00, "PERFMON"}, \ 56 + {0xf20, "ALTIVEC"}, \ 57 + {0xf40, "VSX"} 58 + #else 59 + #define kvm_trace_symbol_exit \ 60 + {0, "CRITICAL"}, \ 61 + {1, "MACHINE_CHECK"}, \ 62 + {2, "DATA_STORAGE"}, \ 63 + {3, "INST_STORAGE"}, \ 64 + {4, "EXTERNAL"}, \ 65 + {5, "ALIGNMENT"}, \ 66 + {6, "PROGRAM"}, \ 67 + {7, "FP_UNAVAIL"}, \ 68 + {8, "SYSCALL"}, \ 69 + {9, "AP_UNAVAIL"}, \ 70 + {10, "DECREMENTER"}, \ 71 + {11, "FIT"}, \ 72 + {12, "WATCHDOG"}, \ 73 + {13, "DTLB_MISS"}, \ 74 + {14, "ITLB_MISS"}, \ 75 + {15, "DEBUG"}, \ 76 + {32, "SPE_UNAVAIL"}, \ 77 + {33, "SPE_FP_DATA"}, \ 78 + {34, "SPE_FP_ROUND"}, \ 79 + {35, "PERFORMANCE_MONITOR"}, \ 80 + {36, "DOORBELL"}, \ 81 + {37, "DOORBELL_CRITICAL"}, \ 82 + {38, "GUEST_DBELL"}, \ 83 + {39, "GUEST_DBELL_CRIT"}, \ 84 + {40, "HV_SYSCALL"}, \ 85 + {41, "HV_PRIV"} 86 + #endif 87 + 88 + TRACE_EVENT(kvm_exit, 89 + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), 90 + TP_ARGS(exit_nr, vcpu), 91 + 92 + TP_STRUCT__entry( 93 + __field( unsigned int, exit_nr ) 94 + __field( unsigned long, pc ) 95 + __field( unsigned long, msr ) 96 + __field( unsigned long, dar ) 97 + #ifdef CONFIG_KVM_BOOK3S_PR 98 + __field( unsigned long, srr1 ) 99 + #endif 100 + __field( unsigned long, last_inst ) 101 + ), 102 + 103 + TP_fast_assign( 104 + #ifdef CONFIG_KVM_BOOK3S_PR 105 + struct kvmppc_book3s_shadow_vcpu *svcpu; 106 + #endif 107 + __entry->exit_nr = exit_nr; 108 + __entry->pc = kvmppc_get_pc(vcpu); 109 + __entry->dar = kvmppc_get_fault_dar(vcpu); 110 + __entry->msr = vcpu->arch.shared->msr; 111 + #ifdef CONFIG_KVM_BOOK3S_PR 112 + svcpu = svcpu_get(vcpu); 113 + __entry->srr1 = svcpu->shadow_srr1; 114 + svcpu_put(svcpu); 115 + #endif 116 + __entry->last_inst = vcpu->arch.last_inst; 117 + ), 118 + 119 + TP_printk("exit=%s" 120 + " | pc=0x%lx" 121 + " | msr=0x%lx" 122 + " | dar=0x%lx" 123 + #ifdef CONFIG_KVM_BOOK3S_PR 124 + " | srr1=0x%lx" 125 + #endif 126 + " | last_inst=0x%lx" 127 + , 128 + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), 129 + __entry->pc, 130 + __entry->msr, 131 + __entry->dar, 132 + #ifdef CONFIG_KVM_BOOK3S_PR 133 + __entry->srr1, 134 + #endif 135 + __entry->last_inst 136 + ) 137 + ); 138 + 139 + TRACE_EVENT(kvm_unmap_hva, 140 + TP_PROTO(unsigned long hva), 141 + TP_ARGS(hva), 142 + 143 + TP_STRUCT__entry( 144 + __field( unsigned long, hva ) 145 + ), 146 + 147 + TP_fast_assign( 148 + __entry->hva = hva; 149 + ), 150 + 151 + TP_printk("unmap hva 0x%lx\n", __entry->hva) 152 + ); 153 + 34 154 TRACE_EVENT(kvm_stlb_inval, 35 155 TP_PROTO(unsigned int stlb_index), 36 156 TP_ARGS(stlb_index), ··· 218 98 __entry->word1, __entry->word2) 219 99 ); 220 100 101 + TRACE_EVENT(kvm_check_requests, 102 + TP_PROTO(struct kvm_vcpu *vcpu), 103 + TP_ARGS(vcpu), 104 + 105 + TP_STRUCT__entry( 106 + __field( __u32, cpu_nr ) 107 + __field( __u32, requests ) 108 + ), 109 + 110 + TP_fast_assign( 111 + __entry->cpu_nr = vcpu->vcpu_id; 112 + __entry->requests = vcpu->requests; 113 + ), 114 + 115 + TP_printk("vcpu=%x requests=%x", 116 + __entry->cpu_nr, __entry->requests) 117 + ); 118 + 221 119 222 120 /************************************************************************* 223 121 * Book3S trace points * 224 122 *************************************************************************/ 225 123 226 124 #ifdef CONFIG_KVM_BOOK3S_PR 227 - 228 - TRACE_EVENT(kvm_book3s_exit, 229 - TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), 230 - TP_ARGS(exit_nr, vcpu), 231 - 232 - TP_STRUCT__entry( 233 - __field( unsigned int, exit_nr ) 234 - __field( unsigned long, pc ) 235 - __field( unsigned long, msr ) 236 - __field( unsigned long, dar ) 237 - __field( unsigned long, srr1 ) 238 - ), 239 - 240 - TP_fast_assign( 241 - struct kvmppc_book3s_shadow_vcpu *svcpu; 242 - __entry->exit_nr = exit_nr; 243 - __entry->pc = kvmppc_get_pc(vcpu); 244 - __entry->dar = kvmppc_get_fault_dar(vcpu); 245 - __entry->msr = vcpu->arch.shared->msr; 246 - svcpu = svcpu_get(vcpu); 247 - __entry->srr1 = svcpu->shadow_srr1; 248 - svcpu_put(svcpu); 249 - ), 250 - 251 - TP_printk("exit=0x%x | pc=0x%lx | msr=0x%lx | dar=0x%lx | srr1=0x%lx", 252 - __entry->exit_nr, __entry->pc, __entry->msr, __entry->dar, 253 - __entry->srr1) 254 - ); 255 125 256 126 TRACE_EVENT(kvm_book3s_reenter, 257 127 TP_PROTO(int r, struct kvm_vcpu *vcpu), ··· 503 393 TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", 504 394 __entry->mas0, __entry->mas1, 505 395 __entry->mas2, __entry->mas7_3) 396 + ); 397 + 398 + TRACE_EVENT(kvm_booke206_ref_release, 399 + TP_PROTO(__u64 pfn, __u32 flags), 400 + TP_ARGS(pfn, flags), 401 + 402 + TP_STRUCT__entry( 403 + __field( __u64, pfn ) 404 + __field( __u32, flags ) 405 + ), 406 + 407 + TP_fast_assign( 408 + __entry->pfn = pfn; 409 + __entry->flags = flags; 410 + ), 411 + 412 + TP_printk("pfn=%llx flags=%x", 413 + __entry->pfn, __entry->flags) 414 + ); 415 + 416 + TRACE_EVENT(kvm_booke_queue_irqprio, 417 + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), 418 + TP_ARGS(vcpu, priority), 419 + 420 + TP_STRUCT__entry( 421 + __field( __u32, cpu_nr ) 422 + __field( __u32, priority ) 423 + __field( unsigned long, pending ) 424 + ), 425 + 426 + TP_fast_assign( 427 + __entry->cpu_nr = vcpu->vcpu_id; 428 + __entry->priority = priority; 429 + __entry->pending = vcpu->arch.pending_exceptions; 430 + ), 431 + 432 + TP_printk("vcpu=%x prio=%x pending=%lx", 433 + __entry->cpu_nr, __entry->priority, __entry->pending) 506 434 ); 507 435 508 436 #endif

+1

arch/powerpc/platforms/Kconfig

··· 90 90 config PPC_EPAPR_HV_PIC 91 91 bool 92 92 default n 93 + select EPAPR_PARAVIRT 93 94 94 95 config MPIC_WEIRD 95 96 bool

+7 -2

arch/powerpc/sysdev/fsl_msi.c

··· 236 236 u32 intr_index; 237 237 u32 have_shift = 0; 238 238 struct fsl_msi_cascade_data *cascade_data; 239 - unsigned int ret; 240 239 241 240 cascade_data = irq_get_handler_data(irq); 242 241 msi_data = cascade_data->msi_data; ··· 267 268 case FSL_PIC_IP_IPIC: 268 269 msir_value = fsl_msi_read(msi_data->msi_regs, msir_index * 0x4); 269 270 break; 270 - case FSL_PIC_IP_VMPIC: 271 + #ifdef CONFIG_EPAPR_PARAVIRT 272 + case FSL_PIC_IP_VMPIC: { 273 + unsigned int ret; 271 274 ret = fh_vmpic_get_msir(virq_to_hw(irq), &msir_value); 272 275 if (ret) { 273 276 pr_err("fsl-msi: fh_vmpic_get_msir() failed for " ··· 277 276 msir_value = 0; 278 277 } 279 278 break; 279 + } 280 + #endif 280 281 } 281 282 282 283 while (msir_value) { ··· 511 508 .compatible = "fsl,ipic-msi", 512 509 .data = &ipic_msi_feature, 513 510 }, 511 + #ifdef CONFIG_EPAPR_PARAVIRT 514 512 { 515 513 .compatible = "fsl,vmpic-msi", 516 514 .data = &vmpic_msi_feature, 517 515 }, 516 + #endif 518 517 {} 519 518 }; 520 519

+2

arch/powerpc/sysdev/fsl_soc.c

··· 253 253 EXPORT_SYMBOL(diu_ops); 254 254 #endif 255 255 256 + #ifdef CONFIG_EPAPR_PARAVIRT 256 257 /* 257 258 * Restart the current partition 258 259 * ··· 279 278 pr_info("hv exit\n"); 280 279 fh_partition_stop(-1); 281 280 } 281 + #endif

+19 -2

arch/s390/kvm/interrupt.c

··· 629 629 break; 630 630 case KVM_S390_SIGP_STOP: 631 631 case KVM_S390_RESTART: 632 - case KVM_S390_INT_EXTERNAL_CALL: 633 - case KVM_S390_INT_EMERGENCY: 634 632 VCPU_EVENT(vcpu, 3, "inject: type %x", s390int->type); 635 633 inti->type = s390int->type; 634 + break; 635 + case KVM_S390_INT_EXTERNAL_CALL: 636 + if (s390int->parm & 0xffff0000) { 637 + kfree(inti); 638 + return -EINVAL; 639 + } 640 + VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u", 641 + s390int->parm); 642 + inti->type = s390int->type; 643 + inti->extcall.code = s390int->parm; 644 + break; 645 + case KVM_S390_INT_EMERGENCY: 646 + if (s390int->parm & 0xffff0000) { 647 + kfree(inti); 648 + return -EINVAL; 649 + } 650 + VCPU_EVENT(vcpu, 3, "inject: emergency %u\n", s390int->parm); 651 + inti->type = s390int->type; 652 + inti->emerg.code = s390int->parm; 636 653 break; 637 654 case KVM_S390_INT_VIRTIO: 638 655 case KVM_S390_INT_SERVICE:

+6 -1

arch/s390/kvm/kvm-s390.c

··· 355 355 atomic_set_mask(CPUSTAT_STOPPED, &vcpu->arch.sie_block->cpuflags); 356 356 } 357 357 358 + int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 359 + { 360 + return 0; 361 + } 362 + 358 363 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 359 364 { 360 365 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH | ··· 998 993 } 999 994 memcpy(facilities, S390_lowcore.stfle_fac_list, 16); 1000 995 facilities[0] &= 0xff00fff3f47c0000ULL; 1001 - facilities[1] &= 0x201c000000000000ULL; 996 + facilities[1] &= 0x001c000000000000ULL; 1002 997 return 0; 1003 998 } 1004 999

+1

arch/x86/include/asm/clocksource.h

··· 8 8 #define VCLOCK_NONE 0 /* No vDSO clock available. */ 9 9 #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ 10 10 #define VCLOCK_HPET 2 /* vDSO should use vread_hpet. */ 11 + #define VCLOCK_PVCLOCK 3 /* vDSO should use vread_pvclock. */ 11 12 12 13 struct arch_clocksource_data { 13 14 int vclock_mode;

+1

arch/x86/include/asm/cpufeature.h

··· 202 202 203 203 /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 204 204 #define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 205 + #define X86_FEATURE_TSC_ADJUST (9*32+ 1) /* TSC adjustment MSR 0x3b */ 205 206 #define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ 206 207 #define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ 207 208 #define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */

+5

arch/x86/include/asm/fixmap.h

··· 19 19 #include <asm/acpi.h> 20 20 #include <asm/apicdef.h> 21 21 #include <asm/page.h> 22 + #include <asm/pvclock.h> 22 23 #ifdef CONFIG_X86_32 23 24 #include <linux/threads.h> 24 25 #include <asm/kmap_types.h> ··· 81 80 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, 82 81 VVAR_PAGE, 83 82 VSYSCALL_HPET, 83 + #endif 84 + #ifdef CONFIG_PARAVIRT_CLOCK 85 + PVCLOCK_FIXMAP_BEGIN, 86 + PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, 84 87 #endif 85 88 FIX_DBGP_BASE, 86 89 FIX_EARLYCON_MEM_BASE,

+3

arch/x86/include/asm/kexec.h

··· 163 163 }; 164 164 #endif 165 165 166 + typedef void crash_vmclear_fn(void); 167 + extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; 168 + 166 169 #endif /* __ASSEMBLY__ */ 167 170 168 171 #endif /* _ASM_X86_KEXEC_H */

+6

arch/x86/include/asm/kvm_guest.h

··· 1 + #ifndef _ASM_X86_KVM_GUEST_H 2 + #define _ASM_X86_KVM_GUEST_H 3 + 4 + int kvm_setup_vsyscall_timeinfo(void); 5 + 6 + #endif /* _ASM_X86_KVM_GUEST_H */

+20 -4

arch/x86/include/asm/kvm_host.h

··· 22 22 #include <linux/kvm_para.h> 23 23 #include <linux/kvm_types.h> 24 24 #include <linux/perf_event.h> 25 + #include <linux/pvclock_gtod.h> 26 + #include <linux/clocksource.h> 25 27 26 28 #include <asm/pvclock-abi.h> 27 29 #include <asm/desc.h> ··· 444 442 s8 virtual_tsc_shift; 445 443 u32 virtual_tsc_mult; 446 444 u32 virtual_tsc_khz; 445 + s64 ia32_tsc_adjust_msr; 447 446 448 447 atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ 449 448 unsigned nmi_pending; /* NMI queued after currently running handler */ ··· 562 559 u64 cur_tsc_write; 563 560 u64 cur_tsc_offset; 564 561 u8 cur_tsc_generation; 562 + int nr_vcpus_matched_tsc; 563 + 564 + spinlock_t pvclock_gtod_sync_lock; 565 + bool use_master_clock; 566 + u64 master_kernel_ns; 567 + cycle_t master_cycle_now; 565 568 566 569 struct kvm_xen_hvm_config xen_hvm_config; 567 570 ··· 621 612 622 613 struct x86_instruction_info; 623 614 615 + struct msr_data { 616 + bool host_initiated; 617 + u32 index; 618 + u64 data; 619 + }; 620 + 624 621 struct kvm_x86_ops { 625 622 int (*cpu_has_kvm_support)(void); /* __init */ 626 623 int (*disabled_by_bios)(void); /* __init */ ··· 649 634 650 635 void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); 651 636 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 652 - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 637 + int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); 653 638 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 654 639 void (*get_segment)(struct kvm_vcpu *vcpu, 655 640 struct kvm_segment *var, int seg); ··· 712 697 bool (*has_wbinvd_exit)(void); 713 698 714 699 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); 700 + u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); 715 701 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 716 702 717 703 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); 718 - u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu); 704 + u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); 719 705 720 706 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); 721 707 ··· 801 785 802 786 void kvm_enable_efer_bits(u64); 803 787 int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 804 - int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 788 + int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); 805 789 806 790 struct x86_emulate_ctxt; 807 791 ··· 828 812 int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); 829 813 830 814 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 831 - int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 815 + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); 832 816 833 817 unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); 834 818 void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);

+1

arch/x86/include/asm/msr-index.h

··· 236 236 #define MSR_IA32_EBL_CR_POWERON 0x0000002a 237 237 #define MSR_EBC_FREQUENCY_ID 0x0000002c 238 238 #define MSR_IA32_FEATURE_CONTROL 0x0000003a 239 + #define MSR_IA32_TSC_ADJUST 0x0000003b 239 240 240 241 #define FEATURE_CONTROL_LOCKED (1<<0) 241 242 #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1)

+47

arch/x86/include/asm/pvclock.h

··· 6 6 7 7 /* some helper functions for xen and kvm pv clock sources */ 8 8 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); 9 + u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); 9 10 void pvclock_set_flags(u8 flags); 10 11 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); 11 12 void pvclock_read_wallclock(struct pvclock_wall_clock *wall, ··· 56 55 57 56 return product; 58 57 } 58 + 59 + static __always_inline 60 + u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) 61 + { 62 + u64 delta = __native_read_tsc() - src->tsc_timestamp; 63 + return pvclock_scale_delta(delta, src->tsc_to_system_mul, 64 + src->tsc_shift); 65 + } 66 + 67 + static __always_inline 68 + unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, 69 + cycle_t *cycles, u8 *flags) 70 + { 71 + unsigned version; 72 + cycle_t ret, offset; 73 + u8 ret_flags; 74 + 75 + version = src->version; 76 + /* Note: emulated platforms which do not advertise SSE2 support 77 + * result in kvmclock not using the necessary RDTSC barriers. 78 + * Without barriers, it is possible that RDTSC instruction reads from 79 + * the time stamp counter outside rdtsc_barrier protected section 80 + * below, resulting in violation of monotonicity. 81 + */ 82 + rdtsc_barrier(); 83 + offset = pvclock_get_nsec_offset(src); 84 + ret = src->system_time + offset; 85 + ret_flags = src->flags; 86 + rdtsc_barrier(); 87 + 88 + *cycles = ret; 89 + *flags = ret_flags; 90 + return version; 91 + } 92 + 93 + struct pvclock_vsyscall_time_info { 94 + struct pvclock_vcpu_time_info pvti; 95 + u32 migrate_count; 96 + } __attribute__((__aligned__(SMP_CACHE_BYTES))); 97 + 98 + #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) 99 + #define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1) 100 + 101 + int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, 102 + int size); 103 + struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu); 59 104 60 105 #endif /* _ASM_X86_PVCLOCK_H */

+1 -2

arch/x86/include/asm/vmx.h

··· 445 445 #define VMX_EPTP_WB_BIT (1ull << 14) 446 446 #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 447 447 #define VMX_EPT_1GB_PAGE_BIT (1ull << 17) 448 - #define VMX_EPT_AD_BIT (1ull << 21) 449 - #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 448 + #define VMX_EPT_AD_BIT (1ull << 21) 450 449 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 451 450 #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 452 451

+20

arch/x86/include/asm/vsyscall.h

··· 33 33 */ 34 34 extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address); 35 35 36 + #ifdef CONFIG_X86_64 37 + 38 + #define VGETCPU_CPU_MASK 0xfff 39 + 40 + static inline unsigned int __getcpu(void) 41 + { 42 + unsigned int p; 43 + 44 + if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { 45 + /* Load per CPU data from RDTSCP */ 46 + native_read_tscp(&p); 47 + } else { 48 + /* Load per CPU data from GDT */ 49 + asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); 50 + } 51 + 52 + return p; 53 + } 54 + #endif /* CONFIG_X86_64 */ 55 + 36 56 #endif /* __KERNEL__ */ 37 57 38 58 #endif /* _ASM_X86_VSYSCALL_H */

+32

arch/x86/kernel/crash.c

··· 16 16 #include <linux/delay.h> 17 17 #include <linux/elf.h> 18 18 #include <linux/elfcore.h> 19 + #include <linux/module.h> 19 20 20 21 #include <asm/processor.h> 21 22 #include <asm/hardirq.h> ··· 30 29 #include <asm/virtext.h> 31 30 32 31 int in_crash_kexec; 32 + 33 + /* 34 + * This is used to VMCLEAR all VMCSs loaded on the 35 + * processor. And when loading kvm_intel module, the 36 + * callback function pointer will be assigned. 37 + * 38 + * protected by rcu. 39 + */ 40 + crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; 41 + EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); 42 + 43 + static inline void cpu_crash_vmclear_loaded_vmcss(void) 44 + { 45 + crash_vmclear_fn *do_vmclear_operation = NULL; 46 + 47 + rcu_read_lock(); 48 + do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); 49 + if (do_vmclear_operation) 50 + do_vmclear_operation(); 51 + rcu_read_unlock(); 52 + } 33 53 34 54 #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 35 55 ··· 67 45 } 68 46 #endif 69 47 crash_save_cpu(regs, cpu); 48 + 49 + /* 50 + * VMCLEAR VMCSs loaded on all cpus if needed. 51 + */ 52 + cpu_crash_vmclear_loaded_vmcss(); 70 53 71 54 /* Disable VMX or SVM if needed. 72 55 * ··· 114 87 local_irq_disable(); 115 88 116 89 kdump_nmi_shootdown_cpus(); 90 + 91 + /* 92 + * VMCLEAR VMCSs loaded on this cpu if needed. 93 + */ 94 + cpu_crash_vmclear_loaded_vmcss(); 117 95 118 96 /* Booting kdump kernel with VMX or SVM enabled won't work, 119 97 * because (among other limitations) we can't disable paging

+14 -6

arch/x86/kernel/kvm.c

··· 42 42 #include <asm/apic.h> 43 43 #include <asm/apicdef.h> 44 44 #include <asm/hypervisor.h> 45 + #include <asm/kvm_guest.h> 45 46 46 47 static int kvmapf = 1; 47 48 ··· 62 61 } 63 62 64 63 early_param("no-steal-acc", parse_no_stealacc); 64 + 65 + static int kvmclock_vsyscall = 1; 66 + static int parse_no_kvmclock_vsyscall(char *arg) 67 + { 68 + kvmclock_vsyscall = 0; 69 + return 0; 70 + } 71 + 72 + early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); 65 73 66 74 static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); 67 75 static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); ··· 120 110 struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; 121 111 struct kvm_task_sleep_node n, *e; 122 112 DEFINE_WAIT(wait); 123 - int cpu, idle; 124 - 125 - cpu = get_cpu(); 126 - idle = idle_cpu(cpu); 127 - put_cpu(); 128 113 129 114 spin_lock(&b->lock); 130 115 e = _find_apf_task(b, token); ··· 133 128 134 129 n.token = token; 135 130 n.cpu = smp_processor_id(); 136 - n.halted = idle || preempt_count() > 1; 131 + n.halted = is_idle_task(current) || preempt_count() > 1; 137 132 init_waitqueue_head(&n.wq); 138 133 hlist_add_head(&n.link, &b->list); 139 134 spin_unlock(&b->lock); ··· 475 470 476 471 if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) 477 472 apic_set_eoi_write(kvm_guest_apic_eoi_write); 473 + 474 + if (kvmclock_vsyscall) 475 + kvm_setup_vsyscall_timeinfo(); 478 476 479 477 #ifdef CONFIG_SMP 480 478 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;

+77 -11

arch/x86/kernel/kvmclock.c

··· 23 23 #include <asm/apic.h> 24 24 #include <linux/percpu.h> 25 25 #include <linux/hardirq.h> 26 + #include <linux/memblock.h> 26 27 27 28 #include <asm/x86_init.h> 28 29 #include <asm/reboot.h> ··· 40 39 early_param("no-kvmclock", parse_no_kvmclock); 41 40 42 41 /* The hypervisor will put information about time periodically here */ 43 - static DEFINE_PER_CPU_SHARED_ALIGNED(struct pvclock_vcpu_time_info, hv_clock); 42 + static struct pvclock_vsyscall_time_info *hv_clock; 44 43 static struct pvclock_wall_clock wall_clock; 45 44 46 45 /* ··· 53 52 struct pvclock_vcpu_time_info *vcpu_time; 54 53 struct timespec ts; 55 54 int low, high; 55 + int cpu; 56 56 57 57 low = (int)__pa_symbol(&wall_clock); 58 58 high = ((u64)__pa_symbol(&wall_clock) >> 32); 59 59 60 60 native_write_msr(msr_kvm_wall_clock, low, high); 61 61 62 - vcpu_time = &get_cpu_var(hv_clock); 62 + preempt_disable(); 63 + cpu = smp_processor_id(); 64 + 65 + vcpu_time = &hv_clock[cpu].pvti; 63 66 pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); 64 - put_cpu_var(hv_clock); 67 + 68 + preempt_enable(); 65 69 66 70 return ts.tv_sec; 67 71 } ··· 80 74 { 81 75 struct pvclock_vcpu_time_info *src; 82 76 cycle_t ret; 77 + int cpu; 83 78 84 79 preempt_disable_notrace(); 85 - src = &__get_cpu_var(hv_clock); 80 + cpu = smp_processor_id(); 81 + src = &hv_clock[cpu].pvti; 86 82 ret = pvclock_clocksource_read(src); 87 83 preempt_enable_notrace(); 88 84 return ret; ··· 107 99 static unsigned long kvm_get_tsc_khz(void) 108 100 { 109 101 struct pvclock_vcpu_time_info *src; 110 - src = &per_cpu(hv_clock, 0); 111 - return pvclock_tsc_khz(src); 102 + int cpu; 103 + unsigned long tsc_khz; 104 + 105 + preempt_disable(); 106 + cpu = smp_processor_id(); 107 + src = &hv_clock[cpu].pvti; 108 + tsc_khz = pvclock_tsc_khz(src); 109 + preempt_enable(); 110 + return tsc_khz; 112 111 } 113 112 114 113 static void kvm_get_preset_lpj(void) ··· 134 119 { 135 120 bool ret = false; 136 121 struct pvclock_vcpu_time_info *src; 122 + int cpu = smp_processor_id(); 137 123 138 - src = &__get_cpu_var(hv_clock); 124 + if (!hv_clock) 125 + return ret; 126 + 127 + src = &hv_clock[cpu].pvti; 139 128 if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { 140 - __this_cpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); 129 + src->flags &= ~PVCLOCK_GUEST_STOPPED; 141 130 ret = true; 142 131 } 143 132 ··· 160 141 { 161 142 int cpu = smp_processor_id(); 162 143 int low, high, ret; 144 + struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; 163 145 164 - low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 165 - high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 146 + low = (int)__pa(src) | 1; 147 + high = ((u64)__pa(src) >> 32); 166 148 ret = native_write_msr_safe(msr_kvm_system_time, low, high); 167 149 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 168 150 cpu, high, low, txt); ··· 217 197 218 198 void __init kvmclock_init(void) 219 199 { 200 + unsigned long mem; 201 + 220 202 if (!kvm_para_available()) 221 203 return; 222 204 ··· 231 209 printk(KERN_INFO "kvm-clock: Using msrs %x and %x", 232 210 msr_kvm_system_time, msr_kvm_wall_clock); 233 211 234 - if (kvm_register_clock("boot clock")) 212 + mem = memblock_alloc(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS, 213 + PAGE_SIZE); 214 + if (!mem) 235 215 return; 216 + hv_clock = __va(mem); 217 + 218 + if (kvm_register_clock("boot clock")) { 219 + hv_clock = NULL; 220 + memblock_free(mem, 221 + sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); 222 + return; 223 + } 236 224 pv_time_ops.sched_clock = kvm_clock_read; 237 225 x86_platform.calibrate_tsc = kvm_get_tsc_khz; 238 226 x86_platform.get_wallclock = kvm_get_wallclock; ··· 264 232 265 233 if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) 266 234 pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); 235 + } 236 + 237 + int __init kvm_setup_vsyscall_timeinfo(void) 238 + { 239 + #ifdef CONFIG_X86_64 240 + int cpu; 241 + int ret; 242 + u8 flags; 243 + struct pvclock_vcpu_time_info *vcpu_time; 244 + unsigned int size; 245 + 246 + size = sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS; 247 + 248 + preempt_disable(); 249 + cpu = smp_processor_id(); 250 + 251 + vcpu_time = &hv_clock[cpu].pvti; 252 + flags = pvclock_read_flags(vcpu_time); 253 + 254 + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) { 255 + preempt_enable(); 256 + return 1; 257 + } 258 + 259 + if ((ret = pvclock_init_vsyscall(hv_clock, size))) { 260 + preempt_enable(); 261 + return ret; 262 + } 263 + 264 + preempt_enable(); 265 + 266 + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; 267 + #endif 268 + return 0; 267 269 }

+93 -54

arch/x86/kernel/pvclock.c

··· 17 17 18 18 #include <linux/kernel.h> 19 19 #include <linux/percpu.h> 20 + #include <linux/notifier.h> 21 + #include <linux/sched.h> 22 + #include <linux/gfp.h> 23 + #include <linux/bootmem.h> 24 + #include <asm/fixmap.h> 20 25 #include <asm/pvclock.h> 21 - 22 - /* 23 - * These are perodically updated 24 - * xen: magic shared_info page 25 - * kvm: gpa registered via msr 26 - * and then copied here. 27 - */ 28 - struct pvclock_shadow_time { 29 - u64 tsc_timestamp; /* TSC at last update of time vals. */ 30 - u64 system_timestamp; /* Time, in nanosecs, since boot. */ 31 - u32 tsc_to_nsec_mul; 32 - int tsc_shift; 33 - u32 version; 34 - u8 flags; 35 - }; 36 26 37 27 static u8 valid_flags __read_mostly = 0; 38 28 39 29 void pvclock_set_flags(u8 flags) 40 30 { 41 31 valid_flags = flags; 42 - } 43 - 44 - static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) 45 - { 46 - u64 delta = native_read_tsc() - shadow->tsc_timestamp; 47 - return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, 48 - shadow->tsc_shift); 49 - } 50 - 51 - /* 52 - * Reads a consistent set of time-base values from hypervisor, 53 - * into a shadow data area. 54 - */ 55 - static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, 56 - struct pvclock_vcpu_time_info *src) 57 - { 58 - do { 59 - dst->version = src->version; 60 - rmb(); /* fetch version before data */ 61 - dst->tsc_timestamp = src->tsc_timestamp; 62 - dst->system_timestamp = src->system_time; 63 - dst->tsc_to_nsec_mul = src->tsc_to_system_mul; 64 - dst->tsc_shift = src->tsc_shift; 65 - dst->flags = src->flags; 66 - rmb(); /* test version after fetching data */ 67 - } while ((src->version & 1) || (dst->version != src->version)); 68 - 69 - return dst->version; 70 32 } 71 33 72 34 unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) ··· 50 88 atomic64_set(&last_value, 0); 51 89 } 52 90 53 - cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 91 + u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src) 54 92 { 55 - struct pvclock_shadow_time shadow; 56 93 unsigned version; 57 - cycle_t ret, offset; 58 - u64 last; 94 + cycle_t ret; 95 + u8 flags; 59 96 60 97 do { 61 - version = pvclock_get_time_values(&shadow, src); 62 - barrier(); 63 - offset = pvclock_get_nsec_offset(&shadow); 64 - ret = shadow.system_timestamp + offset; 65 - barrier(); 66 - } while (version != src->version); 98 + version = __pvclock_read_cycles(src, &ret, &flags); 99 + } while ((src->version & 1) || version != src->version); 100 + 101 + return flags & valid_flags; 102 + } 103 + 104 + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 105 + { 106 + unsigned version; 107 + cycle_t ret; 108 + u64 last; 109 + u8 flags; 110 + 111 + do { 112 + version = __pvclock_read_cycles(src, &ret, &flags); 113 + } while ((src->version & 1) || version != src->version); 67 114 68 115 if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && 69 - (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) 116 + (flags & PVCLOCK_TSC_STABLE_BIT)) 70 117 return ret; 71 118 72 119 /* ··· 127 156 128 157 set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); 129 158 } 159 + 160 + static struct pvclock_vsyscall_time_info *pvclock_vdso_info; 161 + 162 + static struct pvclock_vsyscall_time_info * 163 + pvclock_get_vsyscall_user_time_info(int cpu) 164 + { 165 + if (!pvclock_vdso_info) { 166 + BUG(); 167 + return NULL; 168 + } 169 + 170 + return &pvclock_vdso_info[cpu]; 171 + } 172 + 173 + struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) 174 + { 175 + return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; 176 + } 177 + 178 + #ifdef CONFIG_X86_64 179 + static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, 180 + void *v) 181 + { 182 + struct task_migration_notifier *mn = v; 183 + struct pvclock_vsyscall_time_info *pvti; 184 + 185 + pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); 186 + 187 + /* this is NULL when pvclock vsyscall is not initialized */ 188 + if (unlikely(pvti == NULL)) 189 + return NOTIFY_DONE; 190 + 191 + pvti->migrate_count++; 192 + 193 + return NOTIFY_DONE; 194 + } 195 + 196 + static struct notifier_block pvclock_migrate = { 197 + .notifier_call = pvclock_task_migrate, 198 + }; 199 + 200 + /* 201 + * Initialize the generic pvclock vsyscall state. This will allocate 202 + * a/some page(s) for the per-vcpu pvclock information, set up a 203 + * fixmap mapping for the page(s) 204 + */ 205 + 206 + int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, 207 + int size) 208 + { 209 + int idx; 210 + 211 + WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); 212 + 213 + pvclock_vdso_info = i; 214 + 215 + for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { 216 + __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, 217 + __pa_symbol(i) + (idx*PAGE_SIZE), 218 + PAGE_KERNEL_VVAR); 219 + } 220 + 221 + 222 + register_task_migration_notifier(&pvclock_migrate); 223 + 224 + return 0; 225 + } 226 + #endif

+3

arch/x86/kvm/cpuid.c

··· 320 320 if (index == 0) { 321 321 entry->ebx &= kvm_supported_word9_x86_features; 322 322 cpuid_mask(&entry->ebx, 9); 323 + // TSC_ADJUST is emulated 324 + entry->ebx |= F(TSC_ADJUST); 323 325 } else 324 326 entry->ebx = 0; 325 327 entry->eax = 0; ··· 661 659 } else 662 660 *eax = *ebx = *ecx = *edx = 0; 663 661 } 662 + EXPORT_SYMBOL_GPL(kvm_cpuid); 664 663 665 664 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 666 665 {

+8

arch/x86/kvm/cpuid.h

··· 31 31 return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 32 32 } 33 33 34 + static inline bool guest_cpuid_has_tsc_adjust(struct kvm_vcpu *vcpu) 35 + { 36 + struct kvm_cpuid_entry2 *best; 37 + 38 + best = kvm_find_cpuid_entry(vcpu, 7, 0); 39 + return best && (best->ebx & bit(X86_FEATURE_TSC_ADJUST)); 40 + } 41 + 34 42 static inline bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) 35 43 { 36 44 struct kvm_cpuid_entry2 *best;

+3 -2

arch/x86/kvm/emulate.c

··· 676 676 addr.seg); 677 677 if (!usable) 678 678 goto bad; 679 - /* code segment or read-only data segment */ 680 - if (((desc.type & 8) || !(desc.type & 2)) && write) 679 + /* code segment in protected mode or read-only data segment */ 680 + if ((((ctxt->mode != X86EMUL_MODE_REAL) && (desc.type & 8)) 681 + || !(desc.type & 2)) && write) 681 682 goto bad; 682 683 /* unreadable code segment */ 683 684 if (!fetch && (desc.type & 8) && !(desc.type & 2))

+1 -1

arch/x86/kvm/lapic.c

··· 1011 1011 local_irq_save(flags); 1012 1012 1013 1013 now = apic->lapic_timer.timer.base->get_time(); 1014 - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 1014 + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); 1015 1015 if (likely(tscdeadline > guest_tsc)) { 1016 1016 ns = (tscdeadline - guest_tsc) * 1000000ULL; 1017 1017 do_div(ns, this_tsc_khz);

+44 -21

arch/x86/kvm/mmu.c

··· 2382 2382 || (!vcpu->arch.mmu.direct_map && write_fault 2383 2383 && !is_write_protection(vcpu) && !user_fault)) { 2384 2384 2385 + /* 2386 + * There are two cases: 2387 + * - the one is other vcpu creates new sp in the window 2388 + * between mapping_level() and acquiring mmu-lock. 2389 + * - the another case is the new sp is created by itself 2390 + * (page-fault path) when guest uses the target gfn as 2391 + * its page table. 2392 + * Both of these cases can be fixed by allowing guest to 2393 + * retry the access, it will refault, then we can establish 2394 + * the mapping by using small page. 2395 + */ 2385 2396 if (level > PT_PAGE_TABLE_LEVEL && 2386 - has_wrprotected_page(vcpu->kvm, gfn, level)) { 2387 - ret = 1; 2388 - drop_spte(vcpu->kvm, sptep); 2397 + has_wrprotected_page(vcpu->kvm, gfn, level)) 2389 2398 goto done; 2390 - } 2391 2399 2392 2400 spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; 2393 2401 ··· 2513 2505 mmu_free_roots(vcpu); 2514 2506 } 2515 2507 2508 + static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) 2509 + { 2510 + int bit7; 2511 + 2512 + bit7 = (gpte >> 7) & 1; 2513 + return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; 2514 + } 2515 + 2516 2516 static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn, 2517 2517 bool no_dirty_log) 2518 2518 { ··· 2531 2515 return KVM_PFN_ERR_FAULT; 2532 2516 2533 2517 return gfn_to_pfn_memslot_atomic(slot, gfn); 2518 + } 2519 + 2520 + static bool prefetch_invalid_gpte(struct kvm_vcpu *vcpu, 2521 + struct kvm_mmu_page *sp, u64 *spte, 2522 + u64 gpte) 2523 + { 2524 + if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) 2525 + goto no_present; 2526 + 2527 + if (!is_present_gpte(gpte)) 2528 + goto no_present; 2529 + 2530 + if (!(gpte & PT_ACCESSED_MASK)) 2531 + goto no_present; 2532 + 2533 + return false; 2534 + 2535 + no_present: 2536 + drop_spte(vcpu->kvm, spte); 2537 + return true; 2534 2538 } 2535 2539 2536 2540 static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, ··· 2707 2671 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done 2708 2672 * here. 2709 2673 */ 2710 - if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && 2674 + if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn) && 2711 2675 level == PT_PAGE_TABLE_LEVEL && 2712 2676 PageTransCompound(pfn_to_page(pfn)) && 2713 2677 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { ··· 2735 2699 } 2736 2700 } 2737 2701 2738 - static bool mmu_invalid_pfn(pfn_t pfn) 2739 - { 2740 - return unlikely(is_invalid_pfn(pfn)); 2741 - } 2742 - 2743 2702 static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn, 2744 2703 pfn_t pfn, unsigned access, int *ret_val) 2745 2704 { 2746 2705 bool ret = true; 2747 2706 2748 2707 /* The pfn is invalid, report the error! */ 2749 - if (unlikely(is_invalid_pfn(pfn))) { 2708 + if (unlikely(is_error_pfn(pfn))) { 2750 2709 *ret_val = kvm_handle_bad_page(vcpu, gfn, pfn); 2751 2710 goto exit; 2752 2711 } ··· 2893 2862 return r; 2894 2863 2895 2864 spin_lock(&vcpu->kvm->mmu_lock); 2896 - if (mmu_notifier_retry(vcpu, mmu_seq)) 2865 + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 2897 2866 goto out_unlock; 2898 2867 kvm_mmu_free_some_pages(vcpu); 2899 2868 if (likely(!force_pt_level)) ··· 3362 3331 return r; 3363 3332 3364 3333 spin_lock(&vcpu->kvm->mmu_lock); 3365 - if (mmu_notifier_retry(vcpu, mmu_seq)) 3334 + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 3366 3335 goto out_unlock; 3367 3336 kvm_mmu_free_some_pages(vcpu); 3368 3337 if (likely(!force_pt_level)) ··· 3428 3397 static void paging_free(struct kvm_vcpu *vcpu) 3429 3398 { 3430 3399 nonpaging_free(vcpu); 3431 - } 3432 - 3433 - static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) 3434 - { 3435 - int bit7; 3436 - 3437 - bit7 = (gpte >> 7) & 1; 3438 - return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0; 3439 3400 } 3440 3401 3441 3402 static inline void protect_clean_gpte(unsigned *access, unsigned gpte)

+41 -74

arch/x86/kvm/paging_tmpl.h

··· 305 305 addr, access); 306 306 } 307 307 308 - static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, 309 - struct kvm_mmu_page *sp, u64 *spte, 310 - pt_element_t gpte) 308 + static bool 309 + FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 310 + u64 *spte, pt_element_t gpte, bool no_dirty_log) 311 311 { 312 - if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) 313 - goto no_present; 312 + unsigned pte_access; 313 + gfn_t gfn; 314 + pfn_t pfn; 314 315 315 - if (!is_present_gpte(gpte)) 316 - goto no_present; 316 + if (prefetch_invalid_gpte(vcpu, sp, spte, gpte)) 317 + return false; 317 318 318 - if (!(gpte & PT_ACCESSED_MASK)) 319 - goto no_present; 319 + pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 320 320 321 - return false; 321 + gfn = gpte_to_gfn(gpte); 322 + pte_access = sp->role.access & gpte_access(vcpu, gpte); 323 + protect_clean_gpte(&pte_access, gpte); 324 + pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 325 + no_dirty_log && (pte_access & ACC_WRITE_MASK)); 326 + if (is_error_pfn(pfn)) 327 + return false; 322 328 323 - no_present: 324 - drop_spte(vcpu->kvm, spte); 329 + /* 330 + * we call mmu_set_spte() with host_writable = true because 331 + * pte_prefetch_gfn_to_pfn always gets a writable pfn. 332 + */ 333 + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 334 + NULL, PT_PAGE_TABLE_LEVEL, gfn, pfn, true, true); 335 + 325 336 return true; 326 337 } 327 338 328 339 static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 329 340 u64 *spte, const void *pte) 330 341 { 331 - pt_element_t gpte; 332 - unsigned pte_access; 333 - pfn_t pfn; 342 + pt_element_t gpte = *(const pt_element_t *)pte; 334 343 335 - gpte = *(const pt_element_t *)pte; 336 - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 337 - return; 338 - 339 - pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 340 - pte_access = sp->role.access & gpte_access(vcpu, gpte); 341 - protect_clean_gpte(&pte_access, gpte); 342 - pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte)); 343 - if (mmu_invalid_pfn(pfn)) 344 - return; 345 - 346 - /* 347 - * we call mmu_set_spte() with host_writable = true because that 348 - * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 349 - */ 350 - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 351 - NULL, PT_PAGE_TABLE_LEVEL, 352 - gpte_to_gfn(gpte), pfn, true, true); 344 + FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false); 353 345 } 354 346 355 347 static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, ··· 387 395 spte = sp->spt + i; 388 396 389 397 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { 390 - pt_element_t gpte; 391 - unsigned pte_access; 392 - gfn_t gfn; 393 - pfn_t pfn; 394 - 395 398 if (spte == sptep) 396 399 continue; 397 400 398 401 if (is_shadow_present_pte(*spte)) 399 402 continue; 400 403 401 - gpte = gptep[i]; 402 - 403 - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) 404 - continue; 405 - 406 - pte_access = sp->role.access & gpte_access(vcpu, gpte); 407 - protect_clean_gpte(&pte_access, gpte); 408 - gfn = gpte_to_gfn(gpte); 409 - pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, 410 - pte_access & ACC_WRITE_MASK); 411 - if (mmu_invalid_pfn(pfn)) 404 + if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true)) 412 405 break; 413 - 414 - mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 415 - NULL, PT_PAGE_TABLE_LEVEL, gfn, 416 - pfn, true, true); 417 406 } 418 407 } 419 408 420 409 /* 421 410 * Fetch a shadow pte for a specific level in the paging hierarchy. 411 + * If the guest tries to write a write-protected page, we need to 412 + * emulate this operation, return 1 to indicate this case. 422 413 */ 423 - static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 414 + static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 424 415 struct guest_walker *gw, 425 416 int user_fault, int write_fault, int hlevel, 426 - int *emulate, pfn_t pfn, bool map_writable, 427 - bool prefault) 417 + pfn_t pfn, bool map_writable, bool prefault) 428 418 { 429 - unsigned access = gw->pt_access; 430 419 struct kvm_mmu_page *sp = NULL; 431 - int top_level; 432 - unsigned direct_access; 433 420 struct kvm_shadow_walk_iterator it; 421 + unsigned direct_access, access = gw->pt_access; 422 + int top_level, emulate = 0; 434 423 435 424 if (!is_present_gpte(gw->ptes[gw->level - 1])) 436 - return NULL; 425 + return 0; 437 426 438 427 direct_access = gw->pte_access; 439 428 ··· 478 505 479 506 clear_sp_write_flooding_count(it.sptep); 480 507 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access, 481 - user_fault, write_fault, emulate, it.level, 508 + user_fault, write_fault, &emulate, it.level, 482 509 gw->gfn, pfn, prefault, map_writable); 483 510 FNAME(pte_prefetch)(vcpu, gw, it.sptep); 484 511 485 - return it.sptep; 512 + return emulate; 486 513 487 514 out_gpte_changed: 488 515 if (sp) 489 516 kvm_mmu_put_page(sp, it.sptep); 490 517 kvm_release_pfn_clean(pfn); 491 - return NULL; 518 + return 0; 492 519 } 493 520 494 521 /* ··· 511 538 int write_fault = error_code & PFERR_WRITE_MASK; 512 539 int user_fault = error_code & PFERR_USER_MASK; 513 540 struct guest_walker walker; 514 - u64 *sptep; 515 - int emulate = 0; 516 541 int r; 517 542 pfn_t pfn; 518 543 int level = PT_PAGE_TABLE_LEVEL; ··· 565 594 return r; 566 595 567 596 spin_lock(&vcpu->kvm->mmu_lock); 568 - if (mmu_notifier_retry(vcpu, mmu_seq)) 597 + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) 569 598 goto out_unlock; 570 599 571 600 kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT); 572 601 kvm_mmu_free_some_pages(vcpu); 573 602 if (!force_pt_level) 574 603 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level); 575 - sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 576 - level, &emulate, pfn, map_writable, prefault); 577 - (void)sptep; 578 - pgprintk("%s: shadow pte %p %llx emulate %d\n", __func__, 579 - sptep, *sptep, emulate); 580 - 604 + r = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 605 + level, pfn, map_writable, prefault); 581 606 ++vcpu->stat.pf_fixed; 582 607 kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT); 583 608 spin_unlock(&vcpu->kvm->mmu_lock); 584 609 585 - return emulate; 610 + return r; 586 611 587 612 out_unlock: 588 613 spin_unlock(&vcpu->kvm->mmu_lock); ··· 724 757 sizeof(pt_element_t))) 725 758 return -EINVAL; 726 759 727 - if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) { 760 + if (prefetch_invalid_gpte(vcpu, sp, &sp->spt[i], gpte)) { 728 761 vcpu->kvm->tlbs_dirty++; 729 762 continue; 730 763 }

+28 -20

arch/x86/kvm/svm.c

··· 20 20 #include "mmu.h" 21 21 #include "kvm_cache_regs.h" 22 22 #include "x86.h" 23 + #include "cpuid.h" 23 24 24 25 #include <linux/module.h> 25 26 #include <linux/mod_devicetable.h> ··· 631 630 return -EBUSY; 632 631 633 632 if (!has_svm()) { 634 - printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n", 635 - me); 633 + pr_err("%s: err EOPNOTSUPP on %d\n", __func__, me); 636 634 return -EINVAL; 637 635 } 638 636 sd = per_cpu(svm_data, me); 639 - 640 637 if (!sd) { 641 - printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n", 642 - me); 638 + pr_err("%s: svm_data is NULL on %d\n", __func__, me); 643 639 return -EINVAL; 644 640 } 645 641 ··· 1010 1012 svm->tsc_ratio = ratio; 1011 1013 } 1012 1014 1015 + static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) 1016 + { 1017 + struct vcpu_svm *svm = to_svm(vcpu); 1018 + 1019 + return svm->vmcb->control.tsc_offset; 1020 + } 1021 + 1013 1022 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1014 1023 { 1015 1024 struct vcpu_svm *svm = to_svm(vcpu); ··· 1194 1189 static int svm_vcpu_reset(struct kvm_vcpu *vcpu) 1195 1190 { 1196 1191 struct vcpu_svm *svm = to_svm(vcpu); 1192 + u32 dummy; 1193 + u32 eax = 1; 1197 1194 1198 1195 init_vmcb(svm); 1199 1196 ··· 1204 1197 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 1205 1198 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 1206 1199 } 1207 - vcpu->arch.regs_avail = ~0; 1208 - vcpu->arch.regs_dirty = ~0; 1200 + 1201 + kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); 1202 + kvm_register_write(vcpu, VCPU_REGS_RDX, eax); 1209 1203 1210 1204 return 0; 1211 1205 } ··· 1262 1254 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1263 1255 svm->asid_generation = 0; 1264 1256 init_vmcb(svm); 1265 - kvm_write_tsc(&svm->vcpu, 0); 1266 - 1267 - err = fx_init(&svm->vcpu); 1268 - if (err) 1269 - goto free_page4; 1270 1257 1271 1258 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 1272 1259 if (kvm_vcpu_is_bsp(&svm->vcpu)) ··· 1271 1268 1272 1269 return &svm->vcpu; 1273 1270 1274 - free_page4: 1275 - __free_page(hsave_page); 1276 1271 free_page3: 1277 1272 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 1278 1273 free_page2: ··· 3009 3008 return 0; 3010 3009 } 3011 3010 3012 - u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) 3011 + u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 3013 3012 { 3014 3013 struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); 3015 3014 return vmcb->control.tsc_offset + 3016 - svm_scale_tsc(vcpu, native_read_tsc()); 3015 + svm_scale_tsc(vcpu, host_tsc); 3017 3016 } 3018 3017 3019 3018 static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) ··· 3132 3131 return 0; 3133 3132 } 3134 3133 3135 - static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) 3134 + static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 3136 3135 { 3137 3136 struct vcpu_svm *svm = to_svm(vcpu); 3138 3137 3138 + u32 ecx = msr->index; 3139 + u64 data = msr->data; 3139 3140 switch (ecx) { 3140 3141 case MSR_IA32_TSC: 3141 - kvm_write_tsc(vcpu, data); 3142 + kvm_write_tsc(vcpu, msr); 3142 3143 break; 3143 3144 case MSR_STAR: 3144 3145 svm->vmcb->save.star = data; ··· 3195 3192 vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); 3196 3193 break; 3197 3194 default: 3198 - return kvm_set_msr_common(vcpu, ecx, data); 3195 + return kvm_set_msr_common(vcpu, msr); 3199 3196 } 3200 3197 return 0; 3201 3198 } 3202 3199 3203 3200 static int wrmsr_interception(struct vcpu_svm *svm) 3204 3201 { 3202 + struct msr_data msr; 3205 3203 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 3206 3204 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 3207 3205 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 3208 3206 3207 + msr.data = data; 3208 + msr.index = ecx; 3209 + msr.host_initiated = false; 3209 3210 3210 3211 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 3211 - if (svm_set_msr(&svm->vcpu, ecx, data)) { 3212 + if (svm_set_msr(&svm->vcpu, &msr)) { 3212 3213 trace_kvm_msr_write_ex(ecx, data); 3213 3214 kvm_inject_gp(&svm->vcpu, 0); 3214 3215 } else { ··· 4309 4302 .has_wbinvd_exit = svm_has_wbinvd_exit, 4310 4303 4311 4304 .set_tsc_khz = svm_set_tsc_khz, 4305 + .read_tsc_offset = svm_read_tsc_offset, 4312 4306 .write_tsc_offset = svm_write_tsc_offset, 4313 4307 .adjust_tsc_offset = svm_adjust_tsc_offset, 4314 4308 .compute_tsc_offset = svm_compute_tsc_offset,

+63

arch/x86/kvm/trace.h

··· 4 4 #include <linux/tracepoint.h> 5 5 #include <asm/vmx.h> 6 6 #include <asm/svm.h> 7 + #include <asm/clocksource.h> 7 8 8 9 #undef TRACE_SYSTEM 9 10 #define TRACE_SYSTEM kvm ··· 755 754 __entry->write ? "Write" : "Read", 756 755 __entry->gpa_match ? "GPA" : "GVA") 757 756 ); 757 + 758 + #ifdef CONFIG_X86_64 759 + 760 + #define host_clocks \ 761 + {VCLOCK_NONE, "none"}, \ 762 + {VCLOCK_TSC, "tsc"}, \ 763 + {VCLOCK_HPET, "hpet"} \ 764 + 765 + TRACE_EVENT(kvm_update_master_clock, 766 + TP_PROTO(bool use_master_clock, unsigned int host_clock, bool offset_matched), 767 + TP_ARGS(use_master_clock, host_clock, offset_matched), 768 + 769 + TP_STRUCT__entry( 770 + __field( bool, use_master_clock ) 771 + __field( unsigned int, host_clock ) 772 + __field( bool, offset_matched ) 773 + ), 774 + 775 + TP_fast_assign( 776 + __entry->use_master_clock = use_master_clock; 777 + __entry->host_clock = host_clock; 778 + __entry->offset_matched = offset_matched; 779 + ), 780 + 781 + TP_printk("masterclock %d hostclock %s offsetmatched %u", 782 + __entry->use_master_clock, 783 + __print_symbolic(__entry->host_clock, host_clocks), 784 + __entry->offset_matched) 785 + ); 786 + 787 + TRACE_EVENT(kvm_track_tsc, 788 + TP_PROTO(unsigned int vcpu_id, unsigned int nr_matched, 789 + unsigned int online_vcpus, bool use_master_clock, 790 + unsigned int host_clock), 791 + TP_ARGS(vcpu_id, nr_matched, online_vcpus, use_master_clock, 792 + host_clock), 793 + 794 + TP_STRUCT__entry( 795 + __field( unsigned int, vcpu_id ) 796 + __field( unsigned int, nr_vcpus_matched_tsc ) 797 + __field( unsigned int, online_vcpus ) 798 + __field( bool, use_master_clock ) 799 + __field( unsigned int, host_clock ) 800 + ), 801 + 802 + TP_fast_assign( 803 + __entry->vcpu_id = vcpu_id; 804 + __entry->nr_vcpus_matched_tsc = nr_matched; 805 + __entry->online_vcpus = online_vcpus; 806 + __entry->use_master_clock = use_master_clock; 807 + __entry->host_clock = host_clock; 808 + ), 809 + 810 + TP_printk("vcpu_id %u masterclock %u offsetmatched %u nr_online %u" 811 + " hostclock %s", 812 + __entry->vcpu_id, __entry->use_master_clock, 813 + __entry->nr_vcpus_matched_tsc, __entry->online_vcpus, 814 + __print_symbolic(__entry->host_clock, host_clocks)) 815 + ); 816 + 817 + #endif /* CONFIG_X86_64 */ 818 + 758 819 #endif /* _TRACE_KVM_H */ 759 820 760 821 #undef TRACE_INCLUDE_PATH

+145 -58

arch/x86/kvm/vmx.c

··· 42 42 #include <asm/i387.h> 43 43 #include <asm/xcr.h> 44 44 #include <asm/perf_event.h> 45 + #include <asm/kexec.h> 45 46 46 47 #include "trace.h" 47 48 ··· 803 802 return vmx_capability.ept & VMX_EPT_AD_BIT; 804 803 } 805 804 806 - static inline bool cpu_has_vmx_invept_individual_addr(void) 807 - { 808 - return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; 809 - } 810 - 811 805 static inline bool cpu_has_vmx_invept_context(void) 812 806 { 813 807 return vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT; ··· 988 992 vmcs, phys_addr); 989 993 } 990 994 995 + #ifdef CONFIG_KEXEC 996 + /* 997 + * This bitmap is used to indicate whether the vmclear 998 + * operation is enabled on all cpus. All disabled by 999 + * default. 1000 + */ 1001 + static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; 1002 + 1003 + static inline void crash_enable_local_vmclear(int cpu) 1004 + { 1005 + cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); 1006 + } 1007 + 1008 + static inline void crash_disable_local_vmclear(int cpu) 1009 + { 1010 + cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); 1011 + } 1012 + 1013 + static inline int crash_local_vmclear_enabled(int cpu) 1014 + { 1015 + return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); 1016 + } 1017 + 1018 + static void crash_vmclear_local_loaded_vmcss(void) 1019 + { 1020 + int cpu = raw_smp_processor_id(); 1021 + struct loaded_vmcs *v; 1022 + 1023 + if (!crash_local_vmclear_enabled(cpu)) 1024 + return; 1025 + 1026 + list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu), 1027 + loaded_vmcss_on_cpu_link) 1028 + vmcs_clear(v->vmcs); 1029 + } 1030 + #else 1031 + static inline void crash_enable_local_vmclear(int cpu) { } 1032 + static inline void crash_disable_local_vmclear(int cpu) { } 1033 + #endif /* CONFIG_KEXEC */ 1034 + 991 1035 static void __loaded_vmcs_clear(void *arg) 992 1036 { 993 1037 struct loaded_vmcs *loaded_vmcs = arg; ··· 1037 1001 return; /* vcpu migration can race with cpu offline */ 1038 1002 if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs) 1039 1003 per_cpu(current_vmcs, cpu) = NULL; 1004 + crash_disable_local_vmclear(cpu); 1040 1005 list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link); 1006 + 1007 + /* 1008 + * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link 1009 + * is before setting loaded_vmcs->vcpu to -1 which is done in 1010 + * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist 1011 + * then adds the vmcs into percpu list before it is deleted. 1012 + */ 1013 + smp_wmb(); 1014 + 1041 1015 loaded_vmcs_init(loaded_vmcs); 1016 + crash_enable_local_vmclear(cpu); 1042 1017 } 1043 1018 1044 1019 static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs) 1045 1020 { 1046 - if (loaded_vmcs->cpu != -1) 1047 - smp_call_function_single( 1048 - loaded_vmcs->cpu, __loaded_vmcs_clear, loaded_vmcs, 1); 1021 + int cpu = loaded_vmcs->cpu; 1022 + 1023 + if (cpu != -1) 1024 + smp_call_function_single(cpu, 1025 + __loaded_vmcs_clear, loaded_vmcs, 1); 1049 1026 } 1050 1027 1051 1028 static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) ··· 1097 1048 __invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0); 1098 1049 else 1099 1050 ept_sync_global(); 1100 - } 1101 - } 1102 - 1103 - static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa) 1104 - { 1105 - if (enable_ept) { 1106 - if (cpu_has_vmx_invept_individual_addr()) 1107 - __invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR, 1108 - eptp, gpa); 1109 - else 1110 - ept_sync_context(eptp); 1111 1051 } 1112 1052 } 1113 1053 ··· 1573 1535 1574 1536 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 1575 1537 local_irq_disable(); 1538 + crash_disable_local_vmclear(cpu); 1539 + 1540 + /* 1541 + * Read loaded_vmcs->cpu should be before fetching 1542 + * loaded_vmcs->loaded_vmcss_on_cpu_link. 1543 + * See the comments in __loaded_vmcs_clear(). 1544 + */ 1545 + smp_rmb(); 1546 + 1576 1547 list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link, 1577 1548 &per_cpu(loaded_vmcss_on_cpu, cpu)); 1549 + crash_enable_local_vmclear(cpu); 1578 1550 local_irq_enable(); 1579 1551 1580 1552 /* ··· 1887 1839 * Like guest_read_tsc, but always returns L1's notion of the timestamp 1888 1840 * counter, even if a nested guest (L2) is currently running. 1889 1841 */ 1890 - u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) 1842 + u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) 1891 1843 { 1892 - u64 host_tsc, tsc_offset; 1844 + u64 tsc_offset; 1893 1845 1894 - rdtscll(host_tsc); 1895 1846 tsc_offset = is_guest_mode(vcpu) ? 1896 1847 to_vmx(vcpu)->nested.vmcs01_tsc_offset : 1897 1848 vmcs_read64(TSC_OFFSET); ··· 1911 1864 vcpu->arch.tsc_always_catchup = 1; 1912 1865 } else 1913 1866 WARN(1, "user requested TSC rate below hardware speed\n"); 1867 + } 1868 + 1869 + static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) 1870 + { 1871 + return vmcs_read64(TSC_OFFSET); 1914 1872 } 1915 1873 1916 1874 /* ··· 2254 2202 * Returns 0 on success, non-0 otherwise. 2255 2203 * Assumes vcpu_load() was already called. 2256 2204 */ 2257 - static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 2205 + static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 2258 2206 { 2259 2207 struct vcpu_vmx *vmx = to_vmx(vcpu); 2260 2208 struct shared_msr_entry *msr; 2261 2209 int ret = 0; 2210 + u32 msr_index = msr_info->index; 2211 + u64 data = msr_info->data; 2262 2212 2263 2213 switch (msr_index) { 2264 2214 case MSR_EFER: 2265 - ret = kvm_set_msr_common(vcpu, msr_index, data); 2215 + ret = kvm_set_msr_common(vcpu, msr_info); 2266 2216 break; 2267 2217 #ifdef CONFIG_X86_64 2268 2218 case MSR_FS_BASE: ··· 2290 2236 vmcs_writel(GUEST_SYSENTER_ESP, data); 2291 2237 break; 2292 2238 case MSR_IA32_TSC: 2293 - kvm_write_tsc(vcpu, data); 2239 + kvm_write_tsc(vcpu, msr_info); 2294 2240 break; 2295 2241 case MSR_IA32_CR_PAT: 2296 2242 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { ··· 2298 2244 vcpu->arch.pat = data; 2299 2245 break; 2300 2246 } 2301 - ret = kvm_set_msr_common(vcpu, msr_index, data); 2247 + ret = kvm_set_msr_common(vcpu, msr_info); 2248 + break; 2249 + case MSR_IA32_TSC_ADJUST: 2250 + ret = kvm_set_msr_common(vcpu, msr_info); 2302 2251 break; 2303 2252 case MSR_TSC_AUX: 2304 2253 if (!vmx->rdtscp_enabled) ··· 2324 2267 } 2325 2268 break; 2326 2269 } 2327 - ret = kvm_set_msr_common(vcpu, msr_index, data); 2270 + ret = kvm_set_msr_common(vcpu, msr_info); 2328 2271 } 2329 2272 2330 2273 return ret; ··· 2398 2341 return -EBUSY; 2399 2342 2400 2343 INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu)); 2344 + 2345 + /* 2346 + * Now we can enable the vmclear operation in kdump 2347 + * since the loaded_vmcss_on_cpu list on this cpu 2348 + * has been initialized. 2349 + * 2350 + * Though the cpu is not in VMX operation now, there 2351 + * is no problem to enable the vmclear operation 2352 + * for the loaded_vmcss_on_cpu list is empty! 2353 + */ 2354 + crash_enable_local_vmclear(cpu); 2355 + 2401 2356 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 2402 2357 2403 2358 test_bits = FEATURE_CONTROL_LOCKED; ··· 2766 2697 if (!(vmcs_readl(sf->base) == tmp.base && tmp.s)) { 2767 2698 tmp.base = vmcs_readl(sf->base); 2768 2699 tmp.selector = vmcs_read16(sf->selector); 2700 + tmp.dpl = tmp.selector & SELECTOR_RPL_MASK; 2769 2701 tmp.s = 1; 2770 2702 } 2771 2703 vmx_set_segment(vcpu, &tmp, seg); ··· 3316 3246 * unrestricted guest like Westmere to older host that don't have 3317 3247 * unrestricted guest like Nehelem. 3318 3248 */ 3319 - if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { 3249 + if (vmx->rmode.vm86_active) { 3320 3250 switch (seg) { 3321 3251 case VCPU_SREG_CS: 3322 3252 vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); ··· 3967 3897 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 3968 3898 set_cr4_guest_host_mask(vmx); 3969 3899 3970 - kvm_write_tsc(&vmx->vcpu, 0); 3971 - 3972 3900 return 0; 3973 3901 } 3974 3902 ··· 3975 3907 struct vcpu_vmx *vmx = to_vmx(vcpu); 3976 3908 u64 msr; 3977 3909 int ret; 3978 - 3979 - vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 3980 3910 3981 3911 vmx->rmode.vm86_active = 0; 3982 3912 ··· 3986 3920 if (kvm_vcpu_is_bsp(&vmx->vcpu)) 3987 3921 msr |= MSR_IA32_APICBASE_BSP; 3988 3922 kvm_set_apic_base(&vmx->vcpu, msr); 3989 - 3990 - ret = fx_init(&vmx->vcpu); 3991 - if (ret != 0) 3992 - goto out; 3993 3923 3994 3924 vmx_segment_cache_clear(vmx); 3995 3925 ··· 4027 3965 kvm_rip_write(vcpu, 0xfff0); 4028 3966 else 4029 3967 kvm_rip_write(vcpu, 0); 4030 - kvm_register_write(vcpu, VCPU_REGS_RSP, 0); 4031 3968 4032 3969 vmcs_writel(GUEST_GDTR_BASE, 0); 4033 3970 vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); ··· 4076 4015 /* HACK: Don't enable emulation on guest boot/reset */ 4077 4016 vmx->emulation_required = 0; 4078 4017 4079 - out: 4080 4018 return ret; 4081 4019 } 4082 4020 ··· 4347 4287 if (is_machine_check(intr_info)) 4348 4288 return handle_machine_check(vcpu); 4349 4289 4350 - if ((vect_info & VECTORING_INFO_VALID_MASK) && 4351 - !is_page_fault(intr_info)) { 4352 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4353 - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 4354 - vcpu->run->internal.ndata = 2; 4355 - vcpu->run->internal.data[0] = vect_info; 4356 - vcpu->run->internal.data[1] = intr_info; 4357 - return 0; 4358 - } 4359 - 4360 4290 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 4361 4291 return 1; /* already handled by vmx_vcpu_run() */ 4362 4292 ··· 4365 4315 error_code = 0; 4366 4316 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 4367 4317 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 4318 + 4319 + /* 4320 + * The #PF with PFEC.RSVD = 1 indicates the guest is accessing 4321 + * MMIO, it is better to report an internal error. 4322 + * See the comments in vmx_handle_exit. 4323 + */ 4324 + if ((vect_info & VECTORING_INFO_VALID_MASK) && 4325 + !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) { 4326 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4327 + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; 4328 + vcpu->run->internal.ndata = 2; 4329 + vcpu->run->internal.data[0] = vect_info; 4330 + vcpu->run->internal.data[1] = intr_info; 4331 + return 0; 4332 + } 4333 + 4368 4334 if (is_page_fault(intr_info)) { 4369 4335 /* EPT won't cause page fault directly */ 4370 4336 BUG_ON(enable_ept); ··· 4692 4626 4693 4627 static int handle_wrmsr(struct kvm_vcpu *vcpu) 4694 4628 { 4629 + struct msr_data msr; 4695 4630 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 4696 4631 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 4697 4632 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 4698 4633 4699 - if (vmx_set_msr(vcpu, ecx, data) != 0) { 4634 + msr.data = data; 4635 + msr.index = ecx; 4636 + msr.host_initiated = false; 4637 + if (vmx_set_msr(vcpu, &msr) != 0) { 4700 4638 trace_kvm_msr_write_ex(ecx, data); 4701 4639 kvm_inject_gp(vcpu, 0); 4702 4640 return 1; ··· 4896 4826 int gla_validity; 4897 4827 4898 4828 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 4899 - 4900 - if (exit_qualification & (1 << 6)) { 4901 - printk(KERN_ERR "EPT: GPA exceeds GAW!\n"); 4902 - return -EINVAL; 4903 - } 4904 4829 4905 4830 gla_validity = (exit_qualification >> 7) & 0x3; 4906 4831 if (gla_validity != 0x3 && gla_validity != 0x1 && gla_validity != 0) { ··· 6044 5979 return 0; 6045 5980 } 6046 5981 5982 + /* 5983 + * Note: 5984 + * Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by 5985 + * delivery event since it indicates guest is accessing MMIO. 5986 + * The vm-exit can be triggered again after return to guest that 5987 + * will cause infinite loop. 5988 + */ 6047 5989 if ((vectoring_info & VECTORING_INFO_VALID_MASK) && 6048 5990 (exit_reason != EXIT_REASON_EXCEPTION_NMI && 6049 5991 exit_reason != EXIT_REASON_EPT_VIOLATION && 6050 - exit_reason != EXIT_REASON_TASK_SWITCH)) 6051 - printk(KERN_WARNING "%s: unexpected, valid vectoring info " 6052 - "(0x%x) and exit reason is 0x%x\n", 6053 - __func__, vectoring_info, exit_reason); 5992 + exit_reason != EXIT_REASON_TASK_SWITCH)) { 5993 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 5994 + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV; 5995 + vcpu->run->internal.ndata = 2; 5996 + vcpu->run->internal.data[0] = vectoring_info; 5997 + vcpu->run->internal.data[1] = exit_reason; 5998 + return 0; 5999 + } 6054 6000 6055 6001 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && 6056 6002 !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( ··· 7385 7309 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 7386 7310 7387 7311 .set_tsc_khz = vmx_set_tsc_khz, 7312 + .read_tsc_offset = vmx_read_tsc_offset, 7388 7313 .write_tsc_offset = vmx_write_tsc_offset, 7389 7314 .adjust_tsc_offset = vmx_adjust_tsc_offset, 7390 7315 .compute_tsc_offset = vmx_compute_tsc_offset, ··· 7444 7367 if (r) 7445 7368 goto out3; 7446 7369 7370 + #ifdef CONFIG_KEXEC 7371 + rcu_assign_pointer(crash_vmclear_loaded_vmcss, 7372 + crash_vmclear_local_loaded_vmcss); 7373 + #endif 7374 + 7447 7375 vmx_disable_intercept_for_msr(MSR_FS_BASE, false); 7448 7376 vmx_disable_intercept_for_msr(MSR_GS_BASE, false); 7449 7377 vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); ··· 7485 7403 free_page((unsigned long)vmx_msr_bitmap_longmode); 7486 7404 free_page((unsigned long)vmx_io_bitmap_b); 7487 7405 free_page((unsigned long)vmx_io_bitmap_a); 7406 + 7407 + #ifdef CONFIG_KEXEC 7408 + rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); 7409 + synchronize_rcu(); 7410 + #endif 7488 7411 7489 7412 kvm_exit(); 7490 7413 }

+472 -76

arch/x86/kvm/x86.c

··· 46 46 #include <linux/uaccess.h> 47 47 #include <linux/hash.h> 48 48 #include <linux/pci.h> 49 + #include <linux/timekeeper_internal.h> 50 + #include <linux/pvclock_gtod.h> 49 51 #include <trace/events/kvm.h> 50 52 51 53 #define CREATE_TRACE_POINTS ··· 160 158 161 159 u64 __read_mostly host_xcr0; 162 160 163 - int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 161 + static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); 162 + 163 + static int kvm_vcpu_reset(struct kvm_vcpu *vcpu); 164 164 165 165 static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 166 166 { ··· 637 633 } 638 634 639 635 if (is_long_mode(vcpu)) { 640 - if (kvm_read_cr4(vcpu) & X86_CR4_PCIDE) { 636 + if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { 641 637 if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) 642 638 return 1; 643 639 } else ··· 831 827 static unsigned num_msrs_to_save; 832 828 833 829 static const u32 emulated_msrs[] = { 830 + MSR_IA32_TSC_ADJUST, 834 831 MSR_IA32_TSCDEADLINE, 835 832 MSR_IA32_MISC_ENABLE, 836 833 MSR_IA32_MCG_STATUS, ··· 891 886 * Returns 0 on success, non-0 otherwise. 892 887 * Assumes vcpu_load() was already called. 893 888 */ 894 - int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 889 + int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) 895 890 { 896 - return kvm_x86_ops->set_msr(vcpu, msr_index, data); 891 + return kvm_x86_ops->set_msr(vcpu, msr); 897 892 } 898 893 899 894 /* ··· 901 896 */ 902 897 static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) 903 898 { 904 - return kvm_set_msr(vcpu, index, *data); 899 + struct msr_data msr; 900 + 901 + msr.data = *data; 902 + msr.index = index; 903 + msr.host_initiated = true; 904 + return kvm_set_msr(vcpu, &msr); 905 905 } 906 + 907 + #ifdef CONFIG_X86_64 908 + struct pvclock_gtod_data { 909 + seqcount_t seq; 910 + 911 + struct { /* extract of a clocksource struct */ 912 + int vclock_mode; 913 + cycle_t cycle_last; 914 + cycle_t mask; 915 + u32 mult; 916 + u32 shift; 917 + } clock; 918 + 919 + /* open coded 'struct timespec' */ 920 + u64 monotonic_time_snsec; 921 + time_t monotonic_time_sec; 922 + }; 923 + 924 + static struct pvclock_gtod_data pvclock_gtod_data; 925 + 926 + static void update_pvclock_gtod(struct timekeeper *tk) 927 + { 928 + struct pvclock_gtod_data *vdata = &pvclock_gtod_data; 929 + 930 + write_seqcount_begin(&vdata->seq); 931 + 932 + /* copy pvclock gtod data */ 933 + vdata->clock.vclock_mode = tk->clock->archdata.vclock_mode; 934 + vdata->clock.cycle_last = tk->clock->cycle_last; 935 + vdata->clock.mask = tk->clock->mask; 936 + vdata->clock.mult = tk->mult; 937 + vdata->clock.shift = tk->shift; 938 + 939 + vdata->monotonic_time_sec = tk->xtime_sec 940 + + tk->wall_to_monotonic.tv_sec; 941 + vdata->monotonic_time_snsec = tk->xtime_nsec 942 + + (tk->wall_to_monotonic.tv_nsec 943 + << tk->shift); 944 + while (vdata->monotonic_time_snsec >= 945 + (((u64)NSEC_PER_SEC) << tk->shift)) { 946 + vdata->monotonic_time_snsec -= 947 + ((u64)NSEC_PER_SEC) << tk->shift; 948 + vdata->monotonic_time_sec++; 949 + } 950 + 951 + write_seqcount_end(&vdata->seq); 952 + } 953 + #endif 954 + 906 955 907 956 static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) 908 957 { ··· 1054 995 return timespec_to_ns(&ts); 1055 996 } 1056 997 998 + #ifdef CONFIG_X86_64 999 + static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); 1000 + #endif 1001 + 1057 1002 static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 1058 1003 unsigned long max_tsc_khz; 1059 1004 ··· 1109 1046 return tsc; 1110 1047 } 1111 1048 1112 - void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) 1049 + void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) 1050 + { 1051 + #ifdef CONFIG_X86_64 1052 + bool vcpus_matched; 1053 + bool do_request = false; 1054 + struct kvm_arch *ka = &vcpu->kvm->arch; 1055 + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 1056 + 1057 + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == 1058 + atomic_read(&vcpu->kvm->online_vcpus)); 1059 + 1060 + if (vcpus_matched && gtod->clock.vclock_mode == VCLOCK_TSC) 1061 + if (!ka->use_master_clock) 1062 + do_request = 1; 1063 + 1064 + if (!vcpus_matched && ka->use_master_clock) 1065 + do_request = 1; 1066 + 1067 + if (do_request) 1068 + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); 1069 + 1070 + trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, 1071 + atomic_read(&vcpu->kvm->online_vcpus), 1072 + ka->use_master_clock, gtod->clock.vclock_mode); 1073 + #endif 1074 + } 1075 + 1076 + static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) 1077 + { 1078 + u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu); 1079 + vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; 1080 + } 1081 + 1082 + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) 1113 1083 { 1114 1084 struct kvm *kvm = vcpu->kvm; 1115 1085 u64 offset, ns, elapsed; 1116 1086 unsigned long flags; 1117 1087 s64 usdiff; 1088 + bool matched; 1089 + u64 data = msr->data; 1118 1090 1119 1091 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1120 1092 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); ··· 1192 1094 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); 1193 1095 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1194 1096 } 1097 + matched = true; 1195 1098 } else { 1196 1099 /* 1197 1100 * We split periods of matched TSC writes into generations. ··· 1207 1108 kvm->arch.cur_tsc_nsec = ns; 1208 1109 kvm->arch.cur_tsc_write = data; 1209 1110 kvm->arch.cur_tsc_offset = offset; 1111 + matched = false; 1210 1112 pr_debug("kvm: new tsc generation %u, clock %llu\n", 1211 1113 kvm->arch.cur_tsc_generation, data); 1212 1114 } ··· 1229 1129 vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; 1230 1130 vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; 1231 1131 1132 + if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated) 1133 + update_ia32_tsc_adjust_msr(vcpu, offset); 1232 1134 kvm_x86_ops->write_tsc_offset(vcpu, offset); 1233 1135 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); 1136 + 1137 + spin_lock(&kvm->arch.pvclock_gtod_sync_lock); 1138 + if (matched) 1139 + kvm->arch.nr_vcpus_matched_tsc++; 1140 + else 1141 + kvm->arch.nr_vcpus_matched_tsc = 0; 1142 + 1143 + kvm_track_tsc_matching(vcpu); 1144 + spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); 1234 1145 } 1235 1146 1236 1147 EXPORT_SYMBOL_GPL(kvm_write_tsc); 1237 1148 1149 + #ifdef CONFIG_X86_64 1150 + 1151 + static cycle_t read_tsc(void) 1152 + { 1153 + cycle_t ret; 1154 + u64 last; 1155 + 1156 + /* 1157 + * Empirically, a fence (of type that depends on the CPU) 1158 + * before rdtsc is enough to ensure that rdtsc is ordered 1159 + * with respect to loads. The various CPU manuals are unclear 1160 + * as to whether rdtsc can be reordered with later loads, 1161 + * but no one has ever seen it happen. 1162 + */ 1163 + rdtsc_barrier(); 1164 + ret = (cycle_t)vget_cycles(); 1165 + 1166 + last = pvclock_gtod_data.clock.cycle_last; 1167 + 1168 + if (likely(ret >= last)) 1169 + return ret; 1170 + 1171 + /* 1172 + * GCC likes to generate cmov here, but this branch is extremely 1173 + * predictable (it's just a funciton of time and the likely is 1174 + * very likely) and there's a data dependence, so force GCC 1175 + * to generate a branch instead. I don't barrier() because 1176 + * we don't actually need a barrier, and if this function 1177 + * ever gets inlined it will generate worse code. 1178 + */ 1179 + asm volatile (""); 1180 + return last; 1181 + } 1182 + 1183 + static inline u64 vgettsc(cycle_t *cycle_now) 1184 + { 1185 + long v; 1186 + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 1187 + 1188 + *cycle_now = read_tsc(); 1189 + 1190 + v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; 1191 + return v * gtod->clock.mult; 1192 + } 1193 + 1194 + static int do_monotonic(struct timespec *ts, cycle_t *cycle_now) 1195 + { 1196 + unsigned long seq; 1197 + u64 ns; 1198 + int mode; 1199 + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 1200 + 1201 + ts->tv_nsec = 0; 1202 + do { 1203 + seq = read_seqcount_begin(&gtod->seq); 1204 + mode = gtod->clock.vclock_mode; 1205 + ts->tv_sec = gtod->monotonic_time_sec; 1206 + ns = gtod->monotonic_time_snsec; 1207 + ns += vgettsc(cycle_now); 1208 + ns >>= gtod->clock.shift; 1209 + } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 1210 + timespec_add_ns(ts, ns); 1211 + 1212 + return mode; 1213 + } 1214 + 1215 + /* returns true if host is using tsc clocksource */ 1216 + static bool kvm_get_time_and_clockread(s64 *kernel_ns, cycle_t *cycle_now) 1217 + { 1218 + struct timespec ts; 1219 + 1220 + /* checked again under seqlock below */ 1221 + if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) 1222 + return false; 1223 + 1224 + if (do_monotonic(&ts, cycle_now) != VCLOCK_TSC) 1225 + return false; 1226 + 1227 + monotonic_to_bootbased(&ts); 1228 + *kernel_ns = timespec_to_ns(&ts); 1229 + 1230 + return true; 1231 + } 1232 + #endif 1233 + 1234 + /* 1235 + * 1236 + * Assuming a stable TSC across physical CPUS, and a stable TSC 1237 + * across virtual CPUs, the following condition is possible. 1238 + * Each numbered line represents an event visible to both 1239 + * CPUs at the next numbered event. 1240 + * 1241 + * "timespecX" represents host monotonic time. "tscX" represents 1242 + * RDTSC value. 1243 + * 1244 + * VCPU0 on CPU0 | VCPU1 on CPU1 1245 + * 1246 + * 1. read timespec0,tsc0 1247 + * 2. | timespec1 = timespec0 + N 1248 + * | tsc1 = tsc0 + M 1249 + * 3. transition to guest | transition to guest 1250 + * 4. ret0 = timespec0 + (rdtsc - tsc0) | 1251 + * 5. | ret1 = timespec1 + (rdtsc - tsc1) 1252 + * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) 1253 + * 1254 + * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: 1255 + * 1256 + * - ret0 < ret1 1257 + * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) 1258 + * ... 1259 + * - 0 < N - M => M < N 1260 + * 1261 + * That is, when timespec0 != timespec1, M < N. Unfortunately that is not 1262 + * always the case (the difference between two distinct xtime instances 1263 + * might be smaller then the difference between corresponding TSC reads, 1264 + * when updating guest vcpus pvclock areas). 1265 + * 1266 + * To avoid that problem, do not allow visibility of distinct 1267 + * system_timestamp/tsc_timestamp values simultaneously: use a master 1268 + * copy of host monotonic time values. Update that master copy 1269 + * in lockstep. 1270 + * 1271 + * Rely on synchronization of host TSCs and guest TSCs for monotonicity. 1272 + * 1273 + */ 1274 + 1275 + static void pvclock_update_vm_gtod_copy(struct kvm *kvm) 1276 + { 1277 + #ifdef CONFIG_X86_64 1278 + struct kvm_arch *ka = &kvm->arch; 1279 + int vclock_mode; 1280 + bool host_tsc_clocksource, vcpus_matched; 1281 + 1282 + vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == 1283 + atomic_read(&kvm->online_vcpus)); 1284 + 1285 + /* 1286 + * If the host uses TSC clock, then passthrough TSC as stable 1287 + * to the guest. 1288 + */ 1289 + host_tsc_clocksource = kvm_get_time_and_clockread( 1290 + &ka->master_kernel_ns, 1291 + &ka->master_cycle_now); 1292 + 1293 + ka->use_master_clock = host_tsc_clocksource & vcpus_matched; 1294 + 1295 + if (ka->use_master_clock) 1296 + atomic_set(&kvm_guest_has_master_clock, 1); 1297 + 1298 + vclock_mode = pvclock_gtod_data.clock.vclock_mode; 1299 + trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode, 1300 + vcpus_matched); 1301 + #endif 1302 + } 1303 + 1238 1304 static int kvm_guest_time_update(struct kvm_vcpu *v) 1239 1305 { 1240 - unsigned long flags; 1306 + unsigned long flags, this_tsc_khz; 1241 1307 struct kvm_vcpu_arch *vcpu = &v->arch; 1308 + struct kvm_arch *ka = &v->kvm->arch; 1242 1309 void *shared_kaddr; 1243 - unsigned long this_tsc_khz; 1244 1310 s64 kernel_ns, max_kernel_ns; 1245 - u64 tsc_timestamp; 1311 + u64 tsc_timestamp, host_tsc; 1312 + struct pvclock_vcpu_time_info *guest_hv_clock; 1246 1313 u8 pvclock_flags; 1314 + bool use_master_clock; 1315 + 1316 + kernel_ns = 0; 1317 + host_tsc = 0; 1247 1318 1248 1319 /* Keep irq disabled to prevent changes to the clock */ 1249 1320 local_irq_save(flags); 1250 - tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); 1251 - kernel_ns = get_kernel_ns(); 1252 1321 this_tsc_khz = __get_cpu_var(cpu_tsc_khz); 1253 1322 if (unlikely(this_tsc_khz == 0)) { 1254 1323 local_irq_restore(flags); 1255 1324 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 1256 1325 return 1; 1257 1326 } 1327 + 1328 + /* 1329 + * If the host uses TSC clock, then passthrough TSC as stable 1330 + * to the guest. 1331 + */ 1332 + spin_lock(&ka->pvclock_gtod_sync_lock); 1333 + use_master_clock = ka->use_master_clock; 1334 + if (use_master_clock) { 1335 + host_tsc = ka->master_cycle_now; 1336 + kernel_ns = ka->master_kernel_ns; 1337 + } 1338 + spin_unlock(&ka->pvclock_gtod_sync_lock); 1339 + if (!use_master_clock) { 1340 + host_tsc = native_read_tsc(); 1341 + kernel_ns = get_kernel_ns(); 1342 + } 1343 + 1344 + tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); 1258 1345 1259 1346 /* 1260 1347 * We may have to catch up the TSC to match elapsed wall clock ··· 1504 1217 vcpu->hw_tsc_khz = this_tsc_khz; 1505 1218 } 1506 1219 1507 - if (max_kernel_ns > kernel_ns) 1508 - kernel_ns = max_kernel_ns; 1509 - 1220 + /* with a master <monotonic time, tsc value> tuple, 1221 + * pvclock clock reads always increase at the (scaled) rate 1222 + * of guest TSC - no need to deal with sampling errors. 1223 + */ 1224 + if (!use_master_clock) { 1225 + if (max_kernel_ns > kernel_ns) 1226 + kernel_ns = max_kernel_ns; 1227 + } 1510 1228 /* With all the info we got, fill in the values */ 1511 1229 vcpu->hv_clock.tsc_timestamp = tsc_timestamp; 1512 1230 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; 1513 1231 vcpu->last_kernel_ns = kernel_ns; 1514 1232 vcpu->last_guest_tsc = tsc_timestamp; 1515 - 1516 - pvclock_flags = 0; 1517 - if (vcpu->pvclock_set_guest_stopped_request) { 1518 - pvclock_flags |= PVCLOCK_GUEST_STOPPED; 1519 - vcpu->pvclock_set_guest_stopped_request = false; 1520 - } 1521 - 1522 - vcpu->hv_clock.flags = pvclock_flags; 1523 1233 1524 1234 /* 1525 1235 * The interface expects us to write an even number signaling that the ··· 1526 1242 vcpu->hv_clock.version += 2; 1527 1243 1528 1244 shared_kaddr = kmap_atomic(vcpu->time_page); 1245 + 1246 + guest_hv_clock = shared_kaddr + vcpu->time_offset; 1247 + 1248 + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ 1249 + pvclock_flags = (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); 1250 + 1251 + if (vcpu->pvclock_set_guest_stopped_request) { 1252 + pvclock_flags |= PVCLOCK_GUEST_STOPPED; 1253 + vcpu->pvclock_set_guest_stopped_request = false; 1254 + } 1255 + 1256 + /* If the host uses TSC clocksource, then it is stable */ 1257 + if (use_master_clock) 1258 + pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; 1259 + 1260 + vcpu->hv_clock.flags = pvclock_flags; 1529 1261 1530 1262 memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, 1531 1263 sizeof(vcpu->hv_clock)); ··· 1872 1572 &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); 1873 1573 } 1874 1574 1875 - int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1575 + int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) 1876 1576 { 1877 1577 bool pr = false; 1578 + u32 msr = msr_info->index; 1579 + u64 data = msr_info->data; 1878 1580 1879 1581 switch (msr) { 1880 1582 case MSR_EFER: ··· 1926 1624 return kvm_x2apic_msr_write(vcpu, msr, data); 1927 1625 case MSR_IA32_TSCDEADLINE: 1928 1626 kvm_set_lapic_tscdeadline_msr(vcpu, data); 1627 + break; 1628 + case MSR_IA32_TSC_ADJUST: 1629 + if (guest_cpuid_has_tsc_adjust(vcpu)) { 1630 + if (!msr_info->host_initiated) { 1631 + u64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; 1632 + kvm_x86_ops->adjust_tsc_offset(vcpu, adj, true); 1633 + } 1634 + vcpu->arch.ia32_tsc_adjust_msr = data; 1635 + } 1929 1636 break; 1930 1637 case MSR_IA32_MISC_ENABLE: 1931 1638 vcpu->arch.ia32_misc_enable_msr = data; ··· 2295 1984 case MSR_IA32_TSCDEADLINE: 2296 1985 data = kvm_get_lapic_tscdeadline_msr(vcpu); 2297 1986 break; 1987 + case MSR_IA32_TSC_ADJUST: 1988 + data = (u64)vcpu->arch.ia32_tsc_adjust_msr; 1989 + break; 2298 1990 case MSR_IA32_MISC_ENABLE: 2299 1991 data = vcpu->arch.ia32_misc_enable_msr; 2300 1992 break; ··· 2656 2342 kvm_x86_ops->write_tsc_offset(vcpu, offset); 2657 2343 vcpu->arch.tsc_catchup = 1; 2658 2344 } 2659 - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2345 + /* 2346 + * On a host with synchronized TSC, there is no need to update 2347 + * kvmclock on vcpu->cpu migration 2348 + */ 2349 + if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) 2350 + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); 2660 2351 if (vcpu->cpu != cpu) 2661 2352 kvm_migrate_timers(vcpu); 2662 2353 vcpu->cpu = cpu; ··· 3010 2691 if (!vcpu->arch.apic) 3011 2692 goto out; 3012 2693 u.lapic = memdup_user(argp, sizeof(*u.lapic)); 3013 - if (IS_ERR(u.lapic)) { 3014 - r = PTR_ERR(u.lapic); 3015 - goto out; 3016 - } 2694 + if (IS_ERR(u.lapic)) 2695 + return PTR_ERR(u.lapic); 3017 2696 3018 2697 r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 3019 - if (r) 3020 - goto out; 3021 - r = 0; 3022 2698 break; 3023 2699 } 3024 2700 case KVM_INTERRUPT: { ··· 3023 2709 if (copy_from_user(&irq, argp, sizeof irq)) 3024 2710 goto out; 3025 2711 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 3026 - if (r) 3027 - goto out; 3028 - r = 0; 3029 2712 break; 3030 2713 } 3031 2714 case KVM_NMI: { 3032 2715 r = kvm_vcpu_ioctl_nmi(vcpu); 3033 - if (r) 3034 - goto out; 3035 - r = 0; 3036 2716 break; 3037 2717 } 3038 2718 case KVM_SET_CPUID: { ··· 3037 2729 if (copy_from_user(&cpuid, cpuid_arg, sizeof cpuid)) 3038 2730 goto out; 3039 2731 r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries); 3040 - if (r) 3041 - goto out; 3042 2732 break; 3043 2733 } 3044 2734 case KVM_SET_CPUID2: { ··· 3048 2742 goto out; 3049 2743 r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid, 3050 2744 cpuid_arg->entries); 3051 - if (r) 3052 - goto out; 3053 2745 break; 3054 2746 } 3055 2747 case KVM_GET_CPUID2: { ··· 3179 2875 } 3180 2876 case KVM_SET_XSAVE: { 3181 2877 u.xsave = memdup_user(argp, sizeof(*u.xsave)); 3182 - if (IS_ERR(u.xsave)) { 3183 - r = PTR_ERR(u.xsave); 3184 - goto out; 3185 - } 2878 + if (IS_ERR(u.xsave)) 2879 + return PTR_ERR(u.xsave); 3186 2880 3187 2881 r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 3188 2882 break; ··· 3202 2900 } 3203 2901 case KVM_SET_XCRS: { 3204 2902 u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); 3205 - if (IS_ERR(u.xcrs)) { 3206 - r = PTR_ERR(u.xcrs); 3207 - goto out; 3208 - } 2903 + if (IS_ERR(u.xcrs)) 2904 + return PTR_ERR(u.xcrs); 3209 2905 3210 2906 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 3211 2907 break; ··· 3251 2951 int ret; 3252 2952 3253 2953 if (addr > (unsigned int)(-3 * PAGE_SIZE)) 3254 - return -1; 2954 + return -EINVAL; 3255 2955 ret = kvm_x86_ops->set_tss_addr(kvm, addr); 3256 2956 return ret; 3257 2957 } ··· 3512 3212 switch (ioctl) { 3513 3213 case KVM_SET_TSS_ADDR: 3514 3214 r = kvm_vm_ioctl_set_tss_addr(kvm, arg); 3515 - if (r < 0) 3516 - goto out; 3517 3215 break; 3518 3216 case KVM_SET_IDENTITY_MAP_ADDR: { 3519 3217 u64 ident_addr; ··· 3520 3222 if (copy_from_user(&ident_addr, argp, sizeof ident_addr)) 3521 3223 goto out; 3522 3224 r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); 3523 - if (r < 0) 3524 - goto out; 3525 3225 break; 3526 3226 } 3527 3227 case KVM_SET_NR_MMU_PAGES: 3528 3228 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 3529 - if (r) 3530 - goto out; 3531 3229 break; 3532 3230 case KVM_GET_NR_MMU_PAGES: 3533 3231 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); ··· 3614 3320 r = 0; 3615 3321 get_irqchip_out: 3616 3322 kfree(chip); 3617 - if (r) 3618 - goto out; 3619 3323 break; 3620 3324 } 3621 3325 case KVM_SET_IRQCHIP: { ··· 3635 3343 r = 0; 3636 3344 set_irqchip_out: 3637 3345 kfree(chip); 3638 - if (r) 3639 - goto out; 3640 3346 break; 3641 3347 } 3642 3348 case KVM_GET_PIT: { ··· 3661 3371 if (!kvm->arch.vpit) 3662 3372 goto out; 3663 3373 r = kvm_vm_ioctl_set_pit(kvm, &u.ps); 3664 - if (r) 3665 - goto out; 3666 - r = 0; 3667 3374 break; 3668 3375 } 3669 3376 case KVM_GET_PIT2: { ··· 3684 3397 if (!kvm->arch.vpit) 3685 3398 goto out; 3686 3399 r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2); 3687 - if (r) 3688 - goto out; 3689 - r = 0; 3690 3400 break; 3691 3401 } 3692 3402 case KVM_REINJECT_CONTROL: { ··· 3692 3408 if (copy_from_user(&control, argp, sizeof(control))) 3693 3409 goto out; 3694 3410 r = kvm_vm_ioctl_reinject(kvm, &control); 3695 - if (r) 3696 - goto out; 3697 - r = 0; 3698 3411 break; 3699 3412 } 3700 3413 case KVM_XEN_HVM_CONFIG: { ··· 4554 4273 static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, 4555 4274 u32 msr_index, u64 data) 4556 4275 { 4557 - return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); 4276 + struct msr_data msr; 4277 + 4278 + msr.data = data; 4279 + msr.index = msr_index; 4280 + msr.host_initiated = false; 4281 + return kvm_set_msr(emul_to_vcpu(ctxt), &msr); 4558 4282 } 4559 4283 4560 4284 static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, ··· 4781 4495 * instruction -> ... 4782 4496 */ 4783 4497 pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa)); 4784 - if (!is_error_pfn(pfn)) { 4498 + if (!is_error_noslot_pfn(pfn)) { 4785 4499 kvm_release_pfn_clean(pfn); 4786 4500 return true; 4787 4501 } ··· 5167 4881 kvm_mmu_set_mmio_spte_mask(mask); 5168 4882 } 5169 4883 4884 + #ifdef CONFIG_X86_64 4885 + static void pvclock_gtod_update_fn(struct work_struct *work) 4886 + { 4887 + struct kvm *kvm; 4888 + 4889 + struct kvm_vcpu *vcpu; 4890 + int i; 4891 + 4892 + raw_spin_lock(&kvm_lock); 4893 + list_for_each_entry(kvm, &vm_list, vm_list) 4894 + kvm_for_each_vcpu(i, vcpu, kvm) 4895 + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, &vcpu->requests); 4896 + atomic_set(&kvm_guest_has_master_clock, 0); 4897 + raw_spin_unlock(&kvm_lock); 4898 + } 4899 + 4900 + static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); 4901 + 4902 + /* 4903 + * Notification about pvclock gtod data update. 4904 + */ 4905 + static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, 4906 + void *priv) 4907 + { 4908 + struct pvclock_gtod_data *gtod = &pvclock_gtod_data; 4909 + struct timekeeper *tk = priv; 4910 + 4911 + update_pvclock_gtod(tk); 4912 + 4913 + /* disable master clock if host does not trust, or does not 4914 + * use, TSC clocksource 4915 + */ 4916 + if (gtod->clock.vclock_mode != VCLOCK_TSC && 4917 + atomic_read(&kvm_guest_has_master_clock) != 0) 4918 + queue_work(system_long_wq, &pvclock_gtod_work); 4919 + 4920 + return 0; 4921 + } 4922 + 4923 + static struct notifier_block pvclock_gtod_notifier = { 4924 + .notifier_call = pvclock_gtod_notify, 4925 + }; 4926 + #endif 4927 + 5170 4928 int kvm_arch_init(void *opaque) 5171 4929 { 5172 4930 int r; ··· 5252 4922 host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 5253 4923 5254 4924 kvm_lapic_init(); 4925 + #ifdef CONFIG_X86_64 4926 + pvclock_gtod_register_notifier(&pvclock_gtod_notifier); 4927 + #endif 4928 + 5255 4929 return 0; 5256 4930 5257 4931 out: ··· 5270 4936 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 5271 4937 CPUFREQ_TRANSITION_NOTIFIER); 5272 4938 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); 4939 + #ifdef CONFIG_X86_64 4940 + pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); 4941 + #endif 5273 4942 kvm_x86_ops = NULL; 5274 4943 kvm_mmu_module_exit(); 5275 4944 } ··· 5396 5059 } 5397 5060 EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 5398 5061 5399 - int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) 5062 + static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) 5400 5063 { 5401 5064 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); 5402 5065 char instruction[3]; ··· 5572 5235 kvm_make_request(KVM_REQ_EVENT, vcpu); 5573 5236 } 5574 5237 5238 + static void kvm_gen_update_masterclock(struct kvm *kvm) 5239 + { 5240 + #ifdef CONFIG_X86_64 5241 + int i; 5242 + struct kvm_vcpu *vcpu; 5243 + struct kvm_arch *ka = &kvm->arch; 5244 + 5245 + spin_lock(&ka->pvclock_gtod_sync_lock); 5246 + kvm_make_mclock_inprogress_request(kvm); 5247 + /* no guest entries from this point */ 5248 + pvclock_update_vm_gtod_copy(kvm); 5249 + 5250 + kvm_for_each_vcpu(i, vcpu, kvm) 5251 + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); 5252 + 5253 + /* guest entries allowed */ 5254 + kvm_for_each_vcpu(i, vcpu, kvm) 5255 + clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); 5256 + 5257 + spin_unlock(&ka->pvclock_gtod_sync_lock); 5258 + #endif 5259 + } 5260 + 5575 5261 static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5576 5262 { 5577 5263 int r; ··· 5607 5247 kvm_mmu_unload(vcpu); 5608 5248 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5609 5249 __kvm_migrate_timers(vcpu); 5250 + if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) 5251 + kvm_gen_update_masterclock(vcpu->kvm); 5610 5252 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { 5611 5253 r = kvm_guest_time_update(vcpu); 5612 5254 if (unlikely(r)) ··· 5724 5362 if (hw_breakpoint_active()) 5725 5363 hw_breakpoint_restore(); 5726 5364 5727 - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); 5365 + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, 5366 + native_read_tsc()); 5728 5367 5729 5368 vcpu->mode = OUTSIDE_GUEST_MODE; 5730 5369 smp_wmb(); ··· 5782 5419 pr_debug("vcpu %d received sipi with vector # %x\n", 5783 5420 vcpu->vcpu_id, vcpu->arch.sipi_vector); 5784 5421 kvm_lapic_reset(vcpu); 5785 - r = kvm_arch_vcpu_reset(vcpu); 5422 + r = kvm_vcpu_reset(vcpu); 5786 5423 if (r) 5787 5424 return r; 5788 5425 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; ··· 6410 6047 r = vcpu_load(vcpu); 6411 6048 if (r) 6412 6049 return r; 6413 - r = kvm_arch_vcpu_reset(vcpu); 6050 + r = kvm_vcpu_reset(vcpu); 6414 6051 if (r == 0) 6415 6052 r = kvm_mmu_setup(vcpu); 6053 + vcpu_put(vcpu); 6054 + 6055 + return r; 6056 + } 6057 + 6058 + int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) 6059 + { 6060 + int r; 6061 + struct msr_data msr; 6062 + 6063 + r = vcpu_load(vcpu); 6064 + if (r) 6065 + return r; 6066 + msr.data = 0x0; 6067 + msr.index = MSR_IA32_TSC; 6068 + msr.host_initiated = true; 6069 + kvm_write_tsc(vcpu, &msr); 6416 6070 vcpu_put(vcpu); 6417 6071 6418 6072 return r; ··· 6449 6069 kvm_x86_ops->vcpu_free(vcpu); 6450 6070 } 6451 6071 6452 - int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) 6072 + static int kvm_vcpu_reset(struct kvm_vcpu *vcpu) 6453 6073 { 6454 6074 atomic_set(&vcpu->arch.nmi_queued, 0); 6455 6075 vcpu->arch.nmi_pending = 0; ··· 6471 6091 vcpu->arch.apf.halted = false; 6472 6092 6473 6093 kvm_pmu_reset(vcpu); 6094 + 6095 + memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); 6096 + vcpu->arch.regs_avail = ~0; 6097 + vcpu->arch.regs_dirty = ~0; 6474 6098 6475 6099 return kvm_x86_ops->vcpu_reset(vcpu); 6476 6100 } ··· 6552 6168 kvm_for_each_vcpu(i, vcpu, kvm) { 6553 6169 vcpu->arch.tsc_offset_adjustment += delta_cyc; 6554 6170 vcpu->arch.last_host_tsc = local_tsc; 6171 + set_bit(KVM_REQ_MASTERCLOCK_UPDATE, 6172 + &vcpu->requests); 6555 6173 } 6556 6174 6557 6175 /* ··· 6644 6258 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 6645 6259 goto fail_free_mce_banks; 6646 6260 6261 + r = fx_init(vcpu); 6262 + if (r) 6263 + goto fail_free_wbinvd_dirty_mask; 6264 + 6265 + vcpu->arch.ia32_tsc_adjust_msr = 0x0; 6647 6266 kvm_async_pf_hash_reset(vcpu); 6648 6267 kvm_pmu_init(vcpu); 6649 6268 6650 6269 return 0; 6270 + fail_free_wbinvd_dirty_mask: 6271 + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 6651 6272 fail_free_mce_banks: 6652 6273 kfree(vcpu->arch.mce_banks); 6653 6274 fail_free_lapic: ··· 6698 6305 6699 6306 raw_spin_lock_init(&kvm->arch.tsc_write_lock); 6700 6307 mutex_init(&kvm->arch.apic_map_lock); 6308 + spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock); 6309 + 6310 + pvclock_update_vm_gtod_copy(kvm); 6701 6311 6702 6312 return 0; 6703 6313 }

+1 -1

arch/x86/kvm/x86.h

··· 112 112 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 113 113 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); 114 114 115 - void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); 115 + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); 116 116 117 117 int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, 118 118 gva_t addr, void *val, unsigned int bytes,

+78 -3

arch/x86/vdso/vclock_gettime.c

··· 22 22 #include <asm/hpet.h> 23 23 #include <asm/unistd.h> 24 24 #include <asm/io.h> 25 + #include <asm/pvclock.h> 25 26 26 27 #define gtod (&VVAR(vsyscall_gtod_data)) 27 28 ··· 63 62 return readl((const void __iomem *)fix_to_virt(VSYSCALL_HPET) + 0xf0); 64 63 } 65 64 65 + #ifdef CONFIG_PARAVIRT_CLOCK 66 + 67 + static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) 68 + { 69 + const struct pvclock_vsyscall_time_info *pvti_base; 70 + int idx = cpu / (PAGE_SIZE/PVTI_SIZE); 71 + int offset = cpu % (PAGE_SIZE/PVTI_SIZE); 72 + 73 + BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); 74 + 75 + pvti_base = (struct pvclock_vsyscall_time_info *) 76 + __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); 77 + 78 + return &pvti_base[offset]; 79 + } 80 + 81 + static notrace cycle_t vread_pvclock(int *mode) 82 + { 83 + const struct pvclock_vsyscall_time_info *pvti; 84 + cycle_t ret; 85 + u64 last; 86 + u32 version; 87 + u32 migrate_count; 88 + u8 flags; 89 + unsigned cpu, cpu1; 90 + 91 + 92 + /* 93 + * When looping to get a consistent (time-info, tsc) pair, we 94 + * also need to deal with the possibility we can switch vcpus, 95 + * so make sure we always re-fetch time-info for the current vcpu. 96 + */ 97 + do { 98 + cpu = __getcpu() & VGETCPU_CPU_MASK; 99 + /* TODO: We can put vcpu id into higher bits of pvti.version. 100 + * This will save a couple of cycles by getting rid of 101 + * __getcpu() calls (Gleb). 102 + */ 103 + 104 + pvti = get_pvti(cpu); 105 + 106 + migrate_count = pvti->migrate_count; 107 + 108 + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); 109 + 110 + /* 111 + * Test we're still on the cpu as well as the version. 112 + * We could have been migrated just after the first 113 + * vgetcpu but before fetching the version, so we 114 + * wouldn't notice a version change. 115 + */ 116 + cpu1 = __getcpu() & VGETCPU_CPU_MASK; 117 + } while (unlikely(cpu != cpu1 || 118 + (pvti->pvti.version & 1) || 119 + pvti->pvti.version != version || 120 + pvti->migrate_count != migrate_count)); 121 + 122 + if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) 123 + *mode = VCLOCK_NONE; 124 + 125 + /* refer to tsc.c read_tsc() comment for rationale */ 126 + last = VVAR(vsyscall_gtod_data).clock.cycle_last; 127 + 128 + if (likely(ret >= last)) 129 + return ret; 130 + 131 + return last; 132 + } 133 + #endif 134 + 66 135 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) 67 136 { 68 137 long ret; ··· 151 80 } 152 81 153 82 154 - notrace static inline u64 vgetsns(void) 83 + notrace static inline u64 vgetsns(int *mode) 155 84 { 156 85 long v; 157 86 cycles_t cycles; ··· 159 88 cycles = vread_tsc(); 160 89 else if (gtod->clock.vclock_mode == VCLOCK_HPET) 161 90 cycles = vread_hpet(); 91 + #ifdef CONFIG_PARAVIRT_CLOCK 92 + else if (gtod->clock.vclock_mode == VCLOCK_PVCLOCK) 93 + cycles = vread_pvclock(mode); 94 + #endif 162 95 else 163 96 return 0; 164 97 v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; ··· 182 107 mode = gtod->clock.vclock_mode; 183 108 ts->tv_sec = gtod->wall_time_sec; 184 109 ns = gtod->wall_time_snsec; 185 - ns += vgetsns(); 110 + ns += vgetsns(&mode); 186 111 ns >>= gtod->clock.shift; 187 112 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 188 113 ··· 202 127 mode = gtod->clock.vclock_mode; 203 128 ts->tv_sec = gtod->monotonic_time_sec; 204 129 ns = gtod->monotonic_time_snsec; 205 - ns += vgetsns(); 130 + ns += vgetsns(&mode); 206 131 ns >>= gtod->clock.shift; 207 132 } while (unlikely(read_seqcount_retry(&gtod->seq, seq))); 208 133 timespec_add_ns(ts, ns);

+3 -8

arch/x86/vdso/vgetcpu.c

··· 17 17 { 18 18 unsigned int p; 19 19 20 - if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { 21 - /* Load per CPU data from RDTSCP */ 22 - native_read_tscp(&p); 23 - } else { 24 - /* Load per CPU data from GDT */ 25 - asm("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); 26 - } 20 + p = __getcpu(); 21 + 27 22 if (cpu) 28 - *cpu = p & 0xfff; 23 + *cpu = p & VGETCPU_CPU_MASK; 29 24 if (node) 30 25 *node = p >> 12; 31 26 return 0;

+1

drivers/tty/Kconfig

··· 357 357 config PPC_EPAPR_HV_BYTECHAN 358 358 tristate "ePAPR hypervisor byte channel driver" 359 359 depends on PPC 360 + select EPAPR_PARAVIRT 360 361 help 361 362 This driver creates /dev entries for each ePAPR hypervisor byte 362 363 channel, thereby allowing applications to communicate with byte

+1

drivers/virt/Kconfig

··· 15 15 config FSL_HV_MANAGER 16 16 tristate "Freescale hypervisor management driver" 17 17 depends on FSL_SOC 18 + select EPAPR_PARAVIRT 18 19 help 19 20 The Freescale hypervisor management driver provides several services 20 21 to drivers and applications related to the Freescale hypervisor:

+40 -13

include/linux/kvm_host.h

··· 47 47 48 48 /* 49 49 * For the normal pfn, the highest 12 bits should be zero, 50 - * so we can mask these bits to indicate the error. 50 + * so we can mask bit 62 ~ bit 52 to indicate the error pfn, 51 + * mask bit 63 to indicate the noslot pfn. 51 52 */ 52 - #define KVM_PFN_ERR_MASK (0xfffULL << 52) 53 + #define KVM_PFN_ERR_MASK (0x7ffULL << 52) 54 + #define KVM_PFN_ERR_NOSLOT_MASK (0xfffULL << 52) 55 + #define KVM_PFN_NOSLOT (0x1ULL << 63) 53 56 54 57 #define KVM_PFN_ERR_FAULT (KVM_PFN_ERR_MASK) 55 58 #define KVM_PFN_ERR_HWPOISON (KVM_PFN_ERR_MASK + 1) 56 - #define KVM_PFN_ERR_BAD (KVM_PFN_ERR_MASK + 2) 57 - #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 3) 59 + #define KVM_PFN_ERR_RO_FAULT (KVM_PFN_ERR_MASK + 2) 58 60 61 + /* 62 + * error pfns indicate that the gfn is in slot but faild to 63 + * translate it to pfn on host. 64 + */ 59 65 static inline bool is_error_pfn(pfn_t pfn) 60 66 { 61 67 return !!(pfn & KVM_PFN_ERR_MASK); 62 68 } 63 69 64 - static inline bool is_noslot_pfn(pfn_t pfn) 70 + /* 71 + * error_noslot pfns indicate that the gfn can not be 72 + * translated to pfn - it is not in slot or failed to 73 + * translate it to pfn. 74 + */ 75 + static inline bool is_error_noslot_pfn(pfn_t pfn) 65 76 { 66 - return pfn == KVM_PFN_ERR_BAD; 77 + return !!(pfn & KVM_PFN_ERR_NOSLOT_MASK); 67 78 } 68 79 69 - static inline bool is_invalid_pfn(pfn_t pfn) 80 + /* noslot pfn indicates that the gfn is not in slot. */ 81 + static inline bool is_noslot_pfn(pfn_t pfn) 70 82 { 71 - return !is_noslot_pfn(pfn) && is_error_pfn(pfn); 83 + return pfn == KVM_PFN_NOSLOT; 72 84 } 73 85 74 86 #define KVM_HVA_ERR_BAD (PAGE_OFFSET) ··· 119 107 #define KVM_REQ_IMMEDIATE_EXIT 15 120 108 #define KVM_REQ_PMU 16 121 109 #define KVM_REQ_PMI 17 110 + #define KVM_REQ_WATCHDOG 18 111 + #define KVM_REQ_MASTERCLOCK_UPDATE 19 112 + #define KVM_REQ_MCLOCK_INPROGRESS 20 122 113 123 114 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 124 115 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 ··· 531 516 532 517 void kvm_flush_remote_tlbs(struct kvm *kvm); 533 518 void kvm_reload_remote_mmus(struct kvm *kvm); 519 + void kvm_make_mclock_inprogress_request(struct kvm *kvm); 534 520 535 521 long kvm_arch_dev_ioctl(struct file *filp, 536 522 unsigned int ioctl, unsigned long arg); ··· 585 569 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu); 586 570 struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id); 587 571 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu); 572 + int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu); 588 573 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu); 589 574 590 - int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu); 591 575 int kvm_arch_hardware_enable(void *garbage); 592 576 void kvm_arch_hardware_disable(void *garbage); 593 577 int kvm_arch_hardware_setup(void); ··· 682 666 unsigned long *deliver_bitmask); 683 667 #endif 684 668 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level); 669 + int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level); 685 670 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, 686 671 int irq_source_id, int level); 687 672 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin); ··· 855 838 extern struct dentry *kvm_debugfs_dir; 856 839 857 840 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) 858 - static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_seq) 841 + static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) 859 842 { 860 - if (unlikely(vcpu->kvm->mmu_notifier_count)) 843 + if (unlikely(kvm->mmu_notifier_count)) 861 844 return 1; 862 845 /* 863 846 * Ensure the read of mmu_notifier_count happens before the read ··· 870 853 * can't rely on kvm->mmu_lock to keep things ordered. 871 854 */ 872 855 smp_rmb(); 873 - if (vcpu->kvm->mmu_notifier_seq != mmu_seq) 856 + if (kvm->mmu_notifier_seq != mmu_seq) 874 857 return 1; 875 858 return 0; 876 859 } ··· 898 881 #ifdef CONFIG_HAVE_KVM_EVENTFD 899 882 900 883 void kvm_eventfd_init(struct kvm *kvm); 884 + int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); 885 + 886 + #ifdef CONFIG_HAVE_KVM_IRQCHIP 901 887 int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args); 902 888 void kvm_irqfd_release(struct kvm *kvm); 903 889 void kvm_irq_routing_update(struct kvm *, struct kvm_irq_routing_table *); 904 - int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args); 890 + #else 891 + static inline int kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args) 892 + { 893 + return -EINVAL; 894 + } 895 + 896 + static inline void kvm_irqfd_release(struct kvm *kvm) {} 897 + #endif 905 898 906 899 #else 907 900

+9

include/linux/pvclock_gtod.h

··· 1 + #ifndef _PVCLOCK_GTOD_H 2 + #define _PVCLOCK_GTOD_H 3 + 4 + #include <linux/notifier.h> 5 + 6 + extern int pvclock_gtod_register_notifier(struct notifier_block *nb); 7 + extern int pvclock_gtod_unregister_notifier(struct notifier_block *nb); 8 + 9 + #endif /* _PVCLOCK_GTOD_H */

+8

include/linux/sched.h

··· 107 107 extern void calc_global_load(unsigned long ticks); 108 108 extern void update_cpu_load_nohz(void); 109 109 110 + /* Notifier for when a task gets migrated to a new CPU */ 111 + struct task_migration_notifier { 112 + struct task_struct *task; 113 + int from_cpu; 114 + int to_cpu; 115 + }; 116 + extern void register_task_migration_notifier(struct notifier_block *n); 117 + 110 118 extern unsigned long get_parent_ip(unsigned long addr); 111 119 112 120 extern void dump_cpu_task(int cpu);

+16 -5

include/uapi/linux/kvm.h

··· 167 167 #define KVM_EXIT_OSI 18 168 168 #define KVM_EXIT_PAPR_HCALL 19 169 169 #define KVM_EXIT_S390_UCONTROL 20 170 + #define KVM_EXIT_WATCHDOG 21 170 171 171 172 /* For KVM_EXIT_INTERNAL_ERROR */ 172 - #define KVM_INTERNAL_ERROR_EMULATION 1 173 - #define KVM_INTERNAL_ERROR_SIMUL_EX 2 173 + /* Emulate instruction failed. */ 174 + #define KVM_INTERNAL_ERROR_EMULATION 1 175 + /* Encounter unexpected simultaneous exceptions. */ 176 + #define KVM_INTERNAL_ERROR_SIMUL_EX 2 177 + /* Encounter unexpected vm-exit due to delivery event. */ 178 + #define KVM_INTERNAL_ERROR_DELIVERY_EV 3 174 179 175 180 /* for KVM_RUN, returned by mmap(vcpu_fd, offset=0) */ 176 181 struct kvm_run { ··· 482 477 struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; 483 478 }; 484 479 480 + #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) 481 + 485 482 #define KVMIO 0xAE 486 483 487 484 /* machine type bits, to be used as argument to KVM_CREATE_VM */ ··· 633 626 #define KVM_CAP_READONLY_MEM 81 634 627 #endif 635 628 #define KVM_CAP_IRQFD_RESAMPLE 82 629 + #define KVM_CAP_PPC_BOOKE_WATCHDOG 83 630 + #define KVM_CAP_PPC_HTAB_FD 84 636 631 637 632 #ifdef KVM_CAP_IRQ_ROUTING 638 633 ··· 857 848 #define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa6, struct kvm_ppc_smmu_info) 858 849 /* Available with KVM_CAP_PPC_ALLOC_HTAB */ 859 850 #define KVM_PPC_ALLOCATE_HTAB _IOWR(KVMIO, 0xa7, __u32) 851 + #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) 852 + /* Available with KVM_CAP_RMA */ 853 + #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) 854 + /* Available with KVM_CAP_PPC_HTAB_FD */ 855 + #define KVM_PPC_GET_HTAB_FD _IOW(KVMIO, 0xaa, struct kvm_get_htab_fd) 860 856 861 857 /* 862 858 * ioctls for vcpu fds ··· 925 911 /* Available with KVM_CAP_XCRS */ 926 912 #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) 927 913 #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) 928 - #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO, 0xa8, struct kvm_create_spapr_tce) 929 - /* Available with KVM_CAP_RMA */ 930 - #define KVM_ALLOCATE_RMA _IOR(KVMIO, 0xa9, struct kvm_allocate_rma) 931 914 /* Available with KVM_CAP_SW_TLB */ 932 915 #define KVM_DIRTY_TLB _IOW(KVMIO, 0xaa, struct kvm_dirty_tlb) 933 916 /* Available with KVM_CAP_ONE_REG */

+15

kernel/sched/core.c

··· 923 923 rq->skip_clock_update = 1; 924 924 } 925 925 926 + static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); 927 + 928 + void register_task_migration_notifier(struct notifier_block *n) 929 + { 930 + atomic_notifier_chain_register(&task_migration_notifier, n); 931 + } 932 + 926 933 #ifdef CONFIG_SMP 927 934 void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 928 935 { ··· 960 953 trace_sched_migrate_task(p, new_cpu); 961 954 962 955 if (task_cpu(p) != new_cpu) { 956 + struct task_migration_notifier tmn; 957 + 963 958 if (p->sched_class->migrate_task_rq) 964 959 p->sched_class->migrate_task_rq(p, new_cpu); 965 960 p->se.nr_migrations++; 966 961 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 962 + 963 + tmn.task = p; 964 + tmn.from_cpu = task_cpu(p); 965 + tmn.to_cpu = new_cpu; 966 + 967 + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); 967 968 } 968 969 969 970 __set_task_cpu(p, new_cpu);

+50

kernel/time/timekeeping.c

··· 21 21 #include <linux/time.h> 22 22 #include <linux/tick.h> 23 23 #include <linux/stop_machine.h> 24 + #include <linux/pvclock_gtod.h> 24 25 25 26 26 27 static struct timekeeper timekeeper; ··· 175 174 return nsec + arch_gettimeoffset(); 176 175 } 177 176 177 + static RAW_NOTIFIER_HEAD(pvclock_gtod_chain); 178 + 179 + static void update_pvclock_gtod(struct timekeeper *tk) 180 + { 181 + raw_notifier_call_chain(&pvclock_gtod_chain, 0, tk); 182 + } 183 + 184 + /** 185 + * pvclock_gtod_register_notifier - register a pvclock timedata update listener 186 + * 187 + * Must hold write on timekeeper.lock 188 + */ 189 + int pvclock_gtod_register_notifier(struct notifier_block *nb) 190 + { 191 + struct timekeeper *tk = &timekeeper; 192 + unsigned long flags; 193 + int ret; 194 + 195 + write_seqlock_irqsave(&tk->lock, flags); 196 + ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb); 197 + /* update timekeeping data */ 198 + update_pvclock_gtod(tk); 199 + write_sequnlock_irqrestore(&tk->lock, flags); 200 + 201 + return ret; 202 + } 203 + EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); 204 + 205 + /** 206 + * pvclock_gtod_unregister_notifier - unregister a pvclock 207 + * timedata update listener 208 + * 209 + * Must hold write on timekeeper.lock 210 + */ 211 + int pvclock_gtod_unregister_notifier(struct notifier_block *nb) 212 + { 213 + struct timekeeper *tk = &timekeeper; 214 + unsigned long flags; 215 + int ret; 216 + 217 + write_seqlock_irqsave(&tk->lock, flags); 218 + ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb); 219 + write_sequnlock_irqrestore(&tk->lock, flags); 220 + 221 + return ret; 222 + } 223 + EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); 224 + 178 225 /* must hold write on timekeeper.lock */ 179 226 static void timekeeping_update(struct timekeeper *tk, bool clearntp) 180 227 { ··· 231 182 ntp_clear(); 232 183 } 233 184 update_vsyscall(tk); 185 + update_pvclock_gtod(tk); 234 186 } 235 187 236 188 /**

+26 -10

virt/kvm/assigned-dev.c

··· 105 105 } 106 106 107 107 #ifdef __KVM_HAVE_MSI 108 + static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) 109 + { 110 + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; 111 + int ret = kvm_set_irq_inatomic(assigned_dev->kvm, 112 + assigned_dev->irq_source_id, 113 + assigned_dev->guest_irq, 1); 114 + return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; 115 + } 116 + 108 117 static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) 109 118 { 110 119 struct kvm_assigned_dev_kernel *assigned_dev = dev_id; ··· 126 117 #endif 127 118 128 119 #ifdef __KVM_HAVE_MSIX 120 + static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) 121 + { 122 + struct kvm_assigned_dev_kernel *assigned_dev = dev_id; 123 + int index = find_index_from_host_irq(assigned_dev, irq); 124 + u32 vector; 125 + int ret = 0; 126 + 127 + if (index >= 0) { 128 + vector = assigned_dev->guest_msix_entries[index].vector; 129 + ret = kvm_set_irq_inatomic(assigned_dev->kvm, 130 + assigned_dev->irq_source_id, 131 + vector, 1); 132 + } 133 + 134 + return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; 135 + } 136 + 129 137 static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) 130 138 { 131 139 struct kvm_assigned_dev_kernel *assigned_dev = dev_id; ··· 360 334 } 361 335 362 336 #ifdef __KVM_HAVE_MSI 363 - static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) 364 - { 365 - return IRQ_WAKE_THREAD; 366 - } 367 - 368 337 static int assigned_device_enable_host_msi(struct kvm *kvm, 369 338 struct kvm_assigned_dev_kernel *dev) 370 339 { ··· 384 363 #endif 385 364 386 365 #ifdef __KVM_HAVE_MSIX 387 - static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) 388 - { 389 - return IRQ_WAKE_THREAD; 390 - } 391 - 392 366 static int assigned_device_enable_host_msix(struct kvm *kvm, 393 367 struct kvm_assigned_dev_kernel *dev) 394 368 {

+7 -1

virt/kvm/eventfd.c

··· 35 35 36 36 #include "iodev.h" 37 37 38 + #ifdef __KVM_HAVE_IOAPIC 38 39 /* 39 40 * -------------------------------------------------------------------- 40 41 * irqfd: Allows an fd to be used to inject an interrupt to the guest ··· 333 332 mutex_lock(&kvm->irqfds.resampler_lock); 334 333 335 334 list_for_each_entry(resampler, 336 - &kvm->irqfds.resampler_list, list) { 335 + &kvm->irqfds.resampler_list, link) { 337 336 if (resampler->notifier.gsi == irqfd->gsi) { 338 337 irqfd->resampler = resampler; 339 338 break; ··· 426 425 kfree(irqfd); 427 426 return ret; 428 427 } 428 + #endif 429 429 430 430 void 431 431 kvm_eventfd_init(struct kvm *kvm) 432 432 { 433 + #ifdef __KVM_HAVE_IOAPIC 433 434 spin_lock_init(&kvm->irqfds.lock); 434 435 INIT_LIST_HEAD(&kvm->irqfds.items); 435 436 INIT_LIST_HEAD(&kvm->irqfds.resampler_list); 436 437 mutex_init(&kvm->irqfds.resampler_lock); 438 + #endif 437 439 INIT_LIST_HEAD(&kvm->ioeventfds); 438 440 } 439 441 442 + #ifdef __KVM_HAVE_IOAPIC 440 443 /* 441 444 * shutdown any irqfd's that match fd+gsi 442 445 */ ··· 560 555 561 556 module_init(irqfd_module_init); 562 557 module_exit(irqfd_module_exit); 558 + #endif 563 559 564 560 /* 565 561 * --------------------------------------------------------------------

+3 -7

virt/kvm/iommu.c

··· 52 52 end_gfn = gfn + (size >> PAGE_SHIFT); 53 53 gfn += 1; 54 54 55 - if (is_error_pfn(pfn)) 55 + if (is_error_noslot_pfn(pfn)) 56 56 return pfn; 57 57 58 58 while (gfn < end_gfn) ··· 106 106 * important because we unmap and unpin in 4kb steps later. 107 107 */ 108 108 pfn = kvm_pin_pages(slot, gfn, page_size); 109 - if (is_error_pfn(pfn)) { 109 + if (is_error_noslot_pfn(pfn)) { 110 110 gfn += 1; 111 111 continue; 112 112 } ··· 168 168 169 169 r = iommu_attach_device(domain, &pdev->dev); 170 170 if (r) { 171 - printk(KERN_ERR "assign device %x:%x:%x.%x failed", 172 - pci_domain_nr(pdev->bus), 173 - pdev->bus->number, 174 - PCI_SLOT(pdev->devfn), 175 - PCI_FUNC(pdev->devfn)); 171 + dev_err(&pdev->dev, "kvm assign device failed ret %d", r); 176 172 return r; 177 173 } 178 174

+71 -12

virt/kvm/irq_comm.c

··· 102 102 return r; 103 103 } 104 104 105 + static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, 106 + struct kvm_lapic_irq *irq) 107 + { 108 + trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); 109 + 110 + irq->dest_id = (e->msi.address_lo & 111 + MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; 112 + irq->vector = (e->msi.data & 113 + MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; 114 + irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; 115 + irq->trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; 116 + irq->delivery_mode = e->msi.data & 0x700; 117 + irq->level = 1; 118 + irq->shorthand = 0; 119 + /* TODO Deal with RH bit of MSI message address */ 120 + } 121 + 105 122 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, 106 123 struct kvm *kvm, int irq_source_id, int level) 107 124 { ··· 127 110 if (!level) 128 111 return -1; 129 112 130 - trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data); 113 + kvm_set_msi_irq(e, &irq); 131 114 132 - irq.dest_id = (e->msi.address_lo & 133 - MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT; 134 - irq.vector = (e->msi.data & 135 - MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT; 136 - irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo; 137 - irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data; 138 - irq.delivery_mode = e->msi.data & 0x700; 139 - irq.level = 1; 140 - irq.shorthand = 0; 141 - 142 - /* TODO Deal with RH bit of MSI message address */ 143 115 return kvm_irq_delivery_to_apic(kvm, NULL, &irq); 116 + } 117 + 118 + 119 + static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e, 120 + struct kvm *kvm) 121 + { 122 + struct kvm_lapic_irq irq; 123 + int r; 124 + 125 + kvm_set_msi_irq(e, &irq); 126 + 127 + if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r)) 128 + return r; 129 + else 130 + return -EWOULDBLOCK; 144 131 } 145 132 146 133 int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi) ··· 196 175 ret = r + ((ret < 0) ? 0 : ret); 197 176 } 198 177 178 + return ret; 179 + } 180 + 181 + /* 182 + * Deliver an IRQ in an atomic context if we can, or return a failure, 183 + * user can retry in a process context. 184 + * Return value: 185 + * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. 186 + * Other values - No need to retry. 187 + */ 188 + int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level) 189 + { 190 + struct kvm_kernel_irq_routing_entry *e; 191 + int ret = -EINVAL; 192 + struct kvm_irq_routing_table *irq_rt; 193 + struct hlist_node *n; 194 + 195 + trace_kvm_set_irq(irq, level, irq_source_id); 196 + 197 + /* 198 + * Injection into either PIC or IOAPIC might need to scan all CPUs, 199 + * which would need to be retried from thread context; when same GSI 200 + * is connected to both PIC and IOAPIC, we'd have to report a 201 + * partial failure here. 202 + * Since there's no easy way to do this, we only support injecting MSI 203 + * which is limited to 1:1 GSI mapping. 204 + */ 205 + rcu_read_lock(); 206 + irq_rt = rcu_dereference(kvm->irq_routing); 207 + if (irq < irq_rt->nr_rt_entries) 208 + hlist_for_each_entry(e, n, &irq_rt->map[irq], link) { 209 + if (likely(e->type == KVM_IRQ_ROUTING_MSI)) 210 + ret = kvm_set_msi_inatomic(e, kvm); 211 + else 212 + ret = -EWOULDBLOCK; 213 + break; 214 + } 215 + rcu_read_unlock(); 199 216 return ret; 200 217 } 201 218

+16 -41

virt/kvm/kvm_main.c

··· 212 212 make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); 213 213 } 214 214 215 + void kvm_make_mclock_inprogress_request(struct kvm *kvm) 216 + { 217 + make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); 218 + } 219 + 215 220 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) 216 221 { 217 222 struct page *page; ··· 714 709 int r; 715 710 gfn_t base_gfn; 716 711 unsigned long npages; 717 - unsigned long i; 718 - struct kvm_memory_slot *memslot; 712 + struct kvm_memory_slot *memslot, *slot; 719 713 struct kvm_memory_slot old, new; 720 714 struct kvm_memslots *slots, *old_memslots; 721 715 ··· 765 761 766 762 /* Check for overlaps */ 767 763 r = -EEXIST; 768 - for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 769 - struct kvm_memory_slot *s = &kvm->memslots->memslots[i]; 770 - 771 - if (s == memslot || !s->npages) 764 + kvm_for_each_memslot(slot, kvm->memslots) { 765 + if (slot->id >= KVM_MEMORY_SLOTS || slot == memslot) 772 766 continue; 773 - if (!((base_gfn + npages <= s->base_gfn) || 774 - (base_gfn >= s->base_gfn + s->npages))) 767 + if (!((base_gfn + npages <= slot->base_gfn) || 768 + (base_gfn >= slot->base_gfn + slot->npages))) 775 769 goto out_free; 776 770 } 777 771 ··· 1210 1208 return KVM_PFN_ERR_RO_FAULT; 1211 1209 1212 1210 if (kvm_is_error_hva(addr)) 1213 - return KVM_PFN_ERR_BAD; 1211 + return KVM_PFN_NOSLOT; 1214 1212 1215 1213 /* Do not map writable pfn in the readonly memslot. */ 1216 1214 if (writable && memslot_is_readonly(slot)) { ··· 1292 1290 1293 1291 static struct page *kvm_pfn_to_page(pfn_t pfn) 1294 1292 { 1295 - if (is_error_pfn(pfn)) 1293 + if (is_error_noslot_pfn(pfn)) 1296 1294 return KVM_ERR_PTR_BAD_PAGE; 1297 1295 1298 1296 if (kvm_is_mmio_pfn(pfn)) { ··· 1324 1322 1325 1323 void kvm_release_pfn_clean(pfn_t pfn) 1326 1324 { 1327 - if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) 1325 + if (!is_error_noslot_pfn(pfn) && !kvm_is_mmio_pfn(pfn)) 1328 1326 put_page(pfn_to_page(pfn)); 1329 1327 } 1330 1328 EXPORT_SYMBOL_GPL(kvm_release_pfn_clean); ··· 1850 1848 atomic_inc(&kvm->online_vcpus); 1851 1849 1852 1850 mutex_unlock(&kvm->lock); 1851 + kvm_arch_vcpu_postcreate(vcpu); 1853 1852 return r; 1854 1853 1855 1854 unlock_vcpu_destroy: ··· 1932 1929 goto out; 1933 1930 } 1934 1931 r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); 1935 - if (r) 1936 - goto out_free2; 1937 - r = 0; 1938 - out_free2: 1939 1932 kfree(kvm_regs); 1940 1933 break; 1941 1934 } ··· 1953 1954 kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs)); 1954 1955 if (IS_ERR(kvm_sregs)) { 1955 1956 r = PTR_ERR(kvm_sregs); 1957 + kvm_sregs = NULL; 1956 1958 goto out; 1957 1959 } 1958 1960 r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); 1959 - if (r) 1960 - goto out; 1961 - r = 0; 1962 1961 break; 1963 1962 } 1964 1963 case KVM_GET_MP_STATE: { ··· 1978 1981 if (copy_from_user(&mp_state, argp, sizeof mp_state)) 1979 1982 goto out; 1980 1983 r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); 1981 - if (r) 1982 - goto out; 1983 - r = 0; 1984 1984 break; 1985 1985 } 1986 1986 case KVM_TRANSLATE: { ··· 2002 2008 if (copy_from_user(&dbg, argp, sizeof dbg)) 2003 2009 goto out; 2004 2010 r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); 2005 - if (r) 2006 - goto out; 2007 - r = 0; 2008 2011 break; 2009 2012 } 2010 2013 case KVM_SET_SIGNAL_MASK: { ··· 2045 2054 fpu = memdup_user(argp, sizeof(*fpu)); 2046 2055 if (IS_ERR(fpu)) { 2047 2056 r = PTR_ERR(fpu); 2057 + fpu = NULL; 2048 2058 goto out; 2049 2059 } 2050 2060 r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); 2051 - if (r) 2052 - goto out; 2053 - r = 0; 2054 2061 break; 2055 2062 } 2056 2063 default: ··· 2118 2129 switch (ioctl) { 2119 2130 case KVM_CREATE_VCPU: 2120 2131 r = kvm_vm_ioctl_create_vcpu(kvm, arg); 2121 - if (r < 0) 2122 - goto out; 2123 2132 break; 2124 2133 case KVM_SET_USER_MEMORY_REGION: { 2125 2134 struct kvm_userspace_memory_region kvm_userspace_mem; ··· 2128 2141 goto out; 2129 2142 2130 2143 r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 1); 2131 - if (r) 2132 - goto out; 2133 2144 break; 2134 2145 } 2135 2146 case KVM_GET_DIRTY_LOG: { ··· 2137 2152 if (copy_from_user(&log, argp, sizeof log)) 2138 2153 goto out; 2139 2154 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2140 - if (r) 2141 - goto out; 2142 2155 break; 2143 2156 } 2144 2157 #ifdef KVM_COALESCED_MMIO_PAGE_OFFSET ··· 2146 2163 if (copy_from_user(&zone, argp, sizeof zone)) 2147 2164 goto out; 2148 2165 r = kvm_vm_ioctl_register_coalesced_mmio(kvm, &zone); 2149 - if (r) 2150 - goto out; 2151 - r = 0; 2152 2166 break; 2153 2167 } 2154 2168 case KVM_UNREGISTER_COALESCED_MMIO: { ··· 2154 2174 if (copy_from_user(&zone, argp, sizeof zone)) 2155 2175 goto out; 2156 2176 r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, &zone); 2157 - if (r) 2158 - goto out; 2159 - r = 0; 2160 2177 break; 2161 2178 } 2162 2179 #endif ··· 2262 2285 log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap); 2263 2286 2264 2287 r = kvm_vm_ioctl_get_dirty_log(kvm, &log); 2265 - if (r) 2266 - goto out; 2267 2288 break; 2268 2289 } 2269 2290 default: