commit 5e83f6fbdb020b70c0e413312801424d13c58d68 · tjh.dev/kernel

-21

Documentation/feature-removal-schedule.txt

··· 487 487 488 488 ---------------------------- 489 489 490 - What: KVM memory aliases support 491 - When: July 2010 492 - Why: Memory aliasing support is used for speeding up guest vga access 493 - through the vga windows. 494 - 495 - Modern userspace no longer uses this feature, so it's just bitrotted 496 - code and can be removed with no impact. 497 - Who: Avi Kivity <avi@redhat.com> 498 - 499 - ---------------------------- 500 - 501 490 What: xtime, wall_to_monotonic 502 491 When: 2.6.36+ 503 492 Files: kernel/time/timekeeping.c include/linux/time.h ··· 494 505 existing timekeeping accessor functions to access 495 506 the equivalent functionality. 496 507 Who: John Stultz <johnstul@us.ibm.com> 497 - 498 - ---------------------------- 499 - 500 - What: KVM kernel-allocated memory slots 501 - When: July 2010 502 - Why: Since 2.6.25, kvm supports user-allocated memory slots, which are 503 - much more flexible than kernel-allocated slots. All current userspace 504 - supports the newer interface and this code can be removed with no 505 - impact. 506 - Who: Avi Kivity <avi@redhat.com> 507 508 508 509 ---------------------------- 509 510

+174 -34

Documentation/kvm/api.txt

··· 126 126 kvm adjusts nmsrs to reflect the actual number of msrs and fills in 127 127 the indices array with their numbers. 128 128 129 + Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are 130 + not returned in the MSR list, as different vcpus can have a different number 131 + of banks, as set via the KVM_X86_SETUP_MCE ioctl. 132 + 129 133 4.4 KVM_CHECK_EXTENSION 130 134 131 135 Capability: basic ··· 164 160 Parameters: struct kvm_memory_region (in) 165 161 Returns: 0 on success, -1 on error 166 162 167 - struct kvm_memory_region { 168 - __u32 slot; 169 - __u32 flags; 170 - __u64 guest_phys_addr; 171 - __u64 memory_size; /* bytes */ 172 - }; 173 - 174 - /* for kvm_memory_region::flags */ 175 - #define KVM_MEM_LOG_DIRTY_PAGES 1UL 176 - 177 - This ioctl allows the user to create or modify a guest physical memory 178 - slot. When changing an existing slot, it may be moved in the guest 179 - physical memory space, or its flags may be modified. It may not be 180 - resized. Slots may not overlap. 181 - 182 - The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which 183 - instructs kvm to keep track of writes to memory within the slot. See 184 - the KVM_GET_DIRTY_LOG ioctl. 185 - 186 - It is recommended to use the KVM_SET_USER_MEMORY_REGION ioctl instead 187 - of this API, if available. This newer API allows placing guest memory 188 - at specified locations in the host address space, yielding better 189 - control and easy access. 163 + This ioctl is obsolete and has been removed. 190 164 191 165 4.6 KVM_CREATE_VCPU 192 166 ··· 208 226 Parameters: struct kvm_memory_alias (in) 209 227 Returns: 0 (success), -1 (error) 210 228 211 - struct kvm_memory_alias { 212 - __u32 slot; /* this has a different namespace than memory slots */ 213 - __u32 flags; 214 - __u64 guest_phys_addr; 215 - __u64 memory_size; 216 - __u64 target_phys_addr; 217 - }; 218 - 219 - Defines a guest physical address space region as an alias to another 220 - region. Useful for aliased address, for example the VGA low memory 221 - window. Should not be used with userspace memory. 229 + This ioctl is obsolete and has been removed. 222 230 223 231 4.9 KVM_RUN 224 232 ··· 863 891 864 892 This ioctl is only useful after KVM_CREATE_IRQCHIP. Without an in-kernel 865 893 irqchip, the multiprocessing state must be maintained by userspace. 894 + 895 + 4.39 KVM_SET_IDENTITY_MAP_ADDR 896 + 897 + Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR 898 + Architectures: x86 899 + Type: vm ioctl 900 + Parameters: unsigned long identity (in) 901 + Returns: 0 on success, -1 on error 902 + 903 + This ioctl defines the physical address of a one-page region in the guest 904 + physical address space. The region must be within the first 4GB of the 905 + guest physical address space and must not conflict with any memory slot 906 + or any mmio address. The guest may malfunction if it accesses this memory 907 + region. 908 + 909 + This ioctl is required on Intel-based hosts. This is needed on Intel hardware 910 + because of a quirk in the virtualization implementation (see the internals 911 + documentation when it pops into existence). 912 + 913 + 4.40 KVM_SET_BOOT_CPU_ID 914 + 915 + Capability: KVM_CAP_SET_BOOT_CPU_ID 916 + Architectures: x86, ia64 917 + Type: vm ioctl 918 + Parameters: unsigned long vcpu_id 919 + Returns: 0 on success, -1 on error 920 + 921 + Define which vcpu is the Bootstrap Processor (BSP). Values are the same 922 + as the vcpu id in KVM_CREATE_VCPU. If this ioctl is not called, the default 923 + is vcpu 0. 924 + 925 + 4.41 KVM_GET_XSAVE 926 + 927 + Capability: KVM_CAP_XSAVE 928 + Architectures: x86 929 + Type: vcpu ioctl 930 + Parameters: struct kvm_xsave (out) 931 + Returns: 0 on success, -1 on error 932 + 933 + struct kvm_xsave { 934 + __u32 region[1024]; 935 + }; 936 + 937 + This ioctl would copy current vcpu's xsave struct to the userspace. 938 + 939 + 4.42 KVM_SET_XSAVE 940 + 941 + Capability: KVM_CAP_XSAVE 942 + Architectures: x86 943 + Type: vcpu ioctl 944 + Parameters: struct kvm_xsave (in) 945 + Returns: 0 on success, -1 on error 946 + 947 + struct kvm_xsave { 948 + __u32 region[1024]; 949 + }; 950 + 951 + This ioctl would copy userspace's xsave struct to the kernel. 952 + 953 + 4.43 KVM_GET_XCRS 954 + 955 + Capability: KVM_CAP_XCRS 956 + Architectures: x86 957 + Type: vcpu ioctl 958 + Parameters: struct kvm_xcrs (out) 959 + Returns: 0 on success, -1 on error 960 + 961 + struct kvm_xcr { 962 + __u32 xcr; 963 + __u32 reserved; 964 + __u64 value; 965 + }; 966 + 967 + struct kvm_xcrs { 968 + __u32 nr_xcrs; 969 + __u32 flags; 970 + struct kvm_xcr xcrs[KVM_MAX_XCRS]; 971 + __u64 padding[16]; 972 + }; 973 + 974 + This ioctl would copy current vcpu's xcrs to the userspace. 975 + 976 + 4.44 KVM_SET_XCRS 977 + 978 + Capability: KVM_CAP_XCRS 979 + Architectures: x86 980 + Type: vcpu ioctl 981 + Parameters: struct kvm_xcrs (in) 982 + Returns: 0 on success, -1 on error 983 + 984 + struct kvm_xcr { 985 + __u32 xcr; 986 + __u32 reserved; 987 + __u64 value; 988 + }; 989 + 990 + struct kvm_xcrs { 991 + __u32 nr_xcrs; 992 + __u32 flags; 993 + struct kvm_xcr xcrs[KVM_MAX_XCRS]; 994 + __u64 padding[16]; 995 + }; 996 + 997 + This ioctl would set vcpu's xcr to the value userspace specified. 998 + 999 + 4.45 KVM_GET_SUPPORTED_CPUID 1000 + 1001 + Capability: KVM_CAP_EXT_CPUID 1002 + Architectures: x86 1003 + Type: system ioctl 1004 + Parameters: struct kvm_cpuid2 (in/out) 1005 + Returns: 0 on success, -1 on error 1006 + 1007 + struct kvm_cpuid2 { 1008 + __u32 nent; 1009 + __u32 padding; 1010 + struct kvm_cpuid_entry2 entries[0]; 1011 + }; 1012 + 1013 + #define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1 1014 + #define KVM_CPUID_FLAG_STATEFUL_FUNC 2 1015 + #define KVM_CPUID_FLAG_STATE_READ_NEXT 4 1016 + 1017 + struct kvm_cpuid_entry2 { 1018 + __u32 function; 1019 + __u32 index; 1020 + __u32 flags; 1021 + __u32 eax; 1022 + __u32 ebx; 1023 + __u32 ecx; 1024 + __u32 edx; 1025 + __u32 padding[3]; 1026 + }; 1027 + 1028 + This ioctl returns x86 cpuid features which are supported by both the hardware 1029 + and kvm. Userspace can use the information returned by this ioctl to 1030 + construct cpuid information (for KVM_SET_CPUID2) that is consistent with 1031 + hardware, kernel, and userspace capabilities, and with user requirements (for 1032 + example, the user may wish to constrain cpuid to emulate older hardware, 1033 + or for feature consistency across a cluster). 1034 + 1035 + Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure 1036 + with the 'nent' field indicating the number of entries in the variable-size 1037 + array 'entries'. If the number of entries is too low to describe the cpu 1038 + capabilities, an error (E2BIG) is returned. If the number is too high, 1039 + the 'nent' field is adjusted and an error (ENOMEM) is returned. If the 1040 + number is just right, the 'nent' field is adjusted to the number of valid 1041 + entries in the 'entries' array, which is then filled. 1042 + 1043 + The entries returned are the host cpuid as returned by the cpuid instruction, 1044 + with unknown or unsupported features masked out. The fields in each entry 1045 + are defined as follows: 1046 + 1047 + function: the eax value used to obtain the entry 1048 + index: the ecx value used to obtain the entry (for entries that are 1049 + affected by ecx) 1050 + flags: an OR of zero or more of the following: 1051 + KVM_CPUID_FLAG_SIGNIFCANT_INDEX: 1052 + if the index field is valid 1053 + KVM_CPUID_FLAG_STATEFUL_FUNC: 1054 + if cpuid for this function returns different values for successive 1055 + invocations; there will be several entries with the same function, 1056 + all with this flag set 1057 + KVM_CPUID_FLAG_STATE_READ_NEXT: 1058 + for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is 1059 + the first entry to be read by a cpu 1060 + eax, ebx, ecx, edx: the values returned by the cpuid instruction for 1061 + this function/index combination 866 1062 867 1063 5. The kvm_run structure 868 1064

+48 -4

Documentation/kvm/mmu.txt

··· 77 77 78 78 Guest memory (gpa) is part of the user address space of the process that is 79 79 using kvm. Userspace defines the translation between guest addresses and user 80 - addresses (gpa->hva); note that two gpas may alias to the same gva, but not 80 + addresses (gpa->hva); note that two gpas may alias to the same hva, but not 81 81 vice versa. 82 82 83 - These gvas may be backed using any method available to the host: anonymous 83 + These hvas may be backed using any method available to the host: anonymous 84 84 memory, file backed memory, and device memory. Memory might be paged by the 85 85 host at any time. 86 86 ··· 161 161 role.cr4_pae: 162 162 Contains the value of cr4.pae for which the page is valid (e.g. whether 163 163 32-bit or 64-bit gptes are in use). 164 - role.cr4_nxe: 164 + role.nxe: 165 165 Contains the value of efer.nxe for which the page is valid. 166 166 role.cr0_wp: 167 167 Contains the value of cr0.wp for which the page is valid. ··· 180 180 guest pages as leaves. 181 181 gfns: 182 182 An array of 512 guest frame numbers, one for each present pte. Used to 183 - perform a reverse map from a pte to a gfn. 183 + perform a reverse map from a pte to a gfn. When role.direct is set, any 184 + element of this array can be calculated from the gfn field when used, in 185 + this case, the array of gfns is not allocated. See role.direct and gfn. 184 186 slot_bitmap: 185 187 A bitmap containing one bit per memory slot. If the page contains a pte 186 188 mapping a page from memory slot n, then bit n of slot_bitmap will be set ··· 297 295 - mmu notifier called with updated hva 298 296 - look up affected sptes through reverse map 299 297 - drop (or update) translations 298 + 299 + Emulating cr0.wp 300 + ================ 301 + 302 + If tdp is not enabled, the host must keep cr0.wp=1 so page write protection 303 + works for the guest kernel, not guest guest userspace. When the guest 304 + cr0.wp=1, this does not present a problem. However when the guest cr0.wp=0, 305 + we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the 306 + semantics require allowing any guest kernel access plus user read access). 307 + 308 + We handle this by mapping the permissions to two possible sptes, depending 309 + on fault type: 310 + 311 + - kernel write fault: spte.u=0, spte.w=1 (allows full kernel access, 312 + disallows user access) 313 + - read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel 314 + write access) 315 + 316 + (user write faults generate a #PF) 317 + 318 + Large pages 319 + =========== 320 + 321 + The mmu supports all combinations of large and small guest and host pages. 322 + Supported page sizes include 4k, 2M, 4M, and 1G. 4M pages are treated as 323 + two separate 2M pages, on both guest and host, since the mmu always uses PAE 324 + paging. 325 + 326 + To instantiate a large spte, four constraints must be satisfied: 327 + 328 + - the spte must point to a large host page 329 + - the guest pte must be a large pte of at least equivalent size (if tdp is 330 + enabled, there is no guest pte and this condition is satisified) 331 + - if the spte will be writeable, the large page frame may not overlap any 332 + write-protected pages 333 + - the guest page must be wholly contained by a single memory slot 334 + 335 + To check the last two conditions, the mmu maintains a ->write_count set of 336 + arrays for each memory slot and large page size. Every write protected page 337 + causes its write_count to be incremented, thus preventing instantiation of 338 + a large spte. The frames at the end of an unaligned memory slot have 339 + artificically inflated ->write_counts so they can never be instantiated. 300 340 301 341 Further reading 302 342 ===============

+153

Documentation/kvm/msr.txt

··· 1 + KVM-specific MSRs. 2 + Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010 3 + ===================================================== 4 + 5 + KVM makes use of some custom MSRs to service some requests. 6 + At present, this facility is only used by kvmclock. 7 + 8 + Custom MSRs have a range reserved for them, that goes from 9 + 0x4b564d00 to 0x4b564dff. There are MSRs outside this area, 10 + but they are deprecated and their use is discouraged. 11 + 12 + Custom MSR list 13 + -------- 14 + 15 + The current supported Custom MSR list is: 16 + 17 + MSR_KVM_WALL_CLOCK_NEW: 0x4b564d00 18 + 19 + data: 4-byte alignment physical address of a memory area which must be 20 + in guest RAM. This memory is expected to hold a copy of the following 21 + structure: 22 + 23 + struct pvclock_wall_clock { 24 + u32 version; 25 + u32 sec; 26 + u32 nsec; 27 + } __attribute__((__packed__)); 28 + 29 + whose data will be filled in by the hypervisor. The hypervisor is only 30 + guaranteed to update this data at the moment of MSR write. 31 + Users that want to reliably query this information more than once have 32 + to write more than once to this MSR. Fields have the following meanings: 33 + 34 + version: guest has to check version before and after grabbing 35 + time information and check that they are both equal and even. 36 + An odd version indicates an in-progress update. 37 + 38 + sec: number of seconds for wallclock. 39 + 40 + nsec: number of nanoseconds for wallclock. 41 + 42 + Note that although MSRs are per-CPU entities, the effect of this 43 + particular MSR is global. 44 + 45 + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid 46 + leaf prior to usage. 47 + 48 + MSR_KVM_SYSTEM_TIME_NEW: 0x4b564d01 49 + 50 + data: 4-byte aligned physical address of a memory area which must be in 51 + guest RAM, plus an enable bit in bit 0. This memory is expected to hold 52 + a copy of the following structure: 53 + 54 + struct pvclock_vcpu_time_info { 55 + u32 version; 56 + u32 pad0; 57 + u64 tsc_timestamp; 58 + u64 system_time; 59 + u32 tsc_to_system_mul; 60 + s8 tsc_shift; 61 + u8 flags; 62 + u8 pad[2]; 63 + } __attribute__((__packed__)); /* 32 bytes */ 64 + 65 + whose data will be filled in by the hypervisor periodically. Only one 66 + write, or registration, is needed for each VCPU. The interval between 67 + updates of this structure is arbitrary and implementation-dependent. 68 + The hypervisor may update this structure at any time it sees fit until 69 + anything with bit0 == 0 is written to it. 70 + 71 + Fields have the following meanings: 72 + 73 + version: guest has to check version before and after grabbing 74 + time information and check that they are both equal and even. 75 + An odd version indicates an in-progress update. 76 + 77 + tsc_timestamp: the tsc value at the current VCPU at the time 78 + of the update of this structure. Guests can subtract this value 79 + from current tsc to derive a notion of elapsed time since the 80 + structure update. 81 + 82 + system_time: a host notion of monotonic time, including sleep 83 + time at the time this structure was last updated. Unit is 84 + nanoseconds. 85 + 86 + tsc_to_system_mul: a function of the tsc frequency. One has 87 + to multiply any tsc-related quantity by this value to get 88 + a value in nanoseconds, besides dividing by 2^tsc_shift 89 + 90 + tsc_shift: cycle to nanosecond divider, as a power of two, to 91 + allow for shift rights. One has to shift right any tsc-related 92 + quantity by this value to get a value in nanoseconds, besides 93 + multiplying by tsc_to_system_mul. 94 + 95 + With this information, guests can derive per-CPU time by 96 + doing: 97 + 98 + time = (current_tsc - tsc_timestamp) 99 + time = (time * tsc_to_system_mul) >> tsc_shift 100 + time = time + system_time 101 + 102 + flags: bits in this field indicate extended capabilities 103 + coordinated between the guest and the hypervisor. Availability 104 + of specific flags has to be checked in 0x40000001 cpuid leaf. 105 + Current flags are: 106 + 107 + flag bit | cpuid bit | meaning 108 + ------------------------------------------------------------- 109 + | | time measures taken across 110 + 0 | 24 | multiple cpus are guaranteed to 111 + | | be monotonic 112 + ------------------------------------------------------------- 113 + 114 + Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid 115 + leaf prior to usage. 116 + 117 + 118 + MSR_KVM_WALL_CLOCK: 0x11 119 + 120 + data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead. 121 + 122 + This MSR falls outside the reserved KVM range and may be removed in the 123 + future. Its usage is deprecated. 124 + 125 + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid 126 + leaf prior to usage. 127 + 128 + MSR_KVM_SYSTEM_TIME: 0x12 129 + 130 + data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead. 131 + 132 + This MSR falls outside the reserved KVM range and may be removed in the 133 + future. Its usage is deprecated. 134 + 135 + Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid 136 + leaf prior to usage. 137 + 138 + The suggested algorithm for detecting kvmclock presence is then: 139 + 140 + if (!kvm_para_available()) /* refer to cpuid.txt */ 141 + return NON_PRESENT; 142 + 143 + flags = cpuid_eax(0x40000001); 144 + if (flags & 3) { 145 + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; 146 + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; 147 + return PRESENT; 148 + } else if (flags & 0) { 149 + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; 150 + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; 151 + return PRESENT; 152 + } else 153 + return NON_PRESENT;

+38

Documentation/kvm/review-checklist.txt

··· 1 + Review checklist for kvm patches 2 + ================================ 3 + 4 + 1. The patch must follow Documentation/CodingStyle and 5 + Documentation/SubmittingPatches. 6 + 7 + 2. Patches should be against kvm.git master branch. 8 + 9 + 3. If the patch introduces or modifies a new userspace API: 10 + - the API must be documented in Documentation/kvm/api.txt 11 + - the API must be discoverable using KVM_CHECK_EXTENSION 12 + 13 + 4. New state must include support for save/restore. 14 + 15 + 5. New features must default to off (userspace should explicitly request them). 16 + Performance improvements can and should default to on. 17 + 18 + 6. New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2 19 + 20 + 7. Emulator changes should be accompanied by unit tests for qemu-kvm.git 21 + kvm/test directory. 22 + 23 + 8. Changes should be vendor neutral when possible. Changes to common code 24 + are better than duplicating changes to vendor code. 25 + 26 + 9. Similarly, prefer changes to arch independent code than to arch dependent 27 + code. 28 + 29 + 10. User/kernel interfaces and guest/host interfaces must be 64-bit clean 30 + (all variables and sizes naturally aligned on 64-bit; use specific types 31 + only - u64 rather than ulong). 32 + 33 + 11. New guest visible features must either be documented in a hardware manual 34 + or be accompanied by documentation. 35 + 36 + 12. Features must be robust against reset and kexec - for example, shared 37 + host/guest memory must be unshared to prevent the host from writing to 38 + guest memory that the guest has not reserved for this purpose.

+1

arch/ia64/include/asm/kvm_host.h

··· 235 235 #define KVM_REQ_PTC_G 32 236 236 #define KVM_REQ_RESUME 33 237 237 238 + #define KVM_HPAGE_GFN_SHIFT(x) 0 238 239 #define KVM_NR_PAGE_SIZES 1 239 240 #define KVM_PAGES_PER_HPAGE(x) 1 240 241

+13 -37

arch/ia64/kvm/kvm-ia64.c

··· 725 725 int r; 726 726 sigset_t sigsaved; 727 727 728 - vcpu_load(vcpu); 729 - 730 728 if (vcpu->sigset_active) 731 729 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 732 730 ··· 746 748 if (vcpu->sigset_active) 747 749 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 748 750 749 - vcpu_put(vcpu); 750 751 return r; 751 752 } 752 753 ··· 880 883 struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd); 881 884 int i; 882 885 883 - vcpu_load(vcpu); 884 - 885 886 for (i = 0; i < 16; i++) { 886 887 vpd->vgr[i] = regs->vpd.vgr[i]; 887 888 vpd->vbgr[i] = regs->vpd.vbgr[i]; ··· 925 930 vcpu->arch.irq_new_pending = 1; 926 931 vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu); 927 932 set_bit(KVM_REQ_RESUME, &vcpu->requests); 928 - 929 - vcpu_put(vcpu); 930 933 931 934 return 0; 932 935 } ··· 1795 1802 kvm_vmm_info = NULL; 1796 1803 } 1797 1804 1798 - static int kvm_ia64_sync_dirty_log(struct kvm *kvm, 1799 - struct kvm_dirty_log *log) 1805 + static void kvm_ia64_sync_dirty_log(struct kvm *kvm, 1806 + struct kvm_memory_slot *memslot) 1800 1807 { 1801 - struct kvm_memory_slot *memslot; 1802 - int r, i; 1808 + int i; 1803 1809 long base; 1804 1810 unsigned long n; 1805 1811 unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base + 1806 1812 offsetof(struct kvm_vm_data, kvm_mem_dirty_log)); 1807 1813 1808 - r = -EINVAL; 1809 - if (log->slot >= KVM_MEMORY_SLOTS) 1810 - goto out; 1811 - 1812 - memslot = &kvm->memslots->memslots[log->slot]; 1813 - r = -ENOENT; 1814 - if (!memslot->dirty_bitmap) 1815 - goto out; 1816 - 1817 1814 n = kvm_dirty_bitmap_bytes(memslot); 1818 1815 base = memslot->base_gfn / BITS_PER_LONG; 1819 1816 1817 + spin_lock(&kvm->arch.dirty_log_lock); 1820 1818 for (i = 0; i < n/sizeof(long); ++i) { 1821 1819 memslot->dirty_bitmap[i] = dirty_bitmap[base + i]; 1822 1820 dirty_bitmap[base + i] = 0; 1823 1821 } 1824 - r = 0; 1825 - out: 1826 - return r; 1822 + spin_unlock(&kvm->arch.dirty_log_lock); 1827 1823 } 1828 1824 1829 1825 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, ··· 1824 1842 int is_dirty = 0; 1825 1843 1826 1844 mutex_lock(&kvm->slots_lock); 1827 - spin_lock(&kvm->arch.dirty_log_lock); 1828 1845 1829 - r = kvm_ia64_sync_dirty_log(kvm, log); 1830 - if (r) 1846 + r = -EINVAL; 1847 + if (log->slot >= KVM_MEMORY_SLOTS) 1831 1848 goto out; 1832 1849 1850 + memslot = &kvm->memslots->memslots[log->slot]; 1851 + r = -ENOENT; 1852 + if (!memslot->dirty_bitmap) 1853 + goto out; 1854 + 1855 + kvm_ia64_sync_dirty_log(kvm, memslot); 1833 1856 r = kvm_get_dirty_log(kvm, log, &is_dirty); 1834 1857 if (r) 1835 1858 goto out; ··· 1842 1855 /* If nothing is dirty, don't bother messing with page tables. */ 1843 1856 if (is_dirty) { 1844 1857 kvm_flush_remote_tlbs(kvm); 1845 - memslot = &kvm->memslots->memslots[log->slot]; 1846 1858 n = kvm_dirty_bitmap_bytes(memslot); 1847 1859 memset(memslot->dirty_bitmap, 0, n); 1848 1860 } 1849 1861 r = 0; 1850 1862 out: 1851 1863 mutex_unlock(&kvm->slots_lock); 1852 - spin_unlock(&kvm->arch.dirty_log_lock); 1853 1864 return r; 1854 1865 } 1855 1866 ··· 1938 1953 return vcpu->arch.timer_fired; 1939 1954 } 1940 1955 1941 - gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 1942 - { 1943 - return gfn; 1944 - } 1945 - 1946 1956 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 1947 1957 { 1948 1958 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) || ··· 1947 1967 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 1948 1968 struct kvm_mp_state *mp_state) 1949 1969 { 1950 - vcpu_load(vcpu); 1951 1970 mp_state->mp_state = vcpu->arch.mp_state; 1952 - vcpu_put(vcpu); 1953 1971 return 0; 1954 1972 } 1955 1973 ··· 1978 2000 { 1979 2001 int r = 0; 1980 2002 1981 - vcpu_load(vcpu); 1982 2003 vcpu->arch.mp_state = mp_state->mp_state; 1983 2004 if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED) 1984 2005 r = vcpu_reset(vcpu); 1985 - vcpu_put(vcpu); 1986 2006 return r; 1987 2007 }

+9 -1

arch/powerpc/include/asm/kvm_book3s.h

··· 115 115 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte); 116 116 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr); 117 117 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu); 118 - extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data); 118 + 119 + extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte); 120 + extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu); 121 + extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu); 122 + extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu); 123 + extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte); 124 + extern int kvmppc_mmu_hpte_sysinit(void); 125 + extern void kvmppc_mmu_hpte_sysexit(void); 126 + 119 127 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 120 128 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); 121 129 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);

+15 -12

arch/powerpc/include/asm/kvm_fpu.h

··· 22 22 23 23 #include <linux/types.h> 24 24 25 - extern void fps_fres(struct thread_struct *t, u32 *dst, u32 *src1); 26 - extern void fps_frsqrte(struct thread_struct *t, u32 *dst, u32 *src1); 27 - extern void fps_fsqrts(struct thread_struct *t, u32 *dst, u32 *src1); 25 + extern void fps_fres(u64 *fpscr, u32 *dst, u32 *src1); 26 + extern void fps_frsqrte(u64 *fpscr, u32 *dst, u32 *src1); 27 + extern void fps_fsqrts(u64 *fpscr, u32 *dst, u32 *src1); 28 28 29 - extern void fps_fadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 30 - extern void fps_fdivs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 31 - extern void fps_fmuls(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 32 - extern void fps_fsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2); 29 + extern void fps_fadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2); 30 + extern void fps_fdivs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2); 31 + extern void fps_fmuls(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2); 32 + extern void fps_fsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2); 33 33 34 - extern void fps_fmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 34 + extern void fps_fmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2, 35 35 u32 *src3); 36 - extern void fps_fmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 36 + extern void fps_fmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2, 37 37 u32 *src3); 38 - extern void fps_fnmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 38 + extern void fps_fnmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2, 39 39 u32 *src3); 40 - extern void fps_fnmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 40 + extern void fps_fnmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2, 41 41 u32 *src3); 42 - extern void fps_fsel(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2, 42 + extern void fps_fsel(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2, 43 43 u32 *src3); 44 44 45 45 #define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \ ··· 81 81 FPD_THREE_IN(fmadd) 82 82 FPD_THREE_IN(fnmsub) 83 83 FPD_THREE_IN(fnmadd) 84 + 85 + extern void kvm_cvt_fd(u32 *from, u64 *to, u64 *fpscr); 86 + extern void kvm_cvt_df(u64 *from, u32 *to, u64 *fpscr); 84 87 85 88 #endif

+15 -3

arch/powerpc/include/asm/kvm_host.h

··· 35 35 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 36 36 37 37 /* We don't currently support large pages. */ 38 + #define KVM_HPAGE_GFN_SHIFT(x) 0 38 39 #define KVM_NR_PAGE_SIZES 1 39 40 #define KVM_PAGES_PER_HPAGE(x) (1UL<<31) 40 41 41 - #define HPTEG_CACHE_NUM 1024 42 + #define HPTEG_CACHE_NUM (1 << 15) 43 + #define HPTEG_HASH_BITS_PTE 13 44 + #define HPTEG_HASH_BITS_VPTE 13 45 + #define HPTEG_HASH_BITS_VPTE_LONG 5 46 + #define HPTEG_HASH_NUM_PTE (1 << HPTEG_HASH_BITS_PTE) 47 + #define HPTEG_HASH_NUM_VPTE (1 << HPTEG_HASH_BITS_VPTE) 48 + #define HPTEG_HASH_NUM_VPTE_LONG (1 << HPTEG_HASH_BITS_VPTE_LONG) 42 49 43 50 struct kvm; 44 51 struct kvm_run; ··· 158 151 }; 159 152 160 153 struct hpte_cache { 154 + struct hlist_node list_pte; 155 + struct hlist_node list_vpte; 156 + struct hlist_node list_vpte_long; 161 157 u64 host_va; 162 158 u64 pfn; 163 159 ulong slot; ··· 292 282 unsigned long pending_exceptions; 293 283 294 284 #ifdef CONFIG_PPC_BOOK3S 295 - struct hpte_cache hpte_cache[HPTEG_CACHE_NUM]; 296 - int hpte_cache_offset; 285 + struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE]; 286 + struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE]; 287 + struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG]; 288 + int hpte_cache_count; 297 289 #endif 298 290 }; 299 291

-4

arch/powerpc/kernel/ppc_ksyms.c

··· 101 101 EXPORT_SYMBOL(start_thread); 102 102 EXPORT_SYMBOL(kernel_thread); 103 103 104 - #ifdef CONFIG_PPC_FPU 105 - EXPORT_SYMBOL_GPL(cvt_df); 106 - EXPORT_SYMBOL_GPL(cvt_fd); 107 - #endif 108 104 EXPORT_SYMBOL(giveup_fpu); 109 105 #ifdef CONFIG_ALTIVEC 110 106 EXPORT_SYMBOL(giveup_altivec);

+2 -1

arch/powerpc/kvm/44x_tlb.c

··· 316 316 gfn = gpaddr >> PAGE_SHIFT; 317 317 new_page = gfn_to_page(vcpu->kvm, gfn); 318 318 if (is_error_page(new_page)) { 319 - printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn); 319 + printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n", 320 + (unsigned long long)gfn); 320 321 kvm_release_page_clean(new_page); 321 322 return; 322 323 }

+2

arch/powerpc/kvm/Makefile

··· 45 45 book3s.o \ 46 46 book3s_emulate.o \ 47 47 book3s_interrupts.o \ 48 + book3s_mmu_hpte.o \ 48 49 book3s_64_mmu_host.o \ 49 50 book3s_64_mmu.o \ 50 51 book3s_32_mmu.o ··· 58 57 book3s.o \ 59 58 book3s_emulate.o \ 60 59 book3s_interrupts.o \ 60 + book3s_mmu_hpte.o \ 61 61 book3s_32_mmu_host.o \ 62 62 book3s_32_mmu.o 63 63 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)

+39 -40

arch/powerpc/kvm/book3s.c

··· 1047 1047 { 1048 1048 int i; 1049 1049 1050 - vcpu_load(vcpu); 1051 - 1052 1050 regs->pc = kvmppc_get_pc(vcpu); 1053 1051 regs->cr = kvmppc_get_cr(vcpu); 1054 1052 regs->ctr = kvmppc_get_ctr(vcpu); ··· 1067 1069 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 1068 1070 regs->gpr[i] = kvmppc_get_gpr(vcpu, i); 1069 1071 1070 - vcpu_put(vcpu); 1071 - 1072 1072 return 0; 1073 1073 } 1074 1074 1075 1075 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 1076 1076 { 1077 1077 int i; 1078 - 1079 - vcpu_load(vcpu); 1080 1078 1081 1079 kvmppc_set_pc(vcpu, regs->pc); 1082 1080 kvmppc_set_cr(vcpu, regs->cr); ··· 1093 1099 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 1094 1100 kvmppc_set_gpr(vcpu, i, regs->gpr[i]); 1095 1101 1096 - vcpu_put(vcpu); 1097 - 1098 1102 return 0; 1099 1103 } 1100 1104 ··· 1101 1109 { 1102 1110 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); 1103 1111 int i; 1104 - 1105 - vcpu_load(vcpu); 1106 1112 1107 1113 sregs->pvr = vcpu->arch.pvr; 1108 1114 ··· 1121 1131 } 1122 1132 } 1123 1133 1124 - vcpu_put(vcpu); 1125 - 1126 1134 return 0; 1127 1135 } 1128 1136 ··· 1129 1141 { 1130 1142 struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); 1131 1143 int i; 1132 - 1133 - vcpu_load(vcpu); 1134 1144 1135 1145 kvmppc_set_pvr(vcpu, sregs->pvr); 1136 1146 ··· 1156 1170 1157 1171 /* Flush the MMU after messing with the segments */ 1158 1172 kvmppc_mmu_pte_flush(vcpu, 0, 0); 1159 - 1160 - vcpu_put(vcpu); 1161 1173 1162 1174 return 0; 1163 1175 } ··· 1293 1309 int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 1294 1310 { 1295 1311 int ret; 1296 - struct thread_struct ext_bkp; 1312 + double fpr[32][TS_FPRWIDTH]; 1313 + unsigned int fpscr; 1314 + int fpexc_mode; 1297 1315 #ifdef CONFIG_ALTIVEC 1298 - bool save_vec = current->thread.used_vr; 1316 + vector128 vr[32]; 1317 + vector128 vscr; 1318 + unsigned long uninitialized_var(vrsave); 1319 + int used_vr; 1299 1320 #endif 1300 1321 #ifdef CONFIG_VSX 1301 - bool save_vsx = current->thread.used_vsr; 1322 + int used_vsr; 1302 1323 #endif 1303 1324 ulong ext_msr; 1304 1325 ··· 1316 1327 /* Save FPU state in stack */ 1317 1328 if (current->thread.regs->msr & MSR_FP) 1318 1329 giveup_fpu(current); 1319 - memcpy(ext_bkp.fpr, current->thread.fpr, sizeof(current->thread.fpr)); 1320 - ext_bkp.fpscr = current->thread.fpscr; 1321 - ext_bkp.fpexc_mode = current->thread.fpexc_mode; 1330 + memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr)); 1331 + fpscr = current->thread.fpscr.val; 1332 + fpexc_mode = current->thread.fpexc_mode; 1322 1333 1323 1334 #ifdef CONFIG_ALTIVEC 1324 1335 /* Save Altivec state in stack */ 1325 - if (save_vec) { 1336 + used_vr = current->thread.used_vr; 1337 + if (used_vr) { 1326 1338 if (current->thread.regs->msr & MSR_VEC) 1327 1339 giveup_altivec(current); 1328 - memcpy(ext_bkp.vr, current->thread.vr, sizeof(ext_bkp.vr)); 1329 - ext_bkp.vscr = current->thread.vscr; 1330 - ext_bkp.vrsave = current->thread.vrsave; 1340 + memcpy(vr, current->thread.vr, sizeof(current->thread.vr)); 1341 + vscr = current->thread.vscr; 1342 + vrsave = current->thread.vrsave; 1331 1343 } 1332 - ext_bkp.used_vr = current->thread.used_vr; 1333 1344 #endif 1334 1345 1335 1346 #ifdef CONFIG_VSX 1336 1347 /* Save VSX state in stack */ 1337 - if (save_vsx && (current->thread.regs->msr & MSR_VSX)) 1348 + used_vsr = current->thread.used_vsr; 1349 + if (used_vsr && (current->thread.regs->msr & MSR_VSX)) 1338 1350 __giveup_vsx(current); 1339 - ext_bkp.used_vsr = current->thread.used_vsr; 1340 1351 #endif 1341 1352 1342 1353 /* Remember the MSR with disabled extensions */ ··· 1361 1372 kvmppc_giveup_ext(vcpu, MSR_VSX); 1362 1373 1363 1374 /* Restore FPU state from stack */ 1364 - memcpy(current->thread.fpr, ext_bkp.fpr, sizeof(ext_bkp.fpr)); 1365 - current->thread.fpscr = ext_bkp.fpscr; 1366 - current->thread.fpexc_mode = ext_bkp.fpexc_mode; 1375 + memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr)); 1376 + current->thread.fpscr.val = fpscr; 1377 + current->thread.fpexc_mode = fpexc_mode; 1367 1378 1368 1379 #ifdef CONFIG_ALTIVEC 1369 1380 /* Restore Altivec state from stack */ 1370 - if (save_vec && current->thread.used_vr) { 1371 - memcpy(current->thread.vr, ext_bkp.vr, sizeof(ext_bkp.vr)); 1372 - current->thread.vscr = ext_bkp.vscr; 1373 - current->thread.vrsave= ext_bkp.vrsave; 1381 + if (used_vr && current->thread.used_vr) { 1382 + memcpy(current->thread.vr, vr, sizeof(current->thread.vr)); 1383 + current->thread.vscr = vscr; 1384 + current->thread.vrsave = vrsave; 1374 1385 } 1375 - current->thread.used_vr = ext_bkp.used_vr; 1386 + current->thread.used_vr = used_vr; 1376 1387 #endif 1377 1388 1378 1389 #ifdef CONFIG_VSX 1379 - current->thread.used_vsr = ext_bkp.used_vsr; 1390 + current->thread.used_vsr = used_vsr; 1380 1391 #endif 1381 1392 1382 1393 return ret; ··· 1384 1395 1385 1396 static int kvmppc_book3s_init(void) 1386 1397 { 1387 - return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, 1388 - THIS_MODULE); 1398 + int r; 1399 + 1400 + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0, 1401 + THIS_MODULE); 1402 + 1403 + if (r) 1404 + return r; 1405 + 1406 + r = kvmppc_mmu_hpte_sysinit(); 1407 + 1408 + return r; 1389 1409 } 1390 1410 1391 1411 static void kvmppc_book3s_exit(void) 1392 1412 { 1413 + kvmppc_mmu_hpte_sysexit(); 1393 1414 kvm_exit(); 1394 1415 } 1395 1416

+4 -4

arch/powerpc/kvm/book3s_32_mmu.c

··· 354 354 *vsid = VSID_REAL_DR | gvsid; 355 355 break; 356 356 case MSR_DR|MSR_IR: 357 - if (!sr->valid) 358 - return -1; 359 - 360 - *vsid = sr->vsid; 357 + if (sr->valid) 358 + *vsid = sr->vsid; 359 + else 360 + *vsid = VSID_BAT | gvsid; 361 361 break; 362 362 default: 363 363 BUG();

+12 -122

arch/powerpc/kvm/book3s_32_mmu_host.c

··· 19 19 */ 20 20 21 21 #include <linux/kvm_host.h> 22 + #include <linux/hash.h> 22 23 23 24 #include <asm/kvm_ppc.h> 24 25 #include <asm/kvm_book3s.h> ··· 58 57 static ulong htab; 59 58 static u32 htabmask; 60 59 61 - static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 60 + void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 62 61 { 63 62 volatile u32 *pteg; 64 63 65 - dprintk_mmu("KVM: Flushing SPTE: 0x%llx (0x%llx) -> 0x%llx\n", 66 - pte->pte.eaddr, pte->pte.vpage, pte->host_va); 67 - 64 + /* Remove from host HTAB */ 68 65 pteg = (u32*)pte->slot; 69 - 70 66 pteg[0] = 0; 67 + 68 + /* And make sure it's gone from the TLB too */ 71 69 asm volatile ("sync"); 72 70 asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory"); 73 71 asm volatile ("sync"); 74 72 asm volatile ("tlbsync"); 75 - 76 - pte->host_va = 0; 77 - 78 - if (pte->pte.may_write) 79 - kvm_release_pfn_dirty(pte->pfn); 80 - else 81 - kvm_release_pfn_clean(pte->pfn); 82 - } 83 - 84 - void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask) 85 - { 86 - int i; 87 - 88 - dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%x & 0x%x\n", 89 - vcpu->arch.hpte_cache_offset, guest_ea, ea_mask); 90 - BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM); 91 - 92 - guest_ea &= ea_mask; 93 - for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) { 94 - struct hpte_cache *pte; 95 - 96 - pte = &vcpu->arch.hpte_cache[i]; 97 - if (!pte->host_va) 98 - continue; 99 - 100 - if ((pte->pte.eaddr & ea_mask) == guest_ea) { 101 - invalidate_pte(vcpu, pte); 102 - } 103 - } 104 - 105 - /* Doing a complete flush -> start from scratch */ 106 - if (!ea_mask) 107 - vcpu->arch.hpte_cache_offset = 0; 108 - } 109 - 110 - void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) 111 - { 112 - int i; 113 - 114 - dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n", 115 - vcpu->arch.hpte_cache_offset, guest_vp, vp_mask); 116 - BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM); 117 - 118 - guest_vp &= vp_mask; 119 - for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) { 120 - struct hpte_cache *pte; 121 - 122 - pte = &vcpu->arch.hpte_cache[i]; 123 - if (!pte->host_va) 124 - continue; 125 - 126 - if ((pte->pte.vpage & vp_mask) == guest_vp) { 127 - invalidate_pte(vcpu, pte); 128 - } 129 - } 130 - } 131 - 132 - void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) 133 - { 134 - int i; 135 - 136 - dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%llx & 0x%llx\n", 137 - vcpu->arch.hpte_cache_offset, pa_start, pa_end); 138 - BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM); 139 - 140 - for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) { 141 - struct hpte_cache *pte; 142 - 143 - pte = &vcpu->arch.hpte_cache[i]; 144 - if (!pte->host_va) 145 - continue; 146 - 147 - if ((pte->pte.raddr >= pa_start) && 148 - (pte->pte.raddr < pa_end)) { 149 - invalidate_pte(vcpu, pte); 150 - } 151 - } 152 - } 153 - 154 - struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data) 155 - { 156 - int i; 157 - u64 guest_vp; 158 - 159 - guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false); 160 - for (i=0; i<vcpu->arch.hpte_cache_offset; i++) { 161 - struct hpte_cache *pte; 162 - 163 - pte = &vcpu->arch.hpte_cache[i]; 164 - if (!pte->host_va) 165 - continue; 166 - 167 - if (pte->pte.vpage == guest_vp) 168 - return &pte->pte; 169 - } 170 - 171 - return NULL; 172 - } 173 - 174 - static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) 175 - { 176 - if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM) 177 - kvmppc_mmu_pte_flush(vcpu, 0, 0); 178 - 179 - return vcpu->arch.hpte_cache_offset++; 180 73 } 181 74 182 75 /* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using 183 76 * a hash, so we don't waste cycles on looping */ 184 77 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) 185 78 { 186 - return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^ 187 - ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^ 188 - ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^ 189 - ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^ 190 - ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^ 191 - ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^ 192 - ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^ 193 - ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK)); 79 + return hash_64(gvsid, SID_MAP_BITS); 194 80 } 195 81 196 82 ··· 144 256 register int rr = 0; 145 257 bool primary = false; 146 258 bool evict = false; 147 - int hpte_id; 148 259 struct hpte_cache *pte; 149 260 150 261 /* Get host physical address for gpa */ ··· 228 341 229 342 /* Now tell our Shadow PTE code about the new page */ 230 343 231 - hpte_id = kvmppc_mmu_hpte_cache_next(vcpu); 232 - pte = &vcpu->arch.hpte_cache[hpte_id]; 344 + pte = kvmppc_mmu_hpte_cache_next(vcpu); 233 345 234 346 dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n", 235 347 orig_pte->may_write ? 'w' : '-', ··· 240 354 pte->host_va = va; 241 355 pte->pte = *orig_pte; 242 356 pte->pfn = hpaddr >> PAGE_SHIFT; 357 + 358 + kvmppc_mmu_hpte_cache_map(vcpu, pte); 243 359 244 360 return 0; 245 361 } ··· 327 439 328 440 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 329 441 { 330 - kvmppc_mmu_pte_flush(vcpu, 0, 0); 442 + kvmppc_mmu_hpte_destroy(vcpu); 331 443 preempt_disable(); 332 444 __destroy_context(to_book3s(vcpu)->context_id); 333 445 preempt_enable(); ··· 366 478 asm ( "mfsdr1 %0" : "=r"(sdr1) ); 367 479 htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0; 368 480 htab = (ulong)__va(sdr1 & 0xffff0000); 481 + 482 + kvmppc_mmu_hpte_init(vcpu); 369 483 370 484 return 0; 371 485 }

+9 -120

arch/powerpc/kvm/book3s_64_mmu_host.c

··· 20 20 */ 21 21 22 22 #include <linux/kvm_host.h> 23 + #include <linux/hash.h> 23 24 24 25 #include <asm/kvm_ppc.h> 25 26 #include <asm/kvm_book3s.h> ··· 47 46 #define dprintk_slb(a, ...) do { } while(0) 48 47 #endif 49 48 50 - static void invalidate_pte(struct hpte_cache *pte) 49 + void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 51 50 { 52 - dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n", 53 - pte->pte.eaddr, pte->pte.vpage, pte->host_va); 54 - 55 51 ppc_md.hpte_invalidate(pte->slot, pte->host_va, 56 52 MMU_PAGE_4K, MMU_SEGSIZE_256M, 57 53 false); 58 - pte->host_va = 0; 59 - 60 - if (pte->pte.may_write) 61 - kvm_release_pfn_dirty(pte->pfn); 62 - else 63 - kvm_release_pfn_clean(pte->pfn); 64 - } 65 - 66 - void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask) 67 - { 68 - int i; 69 - 70 - dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n", 71 - vcpu->arch.hpte_cache_offset, guest_ea, ea_mask); 72 - BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM); 73 - 74 - guest_ea &= ea_mask; 75 - for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) { 76 - struct hpte_cache *pte; 77 - 78 - pte = &vcpu->arch.hpte_cache[i]; 79 - if (!pte->host_va) 80 - continue; 81 - 82 - if ((pte->pte.eaddr & ea_mask) == guest_ea) { 83 - invalidate_pte(pte); 84 - } 85 - } 86 - 87 - /* Doing a complete flush -> start from scratch */ 88 - if (!ea_mask) 89 - vcpu->arch.hpte_cache_offset = 0; 90 - } 91 - 92 - void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) 93 - { 94 - int i; 95 - 96 - dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n", 97 - vcpu->arch.hpte_cache_offset, guest_vp, vp_mask); 98 - BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM); 99 - 100 - guest_vp &= vp_mask; 101 - for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) { 102 - struct hpte_cache *pte; 103 - 104 - pte = &vcpu->arch.hpte_cache[i]; 105 - if (!pte->host_va) 106 - continue; 107 - 108 - if ((pte->pte.vpage & vp_mask) == guest_vp) { 109 - invalidate_pte(pte); 110 - } 111 - } 112 - } 113 - 114 - void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) 115 - { 116 - int i; 117 - 118 - dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx & 0x%lx\n", 119 - vcpu->arch.hpte_cache_offset, pa_start, pa_end); 120 - BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM); 121 - 122 - for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) { 123 - struct hpte_cache *pte; 124 - 125 - pte = &vcpu->arch.hpte_cache[i]; 126 - if (!pte->host_va) 127 - continue; 128 - 129 - if ((pte->pte.raddr >= pa_start) && 130 - (pte->pte.raddr < pa_end)) { 131 - invalidate_pte(pte); 132 - } 133 - } 134 - } 135 - 136 - struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data) 137 - { 138 - int i; 139 - u64 guest_vp; 140 - 141 - guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false); 142 - for (i=0; i<vcpu->arch.hpte_cache_offset; i++) { 143 - struct hpte_cache *pte; 144 - 145 - pte = &vcpu->arch.hpte_cache[i]; 146 - if (!pte->host_va) 147 - continue; 148 - 149 - if (pte->pte.vpage == guest_vp) 150 - return &pte->pte; 151 - } 152 - 153 - return NULL; 154 - } 155 - 156 - static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) 157 - { 158 - if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM) 159 - kvmppc_mmu_pte_flush(vcpu, 0, 0); 160 - 161 - return vcpu->arch.hpte_cache_offset++; 162 54 } 163 55 164 56 /* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using 165 57 * a hash, so we don't waste cycles on looping */ 166 58 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) 167 59 { 168 - return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^ 169 - ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^ 170 - ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^ 171 - ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^ 172 - ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^ 173 - ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^ 174 - ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^ 175 - ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK)); 60 + return hash_64(gvsid, SID_MAP_BITS); 176 61 } 177 - 178 62 179 63 static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) 180 64 { ··· 159 273 attempt++; 160 274 goto map_again; 161 275 } else { 162 - int hpte_id = kvmppc_mmu_hpte_cache_next(vcpu); 163 - struct hpte_cache *pte = &vcpu->arch.hpte_cache[hpte_id]; 276 + struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu); 164 277 165 278 dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n", 166 279 ((rflags & HPTE_R_PP) == 3) ? '-' : 'w', ··· 177 292 pte->host_va = va; 178 293 pte->pte = *orig_pte; 179 294 pte->pfn = hpaddr >> PAGE_SHIFT; 295 + 296 + kvmppc_mmu_hpte_cache_map(vcpu, pte); 180 297 } 181 298 182 299 return 0; ··· 305 418 306 419 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) 307 420 { 308 - kvmppc_mmu_pte_flush(vcpu, 0, 0); 421 + kvmppc_mmu_hpte_destroy(vcpu); 309 422 __destroy_context(to_book3s(vcpu)->context_id); 310 423 } 311 424 ··· 322 435 vcpu3s->vsid_max = ((vcpu3s->context_id + 1) << USER_ESID_BITS) - 1; 323 436 vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS; 324 437 vcpu3s->vsid_next = vcpu3s->vsid_first; 438 + 439 + kvmppc_mmu_hpte_init(vcpu); 325 440 326 441 return 0; 327 442 }

+277

arch/powerpc/kvm/book3s_mmu_hpte.c

··· 1 + /* 2 + * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved. 3 + * 4 + * Authors: 5 + * Alexander Graf <agraf@suse.de> 6 + * 7 + * This program is free software; you can redistribute it and/or modify 8 + * it under the terms of the GNU General Public License, version 2, as 9 + * published by the Free Software Foundation. 10 + * 11 + * This program is distributed in the hope that it will be useful, 12 + * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 + * GNU General Public License for more details. 15 + * 16 + * You should have received a copy of the GNU General Public License 17 + * along with this program; if not, write to the Free Software 18 + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. 19 + */ 20 + 21 + #include <linux/kvm_host.h> 22 + #include <linux/hash.h> 23 + #include <linux/slab.h> 24 + 25 + #include <asm/kvm_ppc.h> 26 + #include <asm/kvm_book3s.h> 27 + #include <asm/machdep.h> 28 + #include <asm/mmu_context.h> 29 + #include <asm/hw_irq.h> 30 + 31 + #define PTE_SIZE 12 32 + 33 + /* #define DEBUG_MMU */ 34 + 35 + #ifdef DEBUG_MMU 36 + #define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__) 37 + #else 38 + #define dprintk_mmu(a, ...) do { } while(0) 39 + #endif 40 + 41 + static struct kmem_cache *hpte_cache; 42 + 43 + static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) 44 + { 45 + return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE); 46 + } 47 + 48 + static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) 49 + { 50 + return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE); 51 + } 52 + 53 + static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) 54 + { 55 + return hash_64((vpage & 0xffffff000ULL) >> 12, 56 + HPTEG_HASH_BITS_VPTE_LONG); 57 + } 58 + 59 + void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 60 + { 61 + u64 index; 62 + 63 + /* Add to ePTE list */ 64 + index = kvmppc_mmu_hash_pte(pte->pte.eaddr); 65 + hlist_add_head(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]); 66 + 67 + /* Add to vPTE list */ 68 + index = kvmppc_mmu_hash_vpte(pte->pte.vpage); 69 + hlist_add_head(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]); 70 + 71 + /* Add to vPTE_long list */ 72 + index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage); 73 + hlist_add_head(&pte->list_vpte_long, 74 + &vcpu->arch.hpte_hash_vpte_long[index]); 75 + } 76 + 77 + static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) 78 + { 79 + dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n", 80 + pte->pte.eaddr, pte->pte.vpage, pte->host_va); 81 + 82 + /* Different for 32 and 64 bit */ 83 + kvmppc_mmu_invalidate_pte(vcpu, pte); 84 + 85 + if (pte->pte.may_write) 86 + kvm_release_pfn_dirty(pte->pfn); 87 + else 88 + kvm_release_pfn_clean(pte->pfn); 89 + 90 + hlist_del(&pte->list_pte); 91 + hlist_del(&pte->list_vpte); 92 + hlist_del(&pte->list_vpte_long); 93 + 94 + vcpu->arch.hpte_cache_count--; 95 + kmem_cache_free(hpte_cache, pte); 96 + } 97 + 98 + static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) 99 + { 100 + struct hpte_cache *pte; 101 + struct hlist_node *node, *tmp; 102 + int i; 103 + 104 + for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { 105 + struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; 106 + 107 + hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long) 108 + invalidate_pte(vcpu, pte); 109 + } 110 + } 111 + 112 + static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) 113 + { 114 + struct hlist_head *list; 115 + struct hlist_node *node, *tmp; 116 + struct hpte_cache *pte; 117 + 118 + /* Find the list of entries in the map */ 119 + list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)]; 120 + 121 + /* Check the list for matching entries and invalidate */ 122 + hlist_for_each_entry_safe(pte, node, tmp, list, list_pte) 123 + if ((pte->pte.eaddr & ~0xfffUL) == guest_ea) 124 + invalidate_pte(vcpu, pte); 125 + } 126 + 127 + void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask) 128 + { 129 + u64 i; 130 + 131 + dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n", 132 + vcpu->arch.hpte_cache_count, guest_ea, ea_mask); 133 + 134 + guest_ea &= ea_mask; 135 + 136 + switch (ea_mask) { 137 + case ~0xfffUL: 138 + kvmppc_mmu_pte_flush_page(vcpu, guest_ea); 139 + break; 140 + case 0x0ffff000: 141 + /* 32-bit flush w/o segment, go through all possible segments */ 142 + for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL) 143 + kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL); 144 + break; 145 + case 0: 146 + /* Doing a complete flush -> start from scratch */ 147 + kvmppc_mmu_pte_flush_all(vcpu); 148 + break; 149 + default: 150 + WARN_ON(1); 151 + break; 152 + } 153 + } 154 + 155 + /* Flush with mask 0xfffffffff */ 156 + static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) 157 + { 158 + struct hlist_head *list; 159 + struct hlist_node *node, *tmp; 160 + struct hpte_cache *pte; 161 + u64 vp_mask = 0xfffffffffULL; 162 + 163 + list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)]; 164 + 165 + /* Check the list for matching entries and invalidate */ 166 + hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte) 167 + if ((pte->pte.vpage & vp_mask) == guest_vp) 168 + invalidate_pte(vcpu, pte); 169 + } 170 + 171 + /* Flush with mask 0xffffff000 */ 172 + static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) 173 + { 174 + struct hlist_head *list; 175 + struct hlist_node *node, *tmp; 176 + struct hpte_cache *pte; 177 + u64 vp_mask = 0xffffff000ULL; 178 + 179 + list = &vcpu->arch.hpte_hash_vpte_long[ 180 + kvmppc_mmu_hash_vpte_long(guest_vp)]; 181 + 182 + /* Check the list for matching entries and invalidate */ 183 + hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long) 184 + if ((pte->pte.vpage & vp_mask) == guest_vp) 185 + invalidate_pte(vcpu, pte); 186 + } 187 + 188 + void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) 189 + { 190 + dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n", 191 + vcpu->arch.hpte_cache_count, guest_vp, vp_mask); 192 + guest_vp &= vp_mask; 193 + 194 + switch(vp_mask) { 195 + case 0xfffffffffULL: 196 + kvmppc_mmu_pte_vflush_short(vcpu, guest_vp); 197 + break; 198 + case 0xffffff000ULL: 199 + kvmppc_mmu_pte_vflush_long(vcpu, guest_vp); 200 + break; 201 + default: 202 + WARN_ON(1); 203 + return; 204 + } 205 + } 206 + 207 + void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) 208 + { 209 + struct hlist_node *node, *tmp; 210 + struct hpte_cache *pte; 211 + int i; 212 + 213 + dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n", 214 + vcpu->arch.hpte_cache_count, pa_start, pa_end); 215 + 216 + for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { 217 + struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i]; 218 + 219 + hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long) 220 + if ((pte->pte.raddr >= pa_start) && 221 + (pte->pte.raddr < pa_end)) 222 + invalidate_pte(vcpu, pte); 223 + } 224 + } 225 + 226 + struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) 227 + { 228 + struct hpte_cache *pte; 229 + 230 + pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); 231 + vcpu->arch.hpte_cache_count++; 232 + 233 + if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM) 234 + kvmppc_mmu_pte_flush_all(vcpu); 235 + 236 + return pte; 237 + } 238 + 239 + void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu) 240 + { 241 + kvmppc_mmu_pte_flush(vcpu, 0, 0); 242 + } 243 + 244 + static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len) 245 + { 246 + int i; 247 + 248 + for (i = 0; i < len; i++) 249 + INIT_HLIST_HEAD(&hash_list[i]); 250 + } 251 + 252 + int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu) 253 + { 254 + /* init hpte lookup hashes */ 255 + kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte, 256 + ARRAY_SIZE(vcpu->arch.hpte_hash_pte)); 257 + kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte, 258 + ARRAY_SIZE(vcpu->arch.hpte_hash_vpte)); 259 + kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long, 260 + ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long)); 261 + 262 + return 0; 263 + } 264 + 265 + int kvmppc_mmu_hpte_sysinit(void) 266 + { 267 + /* init hpte slab cache */ 268 + hpte_cache = kmem_cache_create("kvm-spt", sizeof(struct hpte_cache), 269 + sizeof(struct hpte_cache), 0, NULL); 270 + 271 + return 0; 272 + } 273 + 274 + void kvmppc_mmu_hpte_sysexit(void) 275 + { 276 + kmem_cache_destroy(hpte_cache); 277 + }

+37 -57

arch/powerpc/kvm/book3s_paired_singles.c

··· 159 159 160 160 static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt) 161 161 { 162 - struct thread_struct t; 163 - 164 - t.fpscr.val = vcpu->arch.fpscr; 165 - cvt_df((double*)&vcpu->arch.fpr[rt], (float*)&vcpu->arch.qpr[rt], &t); 162 + kvm_cvt_df(&vcpu->arch.fpr[rt], &vcpu->arch.qpr[rt], &vcpu->arch.fpscr); 166 163 } 167 164 168 165 static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store) ··· 180 183 int rs, ulong addr, int ls_type) 181 184 { 182 185 int emulated = EMULATE_FAIL; 183 - struct thread_struct t; 184 186 int r; 185 187 char tmp[8]; 186 188 int len = sizeof(u32); 187 189 188 190 if (ls_type == FPU_LS_DOUBLE) 189 191 len = sizeof(u64); 190 - 191 - t.fpscr.val = vcpu->arch.fpscr; 192 192 193 193 /* read from memory */ 194 194 r = kvmppc_ld(vcpu, &addr, len, tmp, true); ··· 204 210 /* put in registers */ 205 211 switch (ls_type) { 206 212 case FPU_LS_SINGLE: 207 - cvt_fd((float*)tmp, (double*)&vcpu->arch.fpr[rs], &t); 213 + kvm_cvt_fd((u32*)tmp, &vcpu->arch.fpr[rs], &vcpu->arch.fpscr); 208 214 vcpu->arch.qpr[rs] = *((u32*)tmp); 209 215 break; 210 216 case FPU_LS_DOUBLE: ··· 223 229 int rs, ulong addr, int ls_type) 224 230 { 225 231 int emulated = EMULATE_FAIL; 226 - struct thread_struct t; 227 232 int r; 228 233 char tmp[8]; 229 234 u64 val; 230 235 int len; 231 236 232 - t.fpscr.val = vcpu->arch.fpscr; 233 - 234 237 switch (ls_type) { 235 238 case FPU_LS_SINGLE: 236 - cvt_df((double*)&vcpu->arch.fpr[rs], (float*)tmp, &t); 239 + kvm_cvt_df(&vcpu->arch.fpr[rs], (u32*)tmp, &vcpu->arch.fpscr); 237 240 val = *((u32*)tmp); 238 241 len = sizeof(u32); 239 242 break; ··· 269 278 int rs, ulong addr, bool w, int i) 270 279 { 271 280 int emulated = EMULATE_FAIL; 272 - struct thread_struct t; 273 281 int r; 274 282 float one = 1.0; 275 283 u32 tmp[2]; 276 - 277 - t.fpscr.val = vcpu->arch.fpscr; 278 284 279 285 /* read from memory */ 280 286 if (w) { ··· 296 308 emulated = EMULATE_DONE; 297 309 298 310 /* put in registers */ 299 - cvt_fd((float*)&tmp[0], (double*)&vcpu->arch.fpr[rs], &t); 311 + kvm_cvt_fd(&tmp[0], &vcpu->arch.fpr[rs], &vcpu->arch.fpscr); 300 312 vcpu->arch.qpr[rs] = tmp[1]; 301 313 302 314 dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0], ··· 310 322 int rs, ulong addr, bool w, int i) 311 323 { 312 324 int emulated = EMULATE_FAIL; 313 - struct thread_struct t; 314 325 int r; 315 326 u32 tmp[2]; 316 327 int len = w ? sizeof(u32) : sizeof(u64); 317 328 318 - t.fpscr.val = vcpu->arch.fpscr; 319 - 320 - cvt_df((double*)&vcpu->arch.fpr[rs], (float*)&tmp[0], &t); 329 + kvm_cvt_df(&vcpu->arch.fpr[rs], &tmp[0], &vcpu->arch.fpscr); 321 330 tmp[1] = vcpu->arch.qpr[rs]; 322 331 323 332 r = kvmppc_st(vcpu, &addr, len, tmp, true); ··· 502 517 static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc, 503 518 int reg_out, int reg_in1, int reg_in2, 504 519 int reg_in3, int scalar, 505 - void (*func)(struct thread_struct *t, 520 + void (*func)(u64 *fpscr, 506 521 u32 *dst, u32 *src1, 507 522 u32 *src2, u32 *src3)) 508 523 { ··· 511 526 u32 ps0_out; 512 527 u32 ps0_in1, ps0_in2, ps0_in3; 513 528 u32 ps1_in1, ps1_in2, ps1_in3; 514 - struct thread_struct t; 515 - t.fpscr.val = vcpu->arch.fpscr; 516 529 517 530 /* RC */ 518 531 WARN_ON(rc); 519 532 520 533 /* PS0 */ 521 - cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t); 522 - cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t); 523 - cvt_df((double*)&fpr[reg_in3], (float*)&ps0_in3, &t); 534 + kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr); 535 + kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr); 536 + kvm_cvt_df(&fpr[reg_in3], &ps0_in3, &vcpu->arch.fpscr); 524 537 525 538 if (scalar & SCALAR_LOW) 526 539 ps0_in2 = qpr[reg_in2]; 527 540 528 - func(&t, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3); 541 + func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3); 529 542 530 543 dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", 531 544 ps0_in1, ps0_in2, ps0_in3, ps0_out); 532 545 533 546 if (!(scalar & SCALAR_NO_PS0)) 534 - cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t); 547 + kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr); 535 548 536 549 /* PS1 */ 537 550 ps1_in1 = qpr[reg_in1]; ··· 540 557 ps1_in2 = ps0_in2; 541 558 542 559 if (!(scalar & SCALAR_NO_PS1)) 543 - func(&t, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3); 560 + func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3); 544 561 545 562 dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", 546 563 ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]); ··· 551 568 static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc, 552 569 int reg_out, int reg_in1, int reg_in2, 553 570 int scalar, 554 - void (*func)(struct thread_struct *t, 571 + void (*func)(u64 *fpscr, 555 572 u32 *dst, u32 *src1, 556 573 u32 *src2)) 557 574 { ··· 561 578 u32 ps0_in1, ps0_in2; 562 579 u32 ps1_out; 563 580 u32 ps1_in1, ps1_in2; 564 - struct thread_struct t; 565 - t.fpscr.val = vcpu->arch.fpscr; 566 581 567 582 /* RC */ 568 583 WARN_ON(rc); 569 584 570 585 /* PS0 */ 571 - cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t); 586 + kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr); 572 587 573 588 if (scalar & SCALAR_LOW) 574 589 ps0_in2 = qpr[reg_in2]; 575 590 else 576 - cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t); 591 + kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr); 577 592 578 - func(&t, &ps0_out, &ps0_in1, &ps0_in2); 593 + func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2); 579 594 580 595 if (!(scalar & SCALAR_NO_PS0)) { 581 596 dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n", 582 597 ps0_in1, ps0_in2, ps0_out); 583 598 584 - cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t); 599 + kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr); 585 600 } 586 601 587 602 /* PS1 */ ··· 589 608 if (scalar & SCALAR_HIGH) 590 609 ps1_in2 = ps0_in2; 591 610 592 - func(&t, &ps1_out, &ps1_in1, &ps1_in2); 611 + func(&vcpu->arch.fpscr, &ps1_out, &ps1_in1, &ps1_in2); 593 612 594 613 if (!(scalar & SCALAR_NO_PS1)) { 595 614 qpr[reg_out] = ps1_out; ··· 603 622 604 623 static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc, 605 624 int reg_out, int reg_in, 606 - void (*func)(struct thread_struct *t, 625 + void (*func)(u64 *t, 607 626 u32 *dst, u32 *src1)) 608 627 { 609 628 u32 *qpr = vcpu->arch.qpr; 610 629 u64 *fpr = vcpu->arch.fpr; 611 630 u32 ps0_out, ps0_in; 612 631 u32 ps1_in; 613 - struct thread_struct t; 614 - t.fpscr.val = vcpu->arch.fpscr; 615 632 616 633 /* RC */ 617 634 WARN_ON(rc); 618 635 619 636 /* PS0 */ 620 - cvt_df((double*)&fpr[reg_in], (float*)&ps0_in, &t); 621 - func(&t, &ps0_out, &ps0_in); 637 + kvm_cvt_df(&fpr[reg_in], &ps0_in, &vcpu->arch.fpscr); 638 + func(&vcpu->arch.fpscr, &ps0_out, &ps0_in); 622 639 623 640 dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n", 624 641 ps0_in, ps0_out); 625 642 626 - cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t); 643 + kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr); 627 644 628 645 /* PS1 */ 629 646 ps1_in = qpr[reg_in]; 630 - func(&t, &qpr[reg_out], &ps1_in); 647 + func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in); 631 648 632 649 dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n", 633 650 ps1_in, qpr[reg_out]); ··· 651 672 652 673 bool rcomp = (inst & 1) ? true : false; 653 674 u32 cr = kvmppc_get_cr(vcpu); 654 - struct thread_struct t; 655 675 #ifdef DEBUG 656 676 int i; 657 677 #endif 658 - 659 - t.fpscr.val = vcpu->arch.fpscr; 660 678 661 679 if (!kvmppc_inst_is_paired_single(vcpu, inst)) 662 680 return EMULATE_FAIL; ··· 671 695 #ifdef DEBUG 672 696 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { 673 697 u32 f; 674 - cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t); 698 + kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr); 675 699 dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx QPR[%d] = 0x%x\n", 676 700 i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]); 677 701 } ··· 795 819 WARN_ON(rcomp); 796 820 vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra]; 797 821 /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ 798 - cvt_df((double*)&vcpu->arch.fpr[ax_rb], 799 - (float*)&vcpu->arch.qpr[ax_rd], &t); 822 + kvm_cvt_df(&vcpu->arch.fpr[ax_rb], 823 + &vcpu->arch.qpr[ax_rd], 824 + &vcpu->arch.fpscr); 800 825 break; 801 826 case OP_4X_PS_MERGE01: 802 827 WARN_ON(rcomp); ··· 807 830 case OP_4X_PS_MERGE10: 808 831 WARN_ON(rcomp); 809 832 /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ 810 - cvt_fd((float*)&vcpu->arch.qpr[ax_ra], 811 - (double*)&vcpu->arch.fpr[ax_rd], &t); 833 + kvm_cvt_fd(&vcpu->arch.qpr[ax_ra], 834 + &vcpu->arch.fpr[ax_rd], 835 + &vcpu->arch.fpscr); 812 836 /* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */ 813 - cvt_df((double*)&vcpu->arch.fpr[ax_rb], 814 - (float*)&vcpu->arch.qpr[ax_rd], &t); 837 + kvm_cvt_df(&vcpu->arch.fpr[ax_rb], 838 + &vcpu->arch.qpr[ax_rd], 839 + &vcpu->arch.fpscr); 815 840 break; 816 841 case OP_4X_PS_MERGE11: 817 842 WARN_ON(rcomp); 818 843 /* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */ 819 - cvt_fd((float*)&vcpu->arch.qpr[ax_ra], 820 - (double*)&vcpu->arch.fpr[ax_rd], &t); 844 + kvm_cvt_fd(&vcpu->arch.qpr[ax_ra], 845 + &vcpu->arch.fpr[ax_rd], 846 + &vcpu->arch.fpscr); 821 847 vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; 822 848 break; 823 849 } ··· 1255 1275 #ifdef DEBUG 1256 1276 for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) { 1257 1277 u32 f; 1258 - cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t); 1278 + kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr); 1259 1279 dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f); 1260 1280 } 1261 1281 #endif

+1 -11

arch/powerpc/kvm/booke.c

··· 144 144 unsigned int priority) 145 145 { 146 146 int allowed = 0; 147 - ulong msr_mask; 147 + ulong uninitialized_var(msr_mask); 148 148 bool update_esr = false, update_dear = false; 149 149 150 150 switch (priority) { ··· 485 485 { 486 486 int i; 487 487 488 - vcpu_load(vcpu); 489 - 490 488 regs->pc = vcpu->arch.pc; 491 489 regs->cr = kvmppc_get_cr(vcpu); 492 490 regs->ctr = vcpu->arch.ctr; ··· 505 507 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 506 508 regs->gpr[i] = kvmppc_get_gpr(vcpu, i); 507 509 508 - vcpu_put(vcpu); 509 - 510 510 return 0; 511 511 } 512 512 513 513 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 514 514 { 515 515 int i; 516 - 517 - vcpu_load(vcpu); 518 516 519 517 vcpu->arch.pc = regs->pc; 520 518 kvmppc_set_cr(vcpu, regs->cr); ··· 530 536 531 537 for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) 532 538 kvmppc_set_gpr(vcpu, i, regs->gpr[i]); 533 - 534 - vcpu_put(vcpu); 535 539 536 540 return 0; 537 541 } ··· 561 569 { 562 570 int r; 563 571 564 - vcpu_load(vcpu); 565 572 r = kvmppc_core_vcpu_translate(vcpu, tr); 566 - vcpu_put(vcpu); 567 573 return r; 568 574 } 569 575

+18

arch/powerpc/kvm/fpu.S

··· 271 271 FPD_THREE_IN(fmadd) 272 272 FPD_THREE_IN(fnmsub) 273 273 FPD_THREE_IN(fnmadd) 274 + 275 + _GLOBAL(kvm_cvt_fd) 276 + lfd 0,0(r5) /* load up fpscr value */ 277 + MTFSF_L(0) 278 + lfs 0,0(r3) 279 + stfd 0,0(r4) 280 + mffs 0 281 + stfd 0,0(r5) /* save new fpscr value */ 282 + blr 283 + 284 + _GLOBAL(kvm_cvt_df) 285 + lfd 0,0(r5) /* load up fpscr value */ 286 + MTFSF_L(0) 287 + lfd 0,0(r3) 288 + stfs 0,0(r4) 289 + mffs 0 290 + stfd 0,0(r5) /* save new fpscr value */ 291 + blr

+3 -11

arch/powerpc/kvm/powerpc.c

··· 36 36 #define CREATE_TRACE_POINTS 37 37 #include "trace.h" 38 38 39 - gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 40 - { 41 - return gfn; 42 - } 43 - 44 39 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) 45 40 { 46 41 return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions); ··· 282 287 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, 283 288 struct kvm_run *run) 284 289 { 285 - u64 gpr; 290 + u64 uninitialized_var(gpr); 286 291 287 292 if (run->mmio.len > sizeof(gpr)) { 288 293 printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len); ··· 418 423 int r; 419 424 sigset_t sigsaved; 420 425 421 - vcpu_load(vcpu); 422 - 423 426 if (vcpu->sigset_active) 424 427 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 425 428 ··· 448 455 449 456 if (vcpu->sigset_active) 450 457 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 451 - 452 - vcpu_put(vcpu); 453 458 454 459 return r; 455 460 } ··· 514 523 if (copy_from_user(&irq, argp, sizeof(irq))) 515 524 goto out; 516 525 r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); 517 - break; 526 + goto out; 518 527 } 528 + 519 529 case KVM_ENABLE_CAP: 520 530 { 521 531 struct kvm_enable_cap cap;

+3 -2

arch/s390/include/asm/kvm_host.h

··· 26 26 27 27 struct sca_entry { 28 28 atomic_t scn; 29 - __u64 reserved; 29 + __u32 reserved; 30 30 __u64 sda; 31 31 __u64 reserved2[2]; 32 32 } __attribute__((packed)); ··· 41 41 } __attribute__((packed)); 42 42 43 43 #define KVM_NR_PAGE_SIZES 2 44 - #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8) 44 + #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 8) 45 + #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) 45 46 #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) 46 47 #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) 47 48 #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE)

+1 -1

arch/s390/kvm/intercept.c

··· 135 135 spin_lock_bh(&vcpu->arch.local_int.lock); 136 136 if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) { 137 137 vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP; 138 - rc = __kvm_s390_vcpu_store_status(vcpu, 138 + rc = kvm_s390_vcpu_store_status(vcpu, 139 139 KVM_S390_STORE_STATUS_NOADDR); 140 140 if (rc >= 0) 141 141 rc = -EOPNOTSUPP;

+19 -45

arch/s390/kvm/kvm-s390.c

··· 207 207 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 208 208 { 209 209 VCPU_EVENT(vcpu, 3, "%s", "free cpu"); 210 + clear_bit(63 - vcpu->vcpu_id, (unsigned long *) &vcpu->kvm->arch.sca->mcn); 210 211 if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda == 211 212 (__u64) vcpu->arch.sie_block) 212 213 vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0; ··· 297 296 { 298 297 atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH); 299 298 set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests); 300 - vcpu->arch.sie_block->ecb = 2; 299 + vcpu->arch.sie_block->ecb = 6; 301 300 vcpu->arch.sie_block->eca = 0xC1002001U; 302 301 vcpu->arch.sie_block->fac = (int) (long) facilities; 303 302 hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); ··· 330 329 kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block; 331 330 vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32); 332 331 vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca; 332 + set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn); 333 333 334 334 spin_lock_init(&vcpu->arch.local_int.lock); 335 335 INIT_LIST_HEAD(&vcpu->arch.local_int.list); ··· 365 363 366 364 static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) 367 365 { 368 - vcpu_load(vcpu); 369 366 kvm_s390_vcpu_initial_reset(vcpu); 370 - vcpu_put(vcpu); 371 367 return 0; 372 368 } 373 369 374 370 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 375 371 { 376 - vcpu_load(vcpu); 377 372 memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs)); 378 - vcpu_put(vcpu); 379 373 return 0; 380 374 } 381 375 382 376 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 383 377 { 384 - vcpu_load(vcpu); 385 378 memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs)); 386 - vcpu_put(vcpu); 387 379 return 0; 388 380 } 389 381 390 382 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, 391 383 struct kvm_sregs *sregs) 392 384 { 393 - vcpu_load(vcpu); 394 385 memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs)); 395 386 memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); 396 - vcpu_put(vcpu); 397 387 return 0; 398 388 } 399 389 400 390 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, 401 391 struct kvm_sregs *sregs) 402 392 { 403 - vcpu_load(vcpu); 404 393 memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs)); 405 394 memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs)); 406 - vcpu_put(vcpu); 407 395 return 0; 408 396 } 409 397 410 398 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 411 399 { 412 - vcpu_load(vcpu); 413 400 memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs)); 414 401 vcpu->arch.guest_fpregs.fpc = fpu->fpc; 415 - vcpu_put(vcpu); 416 402 return 0; 417 403 } 418 404 419 405 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 420 406 { 421 - vcpu_load(vcpu); 422 407 memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs)); 423 408 fpu->fpc = vcpu->arch.guest_fpregs.fpc; 424 - vcpu_put(vcpu); 425 409 return 0; 426 410 } 427 411 ··· 415 427 { 416 428 int rc = 0; 417 429 418 - vcpu_load(vcpu); 419 430 if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING) 420 431 rc = -EBUSY; 421 432 else { 422 433 vcpu->run->psw_mask = psw.mask; 423 434 vcpu->run->psw_addr = psw.addr; 424 435 } 425 - vcpu_put(vcpu); 426 436 return rc; 427 437 } 428 438 ··· 483 497 { 484 498 int rc; 485 499 sigset_t sigsaved; 486 - 487 - vcpu_load(vcpu); 488 500 489 501 rerun_vcpu: 490 502 if (vcpu->requests) ··· 552 568 if (vcpu->sigset_active) 553 569 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 554 570 555 - vcpu_put(vcpu); 556 - 557 571 vcpu->stat.exit_userspace++; 558 572 return rc; 559 573 } ··· 571 589 * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit 572 590 * KVM_S390_STORE_STATUS_PREFIXED: -> prefix 573 591 */ 574 - int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) 592 + int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) 575 593 { 576 594 const unsigned char archmode = 1; 577 595 int prefix; ··· 633 651 return 0; 634 652 } 635 653 636 - static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) 637 - { 638 - int rc; 639 - 640 - vcpu_load(vcpu); 641 - rc = __kvm_s390_vcpu_store_status(vcpu, addr); 642 - vcpu_put(vcpu); 643 - return rc; 644 - } 645 - 646 654 long kvm_arch_vcpu_ioctl(struct file *filp, 647 655 unsigned int ioctl, unsigned long arg) 648 656 { 649 657 struct kvm_vcpu *vcpu = filp->private_data; 650 658 void __user *argp = (void __user *)arg; 659 + long r; 651 660 652 661 switch (ioctl) { 653 662 case KVM_S390_INTERRUPT: { 654 663 struct kvm_s390_interrupt s390int; 655 664 665 + r = -EFAULT; 656 666 if (copy_from_user(&s390int, argp, sizeof(s390int))) 657 - return -EFAULT; 658 - return kvm_s390_inject_vcpu(vcpu, &s390int); 667 + break; 668 + r = kvm_s390_inject_vcpu(vcpu, &s390int); 669 + break; 659 670 } 660 671 case KVM_S390_STORE_STATUS: 661 - return kvm_s390_vcpu_store_status(vcpu, arg); 672 + r = kvm_s390_vcpu_store_status(vcpu, arg); 673 + break; 662 674 case KVM_S390_SET_INITIAL_PSW: { 663 675 psw_t psw; 664 676 677 + r = -EFAULT; 665 678 if (copy_from_user(&psw, argp, sizeof(psw))) 666 - return -EFAULT; 667 - return kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw); 679 + break; 680 + r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw); 681 + break; 668 682 } 669 683 case KVM_S390_INITIAL_RESET: 670 - return kvm_arch_vcpu_ioctl_initial_reset(vcpu); 684 + r = kvm_arch_vcpu_ioctl_initial_reset(vcpu); 685 + break; 671 686 default: 672 - ; 687 + r = -EINVAL; 673 688 } 674 - return -EINVAL; 689 + return r; 675 690 } 676 691 677 692 /* Section: memory related */ ··· 721 742 722 743 void kvm_arch_flush_shadow(struct kvm *kvm) 723 744 { 724 - } 725 - 726 - gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 727 - { 728 - return gfn; 729 745 } 730 746 731 747 static int __init kvm_s390_init(void)

+1 -1

arch/s390/kvm/kvm-s390.h

··· 92 92 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); 93 93 94 94 /* implemented in kvm-s390.c */ 95 - int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, 95 + int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, 96 96 unsigned long addr); 97 97 /* implemented in diag.c */ 98 98 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);

+2

arch/x86/include/asm/i387.h

··· 482 482 memcpy(dst->state, src->state, xstate_size); 483 483 } 484 484 485 + extern void fpu_finit(struct fpu *fpu); 486 + 485 487 #endif /* __ASSEMBLY__ */ 486 488 487 489 #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5

+22

arch/x86/include/asm/kvm.h

··· 22 22 #define __KVM_HAVE_XEN_HVM 23 23 #define __KVM_HAVE_VCPU_EVENTS 24 24 #define __KVM_HAVE_DEBUGREGS 25 + #define __KVM_HAVE_XSAVE 26 + #define __KVM_HAVE_XCRS 25 27 26 28 /* Architectural interrupt line count. */ 27 29 #define KVM_NR_INTERRUPTS 256 ··· 299 297 __u64 dr7; 300 298 __u64 flags; 301 299 __u64 reserved[9]; 300 + }; 301 + 302 + /* for KVM_CAP_XSAVE */ 303 + struct kvm_xsave { 304 + __u32 region[1024]; 305 + }; 306 + 307 + #define KVM_MAX_XCRS 16 308 + 309 + struct kvm_xcr { 310 + __u32 xcr; 311 + __u32 reserved; 312 + __u64 value; 313 + }; 314 + 315 + struct kvm_xcrs { 316 + __u32 nr_xcrs; 317 + __u32 flags; 318 + struct kvm_xcr xcrs[KVM_MAX_XCRS]; 319 + __u64 padding[16]; 302 320 }; 303 321 304 322 #endif /* _ASM_X86_KVM_H */

+25 -5

arch/x86/include/asm/kvm_emulate.h

··· 51 51 #define X86EMUL_UNHANDLEABLE 1 52 52 /* Terminate emulation but return success to the caller. */ 53 53 #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */ 54 - #define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */ 55 - #define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */ 54 + #define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ 55 + #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ 56 + #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ 57 + 56 58 struct x86_emulate_ops { 57 59 /* 58 60 * read_std: Read bytes of standard (non-emulated/special) memory. ··· 94 92 int (*read_emulated)(unsigned long addr, 95 93 void *val, 96 94 unsigned int bytes, 95 + unsigned int *error, 97 96 struct kvm_vcpu *vcpu); 98 97 99 98 /* ··· 107 104 int (*write_emulated)(unsigned long addr, 108 105 const void *val, 109 106 unsigned int bytes, 107 + unsigned int *error, 110 108 struct kvm_vcpu *vcpu); 111 109 112 110 /* ··· 122 118 const void *old, 123 119 const void *new, 124 120 unsigned int bytes, 121 + unsigned int *error, 125 122 struct kvm_vcpu *vcpu); 126 123 127 124 int (*pio_in_emulated)(int size, unsigned short port, void *val, ··· 137 132 int seg, struct kvm_vcpu *vcpu); 138 133 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 139 134 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 135 + unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); 140 136 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 141 137 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); 142 - void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 138 + int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 143 139 int (*cpl)(struct kvm_vcpu *vcpu); 144 - void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 140 + int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); 141 + int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); 142 + int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 143 + int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 145 144 }; 146 145 147 146 /* Type, address-of, and value of an instruction's operand. */ 148 147 struct operand { 149 148 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; 150 149 unsigned int bytes; 151 - unsigned long val, orig_val, *ptr; 150 + unsigned long orig_val, *ptr; 151 + union { 152 + unsigned long val; 153 + char valptr[sizeof(unsigned long) + 2]; 154 + }; 152 155 }; 153 156 154 157 struct fetch_cache { ··· 199 186 unsigned long modrm_val; 200 187 struct fetch_cache fetch; 201 188 struct read_cache io_read; 189 + struct read_cache mem_read; 202 190 }; 203 191 204 192 struct x86_emulate_ctxt { ··· 216 202 int interruptibility; 217 203 218 204 bool restart; /* restart string instruction after writeback */ 205 + 206 + int exception; /* exception that happens during emulation or -1 */ 207 + u32 error_code; /* error code for exception */ 208 + bool error_code_valid; 209 + unsigned long cr2; /* faulted address in case of #PF */ 210 + 219 211 /* decode cache */ 220 212 struct decode_cache decode; 221 213 };

+17 -53

arch/x86/include/asm/kvm_host.h

··· 15 15 #include <linux/mm.h> 16 16 #include <linux/mmu_notifier.h> 17 17 #include <linux/tracepoint.h> 18 + #include <linux/cpumask.h> 18 19 19 20 #include <linux/kvm.h> 20 21 #include <linux/kvm_para.h> ··· 40 39 0xFFFFFF0000000000ULL) 41 40 42 41 #define INVALID_PAGE (~(hpa_t)0) 42 + #define VALID_PAGE(x) ((x) != INVALID_PAGE) 43 + 43 44 #define UNMAPPED_GVA (~(gpa_t)0) 44 45 45 46 /* KVM Hugepage definitions for x86 */ 46 47 #define KVM_NR_PAGE_SIZES 3 47 - #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + (((x) - 1) * 9)) 48 + #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) 49 + #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) 48 50 #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) 49 51 #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) 50 52 #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) ··· 72 68 #define SELECTOR_RPL_MASK 0x03 73 69 74 70 #define IOPL_SHIFT 12 75 - 76 - #define KVM_ALIAS_SLOTS 4 77 71 78 72 #define KVM_PERMILLE_MMU_PAGES 20 79 73 #define KVM_MIN_ALLOC_MMU_PAGES 64 ··· 243 241 void (*prefetch_page)(struct kvm_vcpu *vcpu, 244 242 struct kvm_mmu_page *page); 245 243 int (*sync_page)(struct kvm_vcpu *vcpu, 246 - struct kvm_mmu_page *sp); 244 + struct kvm_mmu_page *sp, bool clear_unsync); 247 245 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 248 246 hpa_t root_hpa; 249 247 int root_level; ··· 303 301 unsigned long mmu_seq; 304 302 } update_pte; 305 303 306 - struct i387_fxsave_struct host_fx_image; 307 - struct i387_fxsave_struct guest_fx_image; 304 + struct fpu guest_fpu; 305 + u64 xcr0; 308 306 309 307 gva_t mmio_fault_cr2; 310 308 struct kvm_pio_request pio; ··· 362 360 363 361 /* fields used by HYPER-V emulation */ 364 362 u64 hv_vapic; 365 - }; 366 363 367 - struct kvm_mem_alias { 368 - gfn_t base_gfn; 369 - unsigned long npages; 370 - gfn_t target_gfn; 371 - #define KVM_ALIAS_INVALID 1UL 372 - unsigned long flags; 373 - }; 374 - 375 - #define KVM_ARCH_HAS_UNALIAS_INSTANTIATION 376 - 377 - struct kvm_mem_aliases { 378 - struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; 379 - int naliases; 364 + cpumask_var_t wbinvd_dirty_mask; 380 365 }; 381 366 382 367 struct kvm_arch { 383 - struct kvm_mem_aliases *aliases; 384 - 385 368 unsigned int n_free_mmu_pages; 386 369 unsigned int n_requested_mmu_pages; 387 370 unsigned int n_alloc_mmu_pages; ··· 520 533 521 534 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); 522 535 536 + bool (*has_wbinvd_exit)(void); 537 + 523 538 const struct trace_print_flags *exit_reasons_str; 524 539 }; 525 540 ··· 565 576 #define EMULTYPE_SKIP (1 << 2) 566 577 int emulate_instruction(struct kvm_vcpu *vcpu, 567 578 unsigned long cr2, u16 error_code, int emulation_type); 568 - void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 569 579 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 570 580 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 571 581 ··· 579 591 int kvm_emulate_halt(struct kvm_vcpu *vcpu); 580 592 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); 581 593 int emulate_clts(struct kvm_vcpu *vcpu); 582 - int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, 583 - unsigned long *dest); 584 - int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, 585 - unsigned long value); 594 + int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); 586 595 587 596 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 588 597 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); ··· 587 602 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 588 603 bool has_error_code, u32 error_code); 589 604 590 - void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 591 - void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 592 - void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 605 + int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 606 + int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 607 + int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 593 608 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 594 609 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 595 610 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 596 611 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 597 612 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); 598 613 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); 614 + int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); 599 615 600 616 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 601 617 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); ··· 616 630 617 631 void kvm_inject_nmi(struct kvm_vcpu *vcpu); 618 632 619 - void fx_init(struct kvm_vcpu *vcpu); 620 - 621 - int emulator_write_emulated(unsigned long addr, 622 - const void *val, 623 - unsigned int bytes, 624 - struct kvm_vcpu *vcpu); 633 + int fx_init(struct kvm_vcpu *vcpu); 625 634 626 635 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu); 627 636 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, ··· 644 663 645 664 int complete_pio(struct kvm_vcpu *vcpu); 646 665 bool kvm_check_iopl(struct kvm_vcpu *vcpu); 647 - 648 - struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); 649 666 650 667 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page) 651 668 { ··· 697 718 return value; 698 719 } 699 720 #endif 700 - 701 - static inline void kvm_fx_save(struct i387_fxsave_struct *image) 702 - { 703 - asm("fxsave (%0)":: "r" (image)); 704 - } 705 - 706 - static inline void kvm_fx_restore(struct i387_fxsave_struct *image) 707 - { 708 - asm("fxrstor (%0)":: "r" (image)); 709 - } 710 - 711 - static inline void kvm_fx_finit(void) 712 - { 713 - asm("finit"); 714 - } 715 721 716 722 static inline u32 get_rdx_init_val(void) 717 723 {

+2

arch/x86/include/asm/msr-index.h

··· 20 20 #define _EFER_LMA 10 /* Long mode active (read-only) */ 21 21 #define _EFER_NX 11 /* No execute enable */ 22 22 #define _EFER_SVME 12 /* Enable virtualization */ 23 + #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ 23 24 #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ 24 25 25 26 #define EFER_SCE (1<<_EFER_SCE) ··· 28 27 #define EFER_LMA (1<<_EFER_LMA) 29 28 #define EFER_NX (1<<_EFER_NX) 30 29 #define EFER_SVME (1<<_EFER_SVME) 30 + #define EFER_LMSLE (1<<_EFER_LMSLE) 31 31 #define EFER_FFXSR (1<<_EFER_FFXSR) 32 32 33 33 /* Intel MSRs. Some also available on other CPUs */

+5

arch/x86/include/asm/vmx.h

··· 257 257 #define EXIT_REASON_IO_INSTRUCTION 30 258 258 #define EXIT_REASON_MSR_READ 31 259 259 #define EXIT_REASON_MSR_WRITE 32 260 + #define EXIT_REASON_INVALID_STATE 33 260 261 #define EXIT_REASON_MWAIT_INSTRUCTION 36 261 262 #define EXIT_REASON_MONITOR_INSTRUCTION 39 262 263 #define EXIT_REASON_PAUSE_INSTRUCTION 40 ··· 267 266 #define EXIT_REASON_EPT_VIOLATION 48 268 267 #define EXIT_REASON_EPT_MISCONFIG 49 269 268 #define EXIT_REASON_WBINVD 54 269 + #define EXIT_REASON_XSETBV 55 270 270 271 271 /* 272 272 * Interruption-information format ··· 376 374 #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 377 375 #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 378 376 #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 377 + 378 + #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ 379 + #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ 379 380 380 381 #define VMX_EPT_DEFAULT_GAW 3 381 382 #define VMX_EPT_MAX_GAW 0x4

+6

arch/x86/include/asm/xsave.h

··· 13 13 14 14 #define FXSAVE_SIZE 512 15 15 16 + #define XSAVE_HDR_SIZE 64 17 + #define XSAVE_HDR_OFFSET FXSAVE_SIZE 18 + 19 + #define XSAVE_YMM_SIZE 256 20 + #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) 21 + 16 22 /* 17 23 * These are the features that the OS can handle currently. 18 24 */

+2 -1

arch/x86/kernel/i387.c

··· 107 107 } 108 108 #endif /* CONFIG_X86_64 */ 109 109 110 - static void fpu_finit(struct fpu *fpu) 110 + void fpu_finit(struct fpu *fpu) 111 111 { 112 112 #ifdef CONFIG_X86_32 113 113 if (!HAVE_HWFP) { ··· 132 132 fp->fos = 0xffff0000u; 133 133 } 134 134 } 135 + EXPORT_SYMBOL_GPL(fpu_finit); 135 136 136 137 /* 137 138 * The _current_ task is using the FPU for the first time

+1

arch/x86/kernel/process.c

··· 28 28 EXPORT_SYMBOL(idle_nomwait); 29 29 30 30 struct kmem_cache *task_xstate_cachep; 31 + EXPORT_SYMBOL_GPL(task_xstate_cachep); 31 32 32 33 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 33 34 {

+422 -327

arch/x86/kvm/emulate.c

··· 9 9 * privileged instructions: 10 10 * 11 11 * Copyright (C) 2006 Qumranet 12 + * Copyright 2010 Red Hat, Inc. and/or its affilates. 12 13 * 13 14 * Avi Kivity <avi@qumranet.com> 14 15 * Yaniv Kamay <yaniv@qumranet.com> ··· 68 67 #define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 69 68 #define SrcImmU (9<<4) /* Immediate operand, unsigned */ 70 69 #define SrcSI (0xa<<4) /* Source is in the DS:RSI */ 70 + #define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ 71 + #define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ 72 + #define SrcAcc (0xd<<4) /* Source Accumulator */ 71 73 #define SrcMask (0xf<<4) 72 74 /* Generic ModRM decode. */ 73 75 #define ModRM (1<<8) ··· 92 88 #define Src2CL (1<<29) 93 89 #define Src2ImmByte (2<<29) 94 90 #define Src2One (3<<29) 95 - #define Src2Imm16 (4<<29) 96 - #define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be 97 - in memory and second argument is located 98 - immediately after the first one in memory. */ 99 91 #define Src2Mask (7<<29) 100 92 101 93 enum { ··· 124 124 /* 0x20 - 0x27 */ 125 125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 126 126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 127 - DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 127 + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 128 128 /* 0x28 - 0x2F */ 129 129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 130 130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 131 - 0, 0, 0, 0, 131 + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 132 132 /* 0x30 - 0x37 */ 133 133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 134 134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 135 - 0, 0, 0, 0, 135 + ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 136 136 /* 0x38 - 0x3F */ 137 137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 138 138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, ··· 170 170 /* 0x88 - 0x8F */ 171 171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 172 172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 173 - DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, 174 - DstReg | SrcMem | ModRM | Mov, Group | Group1A, 173 + DstMem | SrcNone | ModRM | Mov, ModRM | DstReg, 174 + ImplicitOps | SrcMem16 | ModRM, Group | Group1A, 175 175 /* 0x90 - 0x97 */ 176 176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 177 177 /* 0x98 - 0x9F */ 178 - 0, 0, SrcImm | Src2Imm16 | No64, 0, 178 + 0, 0, SrcImmFAddr | No64, 0, 179 179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 180 180 /* 0xA0 - 0xA7 */ 181 - ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 182 - ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs, 181 + ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs, 182 + ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs, 183 183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String, 184 184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String, 185 185 /* 0xA8 - 0xAF */ 186 - 0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String, 186 + DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String, 187 187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String, 188 188 ByteOp | DstDI | String, DstDI | String, 189 189 /* 0xB0 - 0xB7 */ ··· 215 215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc, 216 216 /* 0xE8 - 0xEF */ 217 217 SrcImm | Stack, SrcImm | ImplicitOps, 218 - SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, 218 + SrcImmFAddr | No64, SrcImmByte | ImplicitOps, 219 219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 220 220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc, 221 221 /* 0xF0 - 0xF7 */ ··· 337 337 [Group1A*8] = 338 338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, 339 339 [Group3_Byte*8] = 340 - ByteOp | SrcImm | DstMem | ModRM, 0, 340 + ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM, 341 341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 342 342 0, 0, 0, 0, 343 343 [Group3*8] = 344 - DstMem | SrcImm | ModRM, 0, 344 + DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 345 345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 346 346 0, 0, 0, 0, 347 347 [Group4*8] = 348 - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 348 + ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, 349 349 0, 0, 0, 0, 0, 0, 350 350 [Group5*8] = 351 - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 351 + DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, 352 352 SrcMem | ModRM | Stack, 0, 353 - SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, 353 + SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps, 354 354 SrcMem | ModRM | Stack, 0, 355 355 [Group7*8] = 356 356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, ··· 576 576 (_type)_x; \ 577 577 }) 578 578 579 + #define insn_fetch_arr(_arr, _size, _eip) \ 580 + ({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ 581 + if (rc != X86EMUL_CONTINUE) \ 582 + goto done; \ 583 + (_eip) += (_size); \ 584 + }) 585 + 579 586 static inline unsigned long ad_mask(struct decode_cache *c) 580 587 { 581 588 return (1UL << (c->ad_bytes << 3)) - 1; ··· 624 617 c->seg_override = seg; 625 618 } 626 619 627 - static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) 620 + static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, 621 + struct x86_emulate_ops *ops, int seg) 628 622 { 629 623 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 630 624 return 0; 631 625 632 - return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); 626 + return ops->get_cached_segment_base(seg, ctxt->vcpu); 633 627 } 634 628 635 629 static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 630 + struct x86_emulate_ops *ops, 636 631 struct decode_cache *c) 637 632 { 638 633 if (!c->has_seg_override) 639 634 return 0; 640 635 641 - return seg_base(ctxt, c->seg_override); 636 + return seg_base(ctxt, ops, c->seg_override); 642 637 } 643 638 644 - static unsigned long es_base(struct x86_emulate_ctxt *ctxt) 639 + static unsigned long es_base(struct x86_emulate_ctxt *ctxt, 640 + struct x86_emulate_ops *ops) 645 641 { 646 - return seg_base(ctxt, VCPU_SREG_ES); 642 + return seg_base(ctxt, ops, VCPU_SREG_ES); 647 643 } 648 644 649 - static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) 645 + static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, 646 + struct x86_emulate_ops *ops) 650 647 { 651 - return seg_base(ctxt, VCPU_SREG_SS); 648 + return seg_base(ctxt, ops, VCPU_SREG_SS); 649 + } 650 + 651 + static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 652 + u32 error, bool valid) 653 + { 654 + ctxt->exception = vec; 655 + ctxt->error_code = error; 656 + ctxt->error_code_valid = valid; 657 + ctxt->restart = false; 658 + } 659 + 660 + static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 661 + { 662 + emulate_exception(ctxt, GP_VECTOR, err, true); 663 + } 664 + 665 + static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, 666 + int err) 667 + { 668 + ctxt->cr2 = addr; 669 + emulate_exception(ctxt, PF_VECTOR, err, true); 670 + } 671 + 672 + static void emulate_ud(struct x86_emulate_ctxt *ctxt) 673 + { 674 + emulate_exception(ctxt, UD_VECTOR, 0, false); 675 + } 676 + 677 + static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) 678 + { 679 + emulate_exception(ctxt, TS_VECTOR, err, true); 652 680 } 653 681 654 682 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, ··· 974 932 /* we cannot decode insn before we complete previous rep insn */ 975 933 WARN_ON(ctxt->restart); 976 934 977 - /* Shadow copy of register state. Committed on successful emulation. */ 978 - memset(c, 0, sizeof(struct decode_cache)); 979 935 c->eip = ctxt->eip; 980 936 c->fetch.start = c->fetch.end = c->eip; 981 - ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 982 - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 937 + ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); 983 938 984 939 switch (mode) { 985 940 case X86EMUL_MODE_REAL: ··· 1099 1060 set_seg_override(c, VCPU_SREG_DS); 1100 1061 1101 1062 if (!(!c->twobyte && c->b == 0x8d)) 1102 - c->modrm_ea += seg_override_base(ctxt, c); 1063 + c->modrm_ea += seg_override_base(ctxt, ops, c); 1103 1064 1104 1065 if (c->ad_bytes != 8) 1105 1066 c->modrm_ea = (u32)c->modrm_ea; ··· 1187 1148 else 1188 1149 c->src.val = insn_fetch(u8, 1, c->eip); 1189 1150 break; 1151 + case SrcAcc: 1152 + c->src.type = OP_REG; 1153 + c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1154 + c->src.ptr = &c->regs[VCPU_REGS_RAX]; 1155 + switch (c->src.bytes) { 1156 + case 1: 1157 + c->src.val = *(u8 *)c->src.ptr; 1158 + break; 1159 + case 2: 1160 + c->src.val = *(u16 *)c->src.ptr; 1161 + break; 1162 + case 4: 1163 + c->src.val = *(u32 *)c->src.ptr; 1164 + break; 1165 + case 8: 1166 + c->src.val = *(u64 *)c->src.ptr; 1167 + break; 1168 + } 1169 + break; 1190 1170 case SrcOne: 1191 1171 c->src.bytes = 1; 1192 1172 c->src.val = 1; ··· 1214 1156 c->src.type = OP_MEM; 1215 1157 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1216 1158 c->src.ptr = (unsigned long *) 1217 - register_address(c, seg_override_base(ctxt, c), 1159 + register_address(c, seg_override_base(ctxt, ops, c), 1218 1160 c->regs[VCPU_REGS_RSI]); 1219 1161 c->src.val = 0; 1162 + break; 1163 + case SrcImmFAddr: 1164 + c->src.type = OP_IMM; 1165 + c->src.ptr = (unsigned long *)c->eip; 1166 + c->src.bytes = c->op_bytes + 2; 1167 + insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip); 1168 + break; 1169 + case SrcMemFAddr: 1170 + c->src.type = OP_MEM; 1171 + c->src.ptr = (unsigned long *)c->modrm_ea; 1172 + c->src.bytes = c->op_bytes + 2; 1220 1173 break; 1221 1174 } 1222 1175 ··· 1248 1179 c->src2.bytes = 1; 1249 1180 c->src2.val = insn_fetch(u8, 1, c->eip); 1250 1181 break; 1251 - case Src2Imm16: 1252 - c->src2.type = OP_IMM; 1253 - c->src2.ptr = (unsigned long *)c->eip; 1254 - c->src2.bytes = 2; 1255 - c->src2.val = insn_fetch(u16, 2, c->eip); 1256 - break; 1257 1182 case Src2One: 1258 1183 c->src2.bytes = 1; 1259 1184 c->src2.val = 1; 1260 - break; 1261 - case Src2Mem16: 1262 - c->src2.type = OP_MEM; 1263 - c->src2.bytes = 2; 1264 - c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes); 1265 - c->src2.val = 0; 1266 1185 break; 1267 1186 } 1268 1187 ··· 1310 1253 c->dst.type = OP_MEM; 1311 1254 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1312 1255 c->dst.ptr = (unsigned long *) 1313 - register_address(c, es_base(ctxt), 1256 + register_address(c, es_base(ctxt, ops), 1314 1257 c->regs[VCPU_REGS_RDI]); 1315 1258 c->dst.val = 0; 1316 1259 break; ··· 1318 1261 1319 1262 done: 1320 1263 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1264 + } 1265 + 1266 + static int read_emulated(struct x86_emulate_ctxt *ctxt, 1267 + struct x86_emulate_ops *ops, 1268 + unsigned long addr, void *dest, unsigned size) 1269 + { 1270 + int rc; 1271 + struct read_cache *mc = &ctxt->decode.mem_read; 1272 + u32 err; 1273 + 1274 + while (size) { 1275 + int n = min(size, 8u); 1276 + size -= n; 1277 + if (mc->pos < mc->end) 1278 + goto read_cached; 1279 + 1280 + rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, 1281 + ctxt->vcpu); 1282 + if (rc == X86EMUL_PROPAGATE_FAULT) 1283 + emulate_pf(ctxt, addr, err); 1284 + if (rc != X86EMUL_CONTINUE) 1285 + return rc; 1286 + mc->end += n; 1287 + 1288 + read_cached: 1289 + memcpy(dest, mc->data + mc->pos, n); 1290 + mc->pos += n; 1291 + dest += n; 1292 + addr += n; 1293 + } 1294 + return X86EMUL_CONTINUE; 1321 1295 } 1322 1296 1323 1297 static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, ··· 1418 1330 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1419 1331 1420 1332 if (dt.size < index * 8 + 7) { 1421 - kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1333 + emulate_gp(ctxt, selector & 0xfffc); 1422 1334 return X86EMUL_PROPAGATE_FAULT; 1423 1335 } 1424 1336 addr = dt.address + index * 8; 1425 1337 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1426 1338 if (ret == X86EMUL_PROPAGATE_FAULT) 1427 - kvm_inject_page_fault(ctxt->vcpu, addr, err); 1339 + emulate_pf(ctxt, addr, err); 1428 1340 1429 1341 return ret; 1430 1342 } ··· 1443 1355 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1444 1356 1445 1357 if (dt.size < index * 8 + 7) { 1446 - kvm_inject_gp(ctxt->vcpu, selector & 0xfffc); 1358 + emulate_gp(ctxt, selector & 0xfffc); 1447 1359 return X86EMUL_PROPAGATE_FAULT; 1448 1360 } 1449 1361 1450 1362 addr = dt.address + index * 8; 1451 1363 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1452 1364 if (ret == X86EMUL_PROPAGATE_FAULT) 1453 - kvm_inject_page_fault(ctxt->vcpu, addr, err); 1365 + emulate_pf(ctxt, addr, err); 1454 1366 1455 1367 return ret; 1456 1368 } ··· 1569 1481 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu); 1570 1482 return X86EMUL_CONTINUE; 1571 1483 exception: 1572 - kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code); 1484 + emulate_exception(ctxt, err_vec, err_code, true); 1573 1485 return X86EMUL_PROPAGATE_FAULT; 1574 1486 } 1575 1487 1576 - static inline void emulate_push(struct x86_emulate_ctxt *ctxt) 1488 + static inline int writeback(struct x86_emulate_ctxt *ctxt, 1489 + struct x86_emulate_ops *ops) 1490 + { 1491 + int rc; 1492 + struct decode_cache *c = &ctxt->decode; 1493 + u32 err; 1494 + 1495 + switch (c->dst.type) { 1496 + case OP_REG: 1497 + /* The 4-byte case *is* correct: 1498 + * in 64-bit mode we zero-extend. 1499 + */ 1500 + switch (c->dst.bytes) { 1501 + case 1: 1502 + *(u8 *)c->dst.ptr = (u8)c->dst.val; 1503 + break; 1504 + case 2: 1505 + *(u16 *)c->dst.ptr = (u16)c->dst.val; 1506 + break; 1507 + case 4: 1508 + *c->dst.ptr = (u32)c->dst.val; 1509 + break; /* 64b: zero-ext */ 1510 + case 8: 1511 + *c->dst.ptr = c->dst.val; 1512 + break; 1513 + } 1514 + break; 1515 + case OP_MEM: 1516 + if (c->lock_prefix) 1517 + rc = ops->cmpxchg_emulated( 1518 + (unsigned long)c->dst.ptr, 1519 + &c->dst.orig_val, 1520 + &c->dst.val, 1521 + c->dst.bytes, 1522 + &err, 1523 + ctxt->vcpu); 1524 + else 1525 + rc = ops->write_emulated( 1526 + (unsigned long)c->dst.ptr, 1527 + &c->dst.val, 1528 + c->dst.bytes, 1529 + &err, 1530 + ctxt->vcpu); 1531 + if (rc == X86EMUL_PROPAGATE_FAULT) 1532 + emulate_pf(ctxt, 1533 + (unsigned long)c->dst.ptr, err); 1534 + if (rc != X86EMUL_CONTINUE) 1535 + return rc; 1536 + break; 1537 + case OP_NONE: 1538 + /* no writeback */ 1539 + break; 1540 + default: 1541 + break; 1542 + } 1543 + return X86EMUL_CONTINUE; 1544 + } 1545 + 1546 + static inline void emulate_push(struct x86_emulate_ctxt *ctxt, 1547 + struct x86_emulate_ops *ops) 1577 1548 { 1578 1549 struct decode_cache *c = &ctxt->decode; 1579 1550 ··· 1640 1493 c->dst.bytes = c->op_bytes; 1641 1494 c->dst.val = c->src.val; 1642 1495 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1643 - c->dst.ptr = (void *) register_address(c, ss_base(ctxt), 1496 + c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), 1644 1497 c->regs[VCPU_REGS_RSP]); 1645 1498 } 1646 1499 ··· 1651 1504 struct decode_cache *c = &ctxt->decode; 1652 1505 int rc; 1653 1506 1654 - rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1655 - c->regs[VCPU_REGS_RSP]), 1656 - dest, len, ctxt->vcpu); 1507 + rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), 1508 + c->regs[VCPU_REGS_RSP]), 1509 + dest, len); 1657 1510 if (rc != X86EMUL_CONTINUE) 1658 1511 return rc; 1659 1512 ··· 1688 1541 break; 1689 1542 case X86EMUL_MODE_VM86: 1690 1543 if (iopl < 3) { 1691 - kvm_inject_gp(ctxt->vcpu, 0); 1544 + emulate_gp(ctxt, 0); 1692 1545 return X86EMUL_PROPAGATE_FAULT; 1693 1546 } 1694 1547 change_mask |= EFLG_IF; ··· 1704 1557 return rc; 1705 1558 } 1706 1559 1707 - static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) 1560 + static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, 1561 + struct x86_emulate_ops *ops, int seg) 1708 1562 { 1709 1563 struct decode_cache *c = &ctxt->decode; 1710 - struct kvm_segment segment; 1711 1564 1712 - kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); 1565 + c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); 1713 1566 1714 - c->src.val = segment.selector; 1715 - emulate_push(ctxt); 1567 + emulate_push(ctxt, ops); 1716 1568 } 1717 1569 1718 1570 static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, ··· 1729 1583 return rc; 1730 1584 } 1731 1585 1732 - static void emulate_pusha(struct x86_emulate_ctxt *ctxt) 1586 + static int emulate_pusha(struct x86_emulate_ctxt *ctxt, 1587 + struct x86_emulate_ops *ops) 1733 1588 { 1734 1589 struct decode_cache *c = &ctxt->decode; 1735 1590 unsigned long old_esp = c->regs[VCPU_REGS_RSP]; 1591 + int rc = X86EMUL_CONTINUE; 1736 1592 int reg = VCPU_REGS_RAX; 1737 1593 1738 1594 while (reg <= VCPU_REGS_RDI) { 1739 1595 (reg == VCPU_REGS_RSP) ? 1740 1596 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1741 1597 1742 - emulate_push(ctxt); 1598 + emulate_push(ctxt, ops); 1599 + 1600 + rc = writeback(ctxt, ops); 1601 + if (rc != X86EMUL_CONTINUE) 1602 + return rc; 1603 + 1743 1604 ++reg; 1744 1605 } 1606 + 1607 + /* Disable writeback. */ 1608 + c->dst.type = OP_NONE; 1609 + 1610 + return rc; 1745 1611 } 1746 1612 1747 1613 static int emulate_popa(struct x86_emulate_ctxt *ctxt, ··· 1853 1695 old_eip = c->eip; 1854 1696 c->eip = c->src.val; 1855 1697 c->src.val = old_eip; 1856 - emulate_push(ctxt); 1698 + emulate_push(ctxt, ops); 1857 1699 break; 1858 1700 } 1859 1701 case 4: /* jmp abs */ 1860 1702 c->eip = c->src.val; 1861 1703 break; 1862 1704 case 6: /* push */ 1863 - emulate_push(ctxt); 1705 + emulate_push(ctxt, ops); 1864 1706 break; 1865 1707 } 1866 1708 return X86EMUL_CONTINUE; ··· 1906 1748 return rc; 1907 1749 } 1908 1750 1909 - static inline int writeback(struct x86_emulate_ctxt *ctxt, 1910 - struct x86_emulate_ops *ops) 1911 - { 1912 - int rc; 1913 - struct decode_cache *c = &ctxt->decode; 1914 - 1915 - switch (c->dst.type) { 1916 - case OP_REG: 1917 - /* The 4-byte case *is* correct: 1918 - * in 64-bit mode we zero-extend. 1919 - */ 1920 - switch (c->dst.bytes) { 1921 - case 1: 1922 - *(u8 *)c->dst.ptr = (u8)c->dst.val; 1923 - break; 1924 - case 2: 1925 - *(u16 *)c->dst.ptr = (u16)c->dst.val; 1926 - break; 1927 - case 4: 1928 - *c->dst.ptr = (u32)c->dst.val; 1929 - break; /* 64b: zero-ext */ 1930 - case 8: 1931 - *c->dst.ptr = c->dst.val; 1932 - break; 1933 - } 1934 - break; 1935 - case OP_MEM: 1936 - if (c->lock_prefix) 1937 - rc = ops->cmpxchg_emulated( 1938 - (unsigned long)c->dst.ptr, 1939 - &c->dst.orig_val, 1940 - &c->dst.val, 1941 - c->dst.bytes, 1942 - ctxt->vcpu); 1943 - else 1944 - rc = ops->write_emulated( 1945 - (unsigned long)c->dst.ptr, 1946 - &c->dst.val, 1947 - c->dst.bytes, 1948 - ctxt->vcpu); 1949 - if (rc != X86EMUL_CONTINUE) 1950 - return rc; 1951 - break; 1952 - case OP_NONE: 1953 - /* no writeback */ 1954 - break; 1955 - default: 1956 - break; 1957 - } 1958 - return X86EMUL_CONTINUE; 1959 - } 1960 - 1961 - static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) 1962 - { 1963 - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask); 1964 - /* 1965 - * an sti; sti; sequence only disable interrupts for the first 1966 - * instruction. So, if the last instruction, be it emulated or 1967 - * not, left the system with the INT_STI flag enabled, it 1968 - * means that the last instruction is an sti. We should not 1969 - * leave the flag on in this case. The same goes for mov ss 1970 - */ 1971 - if (!(int_shadow & mask)) 1972 - ctxt->interruptibility = mask; 1973 - } 1974 - 1975 1751 static inline void 1976 1752 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1977 - struct kvm_segment *cs, struct kvm_segment *ss) 1753 + struct x86_emulate_ops *ops, struct desc_struct *cs, 1754 + struct desc_struct *ss) 1978 1755 { 1979 - memset(cs, 0, sizeof(struct kvm_segment)); 1980 - kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS); 1981 - memset(ss, 0, sizeof(struct kvm_segment)); 1756 + memset(cs, 0, sizeof(struct desc_struct)); 1757 + ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); 1758 + memset(ss, 0, sizeof(struct desc_struct)); 1982 1759 1983 1760 cs->l = 0; /* will be adjusted later */ 1984 - cs->base = 0; /* flat segment */ 1761 + set_desc_base(cs, 0); /* flat segment */ 1985 1762 cs->g = 1; /* 4kb granularity */ 1986 - cs->limit = 0xffffffff; /* 4GB limit */ 1763 + set_desc_limit(cs, 0xfffff); /* 4GB limit */ 1987 1764 cs->type = 0x0b; /* Read, Execute, Accessed */ 1988 1765 cs->s = 1; 1989 1766 cs->dpl = 0; /* will be adjusted later */ 1990 - cs->present = 1; 1991 - cs->db = 1; 1767 + cs->p = 1; 1768 + cs->d = 1; 1992 1769 1993 - ss->unusable = 0; 1994 - ss->base = 0; /* flat segment */ 1995 - ss->limit = 0xffffffff; /* 4GB limit */ 1770 + set_desc_base(ss, 0); /* flat segment */ 1771 + set_desc_limit(ss, 0xfffff); /* 4GB limit */ 1996 1772 ss->g = 1; /* 4kb granularity */ 1997 1773 ss->s = 1; 1998 1774 ss->type = 0x03; /* Read/Write, Accessed */ 1999 - ss->db = 1; /* 32bit stack segment */ 1775 + ss->d = 1; /* 32bit stack segment */ 2000 1776 ss->dpl = 0; 2001 - ss->present = 1; 1777 + ss->p = 1; 2002 1778 } 2003 1779 2004 1780 static int 2005 - emulate_syscall(struct x86_emulate_ctxt *ctxt) 1781 + emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 2006 1782 { 2007 1783 struct decode_cache *c = &ctxt->decode; 2008 - struct kvm_segment cs, ss; 1784 + struct desc_struct cs, ss; 2009 1785 u64 msr_data; 1786 + u16 cs_sel, ss_sel; 2010 1787 2011 1788 /* syscall is not available in real mode */ 2012 1789 if (ctxt->mode == X86EMUL_MODE_REAL || 2013 1790 ctxt->mode == X86EMUL_MODE_VM86) { 2014 - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 1791 + emulate_ud(ctxt); 2015 1792 return X86EMUL_PROPAGATE_FAULT; 2016 1793 } 2017 1794 2018 - setup_syscalls_segments(ctxt, &cs, &ss); 1795 + setup_syscalls_segments(ctxt, ops, &cs, &ss); 2019 1796 2020 - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1797 + ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 2021 1798 msr_data >>= 32; 2022 - cs.selector = (u16)(msr_data & 0xfffc); 2023 - ss.selector = (u16)(msr_data + 8); 1799 + cs_sel = (u16)(msr_data & 0xfffc); 1800 + ss_sel = (u16)(msr_data + 8); 2024 1801 2025 1802 if (is_long_mode(ctxt->vcpu)) { 2026 - cs.db = 0; 1803 + cs.d = 0; 2027 1804 cs.l = 1; 2028 1805 } 2029 - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2030 - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 1806 + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1807 + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1808 + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1809 + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 2031 1810 2032 1811 c->regs[VCPU_REGS_RCX] = c->eip; 2033 1812 if (is_long_mode(ctxt->vcpu)) { 2034 1813 #ifdef CONFIG_X86_64 2035 1814 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 2036 1815 2037 - kvm_x86_ops->get_msr(ctxt->vcpu, 2038 - ctxt->mode == X86EMUL_MODE_PROT64 ? 2039 - MSR_LSTAR : MSR_CSTAR, &msr_data); 1816 + ops->get_msr(ctxt->vcpu, 1817 + ctxt->mode == X86EMUL_MODE_PROT64 ? 1818 + MSR_LSTAR : MSR_CSTAR, &msr_data); 2040 1819 c->eip = msr_data; 2041 1820 2042 - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); 1821 + ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); 2043 1822 ctxt->eflags &= ~(msr_data | EFLG_RF); 2044 1823 #endif 2045 1824 } else { 2046 1825 /* legacy mode */ 2047 - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1826 + ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 2048 1827 c->eip = (u32)msr_data; 2049 1828 2050 1829 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); ··· 1991 1896 } 1992 1897 1993 1898 static int 1994 - emulate_sysenter(struct x86_emulate_ctxt *ctxt) 1899 + emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 1995 1900 { 1996 1901 struct decode_cache *c = &ctxt->decode; 1997 - struct kvm_segment cs, ss; 1902 + struct desc_struct cs, ss; 1998 1903 u64 msr_data; 1904 + u16 cs_sel, ss_sel; 1999 1905 2000 1906 /* inject #GP if in real mode */ 2001 1907 if (ctxt->mode == X86EMUL_MODE_REAL) { 2002 - kvm_inject_gp(ctxt->vcpu, 0); 1908 + emulate_gp(ctxt, 0); 2003 1909 return X86EMUL_PROPAGATE_FAULT; 2004 1910 } 2005 1911 ··· 2008 1912 * Therefore, we inject an #UD. 2009 1913 */ 2010 1914 if (ctxt->mode == X86EMUL_MODE_PROT64) { 2011 - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 1915 + emulate_ud(ctxt); 2012 1916 return X86EMUL_PROPAGATE_FAULT; 2013 1917 } 2014 1918 2015 - setup_syscalls_segments(ctxt, &cs, &ss); 1919 + setup_syscalls_segments(ctxt, ops, &cs, &ss); 2016 1920 2017 - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 1921 + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2018 1922 switch (ctxt->mode) { 2019 1923 case X86EMUL_MODE_PROT32: 2020 1924 if ((msr_data & 0xfffc) == 0x0) { 2021 - kvm_inject_gp(ctxt->vcpu, 0); 1925 + emulate_gp(ctxt, 0); 2022 1926 return X86EMUL_PROPAGATE_FAULT; 2023 1927 } 2024 1928 break; 2025 1929 case X86EMUL_MODE_PROT64: 2026 1930 if (msr_data == 0x0) { 2027 - kvm_inject_gp(ctxt->vcpu, 0); 1931 + emulate_gp(ctxt, 0); 2028 1932 return X86EMUL_PROPAGATE_FAULT; 2029 1933 } 2030 1934 break; 2031 1935 } 2032 1936 2033 1937 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 2034 - cs.selector = (u16)msr_data; 2035 - cs.selector &= ~SELECTOR_RPL_MASK; 2036 - ss.selector = cs.selector + 8; 2037 - ss.selector &= ~SELECTOR_RPL_MASK; 1938 + cs_sel = (u16)msr_data; 1939 + cs_sel &= ~SELECTOR_RPL_MASK; 1940 + ss_sel = cs_sel + 8; 1941 + ss_sel &= ~SELECTOR_RPL_MASK; 2038 1942 if (ctxt->mode == X86EMUL_MODE_PROT64 2039 1943 || is_long_mode(ctxt->vcpu)) { 2040 - cs.db = 0; 1944 + cs.d = 0; 2041 1945 cs.l = 1; 2042 1946 } 2043 1947 2044 - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2045 - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 1948 + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1949 + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1950 + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 1951 + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 2046 1952 2047 - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 1953 + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 2048 1954 c->eip = msr_data; 2049 1955 2050 - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 1956 + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 2051 1957 c->regs[VCPU_REGS_RSP] = msr_data; 2052 1958 2053 1959 return X86EMUL_CONTINUE; 2054 1960 } 2055 1961 2056 1962 static int 2057 - emulate_sysexit(struct x86_emulate_ctxt *ctxt) 1963 + emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 2058 1964 { 2059 1965 struct decode_cache *c = &ctxt->decode; 2060 - struct kvm_segment cs, ss; 1966 + struct desc_struct cs, ss; 2061 1967 u64 msr_data; 2062 1968 int usermode; 1969 + u16 cs_sel, ss_sel; 2063 1970 2064 1971 /* inject #GP if in real mode or Virtual 8086 mode */ 2065 1972 if (ctxt->mode == X86EMUL_MODE_REAL || 2066 1973 ctxt->mode == X86EMUL_MODE_VM86) { 2067 - kvm_inject_gp(ctxt->vcpu, 0); 1974 + emulate_gp(ctxt, 0); 2068 1975 return X86EMUL_PROPAGATE_FAULT; 2069 1976 } 2070 1977 2071 - setup_syscalls_segments(ctxt, &cs, &ss); 1978 + setup_syscalls_segments(ctxt, ops, &cs, &ss); 2072 1979 2073 1980 if ((c->rex_prefix & 0x8) != 0x0) 2074 1981 usermode = X86EMUL_MODE_PROT64; ··· 2080 1981 2081 1982 cs.dpl = 3; 2082 1983 ss.dpl = 3; 2083 - kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 1984 + ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2084 1985 switch (usermode) { 2085 1986 case X86EMUL_MODE_PROT32: 2086 - cs.selector = (u16)(msr_data + 16); 1987 + cs_sel = (u16)(msr_data + 16); 2087 1988 if ((msr_data & 0xfffc) == 0x0) { 2088 - kvm_inject_gp(ctxt->vcpu, 0); 1989 + emulate_gp(ctxt, 0); 2089 1990 return X86EMUL_PROPAGATE_FAULT; 2090 1991 } 2091 - ss.selector = (u16)(msr_data + 24); 1992 + ss_sel = (u16)(msr_data + 24); 2092 1993 break; 2093 1994 case X86EMUL_MODE_PROT64: 2094 - cs.selector = (u16)(msr_data + 32); 1995 + cs_sel = (u16)(msr_data + 32); 2095 1996 if (msr_data == 0x0) { 2096 - kvm_inject_gp(ctxt->vcpu, 0); 1997 + emulate_gp(ctxt, 0); 2097 1998 return X86EMUL_PROPAGATE_FAULT; 2098 1999 } 2099 - ss.selector = cs.selector + 8; 2100 - cs.db = 0; 2000 + ss_sel = cs_sel + 8; 2001 + cs.d = 0; 2101 2002 cs.l = 1; 2102 2003 break; 2103 2004 } 2104 - cs.selector |= SELECTOR_RPL_MASK; 2105 - ss.selector |= SELECTOR_RPL_MASK; 2005 + cs_sel |= SELECTOR_RPL_MASK; 2006 + ss_sel |= SELECTOR_RPL_MASK; 2106 2007 2107 - kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS); 2108 - kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS); 2008 + ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 2009 + ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 2010 + ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu); 2011 + ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); 2109 2012 2110 - c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; 2111 - c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; 2013 + c->eip = c->regs[VCPU_REGS_RDX]; 2014 + c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; 2112 2015 2113 2016 return X86EMUL_CONTINUE; 2114 2017 } ··· 2131 2030 struct x86_emulate_ops *ops, 2132 2031 u16 port, u16 len) 2133 2032 { 2134 - struct kvm_segment tr_seg; 2033 + struct desc_struct tr_seg; 2135 2034 int r; 2136 2035 u16 io_bitmap_ptr; 2137 2036 u8 perm, bit_idx = port & 0x7; 2138 2037 unsigned mask = (1 << len) - 1; 2139 2038 2140 - kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); 2141 - if (tr_seg.unusable) 2039 + ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); 2040 + if (!tr_seg.p) 2142 2041 return false; 2143 - if (tr_seg.limit < 103) 2042 + if (desc_limit_scaled(&tr_seg) < 103) 2144 2043 return false; 2145 - r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, 2146 - NULL); 2044 + r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, 2045 + ctxt->vcpu, NULL); 2147 2046 if (r != X86EMUL_CONTINUE) 2148 2047 return false; 2149 - if (io_bitmap_ptr + port/8 > tr_seg.limit) 2048 + if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) 2150 2049 return false; 2151 - r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, 2152 - ctxt->vcpu, NULL); 2050 + r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, 2051 + &perm, 1, ctxt->vcpu, NULL); 2153 2052 if (r != X86EMUL_CONTINUE) 2154 2053 return false; 2155 2054 if ((perm >> bit_idx) & mask) ··· 2165 2064 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 2166 2065 return false; 2167 2066 return true; 2168 - } 2169 - 2170 - static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt, 2171 - struct x86_emulate_ops *ops, 2172 - int seg) 2173 - { 2174 - struct desc_struct desc; 2175 - if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu)) 2176 - return get_desc_base(&desc); 2177 - else 2178 - return ~0; 2179 2067 } 2180 2068 2181 2069 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, ··· 2255 2165 &err); 2256 2166 if (ret == X86EMUL_PROPAGATE_FAULT) { 2257 2167 /* FIXME: need to provide precise fault address */ 2258 - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2168 + emulate_pf(ctxt, old_tss_base, err); 2259 2169 return ret; 2260 2170 } 2261 2171 ··· 2265 2175 &err); 2266 2176 if (ret == X86EMUL_PROPAGATE_FAULT) { 2267 2177 /* FIXME: need to provide precise fault address */ 2268 - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2178 + emulate_pf(ctxt, old_tss_base, err); 2269 2179 return ret; 2270 2180 } 2271 2181 ··· 2273 2183 &err); 2274 2184 if (ret == X86EMUL_PROPAGATE_FAULT) { 2275 2185 /* FIXME: need to provide precise fault address */ 2276 - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2186 + emulate_pf(ctxt, new_tss_base, err); 2277 2187 return ret; 2278 2188 } 2279 2189 ··· 2286 2196 ctxt->vcpu, &err); 2287 2197 if (ret == X86EMUL_PROPAGATE_FAULT) { 2288 2198 /* FIXME: need to provide precise fault address */ 2289 - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2199 + emulate_pf(ctxt, new_tss_base, err); 2290 2200 return ret; 2291 2201 } 2292 2202 } ··· 2328 2238 struct decode_cache *c = &ctxt->decode; 2329 2239 int ret; 2330 2240 2331 - ops->set_cr(3, tss->cr3, ctxt->vcpu); 2241 + if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { 2242 + emulate_gp(ctxt, 0); 2243 + return X86EMUL_PROPAGATE_FAULT; 2244 + } 2332 2245 c->eip = tss->eip; 2333 2246 ctxt->eflags = tss->eflags | 2; 2334 2247 c->regs[VCPU_REGS_RAX] = tss->eax; ··· 2397 2304 &err); 2398 2305 if (ret == X86EMUL_PROPAGATE_FAULT) { 2399 2306 /* FIXME: need to provide precise fault address */ 2400 - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2307 + emulate_pf(ctxt, old_tss_base, err); 2401 2308 return ret; 2402 2309 } 2403 2310 ··· 2407 2314 &err); 2408 2315 if (ret == X86EMUL_PROPAGATE_FAULT) { 2409 2316 /* FIXME: need to provide precise fault address */ 2410 - kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err); 2317 + emulate_pf(ctxt, old_tss_base, err); 2411 2318 return ret; 2412 2319 } 2413 2320 ··· 2415 2322 &err); 2416 2323 if (ret == X86EMUL_PROPAGATE_FAULT) { 2417 2324 /* FIXME: need to provide precise fault address */ 2418 - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2325 + emulate_pf(ctxt, new_tss_base, err); 2419 2326 return ret; 2420 2327 } 2421 2328 ··· 2428 2335 ctxt->vcpu, &err); 2429 2336 if (ret == X86EMUL_PROPAGATE_FAULT) { 2430 2337 /* FIXME: need to provide precise fault address */ 2431 - kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err); 2338 + emulate_pf(ctxt, new_tss_base, err); 2432 2339 return ret; 2433 2340 } 2434 2341 } ··· 2445 2352 int ret; 2446 2353 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); 2447 2354 ulong old_tss_base = 2448 - get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR); 2355 + ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); 2449 2356 u32 desc_limit; 2450 2357 2451 2358 /* FIXME: old_tss_base == ~0 ? */ ··· 2462 2369 if (reason != TASK_SWITCH_IRET) { 2463 2370 if ((tss_selector & 3) > next_tss_desc.dpl || 2464 2371 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2465 - kvm_inject_gp(ctxt->vcpu, 0); 2372 + emulate_gp(ctxt, 0); 2466 2373 return X86EMUL_PROPAGATE_FAULT; 2467 2374 } 2468 2375 } ··· 2471 2378 if (!next_tss_desc.p || 2472 2379 ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || 2473 2380 desc_limit < 0x2b)) { 2474 - kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR, 2475 - tss_selector & 0xfffc); 2381 + emulate_ts(ctxt, tss_selector & 0xfffc); 2476 2382 return X86EMUL_PROPAGATE_FAULT; 2477 2383 } 2478 2384 ··· 2517 2425 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2518 2426 c->lock_prefix = 0; 2519 2427 c->src.val = (unsigned long) error_code; 2520 - emulate_push(ctxt); 2428 + emulate_push(ctxt, ops); 2521 2429 } 2522 2430 2523 2431 return ret; ··· 2531 2439 struct decode_cache *c = &ctxt->decode; 2532 2440 int rc; 2533 2441 2534 - memset(c, 0, sizeof(struct decode_cache)); 2535 2442 c->eip = ctxt->eip; 2536 - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 2537 2443 c->dst.type = OP_NONE; 2538 2444 2539 2445 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2540 2446 has_error_code, error_code); 2541 2447 2542 2448 if (rc == X86EMUL_CONTINUE) { 2543 - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 2544 - kvm_rip_write(ctxt->vcpu, c->eip); 2545 2449 rc = writeback(ctxt, ops); 2450 + if (rc == X86EMUL_CONTINUE) 2451 + ctxt->eip = c->eip; 2546 2452 } 2547 2453 2548 2454 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; ··· 2564 2474 int rc = X86EMUL_CONTINUE; 2565 2475 int saved_dst_type = c->dst.type; 2566 2476 2567 - ctxt->interruptibility = 0; 2568 - 2569 - /* Shadow copy of register state. Committed on successful emulation. 2570 - * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't 2571 - * modify them. 2572 - */ 2573 - 2574 - memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 2477 + ctxt->decode.mem_read.pos = 0; 2575 2478 2576 2479 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 2577 - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2480 + emulate_ud(ctxt); 2578 2481 goto done; 2579 2482 } 2580 2483 2581 2484 /* LOCK prefix is allowed only with some instructions */ 2582 2485 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 2583 - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2486 + emulate_ud(ctxt); 2584 2487 goto done; 2585 2488 } 2586 2489 2587 2490 /* Privileged instruction can be executed only in CPL=0 */ 2588 2491 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 2589 - kvm_inject_gp(ctxt->vcpu, 0); 2492 + emulate_gp(ctxt, 0); 2590 2493 goto done; 2591 2494 } 2592 2495 ··· 2589 2506 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 2590 2507 string_done: 2591 2508 ctxt->restart = false; 2592 - kvm_rip_write(ctxt->vcpu, c->eip); 2509 + ctxt->eip = c->eip; 2593 2510 goto done; 2594 2511 } 2595 2512 /* The second termination condition only applies for REPE ··· 2612 2529 } 2613 2530 2614 2531 if (c->src.type == OP_MEM) { 2615 - rc = ops->read_emulated((unsigned long)c->src.ptr, 2616 - &c->src.val, 2617 - c->src.bytes, 2618 - ctxt->vcpu); 2532 + rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, 2533 + c->src.valptr, c->src.bytes); 2619 2534 if (rc != X86EMUL_CONTINUE) 2620 2535 goto done; 2621 2536 c->src.orig_val = c->src.val; 2622 2537 } 2623 2538 2624 2539 if (c->src2.type == OP_MEM) { 2625 - rc = ops->read_emulated((unsigned long)c->src2.ptr, 2626 - &c->src2.val, 2627 - c->src2.bytes, 2628 - ctxt->vcpu); 2540 + rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, 2541 + &c->src2.val, c->src2.bytes); 2629 2542 if (rc != X86EMUL_CONTINUE) 2630 2543 goto done; 2631 2544 } ··· 2632 2553 2633 2554 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 2634 2555 /* optimisation - avoid slow emulated read if Mov */ 2635 - rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, 2636 - c->dst.bytes, ctxt->vcpu); 2556 + rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, 2557 + &c->dst.val, c->dst.bytes); 2637 2558 if (rc != X86EMUL_CONTINUE) 2638 2559 goto done; 2639 2560 } ··· 2650 2571 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 2651 2572 break; 2652 2573 case 0x06: /* push es */ 2653 - emulate_push_sreg(ctxt, VCPU_SREG_ES); 2574 + emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); 2654 2575 break; 2655 2576 case 0x07: /* pop es */ 2656 2577 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); ··· 2662 2583 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 2663 2584 break; 2664 2585 case 0x0e: /* push cs */ 2665 - emulate_push_sreg(ctxt, VCPU_SREG_CS); 2586 + emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); 2666 2587 break; 2667 2588 case 0x10 ... 0x15: 2668 2589 adc: /* adc */ 2669 2590 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 2670 2591 break; 2671 2592 case 0x16: /* push ss */ 2672 - emulate_push_sreg(ctxt, VCPU_SREG_SS); 2593 + emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); 2673 2594 break; 2674 2595 case 0x17: /* pop ss */ 2675 2596 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); ··· 2681 2602 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 2682 2603 break; 2683 2604 case 0x1e: /* push ds */ 2684 - emulate_push_sreg(ctxt, VCPU_SREG_DS); 2605 + emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); 2685 2606 break; 2686 2607 case 0x1f: /* pop ds */ 2687 2608 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); ··· 2711 2632 emulate_1op("dec", c->dst, ctxt->eflags); 2712 2633 break; 2713 2634 case 0x50 ... 0x57: /* push reg */ 2714 - emulate_push(ctxt); 2635 + emulate_push(ctxt, ops); 2715 2636 break; 2716 2637 case 0x58 ... 0x5f: /* pop reg */ 2717 2638 pop_instruction: ··· 2720 2641 goto done; 2721 2642 break; 2722 2643 case 0x60: /* pusha */ 2723 - emulate_pusha(ctxt); 2644 + rc = emulate_pusha(ctxt, ops); 2645 + if (rc != X86EMUL_CONTINUE) 2646 + goto done; 2724 2647 break; 2725 2648 case 0x61: /* popa */ 2726 2649 rc = emulate_popa(ctxt, ops); ··· 2736 2655 break; 2737 2656 case 0x68: /* push imm */ 2738 2657 case 0x6a: /* push imm8 */ 2739 - emulate_push(ctxt); 2658 + emulate_push(ctxt, ops); 2740 2659 break; 2741 2660 case 0x6c: /* insb */ 2742 2661 case 0x6d: /* insw/insd */ 2743 2662 c->dst.bytes = min(c->dst.bytes, 4u); 2744 2663 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2745 2664 c->dst.bytes)) { 2746 - kvm_inject_gp(ctxt->vcpu, 0); 2665 + emulate_gp(ctxt, 0); 2747 2666 goto done; 2748 2667 } 2749 2668 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, ··· 2755 2674 c->src.bytes = min(c->src.bytes, 4u); 2756 2675 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 2757 2676 c->src.bytes)) { 2758 - kvm_inject_gp(ctxt->vcpu, 0); 2677 + emulate_gp(ctxt, 0); 2759 2678 goto done; 2760 2679 } 2761 2680 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX], ··· 2788 2707 } 2789 2708 break; 2790 2709 case 0x84 ... 0x85: 2710 + test: 2791 2711 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 2792 2712 break; 2793 2713 case 0x86 ... 0x87: /* xchg */ ··· 2817 2735 break; 2818 2736 case 0x88 ... 0x8b: /* mov */ 2819 2737 goto mov; 2820 - case 0x8c: { /* mov r/m, sreg */ 2821 - struct kvm_segment segreg; 2822 - 2823 - if (c->modrm_reg <= VCPU_SREG_GS) 2824 - kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); 2825 - else { 2826 - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2738 + case 0x8c: /* mov r/m, sreg */ 2739 + if (c->modrm_reg > VCPU_SREG_GS) { 2740 + emulate_ud(ctxt); 2827 2741 goto done; 2828 2742 } 2829 - c->dst.val = segreg.selector; 2743 + c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 2830 2744 break; 2831 - } 2832 2745 case 0x8d: /* lea r16/r32, m */ 2833 2746 c->dst.val = c->modrm_ea; 2834 2747 break; ··· 2834 2757 2835 2758 if (c->modrm_reg == VCPU_SREG_CS || 2836 2759 c->modrm_reg > VCPU_SREG_GS) { 2837 - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 2760 + emulate_ud(ctxt); 2838 2761 goto done; 2839 2762 } 2840 2763 2841 2764 if (c->modrm_reg == VCPU_SREG_SS) 2842 - toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS); 2765 + ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; 2843 2766 2844 2767 rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg); 2845 2768 ··· 2852 2775 goto done; 2853 2776 break; 2854 2777 case 0x90: /* nop / xchg r8,rax */ 2855 - if (!(c->rex_prefix & 1)) { /* nop */ 2856 - c->dst.type = OP_NONE; 2778 + if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { 2779 + c->dst.type = OP_NONE; /* nop */ 2857 2780 break; 2858 2781 } 2859 2782 case 0x91 ... 0x97: /* xchg reg,rax */ 2860 - c->src.type = c->dst.type = OP_REG; 2861 - c->src.bytes = c->dst.bytes = c->op_bytes; 2783 + c->src.type = OP_REG; 2784 + c->src.bytes = c->op_bytes; 2862 2785 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; 2863 2786 c->src.val = *(c->src.ptr); 2864 2787 goto xchg; 2865 2788 case 0x9c: /* pushf */ 2866 2789 c->src.val = (unsigned long) ctxt->eflags; 2867 - emulate_push(ctxt); 2790 + emulate_push(ctxt, ops); 2868 2791 break; 2869 2792 case 0x9d: /* popf */ 2870 2793 c->dst.type = OP_REG; ··· 2874 2797 if (rc != X86EMUL_CONTINUE) 2875 2798 goto done; 2876 2799 break; 2877 - case 0xa0 ... 0xa1: /* mov */ 2878 - c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 2879 - c->dst.val = c->src.val; 2880 - break; 2881 - case 0xa2 ... 0xa3: /* mov */ 2882 - c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX]; 2883 - break; 2800 + case 0xa0 ... 0xa3: /* mov */ 2884 2801 case 0xa4 ... 0xa5: /* movs */ 2885 2802 goto mov; 2886 2803 case 0xa6 ... 0xa7: /* cmps */ 2887 2804 c->dst.type = OP_NONE; /* Disable writeback. */ 2888 2805 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2889 2806 goto cmp; 2807 + case 0xa8 ... 0xa9: /* test ax, imm */ 2808 + goto test; 2890 2809 case 0xaa ... 0xab: /* stos */ 2891 2810 c->dst.val = c->regs[VCPU_REGS_RAX]; 2892 2811 break; ··· 2928 2855 long int rel = c->src.val; 2929 2856 c->src.val = (unsigned long) c->eip; 2930 2857 jmp_rel(c, rel); 2931 - emulate_push(ctxt); 2858 + emulate_push(ctxt, ops); 2932 2859 break; 2933 2860 } 2934 2861 case 0xe9: /* jmp rel */ 2935 2862 goto jmp; 2936 - case 0xea: /* jmp far */ 2863 + case 0xea: { /* jmp far */ 2864 + unsigned short sel; 2937 2865 jump_far: 2938 - if (load_segment_descriptor(ctxt, ops, c->src2.val, 2939 - VCPU_SREG_CS)) 2866 + memcpy(&sel, c->src.valptr + c->op_bytes, 2); 2867 + 2868 + if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) 2940 2869 goto done; 2941 2870 2942 - c->eip = c->src.val; 2871 + c->eip = 0; 2872 + memcpy(&c->eip, c->src.valptr, c->op_bytes); 2943 2873 break; 2874 + } 2944 2875 case 0xeb: 2945 2876 jmp: /* jmp rel short */ 2946 2877 jmp_rel(c, c->src.val); ··· 2956 2879 do_io_in: 2957 2880 c->dst.bytes = min(c->dst.bytes, 4u); 2958 2881 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2959 - kvm_inject_gp(ctxt->vcpu, 0); 2882 + emulate_gp(ctxt, 0); 2960 2883 goto done; 2961 2884 } 2962 2885 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 2963 2886 &c->dst.val)) 2964 2887 goto done; /* IO is needed */ 2965 2888 break; 2966 - case 0xee: /* out al,dx */ 2967 - case 0xef: /* out (e/r)ax,dx */ 2889 + case 0xee: /* out dx,al */ 2890 + case 0xef: /* out dx,(e/r)ax */ 2968 2891 c->src.val = c->regs[VCPU_REGS_RDX]; 2969 2892 do_io_out: 2970 2893 c->dst.bytes = min(c->dst.bytes, 4u); 2971 2894 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 2972 - kvm_inject_gp(ctxt->vcpu, 0); 2895 + emulate_gp(ctxt, 0); 2973 2896 goto done; 2974 2897 } 2975 2898 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1, ··· 2993 2916 c->dst.type = OP_NONE; /* Disable writeback. */ 2994 2917 break; 2995 2918 case 0xfa: /* cli */ 2996 - if (emulator_bad_iopl(ctxt, ops)) 2997 - kvm_inject_gp(ctxt->vcpu, 0); 2998 - else { 2919 + if (emulator_bad_iopl(ctxt, ops)) { 2920 + emulate_gp(ctxt, 0); 2921 + goto done; 2922 + } else { 2999 2923 ctxt->eflags &= ~X86_EFLAGS_IF; 3000 2924 c->dst.type = OP_NONE; /* Disable writeback. */ 3001 2925 } 3002 2926 break; 3003 2927 case 0xfb: /* sti */ 3004 - if (emulator_bad_iopl(ctxt, ops)) 3005 - kvm_inject_gp(ctxt->vcpu, 0); 3006 - else { 3007 - toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI); 2928 + if (emulator_bad_iopl(ctxt, ops)) { 2929 + emulate_gp(ctxt, 0); 2930 + goto done; 2931 + } else { 2932 + ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; 3008 2933 ctxt->eflags |= X86_EFLAGS_IF; 3009 2934 c->dst.type = OP_NONE; /* Disable writeback. */ 3010 2935 } ··· 3043 2964 c->dst.type = saved_dst_type; 3044 2965 3045 2966 if ((c->d & SrcMask) == SrcSI) 3046 - string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI, 3047 - &c->src); 2967 + string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), 2968 + VCPU_REGS_RSI, &c->src); 3048 2969 3049 2970 if ((c->d & DstMask) == DstDI) 3050 - string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst); 2971 + string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, 2972 + &c->dst); 3051 2973 3052 2974 if (c->rep_prefix && (c->d & String)) { 3053 2975 struct read_cache *rc = &ctxt->decode.io_read; ··· 3061 2981 (rc->end != 0 && rc->end == rc->pos)) 3062 2982 ctxt->restart = false; 3063 2983 } 3064 - 3065 - /* Commit shadow register state. */ 3066 - memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 3067 - kvm_rip_write(ctxt->vcpu, c->eip); 3068 - ops->set_rflags(ctxt->vcpu, ctxt->eflags); 2984 + /* 2985 + * reset read cache here in case string instruction is restared 2986 + * without decoding 2987 + */ 2988 + ctxt->decode.mem_read.end = 0; 2989 + ctxt->eip = c->eip; 3069 2990 3070 2991 done: 3071 2992 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; ··· 3132 3051 c->dst.type = OP_NONE; 3133 3052 break; 3134 3053 case 5: /* not defined */ 3135 - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3054 + emulate_ud(ctxt); 3136 3055 goto done; 3137 3056 case 7: /* invlpg*/ 3138 3057 emulate_invlpg(ctxt->vcpu, c->modrm_ea); ··· 3144 3063 } 3145 3064 break; 3146 3065 case 0x05: /* syscall */ 3147 - rc = emulate_syscall(ctxt); 3066 + rc = emulate_syscall(ctxt, ops); 3148 3067 if (rc != X86EMUL_CONTINUE) 3149 3068 goto done; 3150 3069 else ··· 3154 3073 emulate_clts(ctxt->vcpu); 3155 3074 c->dst.type = OP_NONE; 3156 3075 break; 3157 - case 0x08: /* invd */ 3158 3076 case 0x09: /* wbinvd */ 3077 + kvm_emulate_wbinvd(ctxt->vcpu); 3078 + c->dst.type = OP_NONE; 3079 + break; 3080 + case 0x08: /* invd */ 3159 3081 case 0x0d: /* GrpP (prefetch) */ 3160 3082 case 0x18: /* Grp16 (prefetch/nop) */ 3161 3083 c->dst.type = OP_NONE; ··· 3168 3084 case 1: 3169 3085 case 5 ... 7: 3170 3086 case 9 ... 15: 3171 - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3087 + emulate_ud(ctxt); 3172 3088 goto done; 3173 3089 } 3174 3090 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu); ··· 3177 3093 case 0x21: /* mov from dr to reg */ 3178 3094 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3179 3095 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3180 - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3096 + emulate_ud(ctxt); 3181 3097 goto done; 3182 3098 } 3183 - emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); 3099 + ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu); 3184 3100 c->dst.type = OP_NONE; /* no writeback */ 3185 3101 break; 3186 3102 case 0x22: /* mov reg, cr */ 3187 - ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu); 3103 + if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { 3104 + emulate_gp(ctxt, 0); 3105 + goto done; 3106 + } 3188 3107 c->dst.type = OP_NONE; 3189 3108 break; 3190 3109 case 0x23: /* mov from reg to dr */ 3191 3110 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 3192 3111 (c->modrm_reg == 4 || c->modrm_reg == 5)) { 3193 - kvm_queue_exception(ctxt->vcpu, UD_VECTOR); 3112 + emulate_ud(ctxt); 3194 3113 goto done; 3195 3114 } 3196 - emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]); 3115 + 3116 + if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] & 3117 + ((ctxt->mode == X86EMUL_MODE_PROT64) ? 3118 + ~0ULL : ~0U), ctxt->vcpu) < 0) { 3119 + /* #UD condition is already handled by the code above */ 3120 + emulate_gp(ctxt, 0); 3121 + goto done; 3122 + } 3123 + 3197 3124 c->dst.type = OP_NONE; /* no writeback */ 3198 3125 break; 3199 3126 case 0x30: 3200 3127 /* wrmsr */ 3201 3128 msr_data = (u32)c->regs[VCPU_REGS_RAX] 3202 3129 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 3203 - if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3204 - kvm_inject_gp(ctxt->vcpu, 0); 3130 + if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 3131 + emulate_gp(ctxt, 0); 3205 3132 goto done; 3206 3133 } 3207 3134 rc = X86EMUL_CONTINUE; ··· 3220 3125 break; 3221 3126 case 0x32: 3222 3127 /* rdmsr */ 3223 - if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3224 - kvm_inject_gp(ctxt->vcpu, 0); 3128 + if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 3129 + emulate_gp(ctxt, 0); 3225 3130 goto done; 3226 3131 } else { 3227 3132 c->regs[VCPU_REGS_RAX] = (u32)msr_data; ··· 3231 3136 c->dst.type = OP_NONE; 3232 3137 break; 3233 3138 case 0x34: /* sysenter */ 3234 - rc = emulate_sysenter(ctxt); 3139 + rc = emulate_sysenter(ctxt, ops); 3235 3140 if (rc != X86EMUL_CONTINUE) 3236 3141 goto done; 3237 3142 else 3238 3143 goto writeback; 3239 3144 break; 3240 3145 case 0x35: /* sysexit */ 3241 - rc = emulate_sysexit(ctxt); 3146 + rc = emulate_sysexit(ctxt, ops); 3242 3147 if (rc != X86EMUL_CONTINUE) 3243 3148 goto done; 3244 3149 else ··· 3255 3160 c->dst.type = OP_NONE; 3256 3161 break; 3257 3162 case 0xa0: /* push fs */ 3258 - emulate_push_sreg(ctxt, VCPU_SREG_FS); 3163 + emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); 3259 3164 break; 3260 3165 case 0xa1: /* pop fs */ 3261 3166 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); ··· 3274 3179 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 3275 3180 break; 3276 3181 case 0xa8: /* push gs */ 3277 - emulate_push_sreg(ctxt, VCPU_SREG_GS); 3182 + emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); 3278 3183 break; 3279 3184 case 0xa9: /* pop gs */ 3280 3185 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);

+90 -56

arch/x86/kvm/i8254.c

··· 5 5 * Copyright (c) 2006 Intel Corporation 6 6 * Copyright (c) 2007 Keir Fraser, XenSource Inc 7 7 * Copyright (c) 2008 Intel Corporation 8 + * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 9 * 9 10 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 11 * of this software and associated documentation files (the "Software"), to deal ··· 34 33 35 34 #include <linux/kvm_host.h> 36 35 #include <linux/slab.h> 36 + #include <linux/workqueue.h> 37 37 38 38 #include "irq.h" 39 39 #include "i8254.h" ··· 245 243 { 246 244 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 247 245 irq_ack_notifier); 248 - raw_spin_lock(&ps->inject_lock); 249 - if (atomic_dec_return(&ps->pit_timer.pending) < 0) 246 + int value; 247 + 248 + spin_lock(&ps->inject_lock); 249 + value = atomic_dec_return(&ps->pit_timer.pending); 250 + if (value < 0) 251 + /* spurious acks can be generated if, for example, the 252 + * PIC is being reset. Handle it gracefully here 253 + */ 250 254 atomic_inc(&ps->pit_timer.pending); 255 + else if (value > 0) 256 + /* in this case, we had multiple outstanding pit interrupts 257 + * that we needed to inject. Reinject 258 + */ 259 + queue_work(ps->pit->wq, &ps->pit->expired); 251 260 ps->irq_ack = 1; 252 - raw_spin_unlock(&ps->inject_lock); 261 + spin_unlock(&ps->inject_lock); 253 262 } 254 263 255 264 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) ··· 276 263 hrtimer_start_expires(timer, HRTIMER_MODE_ABS); 277 264 } 278 265 279 - static void destroy_pit_timer(struct kvm_timer *pt) 266 + static void destroy_pit_timer(struct kvm_pit *pit) 280 267 { 281 - pr_debug("execute del timer!\n"); 282 - hrtimer_cancel(&pt->timer); 268 + hrtimer_cancel(&pit->pit_state.pit_timer.timer); 269 + cancel_work_sync(&pit->expired); 283 270 } 284 271 285 272 static bool kpit_is_periodic(struct kvm_timer *ktimer) ··· 293 280 .is_periodic = kpit_is_periodic, 294 281 }; 295 282 283 + static void pit_do_work(struct work_struct *work) 284 + { 285 + struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); 286 + struct kvm *kvm = pit->kvm; 287 + struct kvm_vcpu *vcpu; 288 + int i; 289 + struct kvm_kpit_state *ps = &pit->pit_state; 290 + int inject = 0; 291 + 292 + /* Try to inject pending interrupts when 293 + * last one has been acked. 294 + */ 295 + spin_lock(&ps->inject_lock); 296 + if (ps->irq_ack) { 297 + ps->irq_ack = 0; 298 + inject = 1; 299 + } 300 + spin_unlock(&ps->inject_lock); 301 + if (inject) { 302 + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 303 + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 304 + 305 + /* 306 + * Provides NMI watchdog support via Virtual Wire mode. 307 + * The route is: PIT -> PIC -> LVT0 in NMI mode. 308 + * 309 + * Note: Our Virtual Wire implementation is simplified, only 310 + * propagating PIT interrupts to all VCPUs when they have set 311 + * LVT0 to NMI delivery. Other PIC interrupts are just sent to 312 + * VCPU0, and only if its LVT0 is in EXTINT mode. 313 + */ 314 + if (kvm->arch.vapics_in_nmi_mode > 0) 315 + kvm_for_each_vcpu(i, vcpu, kvm) 316 + kvm_apic_nmi_wd_deliver(vcpu); 317 + } 318 + } 319 + 320 + static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) 321 + { 322 + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); 323 + struct kvm_pit *pt = ktimer->kvm->arch.vpit; 324 + 325 + if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 326 + atomic_inc(&ktimer->pending); 327 + queue_work(pt->wq, &pt->expired); 328 + } 329 + 330 + if (ktimer->t_ops->is_periodic(ktimer)) { 331 + hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); 332 + return HRTIMER_RESTART; 333 + } else 334 + return HRTIMER_NORESTART; 335 + } 336 + 296 337 static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) 297 338 { 298 339 struct kvm_timer *pt = &ps->pit_timer; ··· 358 291 359 292 /* TODO The new value only affected after the retriggered */ 360 293 hrtimer_cancel(&pt->timer); 294 + cancel_work_sync(&ps->pit->expired); 361 295 pt->period = interval; 362 296 ps->is_periodic = is_period; 363 297 364 - pt->timer.function = kvm_timer_fn; 298 + pt->timer.function = pit_timer_fn; 365 299 pt->t_ops = &kpit_ops; 366 300 pt->kvm = ps->pit->kvm; 367 - pt->vcpu = pt->kvm->bsp_vcpu; 368 301 369 302 atomic_set(&pt->pending, 0); 370 303 ps->irq_ack = 1; ··· 413 346 } 414 347 break; 415 348 default: 416 - destroy_pit_timer(&ps->pit_timer); 349 + destroy_pit_timer(kvm->arch.vpit); 417 350 } 418 351 } 419 352 ··· 692 625 693 626 mutex_init(&pit->pit_state.lock); 694 627 mutex_lock(&pit->pit_state.lock); 695 - raw_spin_lock_init(&pit->pit_state.inject_lock); 628 + spin_lock_init(&pit->pit_state.inject_lock); 629 + 630 + pit->wq = create_singlethread_workqueue("kvm-pit-wq"); 631 + if (!pit->wq) { 632 + mutex_unlock(&pit->pit_state.lock); 633 + kfree(pit); 634 + return NULL; 635 + } 636 + INIT_WORK(&pit->expired, pit_do_work); 696 637 697 638 kvm->arch.vpit = pit; 698 639 pit->kvm = kvm; ··· 752 677 struct hrtimer *timer; 753 678 754 679 if (kvm->arch.vpit) { 680 + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); 681 + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 682 + &kvm->arch.vpit->speaker_dev); 755 683 kvm_unregister_irq_mask_notifier(kvm, 0, 756 684 &kvm->arch.vpit->mask_notifier); 757 685 kvm_unregister_irq_ack_notifier(kvm, ··· 762 684 mutex_lock(&kvm->arch.vpit->pit_state.lock); 763 685 timer = &kvm->arch.vpit->pit_state.pit_timer.timer; 764 686 hrtimer_cancel(timer); 687 + cancel_work_sync(&kvm->arch.vpit->expired); 765 688 kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); 766 689 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 690 + destroy_workqueue(kvm->arch.vpit->wq); 767 691 kfree(kvm->arch.vpit); 768 - } 769 - } 770 - 771 - static void __inject_pit_timer_intr(struct kvm *kvm) 772 - { 773 - struct kvm_vcpu *vcpu; 774 - int i; 775 - 776 - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 777 - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 778 - 779 - /* 780 - * Provides NMI watchdog support via Virtual Wire mode. 781 - * The route is: PIT -> PIC -> LVT0 in NMI mode. 782 - * 783 - * Note: Our Virtual Wire implementation is simplified, only 784 - * propagating PIT interrupts to all VCPUs when they have set 785 - * LVT0 to NMI delivery. Other PIC interrupts are just sent to 786 - * VCPU0, and only if its LVT0 is in EXTINT mode. 787 - */ 788 - if (kvm->arch.vapics_in_nmi_mode > 0) 789 - kvm_for_each_vcpu(i, vcpu, kvm) 790 - kvm_apic_nmi_wd_deliver(vcpu); 791 - } 792 - 793 - void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) 794 - { 795 - struct kvm_pit *pit = vcpu->kvm->arch.vpit; 796 - struct kvm *kvm = vcpu->kvm; 797 - struct kvm_kpit_state *ps; 798 - 799 - if (pit) { 800 - int inject = 0; 801 - ps = &pit->pit_state; 802 - 803 - /* Try to inject pending interrupts when 804 - * last one has been acked. 805 - */ 806 - raw_spin_lock(&ps->inject_lock); 807 - if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { 808 - ps->irq_ack = 0; 809 - inject = 1; 810 - } 811 - raw_spin_unlock(&ps->inject_lock); 812 - if (inject) 813 - __inject_pit_timer_intr(kvm); 814 692 } 815 693 }

+3 -1

arch/x86/kvm/i8254.h

··· 27 27 u32 speaker_data_on; 28 28 struct mutex lock; 29 29 struct kvm_pit *pit; 30 - raw_spinlock_t inject_lock; 30 + spinlock_t inject_lock; 31 31 unsigned long irq_ack; 32 32 struct kvm_irq_ack_notifier irq_ack_notifier; 33 33 }; ··· 40 40 struct kvm_kpit_state pit_state; 41 41 int irq_source_id; 42 42 struct kvm_irq_mask_notifier mask_notifier; 43 + struct workqueue_struct *wq; 44 + struct work_struct expired; 43 45 }; 44 46 45 47 #define KVM_PIT_BASE_ADDRESS 0x40

+31 -17

arch/x86/kvm/i8259.c

··· 3 3 * 4 4 * Copyright (c) 2003-2004 Fabrice Bellard 5 5 * Copyright (c) 2007 Intel Corporation 6 + * Copyright 2009 Red Hat, Inc. and/or its affilates. 6 7 * 7 8 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 9 * of this software and associated documentation files (the "Software"), to deal ··· 34 33 #include <linux/kvm_host.h> 35 34 #include "trace.h" 36 35 36 + static void pic_irq_request(struct kvm *kvm, int level); 37 + 37 38 static void pic_lock(struct kvm_pic *s) 38 39 __acquires(&s->lock) 39 40 { ··· 46 43 __releases(&s->lock) 47 44 { 48 45 bool wakeup = s->wakeup_needed; 49 - struct kvm_vcpu *vcpu; 46 + struct kvm_vcpu *vcpu, *found = NULL; 47 + int i; 50 48 51 49 s->wakeup_needed = false; 52 50 53 51 raw_spin_unlock(&s->lock); 54 52 55 53 if (wakeup) { 56 - vcpu = s->kvm->bsp_vcpu; 57 - if (vcpu) 58 - kvm_vcpu_kick(vcpu); 54 + kvm_for_each_vcpu(i, vcpu, s->kvm) { 55 + if (kvm_apic_accept_pic_intr(vcpu)) { 56 + found = vcpu; 57 + break; 58 + } 59 + } 60 + 61 + if (!found) 62 + found = s->kvm->bsp_vcpu; 63 + 64 + kvm_vcpu_kick(found); 59 65 } 60 66 } 61 67 ··· 185 173 pic_set_irq1(&s->pics[0], 2, 0); 186 174 } 187 175 irq = pic_get_irq(&s->pics[0]); 188 - if (irq >= 0) 189 - s->irq_request(s->irq_request_opaque, 1); 190 - else 191 - s->irq_request(s->irq_request_opaque, 0); 176 + pic_irq_request(s->kvm, irq >= 0); 192 177 } 193 178 194 179 void kvm_pic_update_irq(struct kvm_pic *s) ··· 270 261 void kvm_pic_reset(struct kvm_kpic_state *s) 271 262 { 272 263 int irq; 273 - struct kvm *kvm = s->pics_state->irq_request_opaque; 274 - struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; 264 + struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu; 275 265 u8 irr = s->irr, isr = s->imr; 276 266 277 267 s->last_irr = 0; ··· 309 301 /* 310 302 * deassert a pending interrupt 311 303 */ 312 - s->pics_state->irq_request(s->pics_state-> 313 - irq_request_opaque, 0); 304 + pic_irq_request(s->pics_state->kvm, 0); 314 305 s->init_state = 1; 315 306 s->init4 = val & 1; 316 307 if (val & 0x02) ··· 363 356 } 364 357 } else 365 358 switch (s->init_state) { 366 - case 0: /* normal mode */ 359 + case 0: { /* normal mode */ 360 + u8 imr_diff = s->imr ^ val, 361 + off = (s == &s->pics_state->pics[0]) ? 0 : 8; 367 362 s->imr = val; 363 + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) 364 + if (imr_diff & (1 << irq)) 365 + kvm_fire_mask_notifiers( 366 + s->pics_state->kvm, 367 + SELECT_PIC(irq + off), 368 + irq + off, 369 + !!(s->imr & (1 << irq))); 368 370 pic_update_irq(s->pics_state); 369 371 break; 372 + } 370 373 case 1: 371 374 s->irq_base = val & 0xf8; 372 375 s->init_state = 2; ··· 535 518 /* 536 519 * callback when PIC0 irq status changed 537 520 */ 538 - static void pic_irq_request(void *opaque, int level) 521 + static void pic_irq_request(struct kvm *kvm, int level) 539 522 { 540 - struct kvm *kvm = opaque; 541 523 struct kvm_vcpu *vcpu = kvm->bsp_vcpu; 542 524 struct kvm_pic *s = pic_irqchip(kvm); 543 525 int irq = pic_get_irq(&s->pics[0]); ··· 565 549 s->kvm = kvm; 566 550 s->pics[0].elcr_mask = 0xf8; 567 551 s->pics[1].elcr_mask = 0xde; 568 - s->irq_request = pic_irq_request; 569 - s->irq_request_opaque = kvm; 570 552 s->pics[0].pics_state = s; 571 553 s->pics[1].pics_state = s; 572 554

+1 -1

arch/x86/kvm/irq.c

··· 1 1 /* 2 2 * irq.c: API for in kernel interrupt controller 3 3 * Copyright (c) 2007, Intel Corporation. 4 + * Copyright 2009 Red Hat, Inc. and/or its affilates. 4 5 * 5 6 * This program is free software; you can redistribute it and/or modify it 6 7 * under the terms and conditions of the GNU General Public License, ··· 90 89 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) 91 90 { 92 91 kvm_inject_apic_timer_irqs(vcpu); 93 - kvm_inject_pit_timer_irqs(vcpu); 94 92 /* TODO: PIT, RTC etc. */ 95 93 } 96 94 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);

-4

arch/x86/kvm/irq.h

··· 38 38 struct kvm; 39 39 struct kvm_vcpu; 40 40 41 - typedef void irq_request_func(void *opaque, int level); 42 - 43 41 struct kvm_kpic_state { 44 42 u8 last_irr; /* edge detection */ 45 43 u8 irr; /* interrupt request register */ ··· 65 67 unsigned pending_acks; 66 68 struct kvm *kvm; 67 69 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 68 - irq_request_func *irq_request; 69 - void *irq_request_opaque; 70 70 int output; /* intr from master PIC */ 71 71 struct kvm_io_device dev; 72 72 void (*ack_notifier)(void *opaque, int irq);

+8

arch/x86/kvm/kvm_cache_regs.h

··· 36 36 37 37 static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) 38 38 { 39 + might_sleep(); /* on svm */ 40 + 39 41 if (!test_bit(VCPU_EXREG_PDPTR, 40 42 (unsigned long *)&vcpu->arch.regs_avail)) 41 43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); ··· 69 67 static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) 70 68 { 71 69 return kvm_read_cr4_bits(vcpu, ~0UL); 70 + } 71 + 72 + static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) 73 + { 74 + return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) 75 + | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); 72 76 } 73 77 74 78 #endif

+8 -9

arch/x86/kvm/lapic.c

··· 5 5 * Copyright (C) 2006 Qumranet, Inc. 6 6 * Copyright (C) 2007 Novell 7 7 * Copyright (C) 2007 Intel 8 + * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 9 * 9 10 * Authors: 10 11 * Dor Laor <dor.laor@qumranet.com> ··· 329 328 "dest_mode 0x%x, short_hand 0x%x\n", 330 329 target, source, dest, dest_mode, short_hand); 331 330 332 - ASSERT(!target); 331 + ASSERT(target); 333 332 switch (short_hand) { 334 333 case APIC_DEST_NOSHORT: 335 334 if (dest_mode == 0) ··· 534 533 struct kvm_vcpu *vcpu = apic->vcpu; 535 534 struct kvm_run *run = vcpu->run; 536 535 537 - set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); 536 + kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu); 538 537 run->tpr_access.rip = kvm_rip_read(vcpu); 539 538 run->tpr_access.is_write = write; 540 539 } ··· 1107 1106 u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); 1108 1107 int r = 0; 1109 1108 1110 - if (kvm_vcpu_is_bsp(vcpu)) { 1111 - if (!apic_hw_enabled(vcpu->arch.apic)) 1112 - r = 1; 1113 - if ((lvt0 & APIC_LVT_MASKED) == 0 && 1114 - GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1115 - r = 1; 1116 - } 1109 + if (!apic_hw_enabled(vcpu->arch.apic)) 1110 + r = 1; 1111 + if ((lvt0 & APIC_LVT_MASKED) == 0 && 1112 + GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) 1113 + r = 1; 1117 1114 return r; 1118 1115 } 1119 1116

+499 -312

arch/x86/kvm/mmu.c

··· 7 7 * MMU support 8 8 * 9 9 * Copyright (C) 2006 Qumranet, Inc. 10 + * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 11 * 11 12 * Authors: 12 13 * Yaniv Kamay <yaniv@qumranet.com> ··· 33 32 #include <linux/compiler.h> 34 33 #include <linux/srcu.h> 35 34 #include <linux/slab.h> 35 + #include <linux/uaccess.h> 36 36 37 37 #include <asm/page.h> 38 38 #include <asm/cmpxchg.h> ··· 91 89 92 90 #define PT_FIRST_AVAIL_BITS_SHIFT 9 93 91 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 94 - 95 - #define VALID_PAGE(x) ((x) != INVALID_PAGE) 96 92 97 93 #define PT64_LEVEL_BITS 9 98 94 ··· 173 173 shadow_walk_okay(&(_walker)); \ 174 174 shadow_walk_next(&(_walker))) 175 175 176 - typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp); 176 + typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte); 177 177 178 178 static struct kmem_cache *pte_chain_cache; 179 179 static struct kmem_cache *rmap_desc_cache; ··· 288 288 #endif 289 289 } 290 290 291 + static u64 __xchg_spte(u64 *sptep, u64 new_spte) 292 + { 293 + #ifdef CONFIG_X86_64 294 + return xchg(sptep, new_spte); 295 + #else 296 + u64 old_spte; 297 + 298 + do { 299 + old_spte = *sptep; 300 + } while (cmpxchg64(sptep, old_spte, new_spte) != old_spte); 301 + 302 + return old_spte; 303 + #endif 304 + } 305 + 306 + static void update_spte(u64 *sptep, u64 new_spte) 307 + { 308 + u64 old_spte; 309 + 310 + if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || 311 + !is_rmap_spte(*sptep)) 312 + __set_spte(sptep, new_spte); 313 + else { 314 + old_spte = __xchg_spte(sptep, new_spte); 315 + if (old_spte & shadow_accessed_mask) 316 + mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); 317 + } 318 + } 319 + 291 320 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 292 321 struct kmem_cache *base_cache, int min) 293 322 { ··· 333 304 return 0; 334 305 } 335 306 336 - static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc) 307 + static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc, 308 + struct kmem_cache *cache) 337 309 { 338 310 while (mc->nobjs) 339 - kfree(mc->objects[--mc->nobjs]); 311 + kmem_cache_free(cache, mc->objects[--mc->nobjs]); 340 312 } 341 313 342 314 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, ··· 385 355 386 356 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu) 387 357 { 388 - mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache); 389 - mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache); 358 + mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache); 359 + mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache); 390 360 mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache); 391 - mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache); 361 + mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache, 362 + mmu_page_header_cache); 392 363 } 393 364 394 365 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc, ··· 410 379 411 380 static void mmu_free_pte_chain(struct kvm_pte_chain *pc) 412 381 { 413 - kfree(pc); 382 + kmem_cache_free(pte_chain_cache, pc); 414 383 } 415 384 416 385 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu) ··· 421 390 422 391 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd) 423 392 { 424 - kfree(rd); 393 + kmem_cache_free(rmap_desc_cache, rd); 394 + } 395 + 396 + static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index) 397 + { 398 + if (!sp->role.direct) 399 + return sp->gfns[index]; 400 + 401 + return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS)); 402 + } 403 + 404 + static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn) 405 + { 406 + if (sp->role.direct) 407 + BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index)); 408 + else 409 + sp->gfns[index] = gfn; 425 410 } 426 411 427 412 /* ··· 450 403 { 451 404 unsigned long idx; 452 405 453 - idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 454 - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 406 + idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 407 + (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 455 408 return &slot->lpage_info[level - 2][idx].write_count; 456 409 } 457 410 ··· 461 414 int *write_count; 462 415 int i; 463 416 464 - gfn = unalias_gfn(kvm, gfn); 465 - 466 - slot = gfn_to_memslot_unaliased(kvm, gfn); 417 + slot = gfn_to_memslot(kvm, gfn); 467 418 for (i = PT_DIRECTORY_LEVEL; 468 419 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 469 420 write_count = slot_largepage_idx(gfn, slot, i); ··· 475 430 int *write_count; 476 431 int i; 477 432 478 - gfn = unalias_gfn(kvm, gfn); 479 - slot = gfn_to_memslot_unaliased(kvm, gfn); 433 + slot = gfn_to_memslot(kvm, gfn); 480 434 for (i = PT_DIRECTORY_LEVEL; 481 435 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 482 436 write_count = slot_largepage_idx(gfn, slot, i); ··· 491 447 struct kvm_memory_slot *slot; 492 448 int *largepage_idx; 493 449 494 - gfn = unalias_gfn(kvm, gfn); 495 - slot = gfn_to_memslot_unaliased(kvm, gfn); 450 + slot = gfn_to_memslot(kvm, gfn); 496 451 if (slot) { 497 452 largepage_idx = slot_largepage_idx(gfn, slot, level); 498 453 return *largepage_idx; ··· 544 501 545 502 /* 546 503 * Take gfn and return the reverse mapping to it. 547 - * Note: gfn must be unaliased before this function get called 548 504 */ 549 505 550 506 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) ··· 555 513 if (likely(level == PT_PAGE_TABLE_LEVEL)) 556 514 return &slot->rmap[gfn - slot->base_gfn]; 557 515 558 - idx = (gfn / KVM_PAGES_PER_HPAGE(level)) - 559 - (slot->base_gfn / KVM_PAGES_PER_HPAGE(level)); 516 + idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 517 + (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 560 518 561 519 return &slot->lpage_info[level - 2][idx].rmap_pde; 562 520 } ··· 583 541 584 542 if (!is_rmap_spte(*spte)) 585 543 return count; 586 - gfn = unalias_gfn(vcpu->kvm, gfn); 587 544 sp = page_header(__pa(spte)); 588 - sp->gfns[spte - sp->spt] = gfn; 545 + kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); 589 546 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 590 547 if (!*rmapp) { 591 548 rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte); ··· 641 600 struct kvm_rmap_desc *desc; 642 601 struct kvm_rmap_desc *prev_desc; 643 602 struct kvm_mmu_page *sp; 644 - pfn_t pfn; 603 + gfn_t gfn; 645 604 unsigned long *rmapp; 646 605 int i; 647 606 648 - if (!is_rmap_spte(*spte)) 649 - return; 650 607 sp = page_header(__pa(spte)); 651 - pfn = spte_to_pfn(*spte); 652 - if (*spte & shadow_accessed_mask) 653 - kvm_set_pfn_accessed(pfn); 654 - if (is_writable_pte(*spte)) 655 - kvm_set_pfn_dirty(pfn); 656 - rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); 608 + gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 609 + rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); 657 610 if (!*rmapp) { 658 611 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 659 612 BUG(); ··· 677 642 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); 678 643 BUG(); 679 644 } 645 + } 646 + 647 + static void set_spte_track_bits(u64 *sptep, u64 new_spte) 648 + { 649 + pfn_t pfn; 650 + u64 old_spte = *sptep; 651 + 652 + if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || 653 + old_spte & shadow_accessed_mask) { 654 + __set_spte(sptep, new_spte); 655 + } else 656 + old_spte = __xchg_spte(sptep, new_spte); 657 + 658 + if (!is_rmap_spte(old_spte)) 659 + return; 660 + pfn = spte_to_pfn(old_spte); 661 + if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 662 + kvm_set_pfn_accessed(pfn); 663 + if (is_writable_pte(old_spte)) 664 + kvm_set_pfn_dirty(pfn); 665 + } 666 + 667 + static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) 668 + { 669 + set_spte_track_bits(sptep, new_spte); 670 + rmap_remove(kvm, sptep); 680 671 } 681 672 682 673 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) ··· 737 676 u64 *spte; 738 677 int i, write_protected = 0; 739 678 740 - gfn = unalias_gfn(kvm, gfn); 741 679 rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL); 742 680 743 681 spte = rmap_next(kvm, rmapp, NULL); ··· 745 685 BUG_ON(!(*spte & PT_PRESENT_MASK)); 746 686 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 747 687 if (is_writable_pte(*spte)) { 748 - __set_spte(spte, *spte & ~PT_WRITABLE_MASK); 688 + update_spte(spte, *spte & ~PT_WRITABLE_MASK); 749 689 write_protected = 1; 750 690 } 751 691 spte = rmap_next(kvm, rmapp, spte); ··· 769 709 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 770 710 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 771 711 if (is_writable_pte(*spte)) { 772 - rmap_remove(kvm, spte); 712 + drop_spte(kvm, spte, 713 + shadow_trap_nonpresent_pte); 773 714 --kvm->stat.lpages; 774 - __set_spte(spte, shadow_trap_nonpresent_pte); 775 715 spte = NULL; 776 716 write_protected = 1; 777 717 } ··· 791 731 while ((spte = rmap_next(kvm, rmapp, NULL))) { 792 732 BUG_ON(!(*spte & PT_PRESENT_MASK)); 793 733 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); 794 - rmap_remove(kvm, spte); 795 - __set_spte(spte, shadow_trap_nonpresent_pte); 734 + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 796 735 need_tlb_flush = 1; 797 736 } 798 737 return need_tlb_flush; ··· 813 754 rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); 814 755 need_flush = 1; 815 756 if (pte_write(*ptep)) { 816 - rmap_remove(kvm, spte); 817 - __set_spte(spte, shadow_trap_nonpresent_pte); 757 + drop_spte(kvm, spte, shadow_trap_nonpresent_pte); 818 758 spte = rmap_next(kvm, rmapp, NULL); 819 759 } else { 820 760 new_spte = *spte &~ (PT64_BASE_ADDR_MASK); ··· 821 763 822 764 new_spte &= ~PT_WRITABLE_MASK; 823 765 new_spte &= ~SPTE_HOST_WRITEABLE; 824 - if (is_writable_pte(*spte)) 825 - kvm_set_pfn_dirty(spte_to_pfn(*spte)); 826 - __set_spte(spte, new_spte); 766 + new_spte &= ~shadow_accessed_mask; 767 + set_spte_track_bits(spte, new_spte); 827 768 spte = rmap_next(kvm, rmapp, spte); 828 769 } 829 770 } ··· 856 799 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 857 800 858 801 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 859 - int idx = gfn_offset; 860 - idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 802 + unsigned long idx; 803 + int sh; 804 + 805 + sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); 806 + idx = ((memslot->base_gfn+gfn_offset) >> sh) - 807 + (memslot->base_gfn >> sh); 861 808 ret |= handler(kvm, 862 809 &memslot->lpage_info[j][idx].rmap_pde, 863 810 data); ··· 924 863 925 864 sp = page_header(__pa(spte)); 926 865 927 - gfn = unalias_gfn(vcpu->kvm, gfn); 928 866 rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); 929 867 930 868 kvm_unmap_rmapp(vcpu->kvm, rmapp, 0); ··· 954 894 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 955 895 { 956 896 ASSERT(is_empty_shadow_page(sp->spt)); 897 + hlist_del(&sp->hash_link); 957 898 list_del(&sp->link); 958 899 __free_page(virt_to_page(sp->spt)); 959 - __free_page(virt_to_page(sp->gfns)); 960 - kfree(sp); 900 + if (!sp->role.direct) 901 + __free_page(virt_to_page(sp->gfns)); 902 + kmem_cache_free(mmu_page_header_cache, sp); 961 903 ++kvm->arch.n_free_mmu_pages; 962 904 } 963 905 ··· 969 907 } 970 908 971 909 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu, 972 - u64 *parent_pte) 910 + u64 *parent_pte, int direct) 973 911 { 974 912 struct kvm_mmu_page *sp; 975 913 976 914 sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp); 977 915 sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 978 - sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE); 916 + if (!direct) 917 + sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, 918 + PAGE_SIZE); 979 919 set_page_private(virt_to_page(sp->spt), (unsigned long)sp); 980 920 list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages); 981 921 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); ··· 1062 998 BUG(); 1063 999 } 1064 1000 1065 - 1066 1001 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn) 1067 1002 { 1068 1003 struct kvm_pte_chain *pte_chain; ··· 1071 1008 1072 1009 if (!sp->multimapped && sp->parent_pte) { 1073 1010 parent_sp = page_header(__pa(sp->parent_pte)); 1074 - fn(parent_sp); 1075 - mmu_parent_walk(parent_sp, fn); 1076 - return; 1077 - } 1078 - hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1079 - for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { 1080 - if (!pte_chain->parent_ptes[i]) 1081 - break; 1082 - parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); 1083 - fn(parent_sp); 1084 - mmu_parent_walk(parent_sp, fn); 1085 - } 1086 - } 1087 - 1088 - static void kvm_mmu_update_unsync_bitmap(u64 *spte) 1089 - { 1090 - unsigned int index; 1091 - struct kvm_mmu_page *sp = page_header(__pa(spte)); 1092 - 1093 - index = spte - sp->spt; 1094 - if (!__test_and_set_bit(index, sp->unsync_child_bitmap)) 1095 - sp->unsync_children++; 1096 - WARN_ON(!sp->unsync_children); 1097 - } 1098 - 1099 - static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) 1100 - { 1101 - struct kvm_pte_chain *pte_chain; 1102 - struct hlist_node *node; 1103 - int i; 1104 - 1105 - if (!sp->parent_pte) 1106 - return; 1107 - 1108 - if (!sp->multimapped) { 1109 - kvm_mmu_update_unsync_bitmap(sp->parent_pte); 1011 + fn(parent_sp, sp->parent_pte); 1110 1012 return; 1111 1013 } 1112 1014 1113 1015 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) 1114 1016 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { 1115 - if (!pte_chain->parent_ptes[i]) 1017 + u64 *spte = pte_chain->parent_ptes[i]; 1018 + 1019 + if (!spte) 1116 1020 break; 1117 - kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); 1021 + parent_sp = page_header(__pa(spte)); 1022 + fn(parent_sp, spte); 1118 1023 } 1119 1024 } 1120 1025 1121 - static int unsync_walk_fn(struct kvm_mmu_page *sp) 1122 - { 1123 - kvm_mmu_update_parents_unsync(sp); 1124 - return 1; 1125 - } 1126 - 1026 + static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte); 1127 1027 static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp) 1128 1028 { 1129 - mmu_parent_walk(sp, unsync_walk_fn); 1130 - kvm_mmu_update_parents_unsync(sp); 1029 + mmu_parent_walk(sp, mark_unsync); 1030 + } 1031 + 1032 + static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte) 1033 + { 1034 + unsigned int index; 1035 + 1036 + index = spte - sp->spt; 1037 + if (__test_and_set_bit(index, sp->unsync_child_bitmap)) 1038 + return; 1039 + if (sp->unsync_children++) 1040 + return; 1041 + kvm_mmu_mark_parents_unsync(sp); 1131 1042 } 1132 1043 1133 1044 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, ··· 1114 1077 } 1115 1078 1116 1079 static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1117 - struct kvm_mmu_page *sp) 1080 + struct kvm_mmu_page *sp, bool clear_unsync) 1118 1081 { 1119 1082 return 1; 1120 1083 } ··· 1160 1123 int i, ret, nr_unsync_leaf = 0; 1161 1124 1162 1125 for_each_unsync_children(sp->unsync_child_bitmap, i) { 1126 + struct kvm_mmu_page *child; 1163 1127 u64 ent = sp->spt[i]; 1164 1128 1165 - if (is_shadow_present_pte(ent) && !is_large_pte(ent)) { 1166 - struct kvm_mmu_page *child; 1167 - child = page_header(ent & PT64_BASE_ADDR_MASK); 1129 + if (!is_shadow_present_pte(ent) || is_large_pte(ent)) 1130 + goto clear_child_bitmap; 1168 1131 1169 - if (child->unsync_children) { 1170 - if (mmu_pages_add(pvec, child, i)) 1171 - return -ENOSPC; 1132 + child = page_header(ent & PT64_BASE_ADDR_MASK); 1172 1133 1173 - ret = __mmu_unsync_walk(child, pvec); 1174 - if (!ret) 1175 - __clear_bit(i, sp->unsync_child_bitmap); 1176 - else if (ret > 0) 1177 - nr_unsync_leaf += ret; 1178 - else 1179 - return ret; 1180 - } 1134 + if (child->unsync_children) { 1135 + if (mmu_pages_add(pvec, child, i)) 1136 + return -ENOSPC; 1181 1137 1182 - if (child->unsync) { 1183 - nr_unsync_leaf++; 1184 - if (mmu_pages_add(pvec, child, i)) 1185 - return -ENOSPC; 1186 - } 1187 - } 1138 + ret = __mmu_unsync_walk(child, pvec); 1139 + if (!ret) 1140 + goto clear_child_bitmap; 1141 + else if (ret > 0) 1142 + nr_unsync_leaf += ret; 1143 + else 1144 + return ret; 1145 + } else if (child->unsync) { 1146 + nr_unsync_leaf++; 1147 + if (mmu_pages_add(pvec, child, i)) 1148 + return -ENOSPC; 1149 + } else 1150 + goto clear_child_bitmap; 1151 + 1152 + continue; 1153 + 1154 + clear_child_bitmap: 1155 + __clear_bit(i, sp->unsync_child_bitmap); 1156 + sp->unsync_children--; 1157 + WARN_ON((int)sp->unsync_children < 0); 1188 1158 } 1189 1159 1190 - if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) 1191 - sp->unsync_children = 0; 1192 1160 1193 1161 return nr_unsync_leaf; 1194 1162 } ··· 1208 1166 return __mmu_unsync_walk(sp, pvec); 1209 1167 } 1210 1168 1211 - static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 1212 - { 1213 - unsigned index; 1214 - struct hlist_head *bucket; 1215 - struct kvm_mmu_page *sp; 1216 - struct hlist_node *node; 1217 - 1218 - pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1219 - index = kvm_page_table_hashfn(gfn); 1220 - bucket = &kvm->arch.mmu_page_hash[index]; 1221 - hlist_for_each_entry(sp, node, bucket, hash_link) 1222 - if (sp->gfn == gfn && !sp->role.direct 1223 - && !sp->role.invalid) { 1224 - pgprintk("%s: found role %x\n", 1225 - __func__, sp->role.word); 1226 - return sp; 1227 - } 1228 - return NULL; 1229 - } 1230 - 1231 1169 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1232 1170 { 1233 1171 WARN_ON(!sp->unsync); ··· 1216 1194 --kvm->stat.mmu_unsync; 1217 1195 } 1218 1196 1219 - static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); 1197 + static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 1198 + struct list_head *invalid_list); 1199 + static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1200 + struct list_head *invalid_list); 1220 1201 1221 - static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1202 + #define for_each_gfn_sp(kvm, sp, gfn, pos) \ 1203 + hlist_for_each_entry(sp, pos, \ 1204 + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1205 + if ((sp)->gfn != (gfn)) {} else 1206 + 1207 + #define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos) \ 1208 + hlist_for_each_entry(sp, pos, \ 1209 + &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link) \ 1210 + if ((sp)->gfn != (gfn) || (sp)->role.direct || \ 1211 + (sp)->role.invalid) {} else 1212 + 1213 + /* @sp->gfn should be write-protected at the call site */ 1214 + static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1215 + struct list_head *invalid_list, bool clear_unsync) 1222 1216 { 1223 1217 if (sp->role.cr4_pae != !!is_pae(vcpu)) { 1224 - kvm_mmu_zap_page(vcpu->kvm, sp); 1218 + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1225 1219 return 1; 1226 1220 } 1227 1221 1228 - if (rmap_write_protect(vcpu->kvm, sp->gfn)) 1229 - kvm_flush_remote_tlbs(vcpu->kvm); 1230 - kvm_unlink_unsync_page(vcpu->kvm, sp); 1231 - if (vcpu->arch.mmu.sync_page(vcpu, sp)) { 1232 - kvm_mmu_zap_page(vcpu->kvm, sp); 1222 + if (clear_unsync) 1223 + kvm_unlink_unsync_page(vcpu->kvm, sp); 1224 + 1225 + if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { 1226 + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1233 1227 return 1; 1234 1228 } 1235 1229 1236 1230 kvm_mmu_flush_tlb(vcpu); 1237 1231 return 0; 1232 + } 1233 + 1234 + static int kvm_sync_page_transient(struct kvm_vcpu *vcpu, 1235 + struct kvm_mmu_page *sp) 1236 + { 1237 + LIST_HEAD(invalid_list); 1238 + int ret; 1239 + 1240 + ret = __kvm_sync_page(vcpu, sp, &invalid_list, false); 1241 + if (ret) 1242 + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 1243 + 1244 + return ret; 1245 + } 1246 + 1247 + static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 1248 + struct list_head *invalid_list) 1249 + { 1250 + return __kvm_sync_page(vcpu, sp, invalid_list, true); 1251 + } 1252 + 1253 + /* @gfn should be write-protected at the call site */ 1254 + static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 1255 + { 1256 + struct kvm_mmu_page *s; 1257 + struct hlist_node *node; 1258 + LIST_HEAD(invalid_list); 1259 + bool flush = false; 1260 + 1261 + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { 1262 + if (!s->unsync) 1263 + continue; 1264 + 1265 + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1266 + if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1267 + (vcpu->arch.mmu.sync_page(vcpu, s, true))) { 1268 + kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1269 + continue; 1270 + } 1271 + kvm_unlink_unsync_page(vcpu->kvm, s); 1272 + flush = true; 1273 + } 1274 + 1275 + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 1276 + if (flush) 1277 + kvm_mmu_flush_tlb(vcpu); 1238 1278 } 1239 1279 1240 1280 struct mmu_page_path { ··· 1365 1281 struct kvm_mmu_page *sp; 1366 1282 struct mmu_page_path parents; 1367 1283 struct kvm_mmu_pages pages; 1284 + LIST_HEAD(invalid_list); 1368 1285 1369 1286 kvm_mmu_pages_init(parent, &parents, &pages); 1370 1287 while (mmu_unsync_walk(parent, &pages)) { ··· 1378 1293 kvm_flush_remote_tlbs(vcpu->kvm); 1379 1294 1380 1295 for_each_sp(pages, sp, parents, i) { 1381 - kvm_sync_page(vcpu, sp); 1296 + kvm_sync_page(vcpu, sp, &invalid_list); 1382 1297 mmu_pages_clear_parents(&parents); 1383 1298 } 1299 + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 1384 1300 cond_resched_lock(&vcpu->kvm->mmu_lock); 1385 1301 kvm_mmu_pages_init(parent, &parents, &pages); 1386 1302 } ··· 1396 1310 u64 *parent_pte) 1397 1311 { 1398 1312 union kvm_mmu_page_role role; 1399 - unsigned index; 1400 1313 unsigned quadrant; 1401 - struct hlist_head *bucket; 1402 1314 struct kvm_mmu_page *sp; 1403 - struct hlist_node *node, *tmp; 1315 + struct hlist_node *node; 1316 + bool need_sync = false; 1404 1317 1405 1318 role = vcpu->arch.mmu.base_role; 1406 1319 role.level = level; ··· 1407 1322 if (role.direct) 1408 1323 role.cr4_pae = 0; 1409 1324 role.access = access; 1410 - if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1325 + if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1411 1326 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1412 1327 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1413 1328 role.quadrant = quadrant; 1414 1329 } 1415 - index = kvm_page_table_hashfn(gfn); 1416 - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1417 - hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) 1418 - if (sp->gfn == gfn) { 1419 - if (sp->unsync) 1420 - if (kvm_sync_page(vcpu, sp)) 1421 - continue; 1330 + for_each_gfn_sp(vcpu->kvm, sp, gfn, node) { 1331 + if (!need_sync && sp->unsync) 1332 + need_sync = true; 1422 1333 1423 - if (sp->role.word != role.word) 1424 - continue; 1334 + if (sp->role.word != role.word) 1335 + continue; 1425 1336 1426 - mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1427 - if (sp->unsync_children) { 1428 - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); 1429 - kvm_mmu_mark_parents_unsync(sp); 1430 - } 1431 - trace_kvm_mmu_get_page(sp, false); 1432 - return sp; 1433 - } 1337 + if (sp->unsync && kvm_sync_page_transient(vcpu, sp)) 1338 + break; 1339 + 1340 + mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1341 + if (sp->unsync_children) { 1342 + kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); 1343 + kvm_mmu_mark_parents_unsync(sp); 1344 + } else if (sp->unsync) 1345 + kvm_mmu_mark_parents_unsync(sp); 1346 + 1347 + trace_kvm_mmu_get_page(sp, false); 1348 + return sp; 1349 + } 1434 1350 ++vcpu->kvm->stat.mmu_cache_miss; 1435 - sp = kvm_mmu_alloc_page(vcpu, parent_pte); 1351 + sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct); 1436 1352 if (!sp) 1437 1353 return sp; 1438 1354 sp->gfn = gfn; 1439 1355 sp->role = role; 1440 - hlist_add_head(&sp->hash_link, bucket); 1356 + hlist_add_head(&sp->hash_link, 1357 + &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); 1441 1358 if (!direct) { 1442 1359 if (rmap_write_protect(vcpu->kvm, gfn)) 1443 1360 kvm_flush_remote_tlbs(vcpu->kvm); 1361 + if (level > PT_PAGE_TABLE_LEVEL && need_sync) 1362 + kvm_sync_pages(vcpu, gfn); 1363 + 1444 1364 account_shadowed(vcpu->kvm, gfn); 1445 1365 } 1446 1366 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) ··· 1492 1402 --iterator->level; 1493 1403 } 1494 1404 1405 + static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp) 1406 + { 1407 + u64 spte; 1408 + 1409 + spte = __pa(sp->spt) 1410 + | PT_PRESENT_MASK | PT_ACCESSED_MASK 1411 + | PT_WRITABLE_MASK | PT_USER_MASK; 1412 + __set_spte(sptep, spte); 1413 + } 1414 + 1415 + static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) 1416 + { 1417 + if (is_large_pte(*sptep)) { 1418 + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 1419 + kvm_flush_remote_tlbs(vcpu->kvm); 1420 + } 1421 + } 1422 + 1423 + static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep, 1424 + unsigned direct_access) 1425 + { 1426 + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) { 1427 + struct kvm_mmu_page *child; 1428 + 1429 + /* 1430 + * For the direct sp, if the guest pte's dirty bit 1431 + * changed form clean to dirty, it will corrupt the 1432 + * sp's access: allow writable in the read-only sp, 1433 + * so we should update the spte at this point to get 1434 + * a new sp with the correct access. 1435 + */ 1436 + child = page_header(*sptep & PT64_BASE_ADDR_MASK); 1437 + if (child->role.access == direct_access) 1438 + return; 1439 + 1440 + mmu_page_remove_parent_pte(child, sptep); 1441 + __set_spte(sptep, shadow_trap_nonpresent_pte); 1442 + kvm_flush_remote_tlbs(vcpu->kvm); 1443 + } 1444 + } 1445 + 1495 1446 static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1496 1447 struct kvm_mmu_page *sp) 1497 1448 { ··· 1553 1422 } else { 1554 1423 if (is_large_pte(ent)) 1555 1424 --kvm->stat.lpages; 1556 - rmap_remove(kvm, &pt[i]); 1425 + drop_spte(kvm, &pt[i], 1426 + shadow_trap_nonpresent_pte); 1557 1427 } 1558 1428 } 1559 1429 pt[i] = shadow_trap_nonpresent_pte; ··· 1596 1464 } 1597 1465 1598 1466 static int mmu_zap_unsync_children(struct kvm *kvm, 1599 - struct kvm_mmu_page *parent) 1467 + struct kvm_mmu_page *parent, 1468 + struct list_head *invalid_list) 1600 1469 { 1601 1470 int i, zapped = 0; 1602 1471 struct mmu_page_path parents; ··· 1611 1478 struct kvm_mmu_page *sp; 1612 1479 1613 1480 for_each_sp(pages, sp, parents, i) { 1614 - kvm_mmu_zap_page(kvm, sp); 1481 + kvm_mmu_prepare_zap_page(kvm, sp, invalid_list); 1615 1482 mmu_pages_clear_parents(&parents); 1616 1483 zapped++; 1617 1484 } ··· 1621 1488 return zapped; 1622 1489 } 1623 1490 1624 - static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1491 + static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, 1492 + struct list_head *invalid_list) 1625 1493 { 1626 1494 int ret; 1627 1495 1628 - trace_kvm_mmu_zap_page(sp); 1496 + trace_kvm_mmu_prepare_zap_page(sp); 1629 1497 ++kvm->stat.mmu_shadow_zapped; 1630 - ret = mmu_zap_unsync_children(kvm, sp); 1498 + ret = mmu_zap_unsync_children(kvm, sp, invalid_list); 1631 1499 kvm_mmu_page_unlink_children(kvm, sp); 1632 1500 kvm_mmu_unlink_parents(kvm, sp); 1633 - kvm_flush_remote_tlbs(kvm); 1634 1501 if (!sp->role.invalid && !sp->role.direct) 1635 1502 unaccount_shadowed(kvm, sp->gfn); 1636 1503 if (sp->unsync) 1637 1504 kvm_unlink_unsync_page(kvm, sp); 1638 1505 if (!sp->root_count) { 1639 - hlist_del(&sp->hash_link); 1640 - kvm_mmu_free_page(kvm, sp); 1506 + /* Count self */ 1507 + ret++; 1508 + list_move(&sp->link, invalid_list); 1641 1509 } else { 1642 - sp->role.invalid = 1; 1643 1510 list_move(&sp->link, &kvm->arch.active_mmu_pages); 1644 1511 kvm_reload_remote_mmus(kvm); 1645 1512 } 1513 + 1514 + sp->role.invalid = 1; 1646 1515 kvm_mmu_reset_last_pte_updated(kvm); 1647 1516 return ret; 1517 + } 1518 + 1519 + static void kvm_mmu_commit_zap_page(struct kvm *kvm, 1520 + struct list_head *invalid_list) 1521 + { 1522 + struct kvm_mmu_page *sp; 1523 + 1524 + if (list_empty(invalid_list)) 1525 + return; 1526 + 1527 + kvm_flush_remote_tlbs(kvm); 1528 + 1529 + do { 1530 + sp = list_first_entry(invalid_list, struct kvm_mmu_page, link); 1531 + WARN_ON(!sp->role.invalid || sp->root_count); 1532 + kvm_mmu_free_page(kvm, sp); 1533 + } while (!list_empty(invalid_list)); 1534 + 1648 1535 } 1649 1536 1650 1537 /* ··· 1674 1521 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1675 1522 { 1676 1523 int used_pages; 1524 + LIST_HEAD(invalid_list); 1677 1525 1678 1526 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; 1679 1527 used_pages = max(0, used_pages); ··· 1692 1538 1693 1539 page = container_of(kvm->arch.active_mmu_pages.prev, 1694 1540 struct kvm_mmu_page, link); 1695 - used_pages -= kvm_mmu_zap_page(kvm, page); 1696 - used_pages--; 1541 + used_pages -= kvm_mmu_prepare_zap_page(kvm, page, 1542 + &invalid_list); 1697 1543 } 1544 + kvm_mmu_commit_zap_page(kvm, &invalid_list); 1698 1545 kvm_nr_mmu_pages = used_pages; 1699 1546 kvm->arch.n_free_mmu_pages = 0; 1700 1547 } ··· 1708 1553 1709 1554 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1710 1555 { 1711 - unsigned index; 1712 - struct hlist_head *bucket; 1713 1556 struct kvm_mmu_page *sp; 1714 - struct hlist_node *node, *n; 1557 + struct hlist_node *node; 1558 + LIST_HEAD(invalid_list); 1715 1559 int r; 1716 1560 1717 1561 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1718 1562 r = 0; 1719 - index = kvm_page_table_hashfn(gfn); 1720 - bucket = &kvm->arch.mmu_page_hash[index]; 1721 - restart: 1722 - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) 1723 - if (sp->gfn == gfn && !sp->role.direct) { 1724 - pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1725 - sp->role.word); 1726 - r = 1; 1727 - if (kvm_mmu_zap_page(kvm, sp)) 1728 - goto restart; 1729 - } 1563 + 1564 + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1565 + pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1566 + sp->role.word); 1567 + r = 1; 1568 + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1569 + } 1570 + kvm_mmu_commit_zap_page(kvm, &invalid_list); 1730 1571 return r; 1731 1572 } 1732 1573 1733 1574 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn) 1734 1575 { 1735 - unsigned index; 1736 - struct hlist_head *bucket; 1737 1576 struct kvm_mmu_page *sp; 1738 - struct hlist_node *node, *nn; 1577 + struct hlist_node *node; 1578 + LIST_HEAD(invalid_list); 1739 1579 1740 - index = kvm_page_table_hashfn(gfn); 1741 - bucket = &kvm->arch.mmu_page_hash[index]; 1742 - restart: 1743 - hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) { 1744 - if (sp->gfn == gfn && !sp->role.direct 1745 - && !sp->role.invalid) { 1746 - pgprintk("%s: zap %lx %x\n", 1747 - __func__, gfn, sp->role.word); 1748 - if (kvm_mmu_zap_page(kvm, sp)) 1749 - goto restart; 1750 - } 1580 + for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1581 + pgprintk("%s: zap %lx %x\n", 1582 + __func__, gfn, sp->role.word); 1583 + kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1751 1584 } 1585 + kvm_mmu_commit_zap_page(kvm, &invalid_list); 1752 1586 } 1753 1587 1754 1588 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) ··· 1867 1723 } 1868 1724 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); 1869 1725 1870 - static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1726 + static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 1871 1727 { 1872 - unsigned index; 1873 - struct hlist_head *bucket; 1874 - struct kvm_mmu_page *s; 1875 - struct hlist_node *node, *n; 1876 - 1877 - index = kvm_page_table_hashfn(sp->gfn); 1878 - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1879 - /* don't unsync if pagetable is shadowed with multiple roles */ 1880 - hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { 1881 - if (s->gfn != sp->gfn || s->role.direct) 1882 - continue; 1883 - if (s->role.word != sp->role.word) 1884 - return 1; 1885 - } 1886 1728 trace_kvm_mmu_unsync_page(sp); 1887 1729 ++vcpu->kvm->stat.mmu_unsync; 1888 1730 sp->unsync = 1; 1889 1731 1890 1732 kvm_mmu_mark_parents_unsync(sp); 1891 - 1892 1733 mmu_convert_notrap(sp); 1893 - return 0; 1734 + } 1735 + 1736 + static void kvm_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn) 1737 + { 1738 + struct kvm_mmu_page *s; 1739 + struct hlist_node *node; 1740 + 1741 + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { 1742 + if (s->unsync) 1743 + continue; 1744 + WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1745 + __kvm_unsync_page(vcpu, s); 1746 + } 1894 1747 } 1895 1748 1896 1749 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, 1897 1750 bool can_unsync) 1898 1751 { 1899 - struct kvm_mmu_page *shadow; 1752 + struct kvm_mmu_page *s; 1753 + struct hlist_node *node; 1754 + bool need_unsync = false; 1900 1755 1901 - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1902 - if (shadow) { 1903 - if (shadow->role.level != PT_PAGE_TABLE_LEVEL) 1756 + for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) { 1757 + if (!can_unsync) 1904 1758 return 1; 1905 - if (shadow->unsync) 1906 - return 0; 1907 - if (can_unsync && oos_shadow) 1908 - return kvm_unsync_page(vcpu, shadow); 1909 - return 1; 1759 + 1760 + if (s->role.level != PT_PAGE_TABLE_LEVEL) 1761 + return 1; 1762 + 1763 + if (!need_unsync && !s->unsync) { 1764 + if (!oos_shadow) 1765 + return 1; 1766 + need_unsync = true; 1767 + } 1910 1768 } 1769 + if (need_unsync) 1770 + kvm_unsync_pages(vcpu, gfn); 1911 1771 return 0; 1912 1772 } 1913 1773 ··· 1952 1804 spte |= (u64)pfn << PAGE_SHIFT; 1953 1805 1954 1806 if ((pte_access & ACC_WRITE_MASK) 1955 - || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1807 + || (!tdp_enabled && write_fault && !is_write_protection(vcpu) 1808 + && !user_fault)) { 1956 1809 1957 1810 if (level > PT_PAGE_TABLE_LEVEL && 1958 1811 has_wrprotected_page(vcpu->kvm, gfn, level)) { 1959 1812 ret = 1; 1960 - spte = shadow_trap_nonpresent_pte; 1961 - goto set_pte; 1813 + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 1814 + goto done; 1962 1815 } 1963 1816 1964 1817 spte |= PT_WRITABLE_MASK; ··· 1990 1841 mark_page_dirty(vcpu->kvm, gfn); 1991 1842 1992 1843 set_pte: 1993 - __set_spte(sptep, spte); 1844 + if (is_writable_pte(*sptep) && !is_writable_pte(spte)) 1845 + kvm_set_pfn_dirty(pfn); 1846 + update_spte(sptep, spte); 1847 + done: 1994 1848 return ret; 1995 1849 } 1996 1850 ··· 2005 1853 bool reset_host_protection) 2006 1854 { 2007 1855 int was_rmapped = 0; 2008 - int was_writable = is_writable_pte(*sptep); 2009 1856 int rmap_count; 2010 1857 2011 1858 pgprintk("%s: spte %llx access %x write_fault %d" ··· 2029 1878 } else if (pfn != spte_to_pfn(*sptep)) { 2030 1879 pgprintk("hfn old %lx new %lx\n", 2031 1880 spte_to_pfn(*sptep), pfn); 2032 - rmap_remove(vcpu->kvm, sptep); 2033 - __set_spte(sptep, shadow_trap_nonpresent_pte); 1881 + drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2034 1882 kvm_flush_remote_tlbs(vcpu->kvm); 2035 1883 } else 2036 1884 was_rmapped = 1; ··· 2040 1890 reset_host_protection)) { 2041 1891 if (write_fault) 2042 1892 *ptwrite = 1; 2043 - kvm_x86_ops->tlb_flush(vcpu); 1893 + kvm_mmu_flush_tlb(vcpu); 2044 1894 } 2045 1895 2046 1896 pgprintk("%s: setting spte %llx\n", __func__, *sptep); ··· 2054 1904 page_header_update_slot(vcpu->kvm, sptep, gfn); 2055 1905 if (!was_rmapped) { 2056 1906 rmap_count = rmap_add(vcpu, sptep, gfn); 2057 - kvm_release_pfn_clean(pfn); 2058 1907 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 2059 1908 rmap_recycle(vcpu, sptep, gfn); 2060 - } else { 2061 - if (was_writable) 2062 - kvm_release_pfn_dirty(pfn); 2063 - else 2064 - kvm_release_pfn_clean(pfn); 2065 1909 } 1910 + kvm_release_pfn_clean(pfn); 2066 1911 if (speculative) { 2067 1912 vcpu->arch.last_pte_updated = sptep; 2068 1913 vcpu->arch.last_pte_gfn = gfn; ··· 2086 1941 } 2087 1942 2088 1943 if (*iterator.sptep == shadow_trap_nonpresent_pte) { 2089 - pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; 1944 + u64 base_addr = iterator.addr; 1945 + 1946 + base_addr &= PT64_LVL_ADDR_MASK(iterator.level); 1947 + pseudo_gfn = base_addr >> PAGE_SHIFT; 2090 1948 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, 2091 1949 iterator.level - 1, 2092 1950 1, ACC_ALL, iterator.sptep); ··· 2106 1958 } 2107 1959 } 2108 1960 return pt_write; 1961 + } 1962 + 1963 + static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) 1964 + { 1965 + char buf[1]; 1966 + void __user *hva; 1967 + int r; 1968 + 1969 + /* Touch the page, so send SIGBUS */ 1970 + hva = (void __user *)gfn_to_hva(kvm, gfn); 1971 + r = copy_from_user(buf, hva, 1); 1972 + } 1973 + 1974 + static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 1975 + { 1976 + kvm_release_pfn_clean(pfn); 1977 + if (is_hwpoison_pfn(pfn)) { 1978 + kvm_send_hwpoison_signal(kvm, gfn); 1979 + return 0; 1980 + } else if (is_fault_pfn(pfn)) 1981 + return -EFAULT; 1982 + 1983 + return 1; 2109 1984 } 2110 1985 2111 1986 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) ··· 2154 1983 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2155 1984 2156 1985 /* mmio */ 2157 - if (is_error_pfn(pfn)) { 2158 - kvm_release_pfn_clean(pfn); 2159 - return 1; 2160 - } 1986 + if (is_error_pfn(pfn)) 1987 + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2161 1988 2162 1989 spin_lock(&vcpu->kvm->mmu_lock); 2163 1990 if (mmu_notifier_retry(vcpu, mmu_seq)) ··· 2178 2009 { 2179 2010 int i; 2180 2011 struct kvm_mmu_page *sp; 2012 + LIST_HEAD(invalid_list); 2181 2013 2182 2014 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2183 2015 return; ··· 2188 2018 2189 2019 sp = page_header(root); 2190 2020 --sp->root_count; 2191 - if (!sp->root_count && sp->role.invalid) 2192 - kvm_mmu_zap_page(vcpu->kvm, sp); 2021 + if (!sp->root_count && sp->role.invalid) { 2022 + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); 2023 + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2024 + } 2193 2025 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2194 2026 spin_unlock(&vcpu->kvm->mmu_lock); 2195 2027 return; ··· 2204 2032 sp = page_header(root); 2205 2033 --sp->root_count; 2206 2034 if (!sp->root_count && sp->role.invalid) 2207 - kvm_mmu_zap_page(vcpu->kvm, sp); 2035 + kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2036 + &invalid_list); 2208 2037 } 2209 2038 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2210 2039 } 2040 + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2211 2041 spin_unlock(&vcpu->kvm->mmu_lock); 2212 2042 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2213 2043 } ··· 2219 2045 int ret = 0; 2220 2046 2221 2047 if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) { 2222 - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2048 + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2223 2049 ret = 1; 2224 2050 } 2225 2051 ··· 2247 2073 root_gfn = 0; 2248 2074 } 2249 2075 spin_lock(&vcpu->kvm->mmu_lock); 2076 + kvm_mmu_free_some_pages(vcpu); 2250 2077 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2251 2078 PT64_ROOT_LEVEL, direct, 2252 2079 ACC_ALL, NULL); ··· 2278 2103 root_gfn = i << 30; 2279 2104 } 2280 2105 spin_lock(&vcpu->kvm->mmu_lock); 2106 + kvm_mmu_free_some_pages(vcpu); 2281 2107 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2282 2108 PT32_ROOT_LEVEL, direct, 2283 2109 ACC_ALL, NULL); ··· 2374 2198 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2375 2199 smp_rmb(); 2376 2200 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2377 - if (is_error_pfn(pfn)) { 2378 - kvm_release_pfn_clean(pfn); 2379 - return 1; 2380 - } 2201 + if (is_error_pfn(pfn)) 2202 + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2381 2203 spin_lock(&vcpu->kvm->mmu_lock); 2382 2204 if (mmu_notifier_retry(vcpu, mmu_seq)) 2383 2205 goto out_unlock; ··· 2417 2243 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu) 2418 2244 { 2419 2245 ++vcpu->stat.tlb_flush; 2420 - kvm_x86_ops->tlb_flush(vcpu); 2246 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 2421 2247 } 2422 2248 2423 2249 static void paging_new_cr3(struct kvm_vcpu *vcpu) ··· 2631 2457 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu) 2632 2458 { 2633 2459 ASSERT(vcpu); 2634 - if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) { 2460 + if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2461 + /* mmu.free() should set root_hpa = INVALID_PAGE */ 2635 2462 vcpu->arch.mmu.free(vcpu); 2636 - vcpu->arch.mmu.root_hpa = INVALID_PAGE; 2637 - } 2638 2463 } 2639 2464 2640 2465 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu) ··· 2650 2477 r = mmu_topup_memory_caches(vcpu); 2651 2478 if (r) 2652 2479 goto out; 2653 - spin_lock(&vcpu->kvm->mmu_lock); 2654 - kvm_mmu_free_some_pages(vcpu); 2655 - spin_unlock(&vcpu->kvm->mmu_lock); 2656 2480 r = mmu_alloc_roots(vcpu); 2657 2481 spin_lock(&vcpu->kvm->mmu_lock); 2658 2482 mmu_sync_roots(vcpu); ··· 2678 2508 pte = *spte; 2679 2509 if (is_shadow_present_pte(pte)) { 2680 2510 if (is_last_spte(pte, sp->role.level)) 2681 - rmap_remove(vcpu->kvm, spte); 2511 + drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte); 2682 2512 else { 2683 2513 child = page_header(pte & PT64_BASE_ADDR_MASK); 2684 2514 mmu_page_remove_parent_pte(child, spte); ··· 2698 2528 ++vcpu->kvm->stat.mmu_pde_zapped; 2699 2529 return; 2700 2530 } 2531 + 2532 + if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL)) 2533 + return; 2701 2534 2702 2535 ++vcpu->kvm->stat.mmu_pte_updated; 2703 2536 if (!sp->role.cr4_pae) ··· 2722 2549 return (old & ~new & PT64_PERM_MASK) != 0; 2723 2550 } 2724 2551 2725 - static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new) 2552 + static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page, 2553 + bool remote_flush, bool local_flush) 2726 2554 { 2727 - if (need_remote_flush(old, new)) 2555 + if (zap_page) 2556 + return; 2557 + 2558 + if (remote_flush) 2728 2559 kvm_flush_remote_tlbs(vcpu->kvm); 2729 - else 2560 + else if (local_flush) 2730 2561 kvm_mmu_flush_tlb(vcpu); 2731 2562 } 2732 2563 ··· 2780 2603 bool guest_initiated) 2781 2604 { 2782 2605 gfn_t gfn = gpa >> PAGE_SHIFT; 2606 + union kvm_mmu_page_role mask = { .word = 0 }; 2783 2607 struct kvm_mmu_page *sp; 2784 - struct hlist_node *node, *n; 2785 - struct hlist_head *bucket; 2786 - unsigned index; 2608 + struct hlist_node *node; 2609 + LIST_HEAD(invalid_list); 2787 2610 u64 entry, gentry; 2788 2611 u64 *spte; 2789 2612 unsigned offset = offset_in_page(gpa); ··· 2796 2619 int npte; 2797 2620 int r; 2798 2621 int invlpg_counter; 2622 + bool remote_flush, local_flush, zap_page; 2623 + 2624 + zap_page = remote_flush = local_flush = false; 2799 2625 2800 2626 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2801 2627 ··· 2854 2674 vcpu->arch.last_pte_updated = NULL; 2855 2675 } 2856 2676 } 2857 - index = kvm_page_table_hashfn(gfn); 2858 - bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2859 2677 2860 - restart: 2861 - hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2862 - if (sp->gfn != gfn || sp->role.direct || sp->role.invalid) 2863 - continue; 2678 + mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; 2679 + for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { 2864 2680 pte_size = sp->role.cr4_pae ? 8 : 4; 2865 2681 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2866 2682 misaligned |= bytes < 4; ··· 2873 2697 */ 2874 2698 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2875 2699 gpa, bytes, sp->role.word); 2876 - if (kvm_mmu_zap_page(vcpu->kvm, sp)) 2877 - goto restart; 2700 + zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2701 + &invalid_list); 2878 2702 ++vcpu->kvm->stat.mmu_flooded; 2879 2703 continue; 2880 2704 } ··· 2898 2722 if (quadrant != sp->role.quadrant) 2899 2723 continue; 2900 2724 } 2725 + local_flush = true; 2901 2726 spte = &sp->spt[page_offset / sizeof(*spte)]; 2902 2727 while (npte--) { 2903 2728 entry = *spte; 2904 2729 mmu_pte_write_zap_pte(vcpu, sp, spte); 2905 - if (gentry) 2730 + if (gentry && 2731 + !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 2732 + & mask.word)) 2906 2733 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); 2907 - mmu_pte_write_flush_tlb(vcpu, entry, *spte); 2734 + if (!remote_flush && need_remote_flush(entry, *spte)) 2735 + remote_flush = true; 2908 2736 ++spte; 2909 2737 } 2910 2738 } 2739 + mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 2740 + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2911 2741 kvm_mmu_audit(vcpu, "post pte write"); 2912 2742 spin_unlock(&vcpu->kvm->mmu_lock); 2913 2743 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) { ··· 2941 2759 2942 2760 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2943 2761 { 2944 - while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && 2762 + int free_pages; 2763 + LIST_HEAD(invalid_list); 2764 + 2765 + free_pages = vcpu->kvm->arch.n_free_mmu_pages; 2766 + while (free_pages < KVM_REFILL_PAGES && 2945 2767 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 2946 2768 struct kvm_mmu_page *sp; 2947 2769 2948 2770 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 2949 2771 struct kvm_mmu_page, link); 2950 - kvm_mmu_zap_page(vcpu->kvm, sp); 2772 + free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 2773 + &invalid_list); 2951 2774 ++vcpu->kvm->stat.mmu_recycled; 2952 2775 } 2776 + kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 2953 2777 } 2954 2778 2955 2779 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) ··· 2983 2795 return 1; 2984 2796 case EMULATE_DO_MMIO: 2985 2797 ++vcpu->stat.mmio_exits; 2986 - return 0; 2798 + /* fall through */ 2987 2799 case EMULATE_FAIL: 2988 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2989 - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2990 - vcpu->run->internal.ndata = 0; 2991 2800 return 0; 2992 2801 default: 2993 2802 BUG(); ··· 3081 2896 pt = sp->spt; 3082 2897 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3083 2898 /* avoid RMW */ 3084 - if (pt[i] & PT_WRITABLE_MASK) 2899 + if (is_writable_pte(pt[i])) 3085 2900 pt[i] &= ~PT_WRITABLE_MASK; 3086 2901 } 3087 2902 kvm_flush_remote_tlbs(kvm); ··· 3090 2905 void kvm_mmu_zap_all(struct kvm *kvm) 3091 2906 { 3092 2907 struct kvm_mmu_page *sp, *node; 2908 + LIST_HEAD(invalid_list); 3093 2909 3094 2910 spin_lock(&kvm->mmu_lock); 3095 2911 restart: 3096 2912 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 3097 - if (kvm_mmu_zap_page(kvm, sp)) 2913 + if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list)) 3098 2914 goto restart; 3099 2915 2916 + kvm_mmu_commit_zap_page(kvm, &invalid_list); 3100 2917 spin_unlock(&kvm->mmu_lock); 3101 - 3102 - kvm_flush_remote_tlbs(kvm); 3103 2918 } 3104 2919 3105 - static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm) 2920 + static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, 2921 + struct list_head *invalid_list) 3106 2922 { 3107 2923 struct kvm_mmu_page *page; 3108 2924 3109 2925 page = container_of(kvm->arch.active_mmu_pages.prev, 3110 2926 struct kvm_mmu_page, link); 3111 - return kvm_mmu_zap_page(kvm, page) + 1; 2927 + return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 3112 2928 } 3113 2929 3114 2930 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) ··· 3122 2936 3123 2937 list_for_each_entry(kvm, &vm_list, vm_list) { 3124 2938 int npages, idx, freed_pages; 2939 + LIST_HEAD(invalid_list); 3125 2940 3126 2941 idx = srcu_read_lock(&kvm->srcu); 3127 2942 spin_lock(&kvm->mmu_lock); ··· 3130 2943 kvm->arch.n_free_mmu_pages; 3131 2944 cache_count += npages; 3132 2945 if (!kvm_freed && nr_to_scan > 0 && npages > 0) { 3133 - freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm); 2946 + freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 2947 + &invalid_list); 3134 2948 cache_count -= freed_pages; 3135 2949 kvm_freed = kvm; 3136 2950 } 3137 2951 nr_to_scan--; 3138 2952 2953 + kvm_mmu_commit_zap_page(kvm, &invalid_list); 3139 2954 spin_unlock(&kvm->mmu_lock); 3140 2955 srcu_read_unlock(&kvm->srcu, idx); 3141 2956 } ··· 3263 3074 3264 3075 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3265 3076 { 3266 - kvm_set_cr3(vcpu, vcpu->arch.cr3); 3077 + (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); 3267 3078 return 1; 3268 3079 } 3269 3080 ··· 3520 3331 struct kvm_mmu_page *rev_sp; 3521 3332 gfn_t gfn; 3522 3333 3523 - if (*sptep & PT_WRITABLE_MASK) { 3334 + if (is_writable_pte(*sptep)) { 3524 3335 rev_sp = page_header(__pa(sptep)); 3525 - gfn = rev_sp->gfns[sptep - rev_sp->spt]; 3336 + gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); 3526 3337 3527 3338 if (!gfn_to_memslot(kvm, gfn)) { 3528 3339 if (!printk_ratelimit()) ··· 3536 3347 return; 3537 3348 } 3538 3349 3539 - rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt], 3540 - rev_sp->role.level); 3350 + rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); 3541 3351 if (!*rmapp) { 3542 3352 if (!printk_ratelimit()) 3543 3353 return; ··· 3569 3381 3570 3382 if (!(ent & PT_PRESENT_MASK)) 3571 3383 continue; 3572 - if (!(ent & PT_WRITABLE_MASK)) 3384 + if (!is_writable_pte(ent)) 3573 3385 continue; 3574 3386 inspect_spte_has_rmap(vcpu->kvm, &pt[i]); 3575 3387 } ··· 3597 3409 if (sp->unsync) 3598 3410 continue; 3599 3411 3600 - gfn = unalias_gfn(vcpu->kvm, sp->gfn); 3601 - slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn); 3412 + slot = gfn_to_memslot(vcpu->kvm, sp->gfn); 3602 3413 rmapp = &slot->rmap[gfn - slot->base_gfn]; 3603 3414 3604 3415 spte = rmap_next(vcpu->kvm, rmapp, NULL); 3605 3416 while (spte) { 3606 - if (*spte & PT_WRITABLE_MASK) 3417 + if (is_writable_pte(*spte)) 3607 3418 printk(KERN_ERR "%s: (%s) shadow page has " 3608 3419 "writable mappings: gfn %lx role %x\n", 3609 3420 __func__, audit_msg, sp->gfn,

+1 -1

arch/x86/kvm/mmutrace.h

··· 190 190 TP_ARGS(sp) 191 191 ); 192 192 193 - DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page, 193 + DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page, 194 194 TP_PROTO(struct kvm_mmu_page *sp), 195 195 196 196 TP_ARGS(sp)

+147 -109

arch/x86/kvm/paging_tmpl.h

··· 7 7 * MMU support 8 8 * 9 9 * Copyright (C) 2006 Qumranet, Inc. 10 + * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 11 * 11 12 * Authors: 12 13 * Yaniv Kamay <yaniv@qumranet.com> ··· 119 118 { 120 119 pt_element_t pte; 121 120 gfn_t table_gfn; 122 - unsigned index, pt_access, pte_access; 121 + unsigned index, pt_access, uninitialized_var(pte_access); 123 122 gpa_t pte_gpa; 124 - int rsvd_fault = 0; 123 + bool eperm, present, rsvd_fault; 125 124 126 125 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 127 126 fetch_fault); 128 127 walk: 128 + present = true; 129 + eperm = rsvd_fault = false; 129 130 walker->level = vcpu->arch.mmu.root_level; 130 131 pte = vcpu->arch.cr3; 131 132 #if PTTYPE == 64 132 133 if (!is_long_mode(vcpu)) { 133 134 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 134 135 trace_kvm_mmu_paging_element(pte, walker->level); 135 - if (!is_present_gpte(pte)) 136 - goto not_present; 136 + if (!is_present_gpte(pte)) { 137 + present = false; 138 + goto error; 139 + } 137 140 --walker->level; 138 141 } 139 142 #endif ··· 155 150 walker->table_gfn[walker->level - 1] = table_gfn; 156 151 walker->pte_gpa[walker->level - 1] = pte_gpa; 157 152 158 - if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) 159 - goto not_present; 153 + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { 154 + present = false; 155 + break; 156 + } 160 157 161 158 trace_kvm_mmu_paging_element(pte, walker->level); 162 159 163 - if (!is_present_gpte(pte)) 164 - goto not_present; 160 + if (!is_present_gpte(pte)) { 161 + present = false; 162 + break; 163 + } 165 164 166 - rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level); 167 - if (rsvd_fault) 168 - goto access_error; 165 + if (is_rsvd_bits_set(vcpu, pte, walker->level)) { 166 + rsvd_fault = true; 167 + break; 168 + } 169 169 170 170 if (write_fault && !is_writable_pte(pte)) 171 171 if (user_fault || is_write_protection(vcpu)) 172 - goto access_error; 172 + eperm = true; 173 173 174 174 if (user_fault && !(pte & PT_USER_MASK)) 175 - goto access_error; 175 + eperm = true; 176 176 177 177 #if PTTYPE == 64 178 178 if (fetch_fault && (pte & PT64_NX_MASK)) 179 - goto access_error; 179 + eperm = true; 180 180 #endif 181 181 182 - if (!(pte & PT_ACCESSED_MASK)) { 182 + if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { 183 183 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 184 184 sizeof(pte)); 185 - mark_page_dirty(vcpu->kvm, table_gfn); 186 185 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 187 186 index, pte, pte|PT_ACCESSED_MASK)) 188 187 goto walk; 188 + mark_page_dirty(vcpu->kvm, table_gfn); 189 189 pte |= PT_ACCESSED_MASK; 190 190 } 191 191 ··· 223 213 --walker->level; 224 214 } 225 215 216 + if (!present || eperm || rsvd_fault) 217 + goto error; 218 + 226 219 if (write_fault && !is_dirty_gpte(pte)) { 227 220 bool ret; 228 221 229 222 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 230 - mark_page_dirty(vcpu->kvm, table_gfn); 231 223 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 232 224 pte|PT_DIRTY_MASK); 233 225 if (ret) 234 226 goto walk; 227 + mark_page_dirty(vcpu->kvm, table_gfn); 235 228 pte |= PT_DIRTY_MASK; 236 229 walker->ptes[walker->level - 1] = pte; 237 230 } ··· 242 229 walker->pt_access = pt_access; 243 230 walker->pte_access = pte_access; 244 231 pgprintk("%s: pte %llx pte_access %x pt_access %x\n", 245 - __func__, (u64)pte, pt_access, pte_access); 232 + __func__, (u64)pte, pte_access, pt_access); 246 233 return 1; 247 234 248 - not_present: 235 + error: 249 236 walker->error_code = 0; 250 - goto err; 251 - 252 - access_error: 253 - walker->error_code = PFERR_PRESENT_MASK; 254 - 255 - err: 237 + if (present) 238 + walker->error_code |= PFERR_PRESENT_MASK; 256 239 if (write_fault) 257 240 walker->error_code |= PFERR_WRITE_MASK; 258 241 if (user_fault) 259 242 walker->error_code |= PFERR_USER_MASK; 260 - if (fetch_fault) 243 + if (fetch_fault && is_nx(vcpu)) 261 244 walker->error_code |= PFERR_FETCH_MASK; 262 245 if (rsvd_fault) 263 246 walker->error_code |= PFERR_RSVD_MASK; ··· 261 252 return 0; 262 253 } 263 254 264 - static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, 255 + static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 265 256 u64 *spte, const void *pte) 266 257 { 267 258 pt_element_t gpte; ··· 272 263 gpte = *(const pt_element_t *)pte; 273 264 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 274 265 if (!is_present_gpte(gpte)) { 275 - if (page->unsync) 266 + if (sp->unsync) 276 267 new_spte = shadow_trap_nonpresent_pte; 277 268 else 278 269 new_spte = shadow_notrap_nonpresent_pte; ··· 281 272 return; 282 273 } 283 274 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 284 - pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte); 275 + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 285 276 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 286 277 return; 287 278 pfn = vcpu->arch.update_pte.pfn; ··· 294 285 * we call mmu_set_spte() with reset_host_protection = true beacuse that 295 286 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 296 287 */ 297 - mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 298 - gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL, 288 + mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 289 + is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL, 299 290 gpte_to_gfn(gpte), pfn, true, true); 291 + } 292 + 293 + static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 294 + struct guest_walker *gw, int level) 295 + { 296 + int r; 297 + pt_element_t curr_pte; 298 + 299 + r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], 300 + &curr_pte, sizeof(curr_pte)); 301 + return r || curr_pte != gw->ptes[level - 1]; 300 302 } 301 303 302 304 /* ··· 319 299 int *ptwrite, pfn_t pfn) 320 300 { 321 301 unsigned access = gw->pt_access; 322 - struct kvm_mmu_page *shadow_page; 323 - u64 spte, *sptep = NULL; 324 - int direct; 325 - gfn_t table_gfn; 326 - int r; 327 - int level; 328 - pt_element_t curr_pte; 329 - struct kvm_shadow_walk_iterator iterator; 302 + struct kvm_mmu_page *sp = NULL; 303 + bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]); 304 + int top_level; 305 + unsigned direct_access; 306 + struct kvm_shadow_walk_iterator it; 330 307 331 308 if (!is_present_gpte(gw->ptes[gw->level - 1])) 332 309 return NULL; 333 310 334 - for_each_shadow_entry(vcpu, addr, iterator) { 335 - level = iterator.level; 336 - sptep = iterator.sptep; 337 - if (iterator.level == hlevel) { 338 - mmu_set_spte(vcpu, sptep, access, 339 - gw->pte_access & access, 340 - user_fault, write_fault, 341 - gw->ptes[gw->level-1] & PT_DIRTY_MASK, 342 - ptwrite, level, 343 - gw->gfn, pfn, false, true); 344 - break; 311 + direct_access = gw->pt_access & gw->pte_access; 312 + if (!dirty) 313 + direct_access &= ~ACC_WRITE_MASK; 314 + 315 + top_level = vcpu->arch.mmu.root_level; 316 + if (top_level == PT32E_ROOT_LEVEL) 317 + top_level = PT32_ROOT_LEVEL; 318 + /* 319 + * Verify that the top-level gpte is still there. Since the page 320 + * is a root page, it is either write protected (and cannot be 321 + * changed from now on) or it is invalid (in which case, we don't 322 + * really care if it changes underneath us after this point). 323 + */ 324 + if (FNAME(gpte_changed)(vcpu, gw, top_level)) 325 + goto out_gpte_changed; 326 + 327 + for (shadow_walk_init(&it, vcpu, addr); 328 + shadow_walk_okay(&it) && it.level > gw->level; 329 + shadow_walk_next(&it)) { 330 + gfn_t table_gfn; 331 + 332 + drop_large_spte(vcpu, it.sptep); 333 + 334 + sp = NULL; 335 + if (!is_shadow_present_pte(*it.sptep)) { 336 + table_gfn = gw->table_gfn[it.level - 2]; 337 + sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1, 338 + false, access, it.sptep); 345 339 } 346 340 347 - if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) 348 - continue; 341 + /* 342 + * Verify that the gpte in the page we've just write 343 + * protected is still there. 344 + */ 345 + if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) 346 + goto out_gpte_changed; 349 347 350 - if (is_large_pte(*sptep)) { 351 - rmap_remove(vcpu->kvm, sptep); 352 - __set_spte(sptep, shadow_trap_nonpresent_pte); 353 - kvm_flush_remote_tlbs(vcpu->kvm); 354 - } 355 - 356 - if (level <= gw->level) { 357 - int delta = level - gw->level + 1; 358 - direct = 1; 359 - if (!is_dirty_gpte(gw->ptes[level - delta])) 360 - access &= ~ACC_WRITE_MASK; 361 - table_gfn = gpte_to_gfn(gw->ptes[level - delta]); 362 - /* advance table_gfn when emulating 1gb pages with 4k */ 363 - if (delta == 0) 364 - table_gfn += PT_INDEX(addr, level); 365 - access &= gw->pte_access; 366 - } else { 367 - direct = 0; 368 - table_gfn = gw->table_gfn[level - 2]; 369 - } 370 - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, 371 - direct, access, sptep); 372 - if (!direct) { 373 - r = kvm_read_guest_atomic(vcpu->kvm, 374 - gw->pte_gpa[level - 2], 375 - &curr_pte, sizeof(curr_pte)); 376 - if (r || curr_pte != gw->ptes[level - 2]) { 377 - kvm_mmu_put_page(shadow_page, sptep); 378 - kvm_release_pfn_clean(pfn); 379 - sptep = NULL; 380 - break; 381 - } 382 - } 383 - 384 - spte = __pa(shadow_page->spt) 385 - | PT_PRESENT_MASK | PT_ACCESSED_MASK 386 - | PT_WRITABLE_MASK | PT_USER_MASK; 387 - *sptep = spte; 348 + if (sp) 349 + link_shadow_page(it.sptep, sp); 388 350 } 389 351 390 - return sptep; 352 + for (; 353 + shadow_walk_okay(&it) && it.level > hlevel; 354 + shadow_walk_next(&it)) { 355 + gfn_t direct_gfn; 356 + 357 + validate_direct_spte(vcpu, it.sptep, direct_access); 358 + 359 + drop_large_spte(vcpu, it.sptep); 360 + 361 + if (is_shadow_present_pte(*it.sptep)) 362 + continue; 363 + 364 + direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1); 365 + 366 + sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1, 367 + true, direct_access, it.sptep); 368 + link_shadow_page(it.sptep, sp); 369 + } 370 + 371 + mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 372 + user_fault, write_fault, dirty, ptwrite, it.level, 373 + gw->gfn, pfn, false, true); 374 + 375 + return it.sptep; 376 + 377 + out_gpte_changed: 378 + if (sp) 379 + kvm_mmu_put_page(sp, it.sptep); 380 + kvm_release_pfn_clean(pfn); 381 + return NULL; 391 382 } 392 383 393 384 /* ··· 462 431 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 463 432 464 433 /* mmio */ 465 - if (is_error_pfn(pfn)) { 466 - pgprintk("gfn %lx is mmio\n", walker.gfn); 467 - kvm_release_pfn_clean(pfn); 468 - return 1; 469 - } 434 + if (is_error_pfn(pfn)) 435 + return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); 470 436 471 437 spin_lock(&vcpu->kvm->mmu_lock); 472 438 if (mmu_notifier_retry(vcpu, mmu_seq)) ··· 471 443 kvm_mmu_free_some_pages(vcpu); 472 444 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 473 445 level, &write_pt, pfn); 446 + (void)sptep; 474 447 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 475 448 sptep, *sptep, write_pt); 476 449 ··· 493 464 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 494 465 { 495 466 struct kvm_shadow_walk_iterator iterator; 467 + struct kvm_mmu_page *sp; 496 468 gpa_t pte_gpa = -1; 497 469 int level; 498 470 u64 *sptep; ··· 505 475 level = iterator.level; 506 476 sptep = iterator.sptep; 507 477 478 + sp = page_header(__pa(sptep)); 508 479 if (is_last_spte(*sptep, level)) { 509 - struct kvm_mmu_page *sp = page_header(__pa(sptep)); 510 480 int offset, shift; 481 + 482 + if (!sp->unsync) 483 + break; 511 484 512 485 shift = PAGE_SHIFT - 513 486 (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level; ··· 520 487 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); 521 488 522 489 if (is_shadow_present_pte(*sptep)) { 523 - rmap_remove(vcpu->kvm, sptep); 524 490 if (is_large_pte(*sptep)) 525 491 --vcpu->kvm->stat.lpages; 492 + drop_spte(vcpu->kvm, sptep, 493 + shadow_trap_nonpresent_pte); 526 494 need_flush = 1; 527 - } 528 - __set_spte(sptep, shadow_trap_nonpresent_pte); 495 + } else 496 + __set_spte(sptep, shadow_trap_nonpresent_pte); 529 497 break; 530 498 } 531 499 532 - if (!is_shadow_present_pte(*sptep)) 500 + if (!is_shadow_present_pte(*sptep) || !sp->unsync_children) 533 501 break; 534 502 } 535 503 ··· 604 570 * Using the cached information from sp->gfns is safe because: 605 571 * - The spte has a reference to the struct page, so the pfn for a given gfn 606 572 * can't change unless all sptes pointing to it are nuked first. 607 - * - Alias changes zap the entire shadow cache. 608 573 */ 609 - static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) 574 + static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 575 + bool clear_unsync) 610 576 { 611 577 int i, offset, nr_present; 612 578 bool reset_host_protection; 613 579 gpa_t first_pte_gpa; 614 580 615 581 offset = nr_present = 0; 582 + 583 + /* direct kvm_mmu_page can not be unsync. */ 584 + BUG_ON(sp->role.direct); 616 585 617 586 if (PTTYPE == 32) 618 587 offset = sp->role.quadrant << PT64_LEVEL_BITS; ··· 626 589 unsigned pte_access; 627 590 pt_element_t gpte; 628 591 gpa_t pte_gpa; 629 - gfn_t gfn = sp->gfns[i]; 592 + gfn_t gfn; 630 593 631 594 if (!is_shadow_present_pte(sp->spt[i])) 632 595 continue; ··· 637 600 sizeof(pt_element_t))) 638 601 return -EINVAL; 639 602 640 - if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) || 641 - !(gpte & PT_ACCESSED_MASK)) { 603 + gfn = gpte_to_gfn(gpte); 604 + if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL) 605 + || gfn != sp->gfns[i] || !is_present_gpte(gpte) 606 + || !(gpte & PT_ACCESSED_MASK)) { 642 607 u64 nonpresent; 643 608 644 - rmap_remove(vcpu->kvm, &sp->spt[i]); 645 - if (is_present_gpte(gpte)) 609 + if (is_present_gpte(gpte) || !clear_unsync) 646 610 nonpresent = shadow_trap_nonpresent_pte; 647 611 else 648 612 nonpresent = shadow_notrap_nonpresent_pte; 649 - __set_spte(&sp->spt[i], nonpresent); 613 + drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); 650 614 continue; 651 615 } 652 616

+121 -17

arch/x86/kvm/svm.c

··· 4 4 * AMD SVM support 5 5 * 6 6 * Copyright (C) 2006 Qumranet, Inc. 7 + * Copyright 2010 Red Hat, Inc. and/or its affilates. 7 8 * 8 9 * Authors: 9 10 * Yaniv Kamay <yaniv@qumranet.com> ··· 286 285 287 286 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 288 287 { 288 + vcpu->arch.efer = efer; 289 289 if (!npt_enabled && !(efer & EFER_LMA)) 290 290 efer &= ~EFER_LME; 291 291 292 292 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 293 - vcpu->arch.efer = efer; 294 293 } 295 294 296 295 static int is_external_interrupt(u32 info) ··· 641 640 642 641 if (nested) { 643 642 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 644 - kvm_enable_efer_bits(EFER_SVME); 643 + kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 645 644 } 646 645 647 646 for_each_possible_cpu(cpu) { ··· 807 806 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 808 807 */ 809 808 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 810 - kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 809 + (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 811 810 812 811 save->cr4 = X86_CR4_PAE; 813 812 /* rdx = ?? */ ··· 904 903 svm->asid_generation = 0; 905 904 init_vmcb(svm); 906 905 907 - fx_init(&svm->vcpu); 906 + err = fx_init(&svm->vcpu); 907 + if (err) 908 + goto free_page4; 909 + 908 910 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 909 911 if (kvm_vcpu_is_bsp(&svm->vcpu)) 910 912 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 911 913 912 914 return &svm->vcpu; 913 915 916 + free_page4: 917 + __free_page(hsave_page); 914 918 free_page3: 915 919 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); 916 920 free_page2: ··· 1494 1488 */ 1495 1489 pr_err("KVM: Guest triggered AMD Erratum 383\n"); 1496 1490 1497 - set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); 1491 + kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu); 1498 1492 1499 1493 return; 1500 1494 } ··· 1541 1535 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1542 1536 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1543 1537 if (string || in) 1544 - return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 1538 + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 1545 1539 1546 1540 port = io_info >> 16; 1547 1541 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; ··· 1963 1957 svm->vmcb->save.cr3 = hsave->save.cr3; 1964 1958 svm->vcpu.arch.cr3 = hsave->save.cr3; 1965 1959 } else { 1966 - kvm_set_cr3(&svm->vcpu, hsave->save.cr3); 1960 + (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); 1967 1961 } 1968 1962 kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); 1969 1963 kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); ··· 2086 2080 svm->vmcb->save.cr3 = nested_vmcb->save.cr3; 2087 2081 svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; 2088 2082 } else 2089 - kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 2083 + (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); 2090 2084 2091 2085 /* Guest paging mode is active - reset mmu */ 2092 2086 kvm_mmu_reset_context(&svm->vcpu); ··· 2392 2386 2393 2387 static int invlpg_interception(struct vcpu_svm *svm) 2394 2388 { 2395 - if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2396 - pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2397 - return 1; 2389 + return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2398 2390 } 2399 2391 2400 2392 static int emulate_on_interception(struct vcpu_svm *svm) 2401 2393 { 2402 - if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE) 2403 - pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2404 - return 1; 2394 + return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2405 2395 } 2406 2396 2407 2397 static int cr8_write_interception(struct vcpu_svm *svm) ··· 2728 2726 [SVM_EXIT_NPF] = pf_interception, 2729 2727 }; 2730 2728 2729 + void dump_vmcb(struct kvm_vcpu *vcpu) 2730 + { 2731 + struct vcpu_svm *svm = to_svm(vcpu); 2732 + struct vmcb_control_area *control = &svm->vmcb->control; 2733 + struct vmcb_save_area *save = &svm->vmcb->save; 2734 + 2735 + pr_err("VMCB Control Area:\n"); 2736 + pr_err("cr_read: %04x\n", control->intercept_cr_read); 2737 + pr_err("cr_write: %04x\n", control->intercept_cr_write); 2738 + pr_err("dr_read: %04x\n", control->intercept_dr_read); 2739 + pr_err("dr_write: %04x\n", control->intercept_dr_write); 2740 + pr_err("exceptions: %08x\n", control->intercept_exceptions); 2741 + pr_err("intercepts: %016llx\n", control->intercept); 2742 + pr_err("pause filter count: %d\n", control->pause_filter_count); 2743 + pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); 2744 + pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); 2745 + pr_err("tsc_offset: %016llx\n", control->tsc_offset); 2746 + pr_err("asid: %d\n", control->asid); 2747 + pr_err("tlb_ctl: %d\n", control->tlb_ctl); 2748 + pr_err("int_ctl: %08x\n", control->int_ctl); 2749 + pr_err("int_vector: %08x\n", control->int_vector); 2750 + pr_err("int_state: %08x\n", control->int_state); 2751 + pr_err("exit_code: %08x\n", control->exit_code); 2752 + pr_err("exit_info1: %016llx\n", control->exit_info_1); 2753 + pr_err("exit_info2: %016llx\n", control->exit_info_2); 2754 + pr_err("exit_int_info: %08x\n", control->exit_int_info); 2755 + pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); 2756 + pr_err("nested_ctl: %lld\n", control->nested_ctl); 2757 + pr_err("nested_cr3: %016llx\n", control->nested_cr3); 2758 + pr_err("event_inj: %08x\n", control->event_inj); 2759 + pr_err("event_inj_err: %08x\n", control->event_inj_err); 2760 + pr_err("lbr_ctl: %lld\n", control->lbr_ctl); 2761 + pr_err("next_rip: %016llx\n", control->next_rip); 2762 + pr_err("VMCB State Save Area:\n"); 2763 + pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", 2764 + save->es.selector, save->es.attrib, 2765 + save->es.limit, save->es.base); 2766 + pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", 2767 + save->cs.selector, save->cs.attrib, 2768 + save->cs.limit, save->cs.base); 2769 + pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", 2770 + save->ss.selector, save->ss.attrib, 2771 + save->ss.limit, save->ss.base); 2772 + pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", 2773 + save->ds.selector, save->ds.attrib, 2774 + save->ds.limit, save->ds.base); 2775 + pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", 2776 + save->fs.selector, save->fs.attrib, 2777 + save->fs.limit, save->fs.base); 2778 + pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", 2779 + save->gs.selector, save->gs.attrib, 2780 + save->gs.limit, save->gs.base); 2781 + pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", 2782 + save->gdtr.selector, save->gdtr.attrib, 2783 + save->gdtr.limit, save->gdtr.base); 2784 + pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", 2785 + save->ldtr.selector, save->ldtr.attrib, 2786 + save->ldtr.limit, save->ldtr.base); 2787 + pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", 2788 + save->idtr.selector, save->idtr.attrib, 2789 + save->idtr.limit, save->idtr.base); 2790 + pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", 2791 + save->tr.selector, save->tr.attrib, 2792 + save->tr.limit, save->tr.base); 2793 + pr_err("cpl: %d efer: %016llx\n", 2794 + save->cpl, save->efer); 2795 + pr_err("cr0: %016llx cr2: %016llx\n", 2796 + save->cr0, save->cr2); 2797 + pr_err("cr3: %016llx cr4: %016llx\n", 2798 + save->cr3, save->cr4); 2799 + pr_err("dr6: %016llx dr7: %016llx\n", 2800 + save->dr6, save->dr7); 2801 + pr_err("rip: %016llx rflags: %016llx\n", 2802 + save->rip, save->rflags); 2803 + pr_err("rsp: %016llx rax: %016llx\n", 2804 + save->rsp, save->rax); 2805 + pr_err("star: %016llx lstar: %016llx\n", 2806 + save->star, save->lstar); 2807 + pr_err("cstar: %016llx sfmask: %016llx\n", 2808 + save->cstar, save->sfmask); 2809 + pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", 2810 + save->kernel_gs_base, save->sysenter_cs); 2811 + pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", 2812 + save->sysenter_esp, save->sysenter_eip); 2813 + pr_err("gpat: %016llx dbgctl: %016llx\n", 2814 + save->g_pat, save->dbgctl); 2815 + pr_err("br_from: %016llx br_to: %016llx\n", 2816 + save->br_from, save->br_to); 2817 + pr_err("excp_from: %016llx excp_to: %016llx\n", 2818 + save->last_excp_from, save->last_excp_to); 2819 + 2820 + } 2821 + 2731 2822 static int handle_exit(struct kvm_vcpu *vcpu) 2732 2823 { 2733 2824 struct vcpu_svm *svm = to_svm(vcpu); ··· 2865 2770 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2866 2771 kvm_run->fail_entry.hardware_entry_failure_reason 2867 2772 = svm->vmcb->control.exit_code; 2773 + pr_err("KVM: FAILED VMRUN WITH VMCB:\n"); 2774 + dump_vmcb(vcpu); 2868 2775 return 0; 2869 2776 } 2870 2777 ··· 2923 2826 { 2924 2827 struct vmcb_control_area *control; 2925 2828 2926 - trace_kvm_inj_virq(irq); 2927 - 2928 - ++svm->vcpu.stat.irq_injections; 2929 2829 control = &svm->vmcb->control; 2930 2830 control->int_vector = irq; 2931 2831 control->int_ctl &= ~V_INTR_PRIO_MASK; ··· 2935 2841 struct vcpu_svm *svm = to_svm(vcpu); 2936 2842 2937 2843 BUG_ON(!(gif_set(svm))); 2844 + 2845 + trace_kvm_inj_virq(vcpu->arch.interrupt.nr); 2846 + ++vcpu->stat.irq_injections; 2938 2847 2939 2848 svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | 2940 2849 SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; ··· 3424 3327 return false; 3425 3328 } 3426 3329 3330 + static bool svm_has_wbinvd_exit(void) 3331 + { 3332 + return true; 3333 + } 3334 + 3427 3335 static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) 3428 3336 { 3429 3337 struct vcpu_svm *svm = to_svm(vcpu); ··· 3513 3411 .rdtscp_supported = svm_rdtscp_supported, 3514 3412 3515 3413 .set_supported_cpuid = svm_set_supported_cpuid, 3414 + 3415 + .has_wbinvd_exit = svm_has_wbinvd_exit, 3516 3416 }; 3517 3417 3518 3418 static int __init svm_init(void)

+15 -1

arch/x86/kvm/timer.c

··· 1 + /* 2 + * Kernel-based Virtual Machine driver for Linux 3 + * 4 + * This module enables machines with Intel VT-x extensions to run virtual 5 + * machines without emulation or binary translation. 6 + * 7 + * timer support 8 + * 9 + * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 + * 11 + * This work is licensed under the terms of the GNU GPL, version 2. See 12 + * the COPYING file in the top-level directory. 13 + */ 14 + 1 15 #include <linux/kvm_host.h> 2 16 #include <linux/kvm.h> 3 17 #include <linux/hrtimer.h> ··· 32 18 if (ktimer->reinject || !atomic_read(&ktimer->pending)) { 33 19 atomic_inc(&ktimer->pending); 34 20 /* FIXME: this code should not know anything about vcpus */ 35 - set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); 21 + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); 36 22 } 37 23 38 24 if (waitqueue_active(q))

+176 -79

arch/x86/kvm/vmx.c

··· 5 5 * machines without emulation or binary translation. 6 6 * 7 7 * Copyright (C) 2006 Qumranet, Inc. 8 + * Copyright 2010 Red Hat, Inc. and/or its affilates. 8 9 * 9 10 * Authors: 10 11 * Avi Kivity <avi@qumranet.com> ··· 37 36 #include <asm/vmx.h> 38 37 #include <asm/virtext.h> 39 38 #include <asm/mce.h> 39 + #include <asm/i387.h> 40 + #include <asm/xcr.h> 40 41 41 42 #include "trace.h" 42 43 ··· 65 62 66 63 static int __read_mostly emulate_invalid_guest_state = 0; 67 64 module_param(emulate_invalid_guest_state, bool, S_IRUGO); 65 + 66 + static int __read_mostly vmm_exclusive = 1; 67 + module_param(vmm_exclusive, bool, S_IRUGO); 68 68 69 69 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 70 70 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) ··· 179 173 180 174 static int init_rmode(struct kvm *kvm); 181 175 static u64 construct_eptp(unsigned long root_hpa); 176 + static void kvm_cpu_vmxon(u64 addr); 177 + static void kvm_cpu_vmxoff(void); 182 178 183 179 static DEFINE_PER_CPU(struct vmcs *, vmxarea); 184 180 static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 185 181 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); 182 + static DEFINE_PER_CPU(struct desc_ptr, host_gdt); 186 183 187 184 static unsigned long *vmx_io_bitmap_a; 188 185 static unsigned long *vmx_io_bitmap_b; ··· 343 334 return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT; 344 335 } 345 336 337 + static inline bool cpu_has_vmx_ept_4levels(void) 338 + { 339 + return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT; 340 + } 341 + 346 342 static inline bool cpu_has_vmx_invept_individual_addr(void) 347 343 { 348 344 return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT; ··· 361 347 static inline bool cpu_has_vmx_invept_global(void) 362 348 { 363 349 return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT; 350 + } 351 + 352 + static inline bool cpu_has_vmx_invvpid_single(void) 353 + { 354 + return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT; 355 + } 356 + 357 + static inline bool cpu_has_vmx_invvpid_global(void) 358 + { 359 + return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; 364 360 } 365 361 366 362 static inline bool cpu_has_vmx_ept(void) ··· 411 387 static inline bool cpu_has_virtual_nmis(void) 412 388 { 413 389 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 390 + } 391 + 392 + static inline bool cpu_has_vmx_wbinvd_exit(void) 393 + { 394 + return vmcs_config.cpu_based_2nd_exec_ctrl & 395 + SECONDARY_EXEC_WBINVD_EXITING; 414 396 } 415 397 416 398 static inline bool report_flexpriority(void) ··· 483 453 vmcs, phys_addr); 484 454 } 485 455 456 + static void vmcs_load(struct vmcs *vmcs) 457 + { 458 + u64 phys_addr = __pa(vmcs); 459 + u8 error; 460 + 461 + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 462 + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 463 + : "cc", "memory"); 464 + if (error) 465 + printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 466 + vmcs, phys_addr); 467 + } 468 + 486 469 static void __vcpu_clear(void *arg) 487 470 { 488 471 struct vcpu_vmx *vmx = arg; ··· 518 475 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 519 476 } 520 477 521 - static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 478 + static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx) 522 479 { 523 480 if (vmx->vpid == 0) 524 481 return; 525 482 526 - __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 483 + if (cpu_has_vmx_invvpid_single()) 484 + __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0); 485 + } 486 + 487 + static inline void vpid_sync_vcpu_global(void) 488 + { 489 + if (cpu_has_vmx_invvpid_global()) 490 + __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0); 491 + } 492 + 493 + static inline void vpid_sync_context(struct vcpu_vmx *vmx) 494 + { 495 + if (cpu_has_vmx_invvpid_single()) 496 + vpid_sync_vcpu_single(vmx); 497 + else 498 + vpid_sync_vcpu_global(); 527 499 } 528 500 529 501 static inline void ept_sync_global(void) ··· 870 812 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 871 813 } 872 814 #endif 815 + if (current_thread_info()->status & TS_USEDFPU) 816 + clts(); 817 + load_gdt(&__get_cpu_var(host_gdt)); 873 818 } 874 819 875 820 static void vmx_load_host_state(struct vcpu_vmx *vmx) ··· 889 828 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 890 829 { 891 830 struct vcpu_vmx *vmx = to_vmx(vcpu); 892 - u64 phys_addr = __pa(vmx->vmcs); 893 831 u64 tsc_this, delta, new_offset; 832 + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 894 833 895 - if (vcpu->cpu != cpu) { 834 + if (!vmm_exclusive) 835 + kvm_cpu_vmxon(phys_addr); 836 + else if (vcpu->cpu != cpu) 896 837 vcpu_clear(vmx); 897 - kvm_migrate_timers(vcpu); 898 - set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); 899 - local_irq_disable(); 900 - list_add(&vmx->local_vcpus_link, 901 - &per_cpu(vcpus_on_cpu, cpu)); 902 - local_irq_enable(); 903 - } 904 838 905 839 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 906 - u8 error; 907 - 908 840 per_cpu(current_vmcs, cpu) = vmx->vmcs; 909 - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 910 - : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 911 - : "cc"); 912 - if (error) 913 - printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 914 - vmx->vmcs, phys_addr); 841 + vmcs_load(vmx->vmcs); 915 842 } 916 843 917 844 if (vcpu->cpu != cpu) { 918 845 struct desc_ptr dt; 919 846 unsigned long sysenter_esp; 847 + 848 + kvm_migrate_timers(vcpu); 849 + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 850 + local_irq_disable(); 851 + list_add(&vmx->local_vcpus_link, 852 + &per_cpu(vcpus_on_cpu, cpu)); 853 + local_irq_enable(); 920 854 921 855 vcpu->cpu = cpu; 922 856 /* ··· 940 884 static void vmx_vcpu_put(struct kvm_vcpu *vcpu) 941 885 { 942 886 __vmx_load_host_state(to_vmx(vcpu)); 887 + if (!vmm_exclusive) { 888 + __vcpu_clear(to_vmx(vcpu)); 889 + kvm_cpu_vmxoff(); 890 + } 943 891 } 944 892 945 893 static void vmx_fpu_activate(struct kvm_vcpu *vcpu) ··· 1346 1286 /* locked but not enabled */ 1347 1287 } 1348 1288 1289 + static void kvm_cpu_vmxon(u64 addr) 1290 + { 1291 + asm volatile (ASM_VMX_VMXON_RAX 1292 + : : "a"(&addr), "m"(addr) 1293 + : "memory", "cc"); 1294 + } 1295 + 1349 1296 static int hardware_enable(void *garbage) 1350 1297 { 1351 1298 int cpu = raw_smp_processor_id(); ··· 1375 1308 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); 1376 1309 } 1377 1310 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1378 - asm volatile (ASM_VMX_VMXON_RAX 1379 - : : "a"(&phys_addr), "m"(phys_addr) 1380 - : "memory", "cc"); 1381 1311 1382 - ept_sync_global(); 1312 + if (vmm_exclusive) { 1313 + kvm_cpu_vmxon(phys_addr); 1314 + ept_sync_global(); 1315 + } 1316 + 1317 + store_gdt(&__get_cpu_var(host_gdt)); 1383 1318 1384 1319 return 0; 1385 1320 } ··· 1403 1334 static void kvm_cpu_vmxoff(void) 1404 1335 { 1405 1336 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); 1406 - write_cr4(read_cr4() & ~X86_CR4_VMXE); 1407 1337 } 1408 1338 1409 1339 static void hardware_disable(void *garbage) 1410 1340 { 1411 - vmclear_local_vcpus(); 1412 - kvm_cpu_vmxoff(); 1341 + if (vmm_exclusive) { 1342 + vmclear_local_vcpus(); 1343 + kvm_cpu_vmxoff(); 1344 + } 1345 + write_cr4(read_cr4() & ~X86_CR4_VMXE); 1413 1346 } 1414 1347 1415 1348 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, ··· 1610 1539 if (!cpu_has_vmx_vpid()) 1611 1540 enable_vpid = 0; 1612 1541 1613 - if (!cpu_has_vmx_ept()) { 1542 + if (!cpu_has_vmx_ept() || 1543 + !cpu_has_vmx_ept_4levels()) { 1614 1544 enable_ept = 0; 1615 1545 enable_unrestricted_guest = 0; 1616 1546 } ··· 1700 1628 gfn_t base_gfn; 1701 1629 1702 1630 slots = kvm_memslots(kvm); 1703 - base_gfn = kvm->memslots->memslots[0].base_gfn + 1631 + base_gfn = slots->memslots[0].base_gfn + 1704 1632 kvm->memslots->memslots[0].npages - 3; 1705 1633 return base_gfn << PAGE_SHIFT; 1706 1634 } ··· 1831 1759 1832 1760 static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1833 1761 { 1834 - vpid_sync_vcpu_all(to_vmx(vcpu)); 1835 - if (enable_ept) 1762 + vpid_sync_context(to_vmx(vcpu)); 1763 + if (enable_ept) { 1764 + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 1765 + return; 1836 1766 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1767 + } 1837 1768 } 1838 1769 1839 1770 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) ··· 2582 2507 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2583 2508 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2584 2509 2585 - vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ 2510 + vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS); /* 22.2.3 */ 2586 2511 vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ 2587 2512 vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ 2588 2513 ··· 2674 2599 2675 2600 static int init_rmode(struct kvm *kvm) 2676 2601 { 2602 + int idx, ret = 0; 2603 + 2604 + idx = srcu_read_lock(&kvm->srcu); 2677 2605 if (!init_rmode_tss(kvm)) 2678 - return 0; 2606 + goto exit; 2679 2607 if (!init_rmode_identity_map(kvm)) 2680 - return 0; 2681 - return 1; 2608 + goto exit; 2609 + 2610 + ret = 1; 2611 + exit: 2612 + srcu_read_unlock(&kvm->srcu, idx); 2613 + return ret; 2682 2614 } 2683 2615 2684 2616 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2685 2617 { 2686 2618 struct vcpu_vmx *vmx = to_vmx(vcpu); 2687 2619 u64 msr; 2688 - int ret, idx; 2620 + int ret; 2689 2621 2690 2622 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2691 - idx = srcu_read_lock(&vcpu->kvm->srcu); 2692 2623 if (!init_rmode(vmx->vcpu.kvm)) { 2693 2624 ret = -ENOMEM; 2694 2625 goto out; ··· 2711 2630 msr |= MSR_IA32_APICBASE_BSP; 2712 2631 kvm_set_apic_base(&vmx->vcpu, msr); 2713 2632 2714 - fx_init(&vmx->vcpu); 2633 + ret = fx_init(&vmx->vcpu); 2634 + if (ret != 0) 2635 + goto out; 2715 2636 2716 2637 seg_setup(VCPU_SREG_CS); 2717 2638 /* ··· 2796 2713 vmx_fpu_activate(&vmx->vcpu); 2797 2714 update_exception_bitmap(&vmx->vcpu); 2798 2715 2799 - vpid_sync_vcpu_all(vmx); 2716 + vpid_sync_context(vmx); 2800 2717 2801 2718 ret = 0; 2802 2719 ··· 2804 2721 vmx->emulation_required = 0; 2805 2722 2806 2723 out: 2807 - srcu_read_unlock(&vcpu->kvm->srcu, idx); 2808 2724 return ret; 2809 2725 } 2810 2726 ··· 2908 2826 { 2909 2827 if (!cpu_has_virtual_nmis()) 2910 2828 return to_vmx(vcpu)->soft_vnmi_blocked; 2911 - else 2912 - return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 2913 - GUEST_INTR_STATE_NMI); 2829 + return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 2914 2830 } 2915 2831 2916 2832 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) ··· 3150 3070 ++vcpu->stat.io_exits; 3151 3071 3152 3072 if (string || in) 3153 - return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO); 3073 + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3154 3074 3155 3075 port = exit_qualification >> 16; 3156 3076 size = (exit_qualification & 7) + 1; ··· 3170 3090 hypercall[2] = 0xc1; 3171 3091 } 3172 3092 3093 + static void complete_insn_gp(struct kvm_vcpu *vcpu, int err) 3094 + { 3095 + if (err) 3096 + kvm_inject_gp(vcpu, 0); 3097 + else 3098 + skip_emulated_instruction(vcpu); 3099 + } 3100 + 3173 3101 static int handle_cr(struct kvm_vcpu *vcpu) 3174 3102 { 3175 3103 unsigned long exit_qualification, val; 3176 3104 int cr; 3177 3105 int reg; 3106 + int err; 3178 3107 3179 3108 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3180 3109 cr = exit_qualification & 15; ··· 3194 3105 trace_kvm_cr_write(cr, val); 3195 3106 switch (cr) { 3196 3107 case 0: 3197 - kvm_set_cr0(vcpu, val); 3198 - skip_emulated_instruction(vcpu); 3108 + err = kvm_set_cr0(vcpu, val); 3109 + complete_insn_gp(vcpu, err); 3199 3110 return 1; 3200 3111 case 3: 3201 - kvm_set_cr3(vcpu, val); 3202 - skip_emulated_instruction(vcpu); 3112 + err = kvm_set_cr3(vcpu, val); 3113 + complete_insn_gp(vcpu, err); 3203 3114 return 1; 3204 3115 case 4: 3205 - kvm_set_cr4(vcpu, val); 3206 - skip_emulated_instruction(vcpu); 3116 + err = kvm_set_cr4(vcpu, val); 3117 + complete_insn_gp(vcpu, err); 3207 3118 return 1; 3208 3119 case 8: { 3209 3120 u8 cr8_prev = kvm_get_cr8(vcpu); ··· 3410 3321 static int handle_wbinvd(struct kvm_vcpu *vcpu) 3411 3322 { 3412 3323 skip_emulated_instruction(vcpu); 3413 - /* TODO: Add support for VT-d/pass-through device */ 3324 + kvm_emulate_wbinvd(vcpu); 3325 + return 1; 3326 + } 3327 + 3328 + static int handle_xsetbv(struct kvm_vcpu *vcpu) 3329 + { 3330 + u64 new_bv = kvm_read_edx_eax(vcpu); 3331 + u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); 3332 + 3333 + if (kvm_set_xcr(vcpu, index, new_bv) == 0) 3334 + skip_emulated_instruction(vcpu); 3414 3335 return 1; 3415 3336 } 3416 3337 3417 3338 static int handle_apic_access(struct kvm_vcpu *vcpu) 3418 3339 { 3419 - unsigned long exit_qualification; 3420 - enum emulation_result er; 3421 - unsigned long offset; 3422 - 3423 - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3424 - offset = exit_qualification & 0xffful; 3425 - 3426 - er = emulate_instruction(vcpu, 0, 0, 0); 3427 - 3428 - if (er != EMULATE_DONE) { 3429 - printk(KERN_ERR 3430 - "Fail to handle apic access vmexit! Offset is 0x%lx\n", 3431 - offset); 3432 - return -ENOEXEC; 3433 - } 3434 - return 1; 3340 + return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3435 3341 } 3436 3342 3437 3343 static int handle_task_switch(struct kvm_vcpu *vcpu) ··· 3638 3554 goto out; 3639 3555 } 3640 3556 3641 - if (err != EMULATE_DONE) { 3642 - vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3643 - vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 3644 - vcpu->run->internal.ndata = 0; 3645 - ret = 0; 3646 - goto out; 3647 - } 3557 + if (err != EMULATE_DONE) 3558 + return 0; 3648 3559 3649 3560 if (signal_pending(current)) 3650 3561 goto out; ··· 3702 3623 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 3703 3624 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 3704 3625 [EXIT_REASON_WBINVD] = handle_wbinvd, 3626 + [EXIT_REASON_XSETBV] = handle_xsetbv, 3705 3627 [EXIT_REASON_TASK_SWITCH] = handle_task_switch, 3706 3628 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3707 3629 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, ··· 3735 3655 * to sync with guest real CR3. */ 3736 3656 if (enable_ept && is_paging(vcpu)) 3737 3657 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3658 + 3659 + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 3660 + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3661 + vcpu->run->fail_entry.hardware_entry_failure_reason 3662 + = exit_reason; 3663 + return 0; 3664 + } 3738 3665 3739 3666 if (unlikely(vmx->fail)) { 3740 3667 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; ··· 3948 3861 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 3949 3862 vmx_set_interrupt_shadow(vcpu, 0); 3950 3863 3951 - /* 3952 - * Loading guest fpu may have cleared host cr0.ts 3953 - */ 3954 - vmcs_writel(HOST_CR0, read_cr0()); 3955 - 3956 3864 asm( 3957 3865 /* Store host registers */ 3958 3866 "push %%"R"dx; push %%"R"bp;" ··· 4083 4001 kmem_cache_free(kvm_vcpu_cache, vmx); 4084 4002 } 4085 4003 4004 + static inline void vmcs_init(struct vmcs *vmcs) 4005 + { 4006 + u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); 4007 + 4008 + if (!vmm_exclusive) 4009 + kvm_cpu_vmxon(phys_addr); 4010 + 4011 + vmcs_clear(vmcs); 4012 + 4013 + if (!vmm_exclusive) 4014 + kvm_cpu_vmxoff(); 4015 + } 4016 + 4086 4017 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) 4087 4018 { 4088 4019 int err; ··· 4121 4026 if (!vmx->vmcs) 4122 4027 goto free_msrs; 4123 4028 4124 - vmcs_clear(vmx->vmcs); 4029 + vmcs_init(vmx->vmcs); 4125 4030 4126 4031 cpu = get_cpu(); 4127 4032 vmx_vcpu_load(&vmx->vcpu, cpu); ··· 4360 4265 .rdtscp_supported = vmx_rdtscp_supported, 4361 4266 4362 4267 .set_supported_cpuid = vmx_set_supported_cpuid, 4268 + 4269 + .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 4363 4270 }; 4364 4271 4365 4272 static int __init vmx_init(void)

+652 -528

arch/x86/kvm/x86.c

··· 6 6 * Copyright (C) 2006 Qumranet, Inc. 7 7 * Copyright (C) 2008 Qumranet, Inc. 8 8 * Copyright IBM Corporation, 2008 9 + * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 10 * 10 11 * Authors: 11 12 * Avi Kivity <avi@qumranet.com> ··· 42 41 #include <linux/srcu.h> 43 42 #include <linux/slab.h> 44 43 #include <linux/perf_event.h> 44 + #include <linux/uaccess.h> 45 45 #include <trace/events/kvm.h> 46 46 47 47 #define CREATE_TRACE_POINTS 48 48 #include "trace.h" 49 49 50 50 #include <asm/debugreg.h> 51 - #include <asm/uaccess.h> 52 51 #include <asm/msr.h> 53 52 #include <asm/desc.h> 54 53 #include <asm/mtrr.h> 55 54 #include <asm/mce.h> 55 + #include <asm/i387.h> 56 + #include <asm/xcr.h> 56 57 57 58 #define MAX_IO_MSRS 256 58 59 #define CR0_RESERVED_BITS \ ··· 65 62 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ 66 63 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ 67 64 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ 65 + | X86_CR4_OSXSAVE \ 68 66 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) 69 67 70 68 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) ··· 150 146 { "largepages", VM_STAT(lpages) }, 151 147 { NULL } 152 148 }; 149 + 150 + u64 __read_mostly host_xcr0; 151 + 152 + static inline u32 bit(int bitno) 153 + { 154 + return 1 << (bitno & 31); 155 + } 153 156 154 157 static void kvm_on_user_return(struct user_return_notifier *urn) 155 158 { ··· 296 285 prev_nr = vcpu->arch.exception.nr; 297 286 if (prev_nr == DF_VECTOR) { 298 287 /* triple fault -> shutdown */ 299 - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 288 + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 300 289 return; 301 290 } 302 291 class1 = exception_class(prev_nr); ··· 425 414 return changed; 426 415 } 427 416 428 - void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 417 + int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 429 418 { 419 + unsigned long old_cr0 = kvm_read_cr0(vcpu); 420 + unsigned long update_bits = X86_CR0_PG | X86_CR0_WP | 421 + X86_CR0_CD | X86_CR0_NW; 422 + 430 423 cr0 |= X86_CR0_ET; 431 424 432 425 #ifdef CONFIG_X86_64 433 - if (cr0 & 0xffffffff00000000UL) { 434 - kvm_inject_gp(vcpu, 0); 435 - return; 436 - } 426 + if (cr0 & 0xffffffff00000000UL) 427 + return 1; 437 428 #endif 438 429 439 430 cr0 &= ~CR0_RESERVED_BITS; 440 431 441 - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 442 - kvm_inject_gp(vcpu, 0); 443 - return; 444 - } 432 + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) 433 + return 1; 445 434 446 - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 447 - kvm_inject_gp(vcpu, 0); 448 - return; 449 - } 435 + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) 436 + return 1; 450 437 451 438 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 452 439 #ifdef CONFIG_X86_64 453 440 if ((vcpu->arch.efer & EFER_LME)) { 454 441 int cs_db, cs_l; 455 442 456 - if (!is_pae(vcpu)) { 457 - kvm_inject_gp(vcpu, 0); 458 - return; 459 - } 443 + if (!is_pae(vcpu)) 444 + return 1; 460 445 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 461 - if (cs_l) { 462 - kvm_inject_gp(vcpu, 0); 463 - return; 464 - 465 - } 446 + if (cs_l) 447 + return 1; 466 448 } else 467 449 #endif 468 - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 469 - kvm_inject_gp(vcpu, 0); 470 - return; 471 - } 472 - 450 + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) 451 + return 1; 473 452 } 474 453 475 454 kvm_x86_ops->set_cr0(vcpu, cr0); 476 455 477 - kvm_mmu_reset_context(vcpu); 478 - return; 456 + if ((cr0 ^ old_cr0) & update_bits) 457 + kvm_mmu_reset_context(vcpu); 458 + return 0; 479 459 } 480 460 EXPORT_SYMBOL_GPL(kvm_set_cr0); 481 461 482 462 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 483 463 { 484 - kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 464 + (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); 485 465 } 486 466 EXPORT_SYMBOL_GPL(kvm_lmsw); 487 467 488 - void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 468 + int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 469 + { 470 + u64 xcr0; 471 + 472 + /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ 473 + if (index != XCR_XFEATURE_ENABLED_MASK) 474 + return 1; 475 + xcr0 = xcr; 476 + if (kvm_x86_ops->get_cpl(vcpu) != 0) 477 + return 1; 478 + if (!(xcr0 & XSTATE_FP)) 479 + return 1; 480 + if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) 481 + return 1; 482 + if (xcr0 & ~host_xcr0) 483 + return 1; 484 + vcpu->arch.xcr0 = xcr0; 485 + vcpu->guest_xcr0_loaded = 0; 486 + return 0; 487 + } 488 + 489 + int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) 490 + { 491 + if (__kvm_set_xcr(vcpu, index, xcr)) { 492 + kvm_inject_gp(vcpu, 0); 493 + return 1; 494 + } 495 + return 0; 496 + } 497 + EXPORT_SYMBOL_GPL(kvm_set_xcr); 498 + 499 + static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) 500 + { 501 + struct kvm_cpuid_entry2 *best; 502 + 503 + best = kvm_find_cpuid_entry(vcpu, 1, 0); 504 + return best && (best->ecx & bit(X86_FEATURE_XSAVE)); 505 + } 506 + 507 + static void update_cpuid(struct kvm_vcpu *vcpu) 508 + { 509 + struct kvm_cpuid_entry2 *best; 510 + 511 + best = kvm_find_cpuid_entry(vcpu, 1, 0); 512 + if (!best) 513 + return; 514 + 515 + /* Update OSXSAVE bit */ 516 + if (cpu_has_xsave && best->function == 0x1) { 517 + best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); 518 + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) 519 + best->ecx |= bit(X86_FEATURE_OSXSAVE); 520 + } 521 + } 522 + 523 + int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 489 524 { 490 525 unsigned long old_cr4 = kvm_read_cr4(vcpu); 491 526 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 492 527 493 - if (cr4 & CR4_RESERVED_BITS) { 494 - kvm_inject_gp(vcpu, 0); 495 - return; 496 - } 528 + if (cr4 & CR4_RESERVED_BITS) 529 + return 1; 530 + 531 + if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) 532 + return 1; 497 533 498 534 if (is_long_mode(vcpu)) { 499 - if (!(cr4 & X86_CR4_PAE)) { 500 - kvm_inject_gp(vcpu, 0); 501 - return; 502 - } 535 + if (!(cr4 & X86_CR4_PAE)) 536 + return 1; 503 537 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 504 538 && ((cr4 ^ old_cr4) & pdptr_bits) 505 - && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 506 - kvm_inject_gp(vcpu, 0); 507 - return; 508 - } 539 + && !load_pdptrs(vcpu, vcpu->arch.cr3)) 540 + return 1; 509 541 510 - if (cr4 & X86_CR4_VMXE) { 511 - kvm_inject_gp(vcpu, 0); 512 - return; 513 - } 542 + if (cr4 & X86_CR4_VMXE) 543 + return 1; 544 + 514 545 kvm_x86_ops->set_cr4(vcpu, cr4); 515 - vcpu->arch.cr4 = cr4; 516 - kvm_mmu_reset_context(vcpu); 546 + 547 + if ((cr4 ^ old_cr4) & pdptr_bits) 548 + kvm_mmu_reset_context(vcpu); 549 + 550 + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) 551 + update_cpuid(vcpu); 552 + 553 + return 0; 517 554 } 518 555 EXPORT_SYMBOL_GPL(kvm_set_cr4); 519 556 520 - void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 557 + int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 521 558 { 522 559 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 523 560 kvm_mmu_sync_roots(vcpu); 524 561 kvm_mmu_flush_tlb(vcpu); 525 - return; 562 + return 0; 526 563 } 527 564 528 565 if (is_long_mode(vcpu)) { 529 - if (cr3 & CR3_L_MODE_RESERVED_BITS) { 530 - kvm_inject_gp(vcpu, 0); 531 - return; 532 - } 566 + if (cr3 & CR3_L_MODE_RESERVED_BITS) 567 + return 1; 533 568 } else { 534 569 if (is_pae(vcpu)) { 535 - if (cr3 & CR3_PAE_RESERVED_BITS) { 536 - kvm_inject_gp(vcpu, 0); 537 - return; 538 - } 539 - if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 540 - kvm_inject_gp(vcpu, 0); 541 - return; 542 - } 570 + if (cr3 & CR3_PAE_RESERVED_BITS) 571 + return 1; 572 + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) 573 + return 1; 543 574 } 544 575 /* 545 576 * We don't check reserved bits in nonpae mode, because ··· 599 546 * to debug) behavior on the guest side. 600 547 */ 601 548 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 602 - kvm_inject_gp(vcpu, 0); 603 - else { 604 - vcpu->arch.cr3 = cr3; 605 - vcpu->arch.mmu.new_cr3(vcpu); 606 - } 549 + return 1; 550 + vcpu->arch.cr3 = cr3; 551 + vcpu->arch.mmu.new_cr3(vcpu); 552 + return 0; 607 553 } 608 554 EXPORT_SYMBOL_GPL(kvm_set_cr3); 609 555 610 - void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 556 + int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 611 557 { 612 - if (cr8 & CR8_RESERVED_BITS) { 613 - kvm_inject_gp(vcpu, 0); 614 - return; 615 - } 558 + if (cr8 & CR8_RESERVED_BITS) 559 + return 1; 616 560 if (irqchip_in_kernel(vcpu->kvm)) 617 561 kvm_lapic_set_tpr(vcpu, cr8); 618 562 else 619 563 vcpu->arch.cr8 = cr8; 564 + return 0; 565 + } 566 + 567 + void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 568 + { 569 + if (__kvm_set_cr8(vcpu, cr8)) 570 + kvm_inject_gp(vcpu, 0); 620 571 } 621 572 EXPORT_SYMBOL_GPL(kvm_set_cr8); 622 573 ··· 633 576 } 634 577 EXPORT_SYMBOL_GPL(kvm_get_cr8); 635 578 636 - int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 579 + static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 637 580 { 638 581 switch (dr) { 639 582 case 0 ... 3: ··· 642 585 vcpu->arch.eff_db[dr] = val; 643 586 break; 644 587 case 4: 645 - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 646 - kvm_queue_exception(vcpu, UD_VECTOR); 647 - return 1; 648 - } 588 + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 589 + return 1; /* #UD */ 649 590 /* fall through */ 650 591 case 6: 651 - if (val & 0xffffffff00000000ULL) { 652 - kvm_inject_gp(vcpu, 0); 653 - return 1; 654 - } 592 + if (val & 0xffffffff00000000ULL) 593 + return -1; /* #GP */ 655 594 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 656 595 break; 657 596 case 5: 658 - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 659 - kvm_queue_exception(vcpu, UD_VECTOR); 660 - return 1; 661 - } 597 + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 598 + return 1; /* #UD */ 662 599 /* fall through */ 663 600 default: /* 7 */ 664 - if (val & 0xffffffff00000000ULL) { 665 - kvm_inject_gp(vcpu, 0); 666 - return 1; 667 - } 601 + if (val & 0xffffffff00000000ULL) 602 + return -1; /* #GP */ 668 603 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 669 604 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 670 605 kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7); ··· 667 618 668 619 return 0; 669 620 } 621 + 622 + int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) 623 + { 624 + int res; 625 + 626 + res = __kvm_set_dr(vcpu, dr, val); 627 + if (res > 0) 628 + kvm_queue_exception(vcpu, UD_VECTOR); 629 + else if (res < 0) 630 + kvm_inject_gp(vcpu, 0); 631 + 632 + return res; 633 + } 670 634 EXPORT_SYMBOL_GPL(kvm_set_dr); 671 635 672 - int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 636 + static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 673 637 { 674 638 switch (dr) { 675 639 case 0 ... 3: 676 640 *val = vcpu->arch.db[dr]; 677 641 break; 678 642 case 4: 679 - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 680 - kvm_queue_exception(vcpu, UD_VECTOR); 643 + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 681 644 return 1; 682 - } 683 645 /* fall through */ 684 646 case 6: 685 647 *val = vcpu->arch.dr6; 686 648 break; 687 649 case 5: 688 - if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) { 689 - kvm_queue_exception(vcpu, UD_VECTOR); 650 + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) 690 651 return 1; 691 - } 692 652 /* fall through */ 693 653 default: /* 7 */ 694 654 *val = vcpu->arch.dr7; ··· 706 648 707 649 return 0; 708 650 } 709 - EXPORT_SYMBOL_GPL(kvm_get_dr); 710 651 711 - static inline u32 bit(int bitno) 652 + int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) 712 653 { 713 - return 1 << (bitno & 31); 654 + if (_kvm_get_dr(vcpu, dr, val)) { 655 + kvm_queue_exception(vcpu, UD_VECTOR); 656 + return 1; 657 + } 658 + return 0; 714 659 } 660 + EXPORT_SYMBOL_GPL(kvm_get_dr); 715 661 716 662 /* 717 663 * List of msr numbers which we expose to userspace through KVM_GET_MSRS ··· 744 682 745 683 static u32 emulated_msrs[] = { 746 684 MSR_IA32_MISC_ENABLE, 685 + MSR_IA32_MCG_STATUS, 686 + MSR_IA32_MCG_CTL, 747 687 }; 748 688 749 689 static int set_efer(struct kvm_vcpu *vcpu, u64 efer) 750 690 { 691 + u64 old_efer = vcpu->arch.efer; 692 + 751 693 if (efer & efer_reserved_bits) 752 694 return 1; 753 695 ··· 780 714 781 715 kvm_x86_ops->set_efer(vcpu, efer); 782 716 783 - vcpu->arch.efer = efer; 784 - 785 717 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 786 718 kvm_mmu_reset_context(vcpu); 719 + 720 + /* Update reserved bits */ 721 + if ((efer ^ old_efer) & EFER_NX) 722 + kvm_mmu_reset_context(vcpu); 787 723 788 724 return 0; 789 725 } ··· 950 882 951 883 if (!vcpu->time_page) 952 884 return 0; 953 - set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); 885 + kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v); 954 886 return 1; 955 887 } 956 888 ··· 1592 1524 { 1593 1525 int i, idx; 1594 1526 1595 - vcpu_load(vcpu); 1596 - 1597 1527 idx = srcu_read_lock(&vcpu->kvm->srcu); 1598 1528 for (i = 0; i < msrs->nmsrs; ++i) 1599 1529 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1600 1530 break; 1601 1531 srcu_read_unlock(&vcpu->kvm->srcu, idx); 1602 - 1603 - vcpu_put(vcpu); 1604 1532 1605 1533 return i; 1606 1534 } ··· 1682 1618 case KVM_CAP_PCI_SEGMENT: 1683 1619 case KVM_CAP_DEBUGREGS: 1684 1620 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1621 + case KVM_CAP_XSAVE: 1685 1622 r = 1; 1686 1623 break; 1687 1624 case KVM_CAP_COALESCED_MMIO: ··· 1705 1640 break; 1706 1641 case KVM_CAP_MCE: 1707 1642 r = KVM_MAX_MCE_BANKS; 1643 + break; 1644 + case KVM_CAP_XCRS: 1645 + r = cpu_has_xsave; 1708 1646 break; 1709 1647 default: 1710 1648 r = 0; ··· 1785 1717 return r; 1786 1718 } 1787 1719 1720 + static void wbinvd_ipi(void *garbage) 1721 + { 1722 + wbinvd(); 1723 + } 1724 + 1725 + static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) 1726 + { 1727 + return vcpu->kvm->arch.iommu_domain && 1728 + !(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY); 1729 + } 1730 + 1788 1731 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1789 1732 { 1733 + /* Address WBINVD may be executed by guest */ 1734 + if (need_emulate_wbinvd(vcpu)) { 1735 + if (kvm_x86_ops->has_wbinvd_exit()) 1736 + cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask); 1737 + else if (vcpu->cpu != -1 && vcpu->cpu != cpu) 1738 + smp_call_function_single(vcpu->cpu, 1739 + wbinvd_ipi, NULL, 1); 1740 + } 1741 + 1790 1742 kvm_x86_ops->vcpu_load(vcpu, cpu); 1791 1743 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 1792 1744 unsigned long khz = cpufreq_quick_get(cpu); ··· 1819 1731 1820 1732 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1821 1733 { 1822 - kvm_put_guest_fpu(vcpu); 1823 1734 kvm_x86_ops->vcpu_put(vcpu); 1735 + kvm_put_guest_fpu(vcpu); 1824 1736 } 1825 1737 1826 1738 static int is_efer_nx(void) ··· 1869 1781 if (copy_from_user(cpuid_entries, entries, 1870 1782 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1871 1783 goto out_free; 1872 - vcpu_load(vcpu); 1873 1784 for (i = 0; i < cpuid->nent; i++) { 1874 1785 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1875 1786 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; ··· 1886 1799 r = 0; 1887 1800 kvm_apic_set_version(vcpu); 1888 1801 kvm_x86_ops->cpuid_update(vcpu); 1889 - vcpu_put(vcpu); 1802 + update_cpuid(vcpu); 1890 1803 1891 1804 out_free: 1892 1805 vfree(cpuid_entries); ··· 1907 1820 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1908 1821 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1909 1822 goto out; 1910 - vcpu_load(vcpu); 1911 1823 vcpu->arch.cpuid_nent = cpuid->nent; 1912 1824 kvm_apic_set_version(vcpu); 1913 1825 kvm_x86_ops->cpuid_update(vcpu); 1914 - vcpu_put(vcpu); 1826 + update_cpuid(vcpu); 1915 1827 return 0; 1916 1828 1917 1829 out: ··· 1923 1837 { 1924 1838 int r; 1925 1839 1926 - vcpu_load(vcpu); 1927 1840 r = -E2BIG; 1928 1841 if (cpuid->nent < vcpu->arch.cpuid_nent) 1929 1842 goto out; ··· 1934 1849 1935 1850 out: 1936 1851 cpuid->nent = vcpu->arch.cpuid_nent; 1937 - vcpu_put(vcpu); 1938 1852 return r; 1939 1853 } 1940 1854 ··· 1985 1901 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1986 1902 /* cpuid 1.ecx */ 1987 1903 const u32 kvm_supported_word4_x86_features = 1988 - F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | 1904 + F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 1989 1905 0 /* DS-CPL, VMX, SMX, EST */ | 1990 1906 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | 1991 1907 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 1992 1908 0 /* Reserved, DCA */ | F(XMM4_1) | 1993 1909 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 1994 - 0 /* Reserved, XSAVE, OSXSAVE */; 1910 + 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); 1995 1911 /* cpuid 0x80000001.ecx */ 1996 1912 const u32 kvm_supported_word6_x86_features = 1997 1913 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | ··· 2006 1922 2007 1923 switch (function) { 2008 1924 case 0: 2009 - entry->eax = min(entry->eax, (u32)0xb); 1925 + entry->eax = min(entry->eax, (u32)0xd); 2010 1926 break; 2011 1927 case 1: 2012 1928 entry->edx &= kvm_supported_word0_x86_features; ··· 2056 1972 for (i = 1; *nent < maxnent; ++i) { 2057 1973 level_type = entry[i - 1].ecx & 0xff00; 2058 1974 if (!level_type) 1975 + break; 1976 + do_cpuid_1_ent(&entry[i], function, i); 1977 + entry[i].flags |= 1978 + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1979 + ++*nent; 1980 + } 1981 + break; 1982 + } 1983 + case 0xd: { 1984 + int i; 1985 + 1986 + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 1987 + for (i = 1; *nent < maxnent; ++i) { 1988 + if (entry[i - 1].eax == 0 && i != 2) 2059 1989 break; 2060 1990 do_cpuid_1_ent(&entry[i], function, i); 2061 1991 entry[i].flags |= ··· 2179 2081 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, 2180 2082 struct kvm_lapic_state *s) 2181 2083 { 2182 - vcpu_load(vcpu); 2183 2084 memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s); 2184 - vcpu_put(vcpu); 2185 2085 2186 2086 return 0; 2187 2087 } ··· 2187 2091 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, 2188 2092 struct kvm_lapic_state *s) 2189 2093 { 2190 - vcpu_load(vcpu); 2191 2094 memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s); 2192 2095 kvm_apic_post_state_restore(vcpu); 2193 2096 update_cr8_intercept(vcpu); 2194 - vcpu_put(vcpu); 2195 2097 2196 2098 return 0; 2197 2099 } ··· 2201 2107 return -EINVAL; 2202 2108 if (irqchip_in_kernel(vcpu->kvm)) 2203 2109 return -ENXIO; 2204 - vcpu_load(vcpu); 2205 2110 2206 2111 kvm_queue_interrupt(vcpu, irq->irq, false); 2207 - 2208 - vcpu_put(vcpu); 2209 2112 2210 2113 return 0; 2211 2114 } 2212 2115 2213 2116 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) 2214 2117 { 2215 - vcpu_load(vcpu); 2216 2118 kvm_inject_nmi(vcpu); 2217 - vcpu_put(vcpu); 2218 2119 2219 2120 return 0; 2220 2121 } ··· 2229 2140 int r; 2230 2141 unsigned bank_num = mcg_cap & 0xff, bank; 2231 2142 2232 - vcpu_load(vcpu); 2233 2143 r = -EINVAL; 2234 2144 if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) 2235 2145 goto out; ··· 2243 2155 for (bank = 0; bank < bank_num; bank++) 2244 2156 vcpu->arch.mce_banks[bank*4] = ~(u64)0; 2245 2157 out: 2246 - vcpu_put(vcpu); 2247 2158 return r; 2248 2159 } 2249 2160 ··· 2275 2188 printk(KERN_DEBUG "kvm: set_mce: " 2276 2189 "injects mce exception while " 2277 2190 "previous one is in progress!\n"); 2278 - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); 2191 + kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2279 2192 return 0; 2280 2193 } 2281 2194 if (banks[1] & MCI_STATUS_VAL) ··· 2300 2213 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, 2301 2214 struct kvm_vcpu_events *events) 2302 2215 { 2303 - vcpu_load(vcpu); 2304 - 2305 2216 events->exception.injected = 2306 2217 vcpu->arch.exception.pending && 2307 2218 !kvm_exception_is_soft(vcpu->arch.exception.nr); ··· 2324 2239 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2325 2240 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2326 2241 | KVM_VCPUEVENT_VALID_SHADOW); 2327 - 2328 - vcpu_put(vcpu); 2329 2242 } 2330 2243 2331 2244 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, ··· 2333 2250 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2334 2251 | KVM_VCPUEVENT_VALID_SHADOW)) 2335 2252 return -EINVAL; 2336 - 2337 - vcpu_load(vcpu); 2338 2253 2339 2254 vcpu->arch.exception.pending = events->exception.injected; 2340 2255 vcpu->arch.exception.nr = events->exception.nr; ··· 2356 2275 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2357 2276 vcpu->arch.sipi_vector = events->sipi_vector; 2358 2277 2359 - vcpu_put(vcpu); 2360 - 2361 2278 return 0; 2362 2279 } 2363 2280 2364 2281 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, 2365 2282 struct kvm_debugregs *dbgregs) 2366 2283 { 2367 - vcpu_load(vcpu); 2368 - 2369 2284 memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db)); 2370 2285 dbgregs->dr6 = vcpu->arch.dr6; 2371 2286 dbgregs->dr7 = vcpu->arch.dr7; 2372 2287 dbgregs->flags = 0; 2373 - 2374 - vcpu_put(vcpu); 2375 2288 } 2376 2289 2377 2290 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, ··· 2374 2299 if (dbgregs->flags) 2375 2300 return -EINVAL; 2376 2301 2377 - vcpu_load(vcpu); 2378 - 2379 2302 memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); 2380 2303 vcpu->arch.dr6 = dbgregs->dr6; 2381 2304 vcpu->arch.dr7 = dbgregs->dr7; 2382 2305 2383 - vcpu_put(vcpu); 2384 - 2385 2306 return 0; 2307 + } 2308 + 2309 + static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, 2310 + struct kvm_xsave *guest_xsave) 2311 + { 2312 + if (cpu_has_xsave) 2313 + memcpy(guest_xsave->region, 2314 + &vcpu->arch.guest_fpu.state->xsave, 2315 + sizeof(struct xsave_struct)); 2316 + else { 2317 + memcpy(guest_xsave->region, 2318 + &vcpu->arch.guest_fpu.state->fxsave, 2319 + sizeof(struct i387_fxsave_struct)); 2320 + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = 2321 + XSTATE_FPSSE; 2322 + } 2323 + } 2324 + 2325 + static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, 2326 + struct kvm_xsave *guest_xsave) 2327 + { 2328 + u64 xstate_bv = 2329 + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; 2330 + 2331 + if (cpu_has_xsave) 2332 + memcpy(&vcpu->arch.guest_fpu.state->xsave, 2333 + guest_xsave->region, sizeof(struct xsave_struct)); 2334 + else { 2335 + if (xstate_bv & ~XSTATE_FPSSE) 2336 + return -EINVAL; 2337 + memcpy(&vcpu->arch.guest_fpu.state->fxsave, 2338 + guest_xsave->region, sizeof(struct i387_fxsave_struct)); 2339 + } 2340 + return 0; 2341 + } 2342 + 2343 + static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, 2344 + struct kvm_xcrs *guest_xcrs) 2345 + { 2346 + if (!cpu_has_xsave) { 2347 + guest_xcrs->nr_xcrs = 0; 2348 + return; 2349 + } 2350 + 2351 + guest_xcrs->nr_xcrs = 1; 2352 + guest_xcrs->flags = 0; 2353 + guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; 2354 + guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; 2355 + } 2356 + 2357 + static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, 2358 + struct kvm_xcrs *guest_xcrs) 2359 + { 2360 + int i, r = 0; 2361 + 2362 + if (!cpu_has_xsave) 2363 + return -EINVAL; 2364 + 2365 + if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) 2366 + return -EINVAL; 2367 + 2368 + for (i = 0; i < guest_xcrs->nr_xcrs; i++) 2369 + /* Only support XCR0 currently */ 2370 + if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { 2371 + r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, 2372 + guest_xcrs->xcrs[0].value); 2373 + break; 2374 + } 2375 + if (r) 2376 + r = -EINVAL; 2377 + return r; 2386 2378 } 2387 2379 2388 2380 long kvm_arch_vcpu_ioctl(struct file *filp, ··· 2458 2316 struct kvm_vcpu *vcpu = filp->private_data; 2459 2317 void __user *argp = (void __user *)arg; 2460 2318 int r; 2461 - struct kvm_lapic_state *lapic = NULL; 2319 + union { 2320 + struct kvm_lapic_state *lapic; 2321 + struct kvm_xsave *xsave; 2322 + struct kvm_xcrs *xcrs; 2323 + void *buffer; 2324 + } u; 2462 2325 2326 + u.buffer = NULL; 2463 2327 switch (ioctl) { 2464 2328 case KVM_GET_LAPIC: { 2465 2329 r = -EINVAL; 2466 2330 if (!vcpu->arch.apic) 2467 2331 goto out; 2468 - lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2332 + u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2469 2333 2470 2334 r = -ENOMEM; 2471 - if (!lapic) 2335 + if (!u.lapic) 2472 2336 goto out; 2473 - r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); 2337 + r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic); 2474 2338 if (r) 2475 2339 goto out; 2476 2340 r = -EFAULT; 2477 - if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) 2341 + if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state))) 2478 2342 goto out; 2479 2343 r = 0; 2480 2344 break; ··· 2489 2341 r = -EINVAL; 2490 2342 if (!vcpu->arch.apic) 2491 2343 goto out; 2492 - lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2344 + u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2493 2345 r = -ENOMEM; 2494 - if (!lapic) 2346 + if (!u.lapic) 2495 2347 goto out; 2496 2348 r = -EFAULT; 2497 - if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) 2349 + if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state))) 2498 2350 goto out; 2499 - r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); 2351 + r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); 2500 2352 if (r) 2501 2353 goto out; 2502 2354 r = 0; ··· 2612 2464 r = -EFAULT; 2613 2465 if (copy_from_user(&mce, argp, sizeof mce)) 2614 2466 goto out; 2615 - vcpu_load(vcpu); 2616 2467 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2617 - vcpu_put(vcpu); 2618 2468 break; 2619 2469 } 2620 2470 case KVM_GET_VCPU_EVENTS: { ··· 2659 2513 r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs); 2660 2514 break; 2661 2515 } 2516 + case KVM_GET_XSAVE: { 2517 + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); 2518 + r = -ENOMEM; 2519 + if (!u.xsave) 2520 + break; 2521 + 2522 + kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave); 2523 + 2524 + r = -EFAULT; 2525 + if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave))) 2526 + break; 2527 + r = 0; 2528 + break; 2529 + } 2530 + case KVM_SET_XSAVE: { 2531 + u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); 2532 + r = -ENOMEM; 2533 + if (!u.xsave) 2534 + break; 2535 + 2536 + r = -EFAULT; 2537 + if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave))) 2538 + break; 2539 + 2540 + r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); 2541 + break; 2542 + } 2543 + case KVM_GET_XCRS: { 2544 + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); 2545 + r = -ENOMEM; 2546 + if (!u.xcrs) 2547 + break; 2548 + 2549 + kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs); 2550 + 2551 + r = -EFAULT; 2552 + if (copy_to_user(argp, u.xcrs, 2553 + sizeof(struct kvm_xcrs))) 2554 + break; 2555 + r = 0; 2556 + break; 2557 + } 2558 + case KVM_SET_XCRS: { 2559 + u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); 2560 + r = -ENOMEM; 2561 + if (!u.xcrs) 2562 + break; 2563 + 2564 + r = -EFAULT; 2565 + if (copy_from_user(u.xcrs, argp, 2566 + sizeof(struct kvm_xcrs))) 2567 + break; 2568 + 2569 + r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 2570 + break; 2571 + } 2662 2572 default: 2663 2573 r = -EINVAL; 2664 2574 } 2665 2575 out: 2666 - kfree(lapic); 2576 + kfree(u.buffer); 2667 2577 return r; 2668 2578 } 2669 2579 ··· 2760 2558 static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 2761 2559 { 2762 2560 return kvm->arch.n_alloc_mmu_pages; 2763 - } 2764 - 2765 - gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) 2766 - { 2767 - int i; 2768 - struct kvm_mem_alias *alias; 2769 - struct kvm_mem_aliases *aliases; 2770 - 2771 - aliases = kvm_aliases(kvm); 2772 - 2773 - for (i = 0; i < aliases->naliases; ++i) { 2774 - alias = &aliases->aliases[i]; 2775 - if (alias->flags & KVM_ALIAS_INVALID) 2776 - continue; 2777 - if (gfn >= alias->base_gfn 2778 - && gfn < alias->base_gfn + alias->npages) 2779 - return alias->target_gfn + gfn - alias->base_gfn; 2780 - } 2781 - return gfn; 2782 - } 2783 - 2784 - gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 2785 - { 2786 - int i; 2787 - struct kvm_mem_alias *alias; 2788 - struct kvm_mem_aliases *aliases; 2789 - 2790 - aliases = kvm_aliases(kvm); 2791 - 2792 - for (i = 0; i < aliases->naliases; ++i) { 2793 - alias = &aliases->aliases[i]; 2794 - if (gfn >= alias->base_gfn 2795 - && gfn < alias->base_gfn + alias->npages) 2796 - return alias->target_gfn + gfn - alias->base_gfn; 2797 - } 2798 - return gfn; 2799 - } 2800 - 2801 - /* 2802 - * Set a new alias region. Aliases map a portion of physical memory into 2803 - * another portion. This is useful for memory windows, for example the PC 2804 - * VGA region. 2805 - */ 2806 - static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, 2807 - struct kvm_memory_alias *alias) 2808 - { 2809 - int r, n; 2810 - struct kvm_mem_alias *p; 2811 - struct kvm_mem_aliases *aliases, *old_aliases; 2812 - 2813 - r = -EINVAL; 2814 - /* General sanity checks */ 2815 - if (alias->memory_size & (PAGE_SIZE - 1)) 2816 - goto out; 2817 - if (alias->guest_phys_addr & (PAGE_SIZE - 1)) 2818 - goto out; 2819 - if (alias->slot >= KVM_ALIAS_SLOTS) 2820 - goto out; 2821 - if (alias->guest_phys_addr + alias->memory_size 2822 - < alias->guest_phys_addr) 2823 - goto out; 2824 - if (alias->target_phys_addr + alias->memory_size 2825 - < alias->target_phys_addr) 2826 - goto out; 2827 - 2828 - r = -ENOMEM; 2829 - aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); 2830 - if (!aliases) 2831 - goto out; 2832 - 2833 - mutex_lock(&kvm->slots_lock); 2834 - 2835 - /* invalidate any gfn reference in case of deletion/shrinking */ 2836 - memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); 2837 - aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; 2838 - old_aliases = kvm->arch.aliases; 2839 - rcu_assign_pointer(kvm->arch.aliases, aliases); 2840 - synchronize_srcu_expedited(&kvm->srcu); 2841 - kvm_mmu_zap_all(kvm); 2842 - kfree(old_aliases); 2843 - 2844 - r = -ENOMEM; 2845 - aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); 2846 - if (!aliases) 2847 - goto out_unlock; 2848 - 2849 - memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); 2850 - 2851 - p = &aliases->aliases[alias->slot]; 2852 - p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 2853 - p->npages = alias->memory_size >> PAGE_SHIFT; 2854 - p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 2855 - p->flags &= ~(KVM_ALIAS_INVALID); 2856 - 2857 - for (n = KVM_ALIAS_SLOTS; n > 0; --n) 2858 - if (aliases->aliases[n - 1].npages) 2859 - break; 2860 - aliases->naliases = n; 2861 - 2862 - old_aliases = kvm->arch.aliases; 2863 - rcu_assign_pointer(kvm->arch.aliases, aliases); 2864 - synchronize_srcu_expedited(&kvm->srcu); 2865 - kfree(old_aliases); 2866 - r = 0; 2867 - 2868 - out_unlock: 2869 - mutex_unlock(&kvm->slots_lock); 2870 - out: 2871 - return r; 2872 2561 } 2873 2562 2874 2563 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) ··· 2890 2797 struct kvm_memory_slot *memslot; 2891 2798 unsigned long n; 2892 2799 unsigned long is_dirty = 0; 2893 - unsigned long *dirty_bitmap = NULL; 2894 2800 2895 2801 mutex_lock(&kvm->slots_lock); 2896 2802 ··· 2904 2812 2905 2813 n = kvm_dirty_bitmap_bytes(memslot); 2906 2814 2907 - r = -ENOMEM; 2908 - dirty_bitmap = vmalloc(n); 2909 - if (!dirty_bitmap) 2910 - goto out; 2911 - memset(dirty_bitmap, 0, n); 2912 - 2913 2815 for (i = 0; !is_dirty && i < n/sizeof(long); i++) 2914 2816 is_dirty = memslot->dirty_bitmap[i]; 2915 2817 2916 2818 /* If nothing is dirty, don't bother messing with page tables. */ 2917 2819 if (is_dirty) { 2918 2820 struct kvm_memslots *slots, *old_slots; 2821 + unsigned long *dirty_bitmap; 2919 2822 2920 2823 spin_lock(&kvm->mmu_lock); 2921 2824 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2922 2825 spin_unlock(&kvm->mmu_lock); 2923 2826 2924 - slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 2925 - if (!slots) 2926 - goto out_free; 2827 + r = -ENOMEM; 2828 + dirty_bitmap = vmalloc(n); 2829 + if (!dirty_bitmap) 2830 + goto out; 2831 + memset(dirty_bitmap, 0, n); 2927 2832 2833 + r = -ENOMEM; 2834 + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 2835 + if (!slots) { 2836 + vfree(dirty_bitmap); 2837 + goto out; 2838 + } 2928 2839 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 2929 2840 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 2930 2841 ··· 2936 2841 synchronize_srcu_expedited(&kvm->srcu); 2937 2842 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; 2938 2843 kfree(old_slots); 2844 + 2845 + r = -EFAULT; 2846 + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { 2847 + vfree(dirty_bitmap); 2848 + goto out; 2849 + } 2850 + vfree(dirty_bitmap); 2851 + } else { 2852 + r = -EFAULT; 2853 + if (clear_user(log->dirty_bitmap, n)) 2854 + goto out; 2939 2855 } 2940 2856 2941 2857 r = 0; 2942 - if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) 2943 - r = -EFAULT; 2944 - out_free: 2945 - vfree(dirty_bitmap); 2946 2858 out: 2947 2859 mutex_unlock(&kvm->slots_lock); 2948 2860 return r; ··· 2969 2867 union { 2970 2868 struct kvm_pit_state ps; 2971 2869 struct kvm_pit_state2 ps2; 2972 - struct kvm_memory_alias alias; 2973 2870 struct kvm_pit_config pit_config; 2974 2871 } u; 2975 2872 ··· 2989 2888 goto out; 2990 2889 break; 2991 2890 } 2992 - case KVM_SET_MEMORY_REGION: { 2993 - struct kvm_memory_region kvm_mem; 2994 - struct kvm_userspace_memory_region kvm_userspace_mem; 2995 - 2996 - r = -EFAULT; 2997 - if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem)) 2998 - goto out; 2999 - kvm_userspace_mem.slot = kvm_mem.slot; 3000 - kvm_userspace_mem.flags = kvm_mem.flags; 3001 - kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr; 3002 - kvm_userspace_mem.memory_size = kvm_mem.memory_size; 3003 - r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0); 3004 - if (r) 3005 - goto out; 3006 - break; 3007 - } 3008 2891 case KVM_SET_NR_MMU_PAGES: 3009 2892 r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg); 3010 2893 if (r) ··· 2996 2911 break; 2997 2912 case KVM_GET_NR_MMU_PAGES: 2998 2913 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 2999 - break; 3000 - case KVM_SET_MEMORY_ALIAS: 3001 - r = -EFAULT; 3002 - if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) 3003 - goto out; 3004 - r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); 3005 - if (r) 3006 - goto out; 3007 2914 break; 3008 2915 case KVM_CREATE_IRQCHIP: { 3009 2916 struct kvm_pic *vpic; ··· 3336 3259 } 3337 3260 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3338 3261 if (ret < 0) { 3339 - r = X86EMUL_UNHANDLEABLE; 3262 + r = X86EMUL_IO_NEEDED; 3340 3263 goto out; 3341 3264 } 3342 3265 ··· 3392 3315 } 3393 3316 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3394 3317 if (ret < 0) { 3395 - r = X86EMUL_UNHANDLEABLE; 3318 + r = X86EMUL_IO_NEEDED; 3396 3319 goto out; 3397 3320 } 3398 3321 ··· 3407 3330 static int emulator_read_emulated(unsigned long addr, 3408 3331 void *val, 3409 3332 unsigned int bytes, 3333 + unsigned int *error_code, 3410 3334 struct kvm_vcpu *vcpu) 3411 3335 { 3412 3336 gpa_t gpa; 3413 - u32 error_code; 3414 3337 3415 3338 if (vcpu->mmio_read_completed) { 3416 3339 memcpy(val, vcpu->mmio_data, bytes); ··· 3420 3343 return X86EMUL_CONTINUE; 3421 3344 } 3422 3345 3423 - gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); 3346 + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); 3424 3347 3425 - if (gpa == UNMAPPED_GVA) { 3426 - kvm_inject_page_fault(vcpu, addr, error_code); 3348 + if (gpa == UNMAPPED_GVA) 3427 3349 return X86EMUL_PROPAGATE_FAULT; 3428 - } 3429 3350 3430 3351 /* For APIC access vmexit */ 3431 3352 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) ··· 3445 3370 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3446 3371 3447 3372 vcpu->mmio_needed = 1; 3448 - vcpu->mmio_phys_addr = gpa; 3449 - vcpu->mmio_size = bytes; 3450 - vcpu->mmio_is_write = 0; 3373 + vcpu->run->exit_reason = KVM_EXIT_MMIO; 3374 + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3375 + vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3376 + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; 3451 3377 3452 - return X86EMUL_UNHANDLEABLE; 3378 + return X86EMUL_IO_NEEDED; 3453 3379 } 3454 3380 3455 3381 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, ··· 3468 3392 static int emulator_write_emulated_onepage(unsigned long addr, 3469 3393 const void *val, 3470 3394 unsigned int bytes, 3395 + unsigned int *error_code, 3471 3396 struct kvm_vcpu *vcpu) 3472 3397 { 3473 3398 gpa_t gpa; 3474 - u32 error_code; 3475 3399 3476 - gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); 3400 + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); 3477 3401 3478 - if (gpa == UNMAPPED_GVA) { 3479 - kvm_inject_page_fault(vcpu, addr, error_code); 3402 + if (gpa == UNMAPPED_GVA) 3480 3403 return X86EMUL_PROPAGATE_FAULT; 3481 - } 3482 3404 3483 3405 /* For APIC access vmexit */ 3484 3406 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) ··· 3494 3420 return X86EMUL_CONTINUE; 3495 3421 3496 3422 vcpu->mmio_needed = 1; 3497 - vcpu->mmio_phys_addr = gpa; 3498 - vcpu->mmio_size = bytes; 3499 - vcpu->mmio_is_write = 1; 3500 - memcpy(vcpu->mmio_data, val, bytes); 3423 + vcpu->run->exit_reason = KVM_EXIT_MMIO; 3424 + vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3425 + vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3426 + vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; 3427 + memcpy(vcpu->run->mmio.data, val, bytes); 3501 3428 3502 3429 return X86EMUL_CONTINUE; 3503 3430 } ··· 3506 3431 int emulator_write_emulated(unsigned long addr, 3507 3432 const void *val, 3508 3433 unsigned int bytes, 3434 + unsigned int *error_code, 3509 3435 struct kvm_vcpu *vcpu) 3510 3436 { 3511 3437 /* Crossing a page boundary? */ ··· 3514 3438 int rc, now; 3515 3439 3516 3440 now = -addr & ~PAGE_MASK; 3517 - rc = emulator_write_emulated_onepage(addr, val, now, vcpu); 3441 + rc = emulator_write_emulated_onepage(addr, val, now, error_code, 3442 + vcpu); 3518 3443 if (rc != X86EMUL_CONTINUE) 3519 3444 return rc; 3520 3445 addr += now; 3521 3446 val += now; 3522 3447 bytes -= now; 3523 3448 } 3524 - return emulator_write_emulated_onepage(addr, val, bytes, vcpu); 3449 + return emulator_write_emulated_onepage(addr, val, bytes, error_code, 3450 + vcpu); 3525 3451 } 3526 - EXPORT_SYMBOL_GPL(emulator_write_emulated); 3527 3452 3528 3453 #define CMPXCHG_TYPE(t, ptr, old, new) \ 3529 3454 (cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old)) ··· 3540 3463 const void *old, 3541 3464 const void *new, 3542 3465 unsigned int bytes, 3466 + unsigned int *error_code, 3543 3467 struct kvm_vcpu *vcpu) 3544 3468 { 3545 3469 gpa_t gpa; ··· 3562 3484 goto emul_write; 3563 3485 3564 3486 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 3487 + if (is_error_page(page)) { 3488 + kvm_release_page_clean(page); 3489 + goto emul_write; 3490 + } 3565 3491 3566 3492 kaddr = kmap_atomic(page, KM_USER0); 3567 3493 kaddr += offset_in_page(gpa); ··· 3598 3516 emul_write: 3599 3517 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 3600 3518 3601 - return emulator_write_emulated(addr, new, bytes, vcpu); 3519 + return emulator_write_emulated(addr, new, bytes, error_code, vcpu); 3602 3520 } 3603 3521 3604 3522 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) ··· 3686 3604 return X86EMUL_CONTINUE; 3687 3605 } 3688 3606 3607 + int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) 3608 + { 3609 + if (!need_emulate_wbinvd(vcpu)) 3610 + return X86EMUL_CONTINUE; 3611 + 3612 + if (kvm_x86_ops->has_wbinvd_exit()) { 3613 + smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, 3614 + wbinvd_ipi, NULL, 1); 3615 + cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 3616 + } 3617 + wbinvd(); 3618 + return X86EMUL_CONTINUE; 3619 + } 3620 + EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 3621 + 3689 3622 int emulate_clts(struct kvm_vcpu *vcpu) 3690 3623 { 3691 3624 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); ··· 3708 3611 return X86EMUL_CONTINUE; 3709 3612 } 3710 3613 3711 - int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3614 + int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) 3712 3615 { 3713 - return kvm_get_dr(ctxt->vcpu, dr, dest); 3616 + return _kvm_get_dr(vcpu, dr, dest); 3714 3617 } 3715 3618 3716 - int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3619 + int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) 3717 3620 { 3718 - unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3719 3621 3720 - return kvm_set_dr(ctxt->vcpu, dr, value & mask); 3622 + return __kvm_set_dr(vcpu, dr, value); 3721 3623 } 3722 - 3723 - void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3724 - { 3725 - u8 opcodes[4]; 3726 - unsigned long rip = kvm_rip_read(vcpu); 3727 - unsigned long rip_linear; 3728 - 3729 - if (!printk_ratelimit()) 3730 - return; 3731 - 3732 - rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3733 - 3734 - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); 3735 - 3736 - printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 3737 - context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 3738 - } 3739 - EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3740 3624 3741 3625 static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3742 3626 { ··· 3752 3674 return value; 3753 3675 } 3754 3676 3755 - static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 3677 + static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 3756 3678 { 3679 + int res = 0; 3680 + 3757 3681 switch (cr) { 3758 3682 case 0: 3759 - kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); 3683 + res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); 3760 3684 break; 3761 3685 case 2: 3762 3686 vcpu->arch.cr2 = val; 3763 3687 break; 3764 3688 case 3: 3765 - kvm_set_cr3(vcpu, val); 3689 + res = kvm_set_cr3(vcpu, val); 3766 3690 break; 3767 3691 case 4: 3768 - kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 3692 + res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 3769 3693 break; 3770 3694 case 8: 3771 - kvm_set_cr8(vcpu, val & 0xfUL); 3695 + res = __kvm_set_cr8(vcpu, val & 0xfUL); 3772 3696 break; 3773 3697 default: 3774 3698 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 3699 + res = -1; 3775 3700 } 3701 + 3702 + return res; 3776 3703 } 3777 3704 3778 3705 static int emulator_get_cpl(struct kvm_vcpu *vcpu) ··· 3788 3705 static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) 3789 3706 { 3790 3707 kvm_x86_ops->get_gdt(vcpu, dt); 3708 + } 3709 + 3710 + static unsigned long emulator_get_cached_segment_base(int seg, 3711 + struct kvm_vcpu *vcpu) 3712 + { 3713 + return get_segment_base(vcpu, seg); 3791 3714 } 3792 3715 3793 3716 static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, ··· 3868 3779 kvm_set_segment(vcpu, &kvm_seg, seg); 3869 3780 } 3870 3781 3871 - static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 3872 - { 3873 - kvm_x86_ops->set_rflags(vcpu, rflags); 3874 - } 3875 - 3876 3782 static struct x86_emulate_ops emulate_ops = { 3877 3783 .read_std = kvm_read_guest_virt_system, 3878 3784 .write_std = kvm_write_guest_virt_system, ··· 3881 3797 .set_cached_descriptor = emulator_set_cached_descriptor, 3882 3798 .get_segment_selector = emulator_get_segment_selector, 3883 3799 .set_segment_selector = emulator_set_segment_selector, 3800 + .get_cached_segment_base = emulator_get_cached_segment_base, 3884 3801 .get_gdt = emulator_get_gdt, 3885 3802 .get_cr = emulator_get_cr, 3886 3803 .set_cr = emulator_set_cr, 3887 3804 .cpl = emulator_get_cpl, 3888 - .set_rflags = emulator_set_rflags, 3805 + .get_dr = emulator_get_dr, 3806 + .set_dr = emulator_set_dr, 3807 + .set_msr = kvm_set_msr, 3808 + .get_msr = kvm_get_msr, 3889 3809 }; 3890 3810 3891 3811 static void cache_all_regs(struct kvm_vcpu *vcpu) ··· 3900 3812 vcpu->arch.regs_dirty = ~0; 3901 3813 } 3902 3814 3815 + static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) 3816 + { 3817 + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); 3818 + /* 3819 + * an sti; sti; sequence only disable interrupts for the first 3820 + * instruction. So, if the last instruction, be it emulated or 3821 + * not, left the system with the INT_STI flag enabled, it 3822 + * means that the last instruction is an sti. We should not 3823 + * leave the flag on in this case. The same goes for mov ss 3824 + */ 3825 + if (!(int_shadow & mask)) 3826 + kvm_x86_ops->set_interrupt_shadow(vcpu, mask); 3827 + } 3828 + 3829 + static void inject_emulated_exception(struct kvm_vcpu *vcpu) 3830 + { 3831 + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 3832 + if (ctxt->exception == PF_VECTOR) 3833 + kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); 3834 + else if (ctxt->error_code_valid) 3835 + kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 3836 + else 3837 + kvm_queue_exception(vcpu, ctxt->exception); 3838 + } 3839 + 3840 + static int handle_emulation_failure(struct kvm_vcpu *vcpu) 3841 + { 3842 + ++vcpu->stat.insn_emulation_fail; 3843 + trace_kvm_emulate_insn_failed(vcpu); 3844 + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 3845 + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 3846 + vcpu->run->internal.ndata = 0; 3847 + kvm_queue_exception(vcpu, UD_VECTOR); 3848 + return EMULATE_FAIL; 3849 + } 3850 + 3851 + static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 3852 + { 3853 + gpa_t gpa; 3854 + 3855 + if (tdp_enabled) 3856 + return false; 3857 + 3858 + /* 3859 + * if emulation was due to access to shadowed page table 3860 + * and it failed try to unshadow page and re-entetr the 3861 + * guest to let CPU execute the instruction. 3862 + */ 3863 + if (kvm_mmu_unprotect_page_virt(vcpu, gva)) 3864 + return true; 3865 + 3866 + gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); 3867 + 3868 + if (gpa == UNMAPPED_GVA) 3869 + return true; /* let cpu generate fault */ 3870 + 3871 + if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) 3872 + return true; 3873 + 3874 + return false; 3875 + } 3876 + 3903 3877 int emulate_instruction(struct kvm_vcpu *vcpu, 3904 3878 unsigned long cr2, 3905 3879 u16 error_code, 3906 3880 int emulation_type) 3907 3881 { 3908 - int r, shadow_mask; 3909 - struct decode_cache *c; 3910 - struct kvm_run *run = vcpu->run; 3882 + int r; 3883 + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 3911 3884 3912 3885 kvm_clear_exception_queue(vcpu); 3913 3886 vcpu->arch.mmio_fault_cr2 = cr2; ··· 3979 3830 * for example. 3980 3831 */ 3981 3832 cache_all_regs(vcpu); 3982 - 3983 - vcpu->mmio_is_write = 0; 3984 3833 3985 3834 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 3986 3835 int cs_db, cs_l; ··· 3993 3846 ? X86EMUL_MODE_VM86 : cs_l 3994 3847 ? X86EMUL_MODE_PROT64 : cs_db 3995 3848 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3849 + memset(c, 0, sizeof(struct decode_cache)); 3850 + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 3851 + vcpu->arch.emulate_ctxt.interruptibility = 0; 3852 + vcpu->arch.emulate_ctxt.exception = -1; 3996 3853 3997 3854 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 3998 3855 trace_kvm_emulate_insn_start(vcpu); 3999 3856 4000 3857 /* Only allow emulation of specific instructions on #UD 4001 3858 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 4002 - c = &vcpu->arch.emulate_ctxt.decode; 4003 3859 if (emulation_type & EMULTYPE_TRAP_UD) { 4004 3860 if (!c->twobyte) 4005 3861 return EMULATE_FAIL; ··· 4030 3880 4031 3881 ++vcpu->stat.insn_emulation; 4032 3882 if (r) { 4033 - ++vcpu->stat.insn_emulation_fail; 4034 - trace_kvm_emulate_insn_failed(vcpu); 4035 - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 3883 + if (reexecute_instruction(vcpu, cr2)) 4036 3884 return EMULATE_DONE; 4037 - return EMULATE_FAIL; 3885 + if (emulation_type & EMULTYPE_SKIP) 3886 + return EMULATE_FAIL; 3887 + return handle_emulation_failure(vcpu); 4038 3888 } 4039 3889 } 4040 3890 ··· 4043 3893 return EMULATE_DONE; 4044 3894 } 4045 3895 3896 + /* this is needed for vmware backdor interface to work since it 3897 + changes registers values during IO operation */ 3898 + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 3899 + 4046 3900 restart: 4047 3901 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4048 - shadow_mask = vcpu->arch.emulate_ctxt.interruptibility; 4049 3902 4050 - if (r == 0) 4051 - kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask); 3903 + if (r) { /* emulation failed */ 3904 + if (reexecute_instruction(vcpu, cr2)) 3905 + return EMULATE_DONE; 3906 + 3907 + return handle_emulation_failure(vcpu); 3908 + } 3909 + 3910 + toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); 3911 + kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 3912 + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 3913 + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 3914 + 3915 + if (vcpu->arch.emulate_ctxt.exception >= 0) { 3916 + inject_emulated_exception(vcpu); 3917 + return EMULATE_DONE; 3918 + } 4052 3919 4053 3920 if (vcpu->arch.pio.count) { 4054 3921 if (!vcpu->arch.pio.in) ··· 4073 3906 return EMULATE_DO_MMIO; 4074 3907 } 4075 3908 4076 - if (r || vcpu->mmio_is_write) { 4077 - run->exit_reason = KVM_EXIT_MMIO; 4078 - run->mmio.phys_addr = vcpu->mmio_phys_addr; 4079 - memcpy(run->mmio.data, vcpu->mmio_data, 8); 4080 - run->mmio.len = vcpu->mmio_size; 4081 - run->mmio.is_write = vcpu->mmio_is_write; 4082 - } 4083 - 4084 - if (r) { 4085 - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) 4086 - goto done; 4087 - if (!vcpu->mmio_needed) { 4088 - ++vcpu->stat.insn_emulation_fail; 4089 - trace_kvm_emulate_insn_failed(vcpu); 4090 - kvm_report_emulation_failure(vcpu, "mmio"); 4091 - return EMULATE_FAIL; 4092 - } 3909 + if (vcpu->mmio_needed) { 3910 + if (vcpu->mmio_is_write) 3911 + vcpu->mmio_needed = 0; 4093 3912 return EMULATE_DO_MMIO; 4094 3913 } 4095 - 4096 - if (vcpu->mmio_is_write) { 4097 - vcpu->mmio_needed = 0; 4098 - return EMULATE_DO_MMIO; 4099 - } 4100 - 4101 - done: 4102 - if (vcpu->arch.exception.pending) 4103 - vcpu->arch.emulate_ctxt.restart = false; 4104 3914 4105 3915 if (vcpu->arch.emulate_ctxt.restart) 4106 3916 goto restart; ··· 4252 4108 4253 4109 perf_register_guest_info_callbacks(&kvm_guest_cbs); 4254 4110 4111 + if (cpu_has_xsave) 4112 + host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); 4113 + 4255 4114 return 0; 4256 4115 4257 4116 out: ··· 4417 4270 4418 4271 kvm_x86_ops->patch_hypercall(vcpu, instruction); 4419 4272 4420 - return emulator_write_emulated(rip, instruction, 3, vcpu); 4273 + return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); 4421 4274 } 4422 4275 4423 4276 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) ··· 4653 4506 } 4654 4507 } 4655 4508 4509 + static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) 4510 + { 4511 + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && 4512 + !vcpu->guest_xcr0_loaded) { 4513 + /* kvm_set_xcr() also depends on this */ 4514 + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); 4515 + vcpu->guest_xcr0_loaded = 1; 4516 + } 4517 + } 4518 + 4519 + static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) 4520 + { 4521 + if (vcpu->guest_xcr0_loaded) { 4522 + if (vcpu->arch.xcr0 != host_xcr0) 4523 + xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); 4524 + vcpu->guest_xcr0_loaded = 0; 4525 + } 4526 + } 4527 + 4656 4528 static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 4657 4529 { 4658 4530 int r; 4659 4531 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 4660 4532 vcpu->run->request_interrupt_window; 4661 4533 4662 - if (vcpu->requests) 4663 - if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 4664 - kvm_mmu_unload(vcpu); 4665 - 4666 - r = kvm_mmu_reload(vcpu); 4667 - if (unlikely(r)) 4668 - goto out; 4669 - 4670 4534 if (vcpu->requests) { 4671 - if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 4535 + if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) 4536 + kvm_mmu_unload(vcpu); 4537 + if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 4672 4538 __kvm_migrate_timers(vcpu); 4673 - if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) 4539 + if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) 4674 4540 kvm_write_guest_time(vcpu); 4675 - if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) 4541 + if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 4676 4542 kvm_mmu_sync_roots(vcpu); 4677 - if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 4543 + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 4678 4544 kvm_x86_ops->tlb_flush(vcpu); 4679 - if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 4680 - &vcpu->requests)) { 4545 + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { 4681 4546 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; 4682 4547 r = 0; 4683 4548 goto out; 4684 4549 } 4685 - if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 4550 + if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { 4686 4551 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; 4687 4552 r = 0; 4688 4553 goto out; 4689 4554 } 4690 - if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { 4555 + if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) { 4691 4556 vcpu->fpu_active = 0; 4692 4557 kvm_x86_ops->fpu_deactivate(vcpu); 4693 4558 } 4694 4559 } 4560 + 4561 + r = kvm_mmu_reload(vcpu); 4562 + if (unlikely(r)) 4563 + goto out; 4695 4564 4696 4565 preempt_disable(); 4697 4566 4698 4567 kvm_x86_ops->prepare_guest_switch(vcpu); 4699 4568 if (vcpu->fpu_active) 4700 4569 kvm_load_guest_fpu(vcpu); 4570 + kvm_load_guest_xcr0(vcpu); 4571 + 4572 + atomic_set(&vcpu->guest_mode, 1); 4573 + smp_wmb(); 4701 4574 4702 4575 local_irq_disable(); 4703 4576 4704 - clear_bit(KVM_REQ_KICK, &vcpu->requests); 4705 - smp_mb__after_clear_bit(); 4706 - 4707 - if (vcpu->requests || need_resched() || signal_pending(current)) { 4708 - set_bit(KVM_REQ_KICK, &vcpu->requests); 4577 + if (!atomic_read(&vcpu->guest_mode) || vcpu->requests 4578 + || need_resched() || signal_pending(current)) { 4579 + atomic_set(&vcpu->guest_mode, 0); 4580 + smp_wmb(); 4709 4581 local_irq_enable(); 4710 4582 preempt_enable(); 4711 4583 r = 1; ··· 4769 4603 if (hw_breakpoint_active()) 4770 4604 hw_breakpoint_restore(); 4771 4605 4772 - set_bit(KVM_REQ_KICK, &vcpu->requests); 4606 + atomic_set(&vcpu->guest_mode, 0); 4607 + smp_wmb(); 4773 4608 local_irq_enable(); 4774 4609 4775 4610 ++vcpu->stat.exits; ··· 4832 4665 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 4833 4666 kvm_vcpu_block(vcpu); 4834 4667 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); 4835 - if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4668 + if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) 4836 4669 { 4837 4670 switch(vcpu->arch.mp_state) { 4838 4671 case KVM_MP_STATE_HALTED: ··· 4884 4717 int r; 4885 4718 sigset_t sigsaved; 4886 4719 4887 - vcpu_load(vcpu); 4888 - 4889 4720 if (vcpu->sigset_active) 4890 4721 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 4891 4722 ··· 4908 4743 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 4909 4744 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE); 4910 4745 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 4911 - if (r == EMULATE_DO_MMIO) { 4746 + if (r != EMULATE_DONE) { 4912 4747 r = 0; 4913 4748 goto out; 4914 4749 } ··· 4924 4759 if (vcpu->sigset_active) 4925 4760 sigprocmask(SIG_SETMASK, &sigsaved, NULL); 4926 4761 4927 - vcpu_put(vcpu); 4928 4762 return r; 4929 4763 } 4930 4764 4931 4765 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4932 4766 { 4933 - vcpu_load(vcpu); 4934 - 4935 4767 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4936 4768 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 4937 4769 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); ··· 4951 4789 regs->rip = kvm_rip_read(vcpu); 4952 4790 regs->rflags = kvm_get_rflags(vcpu); 4953 4791 4954 - vcpu_put(vcpu); 4955 - 4956 4792 return 0; 4957 4793 } 4958 4794 4959 4795 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 4960 4796 { 4961 - vcpu_load(vcpu); 4962 - 4963 4797 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 4964 4798 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 4965 4799 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); ··· 4980 4822 4981 4823 vcpu->arch.exception.pending = false; 4982 4824 4983 - vcpu_put(vcpu); 4984 - 4985 4825 return 0; 4986 4826 } 4987 4827 ··· 4997 4841 struct kvm_sregs *sregs) 4998 4842 { 4999 4843 struct desc_ptr dt; 5000 - 5001 - vcpu_load(vcpu); 5002 4844 5003 4845 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5004 4846 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); ··· 5029 4875 set_bit(vcpu->arch.interrupt.nr, 5030 4876 (unsigned long *)sregs->interrupt_bitmap); 5031 4877 5032 - vcpu_put(vcpu); 5033 - 5034 4878 return 0; 5035 4879 } 5036 4880 5037 4881 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, 5038 4882 struct kvm_mp_state *mp_state) 5039 4883 { 5040 - vcpu_load(vcpu); 5041 4884 mp_state->mp_state = vcpu->arch.mp_state; 5042 - vcpu_put(vcpu); 5043 4885 return 0; 5044 4886 } 5045 4887 5046 4888 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, 5047 4889 struct kvm_mp_state *mp_state) 5048 4890 { 5049 - vcpu_load(vcpu); 5050 4891 vcpu->arch.mp_state = mp_state->mp_state; 5051 - vcpu_put(vcpu); 5052 4892 return 0; 5053 4893 } 5054 4894 5055 4895 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, 5056 4896 bool has_error_code, u32 error_code) 5057 4897 { 4898 + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5058 4899 int cs_db, cs_l, ret; 5059 4900 cache_all_regs(vcpu); 5060 4901 ··· 5064 4915 ? X86EMUL_MODE_VM86 : cs_l 5065 4916 ? X86EMUL_MODE_PROT64 : cs_db 5066 4917 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 4918 + memset(c, 0, sizeof(struct decode_cache)); 4919 + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 5067 4920 5068 4921 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5069 4922 tss_selector, reason, has_error_code, ··· 5074 4923 if (ret) 5075 4924 return EMULATE_FAIL; 5076 4925 4926 + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4927 + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 5077 4928 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5078 4929 return EMULATE_DONE; 5079 4930 } ··· 5087 4934 int mmu_reset_needed = 0; 5088 4935 int pending_vec, max_bits; 5089 4936 struct desc_ptr dt; 5090 - 5091 - vcpu_load(vcpu); 5092 4937 5093 4938 dt.size = sregs->idt.limit; 5094 4939 dt.address = sregs->idt.base; ··· 5147 4996 !is_protmode(vcpu)) 5148 4997 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5149 4998 5150 - vcpu_put(vcpu); 5151 - 5152 4999 return 0; 5153 5000 } 5154 5001 ··· 5156 5007 unsigned long rflags; 5157 5008 int i, r; 5158 5009 5159 - vcpu_load(vcpu); 5160 - 5161 5010 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { 5162 5011 r = -EBUSY; 5163 5012 if (vcpu->arch.exception.pending) 5164 - goto unlock_out; 5013 + goto out; 5165 5014 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 5166 5015 kvm_queue_exception(vcpu, DB_VECTOR); 5167 5016 else ··· 5201 5054 5202 5055 r = 0; 5203 5056 5204 - unlock_out: 5205 - vcpu_put(vcpu); 5057 + out: 5206 5058 5207 5059 return r; 5208 5060 } 5209 - 5210 - /* 5211 - * fxsave fpu state. Taken from x86_64/processor.h. To be killed when 5212 - * we have asm/x86/processor.h 5213 - */ 5214 - struct fxsave { 5215 - u16 cwd; 5216 - u16 swd; 5217 - u16 twd; 5218 - u16 fop; 5219 - u64 rip; 5220 - u64 rdp; 5221 - u32 mxcsr; 5222 - u32 mxcsr_mask; 5223 - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ 5224 - #ifdef CONFIG_X86_64 5225 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ 5226 - #else 5227 - u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ 5228 - #endif 5229 - }; 5230 5061 5231 5062 /* 5232 5063 * Translate a guest virtual address to a guest physical address. ··· 5216 5091 gpa_t gpa; 5217 5092 int idx; 5218 5093 5219 - vcpu_load(vcpu); 5220 5094 idx = srcu_read_lock(&vcpu->kvm->srcu); 5221 5095 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); 5222 5096 srcu_read_unlock(&vcpu->kvm->srcu, idx); ··· 5223 5099 tr->valid = gpa != UNMAPPED_GVA; 5224 5100 tr->writeable = 1; 5225 5101 tr->usermode = 0; 5226 - vcpu_put(vcpu); 5227 5102 5228 5103 return 0; 5229 5104 } 5230 5105 5231 5106 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5232 5107 { 5233 - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5234 - 5235 - vcpu_load(vcpu); 5108 + struct i387_fxsave_struct *fxsave = 5109 + &vcpu->arch.guest_fpu.state->fxsave; 5236 5110 5237 5111 memcpy(fpu->fpr, fxsave->st_space, 128); 5238 5112 fpu->fcw = fxsave->cwd; ··· 5241 5119 fpu->last_dp = fxsave->rdp; 5242 5120 memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); 5243 5121 5244 - vcpu_put(vcpu); 5245 - 5246 5122 return 0; 5247 5123 } 5248 5124 5249 5125 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) 5250 5126 { 5251 - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; 5252 - 5253 - vcpu_load(vcpu); 5127 + struct i387_fxsave_struct *fxsave = 5128 + &vcpu->arch.guest_fpu.state->fxsave; 5254 5129 5255 5130 memcpy(fxsave->st_space, fpu->fpr, 128); 5256 5131 fxsave->cwd = fpu->fcw; ··· 5258 5139 fxsave->rdp = fpu->last_dp; 5259 5140 memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); 5260 5141 5261 - vcpu_put(vcpu); 5262 - 5263 5142 return 0; 5264 5143 } 5265 5144 5266 - void fx_init(struct kvm_vcpu *vcpu) 5145 + int fx_init(struct kvm_vcpu *vcpu) 5267 5146 { 5268 - unsigned after_mxcsr_mask; 5147 + int err; 5148 + 5149 + err = fpu_alloc(&vcpu->arch.guest_fpu); 5150 + if (err) 5151 + return err; 5152 + 5153 + fpu_finit(&vcpu->arch.guest_fpu); 5269 5154 5270 5155 /* 5271 - * Touch the fpu the first time in non atomic context as if 5272 - * this is the first fpu instruction the exception handler 5273 - * will fire before the instruction returns and it'll have to 5274 - * allocate ram with GFP_KERNEL. 5156 + * Ensure guest xcr0 is valid for loading 5275 5157 */ 5276 - if (!used_math()) 5277 - kvm_fx_save(&vcpu->arch.host_fx_image); 5278 - 5279 - /* Initialize guest FPU by resetting ours and saving into guest's */ 5280 - preempt_disable(); 5281 - kvm_fx_save(&vcpu->arch.host_fx_image); 5282 - kvm_fx_finit(); 5283 - kvm_fx_save(&vcpu->arch.guest_fx_image); 5284 - kvm_fx_restore(&vcpu->arch.host_fx_image); 5285 - preempt_enable(); 5158 + vcpu->arch.xcr0 = XSTATE_FP; 5286 5159 5287 5160 vcpu->arch.cr0 |= X86_CR0_ET; 5288 - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); 5289 - vcpu->arch.guest_fx_image.mxcsr = 0x1f80; 5290 - memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, 5291 - 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); 5161 + 5162 + return 0; 5292 5163 } 5293 5164 EXPORT_SYMBOL_GPL(fx_init); 5165 + 5166 + static void fx_free(struct kvm_vcpu *vcpu) 5167 + { 5168 + fpu_free(&vcpu->arch.guest_fpu); 5169 + } 5294 5170 5295 5171 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5296 5172 { 5297 5173 if (vcpu->guest_fpu_loaded) 5298 5174 return; 5299 5175 5176 + /* 5177 + * Restore all possible states in the guest, 5178 + * and assume host would use all available bits. 5179 + * Guest xcr0 would be loaded later. 5180 + */ 5181 + kvm_put_guest_xcr0(vcpu); 5300 5182 vcpu->guest_fpu_loaded = 1; 5301 - kvm_fx_save(&vcpu->arch.host_fx_image); 5302 - kvm_fx_restore(&vcpu->arch.guest_fx_image); 5183 + unlazy_fpu(current); 5184 + fpu_restore_checking(&vcpu->arch.guest_fpu); 5303 5185 trace_kvm_fpu(1); 5304 5186 } 5305 5187 5306 5188 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5307 5189 { 5190 + kvm_put_guest_xcr0(vcpu); 5191 + 5308 5192 if (!vcpu->guest_fpu_loaded) 5309 5193 return; 5310 5194 5311 5195 vcpu->guest_fpu_loaded = 0; 5312 - kvm_fx_save(&vcpu->arch.guest_fx_image); 5313 - kvm_fx_restore(&vcpu->arch.host_fx_image); 5196 + fpu_save_init(&vcpu->arch.guest_fpu); 5314 5197 ++vcpu->stat.fpu_reload; 5315 - set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); 5198 + kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); 5316 5199 trace_kvm_fpu(0); 5317 5200 } 5318 5201 ··· 5325 5204 vcpu->arch.time_page = NULL; 5326 5205 } 5327 5206 5207 + free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 5208 + fx_free(vcpu); 5328 5209 kvm_x86_ops->vcpu_free(vcpu); 5329 5210 } 5330 5211 ··· 5339 5216 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) 5340 5217 { 5341 5218 int r; 5342 - 5343 - /* We do fxsave: this must be aligned. */ 5344 - BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); 5345 5219 5346 5220 vcpu->arch.mtrr_state.have_fixed = 1; 5347 5221 vcpu_load(vcpu); ··· 5361 5241 kvm_mmu_unload(vcpu); 5362 5242 vcpu_put(vcpu); 5363 5243 5244 + fx_free(vcpu); 5364 5245 kvm_x86_ops->vcpu_free(vcpu); 5365 5246 } 5366 5247 ··· 5455 5334 } 5456 5335 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5457 5336 5337 + if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 5338 + goto fail_free_mce_banks; 5339 + 5458 5340 return 0; 5341 + fail_free_mce_banks: 5342 + kfree(vcpu->arch.mce_banks); 5459 5343 fail_free_lapic: 5460 5344 kvm_free_lapic(vcpu); 5461 5345 fail_mmu_destroy: ··· 5489 5363 5490 5364 if (!kvm) 5491 5365 return ERR_PTR(-ENOMEM); 5492 - 5493 - kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); 5494 - if (!kvm->arch.aliases) { 5495 - kfree(kvm); 5496 - return ERR_PTR(-ENOMEM); 5497 - } 5498 5366 5499 5367 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5500 5368 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); ··· 5532 5412 void kvm_arch_sync_events(struct kvm *kvm) 5533 5413 { 5534 5414 kvm_free_all_assigned_devices(kvm); 5415 + kvm_free_pit(kvm); 5535 5416 } 5536 5417 5537 5418 void kvm_arch_destroy_vm(struct kvm *kvm) 5538 5419 { 5539 5420 kvm_iommu_unmap_guest(kvm); 5540 - kvm_free_pit(kvm); 5541 5421 kfree(kvm->arch.vpic); 5542 5422 kfree(kvm->arch.vioapic); 5543 5423 kvm_free_vcpus(kvm); ··· 5547 5427 if (kvm->arch.ept_identity_pagetable) 5548 5428 put_page(kvm->arch.ept_identity_pagetable); 5549 5429 cleanup_srcu_struct(&kvm->srcu); 5550 - kfree(kvm->arch.aliases); 5551 5430 kfree(kvm); 5552 5431 } 5553 5432 ··· 5557 5438 int user_alloc) 5558 5439 { 5559 5440 int npages = memslot->npages; 5441 + int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; 5442 + 5443 + /* Prevent internal slot pages from being moved by fork()/COW. */ 5444 + if (memslot->id >= KVM_MEMORY_SLOTS) 5445 + map_flags = MAP_SHARED | MAP_ANONYMOUS; 5560 5446 5561 5447 /*To keep backward compatibility with older userspace, 5562 5448 *x86 needs to hanlde !user_alloc case. ··· 5574 5450 userspace_addr = do_mmap(NULL, 0, 5575 5451 npages * PAGE_SIZE, 5576 5452 PROT_READ | PROT_WRITE, 5577 - MAP_PRIVATE | MAP_ANONYMOUS, 5453 + map_flags, 5578 5454 0); 5579 5455 up_write(&current->mm->mmap_sem); 5580 5456 ··· 5647 5523 5648 5524 me = get_cpu(); 5649 5525 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 5650 - if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) 5526 + if (atomic_xchg(&vcpu->guest_mode, 0)) 5651 5527 smp_send_reschedule(cpu); 5652 5528 put_cpu(); 5653 5529 }

-7

arch/x86/kvm/x86.h

··· 65 65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 66 66 } 67 67 68 - static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm) 69 - { 70 - return rcu_dereference_check(kvm->arch.aliases, 71 - srcu_read_lock_held(&kvm->srcu) 72 - || lockdep_is_held(&kvm->slots_lock)); 73 - } 74 - 75 68 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 76 69 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 77 70

+13

include/linux/kvm.h

··· 524 524 #define KVM_CAP_PPC_OSI 52 525 525 #define KVM_CAP_PPC_UNSET_IRQ 53 526 526 #define KVM_CAP_ENABLE_CAP 54 527 + #ifdef __KVM_HAVE_XSAVE 528 + #define KVM_CAP_XSAVE 55 529 + #endif 530 + #ifdef __KVM_HAVE_XCRS 531 + #define KVM_CAP_XCRS 56 532 + #endif 527 533 528 534 #ifdef KVM_CAP_IRQ_ROUTING 529 535 ··· 619 613 */ 620 614 #define KVM_CREATE_VCPU _IO(KVMIO, 0x41) 621 615 #define KVM_GET_DIRTY_LOG _IOW(KVMIO, 0x42, struct kvm_dirty_log) 616 + /* KVM_SET_MEMORY_ALIAS is obsolete: */ 622 617 #define KVM_SET_MEMORY_ALIAS _IOW(KVMIO, 0x43, struct kvm_memory_alias) 623 618 #define KVM_SET_NR_MMU_PAGES _IO(KVMIO, 0x44) 624 619 #define KVM_GET_NR_MMU_PAGES _IO(KVMIO, 0x45) ··· 721 714 #define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) 722 715 #define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) 723 716 #define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) 717 + /* Available with KVM_CAP_XSAVE */ 718 + #define KVM_GET_XSAVE _IOR(KVMIO, 0xa4, struct kvm_xsave) 719 + #define KVM_SET_XSAVE _IOW(KVMIO, 0xa5, struct kvm_xsave) 720 + /* Available with KVM_CAP_XCRS */ 721 + #define KVM_GET_XCRS _IOR(KVMIO, 0xa6, struct kvm_xcrs) 722 + #define KVM_SET_XCRS _IOW(KVMIO, 0xa7, struct kvm_xcrs) 724 723 725 724 #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) 726 725

+27 -8

include/linux/kvm_host.h

··· 81 81 int vcpu_id; 82 82 struct mutex mutex; 83 83 int cpu; 84 + atomic_t guest_mode; 84 85 struct kvm_run *run; 85 86 unsigned long requests; 86 87 unsigned long guest_debug; 87 88 int srcu_idx; 88 89 89 90 int fpu_active; 90 - int guest_fpu_loaded; 91 + int guest_fpu_loaded, guest_xcr0_loaded; 91 92 wait_queue_head_t wq; 92 93 int sigset_active; 93 94 sigset_t sigset; ··· 124 123 } *lpage_info[KVM_NR_PAGE_SIZES - 1]; 125 124 unsigned long userspace_addr; 126 125 int user_alloc; 126 + int id; 127 127 }; 128 128 129 129 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot) ··· 268 266 269 267 int is_error_page(struct page *page); 270 268 int is_error_pfn(pfn_t pfn); 269 + int is_hwpoison_pfn(pfn_t pfn); 270 + int is_fault_pfn(pfn_t pfn); 271 271 int kvm_is_error_hva(unsigned long addr); 272 272 int kvm_set_memory_region(struct kvm *kvm, 273 273 struct kvm_userspace_memory_region *mem, ··· 288 284 int user_alloc); 289 285 void kvm_disable_largepages(void); 290 286 void kvm_arch_flush_shadow(struct kvm *kvm); 291 - gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); 292 - gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn); 293 287 294 288 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); 295 289 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn); ··· 447 445 struct kvm_irq_mask_notifier *kimn); 448 446 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, 449 447 struct kvm_irq_mask_notifier *kimn); 450 - void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask); 448 + void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, 449 + bool mask); 451 450 452 451 #ifdef __KVM_HAVE_IOAPIC 453 452 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic, ··· 565 562 } 566 563 #endif 567 564 568 - #ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION 569 - #define unalias_gfn_instantiation unalias_gfn 570 - #endif 571 - 572 565 #ifdef CONFIG_HAVE_KVM_IRQCHIP 573 566 574 567 #define KVM_MAX_IRQ_ROUTES 1024 ··· 626 627 } 627 628 628 629 #endif 630 + 631 + static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) 632 + { 633 + set_bit(req, &vcpu->requests); 634 + } 635 + 636 + static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu) 637 + { 638 + return test_and_set_bit(req, &vcpu->requests); 639 + } 640 + 641 + static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) 642 + { 643 + if (test_bit(req, &vcpu->requests)) { 644 + clear_bit(req, &vcpu->requests); 645 + return true; 646 + } else { 647 + return false; 648 + } 649 + } 629 650 630 651 #endif 631 652

+2 -2

include/linux/kvm_types.h

··· 32 32 33 33 typedef unsigned long gva_t; 34 34 typedef u64 gpa_t; 35 - typedef unsigned long gfn_t; 35 + typedef u64 gfn_t; 36 36 37 37 typedef unsigned long hva_t; 38 38 typedef u64 hpa_t; 39 - typedef unsigned long hfn_t; 39 + typedef u64 hfn_t; 40 40 41 41 typedef hfn_t pfn_t; 42 42

+8

include/linux/mm.h

··· 1465 1465 extern void shake_page(struct page *p, int access); 1466 1466 extern atomic_long_t mce_bad_pages; 1467 1467 extern int soft_offline_page(struct page *page, int flags); 1468 + #ifdef CONFIG_MEMORY_FAILURE 1469 + int is_hwpoison_address(unsigned long addr); 1470 + #else 1471 + static inline int is_hwpoison_address(unsigned long addr) 1472 + { 1473 + return 0; 1474 + } 1475 + #endif 1468 1476 1469 1477 extern void dump_page(struct page *page); 1470 1478

+33

mm/memory-failure.c

··· 45 45 #include <linux/page-isolation.h> 46 46 #include <linux/suspend.h> 47 47 #include <linux/slab.h> 48 + #include <linux/swapops.h> 48 49 #include "internal.h" 49 50 50 51 int sysctl_memory_failure_early_kill __read_mostly = 0; ··· 1297 1296 /* keep elevated page count for bad page */ 1298 1297 return ret; 1299 1298 } 1299 + 1300 + /* 1301 + * The caller must hold current->mm->mmap_sem in read mode. 1302 + */ 1303 + int is_hwpoison_address(unsigned long addr) 1304 + { 1305 + pgd_t *pgdp; 1306 + pud_t pud, *pudp; 1307 + pmd_t pmd, *pmdp; 1308 + pte_t pte, *ptep; 1309 + swp_entry_t entry; 1310 + 1311 + pgdp = pgd_offset(current->mm, addr); 1312 + if (!pgd_present(*pgdp)) 1313 + return 0; 1314 + pudp = pud_offset(pgdp, addr); 1315 + pud = *pudp; 1316 + if (!pud_present(pud) || pud_large(pud)) 1317 + return 0; 1318 + pmdp = pmd_offset(pudp, addr); 1319 + pmd = *pmdp; 1320 + if (!pmd_present(pmd) || pmd_large(pmd)) 1321 + return 0; 1322 + ptep = pte_offset_map(pmdp, addr); 1323 + pte = *ptep; 1324 + pte_unmap(ptep); 1325 + if (!is_swap_pte(pte)) 1326 + return 0; 1327 + entry = pte_to_swp_entry(pte); 1328 + return is_hwpoison_entry(entry); 1329 + } 1330 + EXPORT_SYMBOL_GPL(is_hwpoison_address);

+1 -6

virt/kvm/assigned-dev.c

··· 1 1 /* 2 2 * Kernel-based Virtual Machine - device assignment support 3 3 * 4 - * Copyright (C) 2006-9 Red Hat, Inc 4 + * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. 5 5 * 6 6 * This work is licensed under the terms of the GNU GPL, version 2. See 7 7 * the COPYING file in the top-level directory. ··· 58 58 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) 59 59 { 60 60 struct kvm_assigned_dev_kernel *assigned_dev; 61 - struct kvm *kvm; 62 61 int i; 63 62 64 63 assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, 65 64 interrupt_work); 66 - kvm = assigned_dev->kvm; 67 65 68 66 spin_lock_irq(&assigned_dev->assigned_dev_lock); 69 67 if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { ··· 445 447 int r = -EINVAL; 446 448 struct kvm_assigned_dev_kernel *match; 447 449 unsigned long host_irq_type, guest_irq_type; 448 - 449 - if (!capable(CAP_SYS_RAWIO)) 450 - return -EPERM; 451 450 452 451 if (!irqchip_in_kernel(kvm)) 453 452 return r;

+1

virt/kvm/coalesced_mmio.c

+1

virt/kvm/eventfd.c

··· 2 2 * kvm eventfd support - use eventfd objects to signal various KVM events 3 3 * 4 4 * Copyright 2009 Novell. All Rights Reserved. 5 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. 5 6 * 6 7 * Author: 7 8 * Gregory Haskins <ghaskins@novell.com>

+2 -1

virt/kvm/ioapic.c

··· 1 1 /* 2 2 * Copyright (C) 2001 MandrakeSoft S.A. 3 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. 3 4 * 4 5 * MandrakeSoft S.A. 5 6 * 43, rue d'Aboukir ··· 152 151 update_handled_vectors(ioapic); 153 152 mask_after = e->fields.mask; 154 153 if (mask_before != mask_after) 155 - kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after); 154 + kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); 156 155 if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG 157 156 && ioapic->irr & (1 << index)) 158 157 ioapic_service(ioapic, index);

+9 -3

virt/kvm/iommu.c

··· 16 16 * 17 17 * Copyright (C) 2006-2008 Intel Corporation 18 18 * Copyright IBM Corporation, 2008 19 + * Copyright 2010 Red Hat, Inc. and/or its affiliates. 20 + * 19 21 * Author: Allen M. Kay <allen.m.kay@intel.com> 20 22 * Author: Weidong Han <weidong.han@intel.com> 21 23 * Author: Ben-Ami Yassour <benami@il.ibm.com> ··· 108 106 get_order(page_size), flags); 109 107 if (r) { 110 108 printk(KERN_ERR "kvm_iommu_map_address:" 111 - "iommu failed to map pfn=%lx\n", pfn); 109 + "iommu failed to map pfn=%llx\n", pfn); 112 110 goto unmap_pages; 113 111 } 114 112 ··· 126 124 127 125 static int kvm_iommu_map_memslots(struct kvm *kvm) 128 126 { 129 - int i, r = 0; 127 + int i, idx, r = 0; 130 128 struct kvm_memslots *slots; 131 129 130 + idx = srcu_read_lock(&kvm->srcu); 132 131 slots = kvm_memslots(kvm); 133 132 134 133 for (i = 0; i < slots->nmemslots; i++) { ··· 137 134 if (r) 138 135 break; 139 136 } 137 + srcu_read_unlock(&kvm->srcu, idx); 140 138 141 139 return r; 142 140 } ··· 287 283 288 284 static int kvm_iommu_unmap_memslots(struct kvm *kvm) 289 285 { 290 - int i; 286 + int i, idx; 291 287 struct kvm_memslots *slots; 292 288 289 + idx = srcu_read_lock(&kvm->srcu); 293 290 slots = kvm_memslots(kvm); 294 291 295 292 for (i = 0; i < slots->nmemslots; i++) { 296 293 kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn, 297 294 slots->memslots[i].npages); 298 295 } 296 + srcu_read_unlock(&kvm->srcu, idx); 299 297 300 298 return 0; 301 299 }

+10 -5

virt/kvm/irq_comm.c

··· 17 17 * Authors: 18 18 * Yaozu (Eddie) Dong <Eddie.dong@intel.com> 19 19 * 20 + * Copyright 2010 Red Hat, Inc. and/or its affilates. 20 21 */ 21 22 22 23 #include <linux/kvm_host.h> ··· 100 99 if (r < 0) 101 100 r = 0; 102 101 r += kvm_apic_set_irq(vcpu, irq); 103 - } else { 102 + } else if (kvm_lapic_enabled(vcpu)) { 104 103 if (!lowest) 105 104 lowest = vcpu; 106 105 else if (kvm_apic_compare_prio(vcpu, lowest) < 0) ··· 279 278 synchronize_rcu(); 280 279 } 281 280 282 - void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask) 281 + void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, 282 + bool mask) 283 283 { 284 284 struct kvm_irq_mask_notifier *kimn; 285 285 struct hlist_node *n; 286 + int gsi; 286 287 287 288 rcu_read_lock(); 288 - hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link) 289 - if (kimn->irq == irq) 290 - kimn->func(kimn, mask); 289 + gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin]; 290 + if (gsi != -1) 291 + hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link) 292 + if (kimn->irq == gsi) 293 + kimn->func(kimn, mask); 291 294 rcu_read_unlock(); 292 295 } 293 296

+79 -27

virt/kvm/kvm_main.c

··· 5 5 * machines without emulation or binary translation. 6 6 * 7 7 * Copyright (C) 2006 Qumranet, Inc. 8 + * Copyright 2010 Red Hat, Inc. and/or its affilates. 8 9 * 9 10 * Authors: 10 11 * Avi Kivity <avi@qumranet.com> ··· 93 92 94 93 static bool largepages_enabled = true; 95 94 95 + static struct page *hwpoison_page; 96 + static pfn_t hwpoison_pfn; 97 + 98 + static struct page *fault_page; 99 + static pfn_t fault_pfn; 100 + 96 101 inline int kvm_is_mmio_pfn(pfn_t pfn) 97 102 { 98 103 if (pfn_valid(pfn)) { ··· 148 141 raw_spin_lock(&kvm->requests_lock); 149 142 me = smp_processor_id(); 150 143 kvm_for_each_vcpu(i, vcpu, kvm) { 151 - if (test_and_set_bit(req, &vcpu->requests)) 144 + if (kvm_make_check_request(req, vcpu)) 152 145 continue; 153 146 cpu = vcpu->cpu; 154 147 if (cpus != NULL && cpu != -1 && cpu != me) ··· 573 566 574 567 new = old = *memslot; 575 568 569 + new.id = mem->slot; 576 570 new.base_gfn = base_gfn; 577 571 new.npages = npages; 578 572 new.flags = mem->flags; ··· 604 596 /* Allocate if a slot is being created */ 605 597 #ifndef CONFIG_S390 606 598 if (npages && !new.rmap) { 607 - new.rmap = vmalloc(npages * sizeof(struct page *)); 599 + new.rmap = vmalloc(npages * sizeof(*new.rmap)); 608 600 609 601 if (!new.rmap) 610 602 goto out_free; ··· 629 621 if (new.lpage_info[i]) 630 622 continue; 631 623 632 - lpages = 1 + (base_gfn + npages - 1) / 633 - KVM_PAGES_PER_HPAGE(level); 634 - lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level); 624 + lpages = 1 + ((base_gfn + npages - 1) 625 + >> KVM_HPAGE_GFN_SHIFT(level)); 626 + lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level); 635 627 636 628 new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i])); 637 629 ··· 641 633 memset(new.lpage_info[i], 0, 642 634 lpages * sizeof(*new.lpage_info[i])); 643 635 644 - if (base_gfn % KVM_PAGES_PER_HPAGE(level)) 636 + if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) 645 637 new.lpage_info[i][0].write_count = 1; 646 - if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level)) 638 + if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) 647 639 new.lpage_info[i][lpages - 1].write_count = 1; 648 640 ugfn = new.userspace_addr >> PAGE_SHIFT; 649 641 /* ··· 818 810 819 811 int is_error_page(struct page *page) 820 812 { 821 - return page == bad_page; 813 + return page == bad_page || page == hwpoison_page || page == fault_page; 822 814 } 823 815 EXPORT_SYMBOL_GPL(is_error_page); 824 816 825 817 int is_error_pfn(pfn_t pfn) 826 818 { 827 - return pfn == bad_pfn; 819 + return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn; 828 820 } 829 821 EXPORT_SYMBOL_GPL(is_error_pfn); 822 + 823 + int is_hwpoison_pfn(pfn_t pfn) 824 + { 825 + return pfn == hwpoison_pfn; 826 + } 827 + EXPORT_SYMBOL_GPL(is_hwpoison_pfn); 828 + 829 + int is_fault_pfn(pfn_t pfn) 830 + { 831 + return pfn == fault_pfn; 832 + } 833 + EXPORT_SYMBOL_GPL(is_fault_pfn); 830 834 831 835 static inline unsigned long bad_hva(void) 832 836 { ··· 851 831 } 852 832 EXPORT_SYMBOL_GPL(kvm_is_error_hva); 853 833 854 - struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn) 834 + struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 855 835 { 856 836 int i; 857 837 struct kvm_memslots *slots = kvm_memslots(kvm); ··· 865 845 } 866 846 return NULL; 867 847 } 868 - EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased); 869 - 870 - struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn) 871 - { 872 - gfn = unalias_gfn(kvm, gfn); 873 - return gfn_to_memslot_unaliased(kvm, gfn); 874 - } 848 + EXPORT_SYMBOL_GPL(gfn_to_memslot); 875 849 876 850 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn) 877 851 { 878 852 int i; 879 853 struct kvm_memslots *slots = kvm_memslots(kvm); 880 854 881 - gfn = unalias_gfn_instantiation(kvm, gfn); 882 855 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 883 856 struct kvm_memory_slot *memslot = &slots->memslots[i]; 884 857 ··· 916 903 struct kvm_memslots *slots = kvm_memslots(kvm); 917 904 struct kvm_memory_slot *memslot = NULL; 918 905 919 - gfn = unalias_gfn(kvm, gfn); 920 906 for (i = 0; i < slots->nmemslots; ++i) { 921 907 memslot = &slots->memslots[i]; 922 908 ··· 936 924 { 937 925 struct kvm_memory_slot *slot; 938 926 939 - gfn = unalias_gfn_instantiation(kvm, gfn); 940 - slot = gfn_to_memslot_unaliased(kvm, gfn); 927 + slot = gfn_to_memslot(kvm, gfn); 941 928 if (!slot || slot->flags & KVM_MEMSLOT_INVALID) 942 929 return bad_hva(); 943 930 return gfn_to_hva_memslot(slot, gfn); ··· 957 946 struct vm_area_struct *vma; 958 947 959 948 down_read(&current->mm->mmap_sem); 949 + if (is_hwpoison_address(addr)) { 950 + up_read(&current->mm->mmap_sem); 951 + get_page(hwpoison_page); 952 + return page_to_pfn(hwpoison_page); 953 + } 954 + 960 955 vma = find_vma(current->mm, addr); 961 956 962 957 if (vma == NULL || addr < vma->vm_start || 963 958 !(vma->vm_flags & VM_PFNMAP)) { 964 959 up_read(&current->mm->mmap_sem); 965 - get_page(bad_page); 966 - return page_to_pfn(bad_page); 960 + get_page(fault_page); 961 + return page_to_pfn(fault_page); 967 962 } 968 963 969 964 pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; ··· 1204 1187 { 1205 1188 struct kvm_memory_slot *memslot; 1206 1189 1207 - gfn = unalias_gfn(kvm, gfn); 1208 - memslot = gfn_to_memslot_unaliased(kvm, gfn); 1190 + memslot = gfn_to_memslot(kvm, gfn); 1209 1191 if (memslot && memslot->dirty_bitmap) { 1210 1192 unsigned long rel_gfn = gfn - memslot->base_gfn; 1211 1193 ··· 1223 1207 prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE); 1224 1208 1225 1209 if (kvm_arch_vcpu_runnable(vcpu)) { 1226 - set_bit(KVM_REQ_UNHALT, &vcpu->requests); 1210 + kvm_make_request(KVM_REQ_UNHALT, vcpu); 1227 1211 break; 1228 1212 } 1229 1213 if (kvm_cpu_has_pending_timer(vcpu)) ··· 1394 1378 1395 1379 if (vcpu->kvm->mm != current->mm) 1396 1380 return -EIO; 1381 + 1382 + #if defined(CONFIG_S390) || defined(CONFIG_PPC) 1383 + /* 1384 + * Special cases: vcpu ioctls that are asynchronous to vcpu execution, 1385 + * so vcpu_load() would break it. 1386 + */ 1387 + if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT) 1388 + return kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1389 + #endif 1390 + 1391 + 1392 + vcpu_load(vcpu); 1397 1393 switch (ioctl) { 1398 1394 case KVM_RUN: 1399 1395 r = -EINVAL; ··· 1548 1520 goto out; 1549 1521 p = &sigset; 1550 1522 } 1551 - r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset); 1523 + r = kvm_vcpu_ioctl_set_sigmask(vcpu, p); 1552 1524 break; 1553 1525 } 1554 1526 case KVM_GET_FPU: { ··· 1583 1555 r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); 1584 1556 } 1585 1557 out: 1558 + vcpu_put(vcpu); 1586 1559 kfree(fpu); 1587 1560 kfree(kvm_sregs); 1588 1561 return r; ··· 2226 2197 2227 2198 bad_pfn = page_to_pfn(bad_page); 2228 2199 2200 + hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2201 + 2202 + if (hwpoison_page == NULL) { 2203 + r = -ENOMEM; 2204 + goto out_free_0; 2205 + } 2206 + 2207 + hwpoison_pfn = page_to_pfn(hwpoison_page); 2208 + 2209 + fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO); 2210 + 2211 + if (fault_page == NULL) { 2212 + r = -ENOMEM; 2213 + goto out_free_0; 2214 + } 2215 + 2216 + fault_pfn = page_to_pfn(fault_page); 2217 + 2229 2218 if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) { 2230 2219 r = -ENOMEM; 2231 2220 goto out_free_0; ··· 2316 2269 out_free_0a: 2317 2270 free_cpumask_var(cpus_hardware_enabled); 2318 2271 out_free_0: 2272 + if (fault_page) 2273 + __free_page(fault_page); 2274 + if (hwpoison_page) 2275 + __free_page(hwpoison_page); 2319 2276 __free_page(bad_page); 2320 2277 out: 2321 2278 kvm_arch_exit(); ··· 2341 2290 kvm_arch_hardware_unsetup(); 2342 2291 kvm_arch_exit(); 2343 2292 free_cpumask_var(cpus_hardware_enabled); 2293 + __free_page(hwpoison_page); 2344 2294 __free_page(bad_page); 2345 2295 } 2346 2296 EXPORT_SYMBOL_GPL(kvm_exit);