Merge tag 'for-linus-4.12b-rc0b-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip

+3

arch/arm/include/asm/device.h

··· 16 16 #ifdef CONFIG_ARM_DMA_USE_IOMMU 17 17 struct dma_iommu_mapping *mapping; 18 18 #endif 19 + #ifdef CONFIG_XEN 20 + const struct dma_map_ops *dev_dma_ops; 21 + #endif 19 22 bool dma_coherent; 20 23 }; 21 24

+1 -11

arch/arm/include/asm/dma-mapping.h

··· 16 16 extern const struct dma_map_ops arm_dma_ops; 17 17 extern const struct dma_map_ops arm_coherent_dma_ops; 18 18 19 - static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev) 20 - { 21 - if (dev && dev->dma_ops) 22 - return dev->dma_ops; 23 - return &arm_dma_ops; 24 - } 25 - 26 19 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) 27 20 { 28 - if (xen_initial_domain()) 29 - return xen_dma_ops; 30 - else 31 - return __generic_dma_ops(NULL); 21 + return &arm_dma_ops; 32 22 } 33 23 34 24 #define HAVE_ARCH_DMA_SUPPORTED 1

+7

arch/arm/mm/dma-mapping.c

··· 2414 2414 dma_ops = arm_get_dma_map_ops(coherent); 2415 2415 2416 2416 set_dma_ops(dev, dma_ops); 2417 + 2418 + #ifdef CONFIG_XEN 2419 + if (xen_initial_domain()) { 2420 + dev->archdata.dev_dma_ops = dev->dma_ops; 2421 + dev->dma_ops = xen_dma_ops; 2422 + } 2423 + #endif 2417 2424 } 2418 2425 2419 2426 void arch_teardown_dma_ops(struct device *dev)

+1 -1

arch/arm/xen/efi.c

··· 35 35 efi.update_capsule = xen_efi_update_capsule; 36 36 efi.query_capsule_caps = xen_efi_query_capsule_caps; 37 37 efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count; 38 - efi.reset_system = NULL; /* Functionality provided by Xen. */ 38 + efi.reset_system = xen_efi_reset_system; 39 39 } 40 40 EXPORT_SYMBOL_GPL(xen_efi_runtime_setup);

+10 -6

arch/arm/xen/enlighten.c

··· 191 191 return 0; 192 192 } 193 193 194 - static void xen_restart(enum reboot_mode reboot_mode, const char *cmd) 194 + void xen_reboot(int reason) 195 195 { 196 - struct sched_shutdown r = { .reason = SHUTDOWN_reboot }; 196 + struct sched_shutdown r = { .reason = reason }; 197 197 int rc; 198 + 198 199 rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); 199 200 BUG_ON(rc); 200 201 } 201 202 203 + static void xen_restart(enum reboot_mode reboot_mode, const char *cmd) 204 + { 205 + xen_reboot(SHUTDOWN_reboot); 206 + } 207 + 208 + 202 209 static void xen_power_off(void) 203 210 { 204 - struct sched_shutdown r = { .reason = SHUTDOWN_poweroff }; 205 - int rc; 206 - rc = HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); 207 - BUG_ON(rc); 211 + xen_reboot(SHUTDOWN_poweroff); 208 212 } 209 213 210 214 static irqreturn_t xen_arm_callback(int irq, void *arg)

+3

arch/arm64/include/asm/device.h

··· 20 20 #ifdef CONFIG_IOMMU_API 21 21 void *iommu; /* private IOMMU data */ 22 22 #endif 23 + #ifdef CONFIG_XEN 24 + const struct dma_map_ops *dev_dma_ops; 25 + #endif 23 26 bool dma_coherent; 24 27 }; 25 28

+1 -12

arch/arm64/include/asm/dma-mapping.h

··· 27 27 #define DMA_ERROR_CODE (~(dma_addr_t)0) 28 28 extern const struct dma_map_ops dummy_dma_ops; 29 29 30 - static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev) 30 + static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) 31 31 { 32 - if (dev && dev->dma_ops) 33 - return dev->dma_ops; 34 - 35 32 /* 36 33 * We expect no ISA devices, and all other DMA masters are expected to 37 34 * have someone call arch_setup_dma_ops at device creation time. 38 35 */ 39 36 return &dummy_dma_ops; 40 - } 41 - 42 - static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) 43 - { 44 - if (xen_initial_domain()) 45 - return xen_dma_ops; 46 - else 47 - return __generic_dma_ops(NULL); 48 37 } 49 38 50 39 void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,

+7

arch/arm64/mm/dma-mapping.c

··· 977 977 978 978 dev->archdata.dma_coherent = coherent; 979 979 __iommu_setup_dma_ops(dev, dma_base, size, iommu); 980 + 981 + #ifdef CONFIG_XEN 982 + if (xen_initial_domain()) { 983 + dev->archdata.dev_dma_ops = dev->dma_ops; 984 + dev->dma_ops = xen_dma_ops; 985 + } 986 + #endif 980 987 }

+2 -6

arch/x86/include/asm/hypervisor.h

··· 35 35 /* Detection routine */ 36 36 uint32_t (*detect)(void); 37 37 38 - /* Adjust CPU feature bits (run once per CPU) */ 39 - void (*set_cpu_features)(struct cpuinfo_x86 *); 40 - 41 38 /* Platform setup (run once per boot) */ 42 39 void (*init_platform)(void); 43 40 ··· 50 53 /* Recognized hypervisors */ 51 54 extern const struct hypervisor_x86 x86_hyper_vmware; 52 55 extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 53 - extern const struct hypervisor_x86 x86_hyper_xen; 56 + extern const struct hypervisor_x86 x86_hyper_xen_pv; 57 + extern const struct hypervisor_x86 x86_hyper_xen_hvm; 54 58 extern const struct hypervisor_x86 x86_hyper_kvm; 55 59 56 - extern void init_hypervisor(struct cpuinfo_x86 *c); 57 60 extern void init_hypervisor_platform(void); 58 61 extern bool hypervisor_x2apic_available(void); 59 62 extern void hypervisor_pin_vcpu(int cpu); 60 63 #else 61 - static inline void init_hypervisor(struct cpuinfo_x86 *c) { } 62 64 static inline void init_hypervisor_platform(void) { } 63 65 static inline bool hypervisor_x2apic_available(void) { return false; } 64 66 #endif /* CONFIG_HYPERVISOR_GUEST */

+11

arch/x86/include/asm/xen/events.h

··· 20 20 /* No need for a barrier -- XCHG is a barrier on x86. */ 21 21 #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) 22 22 23 + extern int xen_have_vector_callback; 24 + 25 + /* 26 + * Events delivered via platform PCI interrupts are always 27 + * routed to vcpu 0 and hence cannot be rebound. 28 + */ 29 + static inline bool xen_support_evtchn_rebind(void) 30 + { 31 + return (!xen_hvm_domain() || xen_have_vector_callback); 32 + } 33 + 23 34 #endif /* _ASM_X86_XEN_EVENTS_H */

+25

arch/x86/include/asm/xen/page.h

··· 52 52 extern unsigned long __init set_phys_range_identity(unsigned long pfn_s, 53 53 unsigned long pfn_e); 54 54 55 + #ifdef CONFIG_XEN_PV 55 56 extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, 56 57 struct gnttab_map_grant_ref *kmap_ops, 57 58 struct page **pages, unsigned int count); 58 59 extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, 59 60 struct gnttab_unmap_grant_ref *kunmap_ops, 60 61 struct page **pages, unsigned int count); 62 + #else 63 + static inline int 64 + set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, 65 + struct gnttab_map_grant_ref *kmap_ops, 66 + struct page **pages, unsigned int count) 67 + { 68 + return 0; 69 + } 70 + 71 + static inline int 72 + clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, 73 + struct gnttab_unmap_grant_ref *kunmap_ops, 74 + struct page **pages, unsigned int count) 75 + { 76 + return 0; 77 + } 78 + #endif 61 79 62 80 /* 63 81 * Helper functions to write or read unsigned long values to/from ··· 91 73 return __get_user(*val, (unsigned long __user *)addr); 92 74 } 93 75 76 + #ifdef CONFIG_XEN_PV 94 77 /* 95 78 * When to use pfn_to_mfn(), __pfn_to_mfn() or get_phys_to_machine(): 96 79 * - pfn_to_mfn() returns either INVALID_P2M_ENTRY or the mfn. No indicator ··· 118 99 119 100 return mfn; 120 101 } 102 + #else 103 + static inline unsigned long __pfn_to_mfn(unsigned long pfn) 104 + { 105 + return pfn; 106 + } 107 + #endif 121 108 122 109 static inline unsigned long pfn_to_mfn(unsigned long pfn) 123 110 {

-1

arch/x86/kernel/cpu/common.c

··· 1149 1149 detect_ht(c); 1150 1150 #endif 1151 1151 1152 - init_hypervisor(c); 1153 1152 x86_init_rdrand(c); 1154 1153 x86_init_cache_qos(c); 1155 1154 setup_pku(c);

+5 -10

arch/x86/kernel/cpu/hypervisor.c

··· 28 28 29 29 static const __initconst struct hypervisor_x86 * const hypervisors[] = 30 30 { 31 - #ifdef CONFIG_XEN 32 - &x86_hyper_xen, 31 + #ifdef CONFIG_XEN_PV 32 + &x86_hyper_xen_pv, 33 + #endif 34 + #ifdef CONFIG_XEN_PVHVM 35 + &x86_hyper_xen_hvm, 33 36 #endif 34 37 &x86_hyper_vmware, 35 38 &x86_hyper_ms_hyperv, ··· 63 60 pr_info("Hypervisor detected: %s\n", x86_hyper->name); 64 61 } 65 62 66 - void init_hypervisor(struct cpuinfo_x86 *c) 67 - { 68 - if (x86_hyper && x86_hyper->set_cpu_features) 69 - x86_hyper->set_cpu_features(c); 70 - } 71 - 72 63 void __init init_hypervisor_platform(void) 73 64 { 74 65 ··· 70 73 71 74 if (!x86_hyper) 72 75 return; 73 - 74 - init_hypervisor(&boot_cpu_data); 75 76 76 77 if (x86_hyper->init_platform) 77 78 x86_hyper->init_platform();

+20 -19

arch/x86/kernel/cpu/vmware.c

··· 113 113 #define vmware_paravirt_ops_setup() do {} while (0) 114 114 #endif 115 115 116 + /* 117 + * VMware hypervisor takes care of exporting a reliable TSC to the guest. 118 + * Still, due to timing difference when running on virtual cpus, the TSC can 119 + * be marked as unstable in some cases. For example, the TSC sync check at 120 + * bootup can fail due to a marginal offset between vcpus' TSCs (though the 121 + * TSCs do not drift from each other). Also, the ACPI PM timer clocksource 122 + * is not suitable as a watchdog when running on a hypervisor because the 123 + * kernel may miss a wrap of the counter if the vcpu is descheduled for a 124 + * long time. To skip these checks at runtime we set these capability bits, 125 + * so that the kernel could just trust the hypervisor with providing a 126 + * reliable virtual TSC that is suitable for timekeeping. 127 + */ 128 + static void __init vmware_set_capabilities(void) 129 + { 130 + setup_force_cpu_cap(X86_FEATURE_CONSTANT_TSC); 131 + setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); 132 + } 133 + 116 134 static void __init vmware_platform_setup(void) 117 135 { 118 136 uint32_t eax, ebx, ecx, edx; ··· 170 152 #ifdef CONFIG_X86_IO_APIC 171 153 no_timer_check = 1; 172 154 #endif 155 + 156 + vmware_set_capabilities(); 173 157 } 174 158 175 159 /* ··· 196 176 return 0; 197 177 } 198 178 199 - /* 200 - * VMware hypervisor takes care of exporting a reliable TSC to the guest. 201 - * Still, due to timing difference when running on virtual cpus, the TSC can 202 - * be marked as unstable in some cases. For example, the TSC sync check at 203 - * bootup can fail due to a marginal offset between vcpus' TSCs (though the 204 - * TSCs do not drift from each other). Also, the ACPI PM timer clocksource 205 - * is not suitable as a watchdog when running on a hypervisor because the 206 - * kernel may miss a wrap of the counter if the vcpu is descheduled for a 207 - * long time. To skip these checks at runtime we set these capability bits, 208 - * so that the kernel could just trust the hypervisor with providing a 209 - * reliable virtual TSC that is suitable for timekeeping. 210 - */ 211 - static void vmware_set_cpu_features(struct cpuinfo_x86 *c) 212 - { 213 - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 214 - set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); 215 - } 216 - 217 179 /* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ 218 180 static bool __init vmware_legacy_x2apic_available(void) 219 181 { ··· 208 206 const __refconst struct hypervisor_x86 x86_hyper_vmware = { 209 207 .name = "VMware", 210 208 .detect = vmware_platform, 211 - .set_cpu_features = vmware_set_cpu_features, 212 209 .init_platform = vmware_platform_setup, 213 210 .x2apic_available = vmware_legacy_x2apic_available, 214 211 };

+1 -1

arch/x86/kernel/process_64.c

··· 446 446 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 447 447 __switch_to_xtra(prev_p, next_p, tss); 448 448 449 - #ifdef CONFIG_XEN 449 + #ifdef CONFIG_XEN_PV 450 450 /* 451 451 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and 452 452 * current_pt_regs()->flags may not match the current task's

+1 -1

arch/x86/pci/xen.c

··· 447 447 448 448 int __init pci_xen_hvm_init(void) 449 449 { 450 - if (!xen_feature(XENFEAT_hvm_pirqs)) 450 + if (!xen_have_vector_callback || !xen_feature(XENFEAT_hvm_pirqs)) 451 451 return 0; 452 452 453 453 #ifdef CONFIG_ACPI

+27 -6

arch/x86/xen/Kconfig

··· 6 6 bool "Xen guest support" 7 7 depends on PARAVIRT 8 8 select PARAVIRT_CLOCK 9 - select XEN_HAVE_PVMMU 10 - select XEN_HAVE_VPMU 11 9 depends on X86_64 || (X86_32 && X86_PAE) 12 10 depends on X86_LOCAL_APIC && X86_TSC 13 11 help ··· 13 15 kernel to boot in a paravirtualized environment under the 14 16 Xen hypervisor. 15 17 16 - config XEN_DOM0 18 + config XEN_PV 19 + bool "Xen PV guest support" 20 + default y 21 + depends on XEN 22 + select XEN_HAVE_PVMMU 23 + select XEN_HAVE_VPMU 24 + help 25 + Support running as a Xen PV guest. 26 + 27 + config XEN_PV_SMP 17 28 def_bool y 18 - depends on XEN && PCI_XEN && SWIOTLB_XEN 29 + depends on XEN_PV && SMP 30 + 31 + config XEN_DOM0 32 + bool "Xen PV Dom0 support" 33 + default y 34 + depends on XEN_PV && PCI_XEN && SWIOTLB_XEN 19 35 depends on X86_IO_APIC && ACPI && PCI 36 + help 37 + Support running as a Xen PV Dom0 guest. 20 38 21 39 config XEN_PVHVM 22 - def_bool y 40 + bool "Xen PVHVM guest support" 41 + default y 23 42 depends on XEN && PCI && X86_LOCAL_APIC 43 + help 44 + Support running as a Xen PVHVM guest. 45 + 46 + config XEN_PVHVM_SMP 47 + def_bool y 48 + depends on XEN_PVHVM && SMP 24 49 25 50 config XEN_512GB 26 51 bool "Limit Xen pv-domain memory to 512GB" 27 - depends on XEN && X86_64 52 + depends on XEN_PV && X86_64 28 53 default y 29 54 help 30 55 Limit paravirtualized user domains to 512GB of RAM.

+11 -5

arch/x86/xen/Makefile

··· 7 7 8 8 # Make sure early boot has no stackprotector 9 9 nostackp := $(call cc-option, -fno-stack-protector) 10 - CFLAGS_enlighten.o := $(nostackp) 11 - CFLAGS_mmu.o := $(nostackp) 10 + CFLAGS_enlighten_pv.o := $(nostackp) 11 + CFLAGS_mmu_pv.o := $(nostackp) 12 12 13 - obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13 + obj-y := enlighten.o multicalls.o mmu.o irq.o \ 14 14 time.o xen-asm.o xen-asm_$(BITS).o \ 15 - grant-table.o suspend.o platform-pci-unplug.o \ 16 - p2m.o apic.o pmu.o 15 + grant-table.o suspend.o platform-pci-unplug.o 16 + 17 + obj-$(CONFIG_XEN_PVHVM) += enlighten_hvm.o mmu_hvm.o suspend_hvm.o 18 + obj-$(CONFIG_XEN_PV) += setup.o apic.o pmu.o suspend_pv.o \ 19 + p2m.o enlighten_pv.o mmu_pv.o 20 + obj-$(CONFIG_XEN_PVH) += enlighten_pvh.o 17 21 18 22 obj-$(CONFIG_EVENT_TRACING) += trace.o 19 23 20 24 obj-$(CONFIG_SMP) += smp.o 25 + obj-$(CONFIG_XEN_PV_SMP) += smp_pv.o 26 + obj-$(CONFIG_XEN_PVHVM_SMP) += smp_hvm.o 21 27 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 22 28 obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o 23 29 obj-$(CONFIG_XEN_DOM0) += vga.o

+1 -1

arch/x86/xen/efi.c

··· 81 81 .update_capsule = xen_efi_update_capsule, 82 82 .query_capsule_caps = xen_efi_query_capsule_caps, 83 83 .get_next_high_mono_count = xen_efi_get_next_high_mono_count, 84 - .reset_system = NULL, /* Functionality provided by Xen. */ 84 + .reset_system = xen_efi_reset_system, 85 85 .set_virtual_address_map = NULL, /* Not used under Xen. */ 86 86 .flags = 0 /* Initialized later. */ 87 87 };

+30 -1803

arch/x86/xen/enlighten.c

··· 1 - /* 2 - * Core of Xen paravirt_ops implementation. 3 - * 4 - * This file contains the xen_paravirt_ops structure itself, and the 5 - * implementations for: 6 - * - privileged instructions 7 - * - interrupt flags 8 - * - segment operations 9 - * - booting and setup 10 - * 11 - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 12 - */ 13 - 14 1 #include <linux/cpu.h> 15 - #include <linux/kernel.h> 16 - #include <linux/init.h> 17 - #include <linux/smp.h> 18 - #include <linux/preempt.h> 19 - #include <linux/hardirq.h> 20 - #include <linux/percpu.h> 21 - #include <linux/delay.h> 22 - #include <linux/start_kernel.h> 23 - #include <linux/sched.h> 24 - #include <linux/kprobes.h> 25 - #include <linux/bootmem.h> 26 - #include <linux/export.h> 27 - #include <linux/mm.h> 28 - #include <linux/page-flags.h> 29 - #include <linux/highmem.h> 30 - #include <linux/console.h> 31 - #include <linux/pci.h> 32 - #include <linux/gfp.h> 33 - #include <linux/memblock.h> 34 - #include <linux/edd.h> 35 - #include <linux/frame.h> 36 - 37 2 #include <linux/kexec.h> 38 3 39 - #include <xen/xen.h> 40 - #include <xen/events.h> 41 - #include <xen/interface/xen.h> 42 - #include <xen/interface/version.h> 43 - #include <xen/interface/physdev.h> 44 - #include <xen/interface/vcpu.h> 45 - #include <xen/interface/memory.h> 46 - #include <xen/interface/nmi.h> 47 - #include <xen/interface/xen-mca.h> 48 - #include <xen/interface/hvm/start_info.h> 49 4 #include <xen/features.h> 50 5 #include <xen/page.h> 51 - #include <xen/hvm.h> 52 - #include <xen/hvc-console.h> 53 - #include <xen/acpi.h> 54 6 55 - #include <asm/paravirt.h> 56 - #include <asm/apic.h> 57 - #include <asm/page.h> 58 - #include <asm/xen/pci.h> 59 7 #include <asm/xen/hypercall.h> 60 8 #include <asm/xen/hypervisor.h> 61 - #include <asm/xen/cpuid.h> 62 - #include <asm/fixmap.h> 63 - #include <asm/processor.h> 64 - #include <asm/proto.h> 65 - #include <asm/msr-index.h> 66 - #include <asm/traps.h> 67 - #include <asm/setup.h> 68 - #include <asm/desc.h> 69 - #include <asm/pgalloc.h> 70 - #include <asm/pgtable.h> 71 - #include <asm/tlbflush.h> 72 - #include <asm/reboot.h> 73 - #include <asm/stackprotector.h> 74 - #include <asm/hypervisor.h> 75 - #include <asm/mach_traps.h> 76 - #include <asm/mwait.h> 77 - #include <asm/pci_x86.h> 78 9 #include <asm/cpu.h> 79 10 #include <asm/e820/api.h> 80 11 81 - #ifdef CONFIG_ACPI 82 - #include <linux/acpi.h> 83 - #include <asm/acpi.h> 84 - #include <acpi/pdc_intel.h> 85 - #include <acpi/processor.h> 86 - #include <xen/interface/platform.h> 87 - #endif 88 - 89 12 #include "xen-ops.h" 90 - #include "mmu.h" 91 13 #include "smp.h" 92 - #include "multicalls.h" 93 14 #include "pmu.h" 94 15 95 16 EXPORT_SYMBOL_GPL(hypercall_page); ··· 57 136 58 137 struct shared_info xen_dummy_shared_info; 59 138 60 - void *xen_initial_gdt; 61 - 62 - RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); 63 - 64 - static int xen_cpu_up_prepare(unsigned int cpu); 65 - static int xen_cpu_up_online(unsigned int cpu); 66 - static int xen_cpu_dead(unsigned int cpu); 139 + __read_mostly int xen_have_vector_callback; 140 + EXPORT_SYMBOL_GPL(xen_have_vector_callback); 67 141 68 142 /* 69 143 * Point at some empty memory to start with. We map the real shared_info ··· 79 163 * 80 164 * 0: not available, 1: available 81 165 */ 82 - static int have_vcpu_info_placement = 1; 166 + int xen_have_vcpu_info_placement = 1; 83 167 84 - struct tls_descs { 85 - struct desc_struct desc[3]; 86 - }; 168 + static int xen_cpu_up_online(unsigned int cpu) 169 + { 170 + xen_init_lock_cpu(cpu); 171 + return 0; 172 + } 87 173 88 - /* 89 - * Updating the 3 TLS descriptors in the GDT on every task switch is 90 - * surprisingly expensive so we avoid updating them if they haven't 91 - * changed. Since Xen writes different descriptors than the one 92 - * passed in the update_descriptor hypercall we keep shadow copies to 93 - * compare against. 94 - */ 95 - static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); 174 + int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), 175 + int (*cpu_dead_cb)(unsigned int)) 176 + { 177 + int rc; 96 178 97 - #ifdef CONFIG_XEN_PVH 98 - /* 99 - * PVH variables. 100 - * 101 - * xen_pvh and pvh_bootparams need to live in data segment since they 102 - * are used after startup_{32|64}, which clear .bss, are invoked. 103 - */ 104 - bool xen_pvh __attribute__((section(".data"))) = 0; 105 - struct boot_params pvh_bootparams __attribute__((section(".data"))); 179 + rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, 180 + "x86/xen/hvm_guest:prepare", 181 + cpu_up_prepare_cb, cpu_dead_cb); 182 + if (rc >= 0) { 183 + rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, 184 + "x86/xen/hvm_guest:online", 185 + xen_cpu_up_online, NULL); 186 + if (rc < 0) 187 + cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); 188 + } 106 189 107 - struct hvm_start_info pvh_start_info; 108 - unsigned int pvh_start_info_sz = sizeof(pvh_start_info); 109 - #endif 190 + return rc >= 0 ? 0 : rc; 191 + } 110 192 111 193 static void clamp_max_cpus(void) 112 194 { ··· 141 227 per_cpu(xen_vcpu, cpu) = 142 228 &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; 143 229 144 - if (!have_vcpu_info_placement) { 230 + if (!xen_have_vcpu_info_placement) { 145 231 if (cpu >= MAX_VIRT_CPUS) 146 232 clamp_max_cpus(); 147 233 return; ··· 164 250 165 251 if (err) { 166 252 printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); 167 - have_vcpu_info_placement = 0; 253 + xen_have_vcpu_info_placement = 0; 168 254 clamp_max_cpus(); 169 255 } else { 170 256 /* This cpu is using the registered vcpu info, even if ··· 173 259 } 174 260 } 175 261 176 - /* 177 - * On restore, set the vcpu placement up again. 178 - * If it fails, then we're in a bad state, since 179 - * we can't back out from using it... 180 - */ 181 - void xen_vcpu_restore(void) 182 - { 183 - int cpu; 184 - 185 - for_each_possible_cpu(cpu) { 186 - bool other_cpu = (cpu != smp_processor_id()); 187 - bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), 188 - NULL); 189 - 190 - if (other_cpu && is_up && 191 - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) 192 - BUG(); 193 - 194 - xen_setup_runstate_info(cpu); 195 - 196 - if (have_vcpu_info_placement) 197 - xen_vcpu_setup(cpu); 198 - 199 - if (other_cpu && is_up && 200 - HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) 201 - BUG(); 202 - } 203 - } 204 - 205 - static void __init xen_banner(void) 206 - { 207 - unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); 208 - struct xen_extraversion extra; 209 - HYPERVISOR_xen_version(XENVER_extraversion, &extra); 210 - 211 - pr_info("Booting paravirtualized kernel %son %s\n", 212 - xen_feature(XENFEAT_auto_translated_physmap) ? 213 - "with PVH extensions " : "", pv_info.name); 214 - printk(KERN_INFO "Xen version: %d.%d%s%s\n", 215 - version >> 16, version & 0xffff, extra.extraversion, 216 - xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 217 - } 218 - /* Check if running on Xen version (major, minor) or later */ 219 - bool 220 - xen_running_on_version_or_later(unsigned int major, unsigned int minor) 221 - { 222 - unsigned int version; 223 - 224 - if (!xen_domain()) 225 - return false; 226 - 227 - version = HYPERVISOR_xen_version(XENVER_version, NULL); 228 - if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || 229 - ((version >> 16) > major)) 230 - return true; 231 - return false; 232 - } 233 - 234 - #define CPUID_THERM_POWER_LEAF 6 235 - #define APERFMPERF_PRESENT 0 236 - 237 - static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; 238 - static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; 239 - 240 - static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask; 241 - static __read_mostly unsigned int cpuid_leaf5_ecx_val; 242 - static __read_mostly unsigned int cpuid_leaf5_edx_val; 243 - 244 - static void xen_cpuid(unsigned int *ax, unsigned int *bx, 245 - unsigned int *cx, unsigned int *dx) 246 - { 247 - unsigned maskebx = ~0; 248 - unsigned maskecx = ~0; 249 - unsigned maskedx = ~0; 250 - unsigned setecx = 0; 251 - /* 252 - * Mask out inconvenient features, to try and disable as many 253 - * unsupported kernel subsystems as possible. 254 - */ 255 - switch (*ax) { 256 - case 1: 257 - maskecx = cpuid_leaf1_ecx_mask; 258 - setecx = cpuid_leaf1_ecx_set_mask; 259 - maskedx = cpuid_leaf1_edx_mask; 260 - break; 261 - 262 - case CPUID_MWAIT_LEAF: 263 - /* Synthesize the values.. */ 264 - *ax = 0; 265 - *bx = 0; 266 - *cx = cpuid_leaf5_ecx_val; 267 - *dx = cpuid_leaf5_edx_val; 268 - return; 269 - 270 - case CPUID_THERM_POWER_LEAF: 271 - /* Disabling APERFMPERF for kernel usage */ 272 - maskecx = ~(1 << APERFMPERF_PRESENT); 273 - break; 274 - 275 - case 0xb: 276 - /* Suppress extended topology stuff */ 277 - maskebx = 0; 278 - break; 279 - } 280 - 281 - asm(XEN_EMULATE_PREFIX "cpuid" 282 - : "=a" (*ax), 283 - "=b" (*bx), 284 - "=c" (*cx), 285 - "=d" (*dx) 286 - : "0" (*ax), "2" (*cx)); 287 - 288 - *bx &= maskebx; 289 - *cx &= maskecx; 290 - *cx |= setecx; 291 - *dx &= maskedx; 292 - } 293 - STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */ 294 - 295 - static bool __init xen_check_mwait(void) 296 - { 297 - #ifdef CONFIG_ACPI 298 - struct xen_platform_op op = { 299 - .cmd = XENPF_set_processor_pminfo, 300 - .u.set_pminfo.id = -1, 301 - .u.set_pminfo.type = XEN_PM_PDC, 302 - }; 303 - uint32_t buf[3]; 304 - unsigned int ax, bx, cx, dx; 305 - unsigned int mwait_mask; 306 - 307 - /* We need to determine whether it is OK to expose the MWAIT 308 - * capability to the kernel to harvest deeper than C3 states from ACPI 309 - * _CST using the processor_harvest_xen.c module. For this to work, we 310 - * need to gather the MWAIT_LEAF values (which the cstate.c code 311 - * checks against). The hypervisor won't expose the MWAIT flag because 312 - * it would break backwards compatibility; so we will find out directly 313 - * from the hardware and hypercall. 314 - */ 315 - if (!xen_initial_domain()) 316 - return false; 317 - 318 - /* 319 - * When running under platform earlier than Xen4.2, do not expose 320 - * mwait, to avoid the risk of loading native acpi pad driver 321 - */ 322 - if (!xen_running_on_version_or_later(4, 2)) 323 - return false; 324 - 325 - ax = 1; 326 - cx = 0; 327 - 328 - native_cpuid(&ax, &bx, &cx, &dx); 329 - 330 - mwait_mask = (1 << (X86_FEATURE_EST % 32)) | 331 - (1 << (X86_FEATURE_MWAIT % 32)); 332 - 333 - if ((cx & mwait_mask) != mwait_mask) 334 - return false; 335 - 336 - /* We need to emulate the MWAIT_LEAF and for that we need both 337 - * ecx and edx. The hypercall provides only partial information. 338 - */ 339 - 340 - ax = CPUID_MWAIT_LEAF; 341 - bx = 0; 342 - cx = 0; 343 - dx = 0; 344 - 345 - native_cpuid(&ax, &bx, &cx, &dx); 346 - 347 - /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, 348 - * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. 349 - */ 350 - buf[0] = ACPI_PDC_REVISION_ID; 351 - buf[1] = 1; 352 - buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); 353 - 354 - set_xen_guest_handle(op.u.set_pminfo.pdc, buf); 355 - 356 - if ((HYPERVISOR_platform_op(&op) == 0) && 357 - (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { 358 - cpuid_leaf5_ecx_val = cx; 359 - cpuid_leaf5_edx_val = dx; 360 - } 361 - return true; 362 - #else 363 - return false; 364 - #endif 365 - } 366 - static void __init xen_init_cpuid_mask(void) 367 - { 368 - unsigned int ax, bx, cx, dx; 369 - unsigned int xsave_mask; 370 - 371 - cpuid_leaf1_edx_mask = 372 - ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ 373 - (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 374 - 375 - if (!xen_initial_domain()) 376 - cpuid_leaf1_edx_mask &= 377 - ~((1 << X86_FEATURE_ACPI)); /* disable ACPI */ 378 - 379 - cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); 380 - 381 - ax = 1; 382 - cx = 0; 383 - cpuid(1, &ax, &bx, &cx, &dx); 384 - 385 - xsave_mask = 386 - (1 << (X86_FEATURE_XSAVE % 32)) | 387 - (1 << (X86_FEATURE_OSXSAVE % 32)); 388 - 389 - /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ 390 - if ((cx & xsave_mask) != xsave_mask) 391 - cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ 392 - if (xen_check_mwait()) 393 - cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32)); 394 - } 395 - 396 - static void xen_set_debugreg(int reg, unsigned long val) 397 - { 398 - HYPERVISOR_set_debugreg(reg, val); 399 - } 400 - 401 - static unsigned long xen_get_debugreg(int reg) 402 - { 403 - return HYPERVISOR_get_debugreg(reg); 404 - } 405 - 406 - static void xen_end_context_switch(struct task_struct *next) 407 - { 408 - xen_mc_flush(); 409 - paravirt_end_context_switch(next); 410 - } 411 - 412 - static unsigned long xen_store_tr(void) 413 - { 414 - return 0; 415 - } 416 - 417 - /* 418 - * Set the page permissions for a particular virtual address. If the 419 - * address is a vmalloc mapping (or other non-linear mapping), then 420 - * find the linear mapping of the page and also set its protections to 421 - * match. 422 - */ 423 - static void set_aliased_prot(void *v, pgprot_t prot) 424 - { 425 - int level; 426 - pte_t *ptep; 427 - pte_t pte; 428 - unsigned long pfn; 429 - struct page *page; 430 - unsigned char dummy; 431 - 432 - ptep = lookup_address((unsigned long)v, &level); 433 - BUG_ON(ptep == NULL); 434 - 435 - pfn = pte_pfn(*ptep); 436 - page = pfn_to_page(pfn); 437 - 438 - pte = pfn_pte(pfn, prot); 439 - 440 - /* 441 - * Careful: update_va_mapping() will fail if the virtual address 442 - * we're poking isn't populated in the page tables. We don't 443 - * need to worry about the direct map (that's always in the page 444 - * tables), but we need to be careful about vmap space. In 445 - * particular, the top level page table can lazily propagate 446 - * entries between processes, so if we've switched mms since we 447 - * vmapped the target in the first place, we might not have the 448 - * top-level page table entry populated. 449 - * 450 - * We disable preemption because we want the same mm active when 451 - * we probe the target and when we issue the hypercall. We'll 452 - * have the same nominal mm, but if we're a kernel thread, lazy 453 - * mm dropping could change our pgd. 454 - * 455 - * Out of an abundance of caution, this uses __get_user() to fault 456 - * in the target address just in case there's some obscure case 457 - * in which the target address isn't readable. 458 - */ 459 - 460 - preempt_disable(); 461 - 462 - probe_kernel_read(&dummy, v, 1); 463 - 464 - if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) 465 - BUG(); 466 - 467 - if (!PageHighMem(page)) { 468 - void *av = __va(PFN_PHYS(pfn)); 469 - 470 - if (av != v) 471 - if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) 472 - BUG(); 473 - } else 474 - kmap_flush_unused(); 475 - 476 - preempt_enable(); 477 - } 478 - 479 - static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) 480 - { 481 - const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; 482 - int i; 483 - 484 - /* 485 - * We need to mark the all aliases of the LDT pages RO. We 486 - * don't need to call vm_flush_aliases(), though, since that's 487 - * only responsible for flushing aliases out the TLBs, not the 488 - * page tables, and Xen will flush the TLB for us if needed. 489 - * 490 - * To avoid confusing future readers: none of this is necessary 491 - * to load the LDT. The hypervisor only checks this when the 492 - * LDT is faulted in due to subsequent descriptor access. 493 - */ 494 - 495 - for(i = 0; i < entries; i += entries_per_page) 496 - set_aliased_prot(ldt + i, PAGE_KERNEL_RO); 497 - } 498 - 499 - static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) 500 - { 501 - const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; 502 - int i; 503 - 504 - for(i = 0; i < entries; i += entries_per_page) 505 - set_aliased_prot(ldt + i, PAGE_KERNEL); 506 - } 507 - 508 - static void xen_set_ldt(const void *addr, unsigned entries) 509 - { 510 - struct mmuext_op *op; 511 - struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 512 - 513 - trace_xen_cpu_set_ldt(addr, entries); 514 - 515 - op = mcs.args; 516 - op->cmd = MMUEXT_SET_LDT; 517 - op->arg1.linear_addr = (unsigned long)addr; 518 - op->arg2.nr_ents = entries; 519 - 520 - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 521 - 522 - xen_mc_issue(PARAVIRT_LAZY_CPU); 523 - } 524 - 525 - static void xen_load_gdt(const struct desc_ptr *dtr) 526 - { 527 - unsigned long va = dtr->address; 528 - unsigned int size = dtr->size + 1; 529 - unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); 530 - unsigned long frames[pages]; 531 - int f; 532 - 533 - /* 534 - * A GDT can be up to 64k in size, which corresponds to 8192 535 - * 8-byte entries, or 16 4k pages.. 536 - */ 537 - 538 - BUG_ON(size > 65536); 539 - BUG_ON(va & ~PAGE_MASK); 540 - 541 - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 542 - int level; 543 - pte_t *ptep; 544 - unsigned long pfn, mfn; 545 - void *virt; 546 - 547 - /* 548 - * The GDT is per-cpu and is in the percpu data area. 549 - * That can be virtually mapped, so we need to do a 550 - * page-walk to get the underlying MFN for the 551 - * hypercall. The page can also be in the kernel's 552 - * linear range, so we need to RO that mapping too. 553 - */ 554 - ptep = lookup_address(va, &level); 555 - BUG_ON(ptep == NULL); 556 - 557 - pfn = pte_pfn(*ptep); 558 - mfn = pfn_to_mfn(pfn); 559 - virt = __va(PFN_PHYS(pfn)); 560 - 561 - frames[f] = mfn; 562 - 563 - make_lowmem_page_readonly((void *)va); 564 - make_lowmem_page_readonly(virt); 565 - } 566 - 567 - if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) 568 - BUG(); 569 - } 570 - 571 - /* 572 - * load_gdt for early boot, when the gdt is only mapped once 573 - */ 574 - static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) 575 - { 576 - unsigned long va = dtr->address; 577 - unsigned int size = dtr->size + 1; 578 - unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); 579 - unsigned long frames[pages]; 580 - int f; 581 - 582 - /* 583 - * A GDT can be up to 64k in size, which corresponds to 8192 584 - * 8-byte entries, or 16 4k pages.. 585 - */ 586 - 587 - BUG_ON(size > 65536); 588 - BUG_ON(va & ~PAGE_MASK); 589 - 590 - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 591 - pte_t pte; 592 - unsigned long pfn, mfn; 593 - 594 - pfn = virt_to_pfn(va); 595 - mfn = pfn_to_mfn(pfn); 596 - 597 - pte = pfn_pte(pfn, PAGE_KERNEL_RO); 598 - 599 - if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) 600 - BUG(); 601 - 602 - frames[f] = mfn; 603 - } 604 - 605 - if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) 606 - BUG(); 607 - } 608 - 609 - static inline bool desc_equal(const struct desc_struct *d1, 610 - const struct desc_struct *d2) 611 - { 612 - return d1->a == d2->a && d1->b == d2->b; 613 - } 614 - 615 - static void load_TLS_descriptor(struct thread_struct *t, 616 - unsigned int cpu, unsigned int i) 617 - { 618 - struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; 619 - struct desc_struct *gdt; 620 - xmaddr_t maddr; 621 - struct multicall_space mc; 622 - 623 - if (desc_equal(shadow, &t->tls_array[i])) 624 - return; 625 - 626 - *shadow = t->tls_array[i]; 627 - 628 - gdt = get_cpu_gdt_rw(cpu); 629 - maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 630 - mc = __xen_mc_entry(0); 631 - 632 - MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 633 - } 634 - 635 - static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 636 - { 637 - /* 638 - * XXX sleazy hack: If we're being called in a lazy-cpu zone 639 - * and lazy gs handling is enabled, it means we're in a 640 - * context switch, and %gs has just been saved. This means we 641 - * can zero it out to prevent faults on exit from the 642 - * hypervisor if the next process has no %gs. Either way, it 643 - * has been saved, and the new value will get loaded properly. 644 - * This will go away as soon as Xen has been modified to not 645 - * save/restore %gs for normal hypercalls. 646 - * 647 - * On x86_64, this hack is not used for %gs, because gs points 648 - * to KERNEL_GS_BASE (and uses it for PDA references), so we 649 - * must not zero %gs on x86_64 650 - * 651 - * For x86_64, we need to zero %fs, otherwise we may get an 652 - * exception between the new %fs descriptor being loaded and 653 - * %fs being effectively cleared at __switch_to(). 654 - */ 655 - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { 656 - #ifdef CONFIG_X86_32 657 - lazy_load_gs(0); 658 - #else 659 - loadsegment(fs, 0); 660 - #endif 661 - } 662 - 663 - xen_mc_batch(); 664 - 665 - load_TLS_descriptor(t, cpu, 0); 666 - load_TLS_descriptor(t, cpu, 1); 667 - load_TLS_descriptor(t, cpu, 2); 668 - 669 - xen_mc_issue(PARAVIRT_LAZY_CPU); 670 - } 671 - 672 - #ifdef CONFIG_X86_64 673 - static void xen_load_gs_index(unsigned int idx) 674 - { 675 - if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) 676 - BUG(); 677 - } 678 - #endif 679 - 680 - static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 681 - const void *ptr) 682 - { 683 - xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); 684 - u64 entry = *(u64 *)ptr; 685 - 686 - trace_xen_cpu_write_ldt_entry(dt, entrynum, entry); 687 - 688 - preempt_disable(); 689 - 690 - xen_mc_flush(); 691 - if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) 692 - BUG(); 693 - 694 - preempt_enable(); 695 - } 696 - 697 - static int cvt_gate_to_trap(int vector, const gate_desc *val, 698 - struct trap_info *info) 699 - { 700 - unsigned long addr; 701 - 702 - if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) 703 - return 0; 704 - 705 - info->vector = vector; 706 - 707 - addr = gate_offset(*val); 708 - #ifdef CONFIG_X86_64 709 - /* 710 - * Look for known traps using IST, and substitute them 711 - * appropriately. The debugger ones are the only ones we care 712 - * about. Xen will handle faults like double_fault, 713 - * so we should never see them. Warn if 714 - * there's an unexpected IST-using fault handler. 715 - */ 716 - if (addr == (unsigned long)debug) 717 - addr = (unsigned long)xen_debug; 718 - else if (addr == (unsigned long)int3) 719 - addr = (unsigned long)xen_int3; 720 - else if (addr == (unsigned long)stack_segment) 721 - addr = (unsigned long)xen_stack_segment; 722 - else if (addr == (unsigned long)double_fault) { 723 - /* Don't need to handle these */ 724 - return 0; 725 - #ifdef CONFIG_X86_MCE 726 - } else if (addr == (unsigned long)machine_check) { 727 - /* 728 - * when xen hypervisor inject vMCE to guest, 729 - * use native mce handler to handle it 730 - */ 731 - ; 732 - #endif 733 - } else if (addr == (unsigned long)nmi) 734 - /* 735 - * Use the native version as well. 736 - */ 737 - ; 738 - else { 739 - /* Some other trap using IST? */ 740 - if (WARN_ON(val->ist != 0)) 741 - return 0; 742 - } 743 - #endif /* CONFIG_X86_64 */ 744 - info->address = addr; 745 - 746 - info->cs = gate_segment(*val); 747 - info->flags = val->dpl; 748 - /* interrupt gates clear IF */ 749 - if (val->type == GATE_INTERRUPT) 750 - info->flags |= 1 << 2; 751 - 752 - return 1; 753 - } 754 - 755 - /* Locations of each CPU's IDT */ 756 - static DEFINE_PER_CPU(struct desc_ptr, idt_desc); 757 - 758 - /* Set an IDT entry. If the entry is part of the current IDT, then 759 - also update Xen. */ 760 - static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) 761 - { 762 - unsigned long p = (unsigned long)&dt[entrynum]; 763 - unsigned long start, end; 764 - 765 - trace_xen_cpu_write_idt_entry(dt, entrynum, g); 766 - 767 - preempt_disable(); 768 - 769 - start = __this_cpu_read(idt_desc.address); 770 - end = start + __this_cpu_read(idt_desc.size) + 1; 771 - 772 - xen_mc_flush(); 773 - 774 - native_write_idt_entry(dt, entrynum, g); 775 - 776 - if (p >= start && (p + 8) <= end) { 777 - struct trap_info info[2]; 778 - 779 - info[1].address = 0; 780 - 781 - if (cvt_gate_to_trap(entrynum, g, &info[0])) 782 - if (HYPERVISOR_set_trap_table(info)) 783 - BUG(); 784 - } 785 - 786 - preempt_enable(); 787 - } 788 - 789 - static void xen_convert_trap_info(const struct desc_ptr *desc, 790 - struct trap_info *traps) 791 - { 792 - unsigned in, out, count; 793 - 794 - count = (desc->size+1) / sizeof(gate_desc); 795 - BUG_ON(count > 256); 796 - 797 - for (in = out = 0; in < count; in++) { 798 - gate_desc *entry = (gate_desc*)(desc->address) + in; 799 - 800 - if (cvt_gate_to_trap(in, entry, &traps[out])) 801 - out++; 802 - } 803 - traps[out].address = 0; 804 - } 805 - 806 - void xen_copy_trap_info(struct trap_info *traps) 807 - { 808 - const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); 809 - 810 - xen_convert_trap_info(desc, traps); 811 - } 812 - 813 - /* Load a new IDT into Xen. In principle this can be per-CPU, so we 814 - hold a spinlock to protect the static traps[] array (static because 815 - it avoids allocation, and saves stack space). */ 816 - static void xen_load_idt(const struct desc_ptr *desc) 817 - { 818 - static DEFINE_SPINLOCK(lock); 819 - static struct trap_info traps[257]; 820 - 821 - trace_xen_cpu_load_idt(desc); 822 - 823 - spin_lock(&lock); 824 - 825 - memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); 826 - 827 - xen_convert_trap_info(desc, traps); 828 - 829 - xen_mc_flush(); 830 - if (HYPERVISOR_set_trap_table(traps)) 831 - BUG(); 832 - 833 - spin_unlock(&lock); 834 - } 835 - 836 - /* Write a GDT descriptor entry. Ignore LDT descriptors, since 837 - they're handled differently. */ 838 - static void xen_write_gdt_entry(struct desc_struct *dt, int entry, 839 - const void *desc, int type) 840 - { 841 - trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); 842 - 843 - preempt_disable(); 844 - 845 - switch (type) { 846 - case DESC_LDT: 847 - case DESC_TSS: 848 - /* ignore */ 849 - break; 850 - 851 - default: { 852 - xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); 853 - 854 - xen_mc_flush(); 855 - if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 856 - BUG(); 857 - } 858 - 859 - } 860 - 861 - preempt_enable(); 862 - } 863 - 864 - /* 865 - * Version of write_gdt_entry for use at early boot-time needed to 866 - * update an entry as simply as possible. 867 - */ 868 - static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, 869 - const void *desc, int type) 870 - { 871 - trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); 872 - 873 - switch (type) { 874 - case DESC_LDT: 875 - case DESC_TSS: 876 - /* ignore */ 877 - break; 878 - 879 - default: { 880 - xmaddr_t maddr = virt_to_machine(&dt[entry]); 881 - 882 - if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 883 - dt[entry] = *(struct desc_struct *)desc; 884 - } 885 - 886 - } 887 - } 888 - 889 - static void xen_load_sp0(struct tss_struct *tss, 890 - struct thread_struct *thread) 891 - { 892 - struct multicall_space mcs; 893 - 894 - mcs = xen_mc_entry(0); 895 - MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 896 - xen_mc_issue(PARAVIRT_LAZY_CPU); 897 - tss->x86_tss.sp0 = thread->sp0; 898 - } 899 - 900 - void xen_set_iopl_mask(unsigned mask) 901 - { 902 - struct physdev_set_iopl set_iopl; 903 - 904 - /* Force the change at ring 0. */ 905 - set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; 906 - HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 907 - } 908 - 909 - static void xen_io_delay(void) 910 - { 911 - } 912 - 913 - static DEFINE_PER_CPU(unsigned long, xen_cr0_value); 914 - 915 - static unsigned long xen_read_cr0(void) 916 - { 917 - unsigned long cr0 = this_cpu_read(xen_cr0_value); 918 - 919 - if (unlikely(cr0 == 0)) { 920 - cr0 = native_read_cr0(); 921 - this_cpu_write(xen_cr0_value, cr0); 922 - } 923 - 924 - return cr0; 925 - } 926 - 927 - static void xen_write_cr0(unsigned long cr0) 928 - { 929 - struct multicall_space mcs; 930 - 931 - this_cpu_write(xen_cr0_value, cr0); 932 - 933 - /* Only pay attention to cr0.TS; everything else is 934 - ignored. */ 935 - mcs = xen_mc_entry(0); 936 - 937 - MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); 938 - 939 - xen_mc_issue(PARAVIRT_LAZY_CPU); 940 - } 941 - 942 - static void xen_write_cr4(unsigned long cr4) 943 - { 944 - cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); 945 - 946 - native_write_cr4(cr4); 947 - } 948 - #ifdef CONFIG_X86_64 949 - static inline unsigned long xen_read_cr8(void) 950 - { 951 - return 0; 952 - } 953 - static inline void xen_write_cr8(unsigned long val) 954 - { 955 - BUG_ON(val); 956 - } 957 - #endif 958 - 959 - static u64 xen_read_msr_safe(unsigned int msr, int *err) 960 - { 961 - u64 val; 962 - 963 - if (pmu_msr_read(msr, &val, err)) 964 - return val; 965 - 966 - val = native_read_msr_safe(msr, err); 967 - switch (msr) { 968 - case MSR_IA32_APICBASE: 969 - #ifdef CONFIG_X86_X2APIC 970 - if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31)))) 971 - #endif 972 - val &= ~X2APIC_ENABLE; 973 - break; 974 - } 975 - return val; 976 - } 977 - 978 - static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 979 - { 980 - int ret; 981 - 982 - ret = 0; 983 - 984 - switch (msr) { 985 - #ifdef CONFIG_X86_64 986 - unsigned which; 987 - u64 base; 988 - 989 - case MSR_FS_BASE: which = SEGBASE_FS; goto set; 990 - case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; 991 - case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; 992 - 993 - set: 994 - base = ((u64)high << 32) | low; 995 - if (HYPERVISOR_set_segment_base(which, base) != 0) 996 - ret = -EIO; 997 - break; 998 - #endif 999 - 1000 - case MSR_STAR: 1001 - case MSR_CSTAR: 1002 - case MSR_LSTAR: 1003 - case MSR_SYSCALL_MASK: 1004 - case MSR_IA32_SYSENTER_CS: 1005 - case MSR_IA32_SYSENTER_ESP: 1006 - case MSR_IA32_SYSENTER_EIP: 1007 - /* Fast syscall setup is all done in hypercalls, so 1008 - these are all ignored. Stub them out here to stop 1009 - Xen console noise. */ 1010 - break; 1011 - 1012 - default: 1013 - if (!pmu_msr_write(msr, low, high, &ret)) 1014 - ret = native_write_msr_safe(msr, low, high); 1015 - } 1016 - 1017 - return ret; 1018 - } 1019 - 1020 - static u64 xen_read_msr(unsigned int msr) 1021 - { 1022 - /* 1023 - * This will silently swallow a #GP from RDMSR. It may be worth 1024 - * changing that. 1025 - */ 1026 - int err; 1027 - 1028 - return xen_read_msr_safe(msr, &err); 1029 - } 1030 - 1031 - static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) 1032 - { 1033 - /* 1034 - * This will silently swallow a #GP from WRMSR. It may be worth 1035 - * changing that. 1036 - */ 1037 - xen_write_msr_safe(msr, low, high); 1038 - } 1039 - 1040 - void xen_setup_shared_info(void) 1041 - { 1042 - if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1043 - set_fixmap(FIX_PARAVIRT_BOOTMAP, 1044 - xen_start_info->shared_info); 1045 - 1046 - HYPERVISOR_shared_info = 1047 - (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); 1048 - } else 1049 - HYPERVISOR_shared_info = 1050 - (struct shared_info *)__va(xen_start_info->shared_info); 1051 - 1052 - #ifndef CONFIG_SMP 1053 - /* In UP this is as good a place as any to set up shared info */ 1054 - xen_setup_vcpu_info_placement(); 1055 - #endif 1056 - 1057 - xen_setup_mfn_list_list(); 1058 - } 1059 - 1060 - /* This is called once we have the cpu_possible_mask */ 1061 - void xen_setup_vcpu_info_placement(void) 1062 - { 1063 - int cpu; 1064 - 1065 - for_each_possible_cpu(cpu) { 1066 - /* Set up direct vCPU id mapping for PV guests. */ 1067 - per_cpu(xen_vcpu_id, cpu) = cpu; 1068 - xen_vcpu_setup(cpu); 1069 - } 1070 - 1071 - /* 1072 - * xen_vcpu_setup managed to place the vcpu_info within the 1073 - * percpu area for all cpus, so make use of it. 1074 - */ 1075 - if (have_vcpu_info_placement) { 1076 - pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); 1077 - pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); 1078 - pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); 1079 - pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); 1080 - pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1081 - } 1082 - } 1083 - 1084 - static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1085 - unsigned long addr, unsigned len) 1086 - { 1087 - char *start, *end, *reloc; 1088 - unsigned ret; 1089 - 1090 - start = end = reloc = NULL; 1091 - 1092 - #define SITE(op, x) \ 1093 - case PARAVIRT_PATCH(op.x): \ 1094 - if (have_vcpu_info_placement) { \ 1095 - start = (char *)xen_##x##_direct; \ 1096 - end = xen_##x##_direct_end; \ 1097 - reloc = xen_##x##_direct_reloc; \ 1098 - } \ 1099 - goto patch_site 1100 - 1101 - switch (type) { 1102 - SITE(pv_irq_ops, irq_enable); 1103 - SITE(pv_irq_ops, irq_disable); 1104 - SITE(pv_irq_ops, save_fl); 1105 - SITE(pv_irq_ops, restore_fl); 1106 - #undef SITE 1107 - 1108 - patch_site: 1109 - if (start == NULL || (end-start) > len) 1110 - goto default_patch; 1111 - 1112 - ret = paravirt_patch_insns(insnbuf, len, start, end); 1113 - 1114 - /* Note: because reloc is assigned from something that 1115 - appears to be an array, gcc assumes it's non-null, 1116 - but doesn't know its relationship with start and 1117 - end. */ 1118 - if (reloc > start && reloc < end) { 1119 - int reloc_off = reloc - start; 1120 - long *relocp = (long *)(insnbuf + reloc_off); 1121 - long delta = start - (char *)addr; 1122 - 1123 - *relocp += delta; 1124 - } 1125 - break; 1126 - 1127 - default_patch: 1128 - default: 1129 - ret = paravirt_patch_default(type, clobbers, insnbuf, 1130 - addr, len); 1131 - break; 1132 - } 1133 - 1134 - return ret; 1135 - } 1136 - 1137 - static const struct pv_info xen_info __initconst = { 1138 - .shared_kernel_pmd = 0, 1139 - 1140 - #ifdef CONFIG_X86_64 1141 - .extra_user_64bit_cs = FLAT_USER_CS64, 1142 - #endif 1143 - .name = "Xen", 1144 - }; 1145 - 1146 - static const struct pv_init_ops xen_init_ops __initconst = { 1147 - .patch = xen_patch, 1148 - }; 1149 - 1150 - static const struct pv_cpu_ops xen_cpu_ops __initconst = { 1151 - .cpuid = xen_cpuid, 1152 - 1153 - .set_debugreg = xen_set_debugreg, 1154 - .get_debugreg = xen_get_debugreg, 1155 - 1156 - .read_cr0 = xen_read_cr0, 1157 - .write_cr0 = xen_write_cr0, 1158 - 1159 - .read_cr4 = native_read_cr4, 1160 - .write_cr4 = xen_write_cr4, 1161 - 1162 - #ifdef CONFIG_X86_64 1163 - .read_cr8 = xen_read_cr8, 1164 - .write_cr8 = xen_write_cr8, 1165 - #endif 1166 - 1167 - .wbinvd = native_wbinvd, 1168 - 1169 - .read_msr = xen_read_msr, 1170 - .write_msr = xen_write_msr, 1171 - 1172 - .read_msr_safe = xen_read_msr_safe, 1173 - .write_msr_safe = xen_write_msr_safe, 1174 - 1175 - .read_pmc = xen_read_pmc, 1176 - 1177 - .iret = xen_iret, 1178 - #ifdef CONFIG_X86_64 1179 - .usergs_sysret64 = xen_sysret64, 1180 - #endif 1181 - 1182 - .load_tr_desc = paravirt_nop, 1183 - .set_ldt = xen_set_ldt, 1184 - .load_gdt = xen_load_gdt, 1185 - .load_idt = xen_load_idt, 1186 - .load_tls = xen_load_tls, 1187 - #ifdef CONFIG_X86_64 1188 - .load_gs_index = xen_load_gs_index, 1189 - #endif 1190 - 1191 - .alloc_ldt = xen_alloc_ldt, 1192 - .free_ldt = xen_free_ldt, 1193 - 1194 - .store_idt = native_store_idt, 1195 - .store_tr = xen_store_tr, 1196 - 1197 - .write_ldt_entry = xen_write_ldt_entry, 1198 - .write_gdt_entry = xen_write_gdt_entry, 1199 - .write_idt_entry = xen_write_idt_entry, 1200 - .load_sp0 = xen_load_sp0, 1201 - 1202 - .set_iopl_mask = xen_set_iopl_mask, 1203 - .io_delay = xen_io_delay, 1204 - 1205 - /* Xen takes care of %gs when switching to usermode for us */ 1206 - .swapgs = paravirt_nop, 1207 - 1208 - .start_context_switch = paravirt_start_context_switch, 1209 - .end_context_switch = xen_end_context_switch, 1210 - }; 1211 - 1212 - static void xen_reboot(int reason) 262 + void xen_reboot(int reason) 1213 263 { 1214 264 struct sched_shutdown r = { .reason = reason }; 1215 265 int cpu; ··· 185 1307 BUG(); 186 1308 } 187 1309 188 - static void xen_restart(char *msg) 1310 + void xen_emergency_restart(void) 189 1311 { 190 1312 xen_reboot(SHUTDOWN_reboot); 191 - } 192 - 193 - static void xen_emergency_restart(void) 194 - { 195 - xen_reboot(SHUTDOWN_reboot); 196 - } 197 - 198 - static void xen_machine_halt(void) 199 - { 200 - xen_reboot(SHUTDOWN_poweroff); 201 - } 202 - 203 - static void xen_machine_power_off(void) 204 - { 205 - if (pm_power_off) 206 - pm_power_off(); 207 - xen_reboot(SHUTDOWN_poweroff); 208 - } 209 - 210 - static void xen_crash_shutdown(struct pt_regs *regs) 211 - { 212 - xen_reboot(SHUTDOWN_crash); 213 1313 } 214 1314 215 1315 static int ··· 199 1343 } 200 1344 201 1345 static struct notifier_block xen_panic_block = { 202 - .notifier_call= xen_panic_event, 1346 + .notifier_call = xen_panic_event, 203 1347 .priority = INT_MIN 204 1348 }; 205 1349 ··· 209 1353 return 0; 210 1354 } 211 1355 212 - static const struct machine_ops xen_machine_ops __initconst = { 213 - .restart = xen_restart, 214 - .halt = xen_machine_halt, 215 - .power_off = xen_machine_power_off, 216 - .shutdown = xen_machine_halt, 217 - .crash_shutdown = xen_crash_shutdown, 218 - .emergency_restart = xen_emergency_restart, 219 - }; 220 - 221 - static unsigned char xen_get_nmi_reason(void) 222 - { 223 - unsigned char reason = 0; 224 - 225 - /* Construct a value which looks like it came from port 0x61. */ 226 - if (test_bit(_XEN_NMIREASON_io_error, 227 - &HYPERVISOR_shared_info->arch.nmi_reason)) 228 - reason |= NMI_REASON_IOCHK; 229 - if (test_bit(_XEN_NMIREASON_pci_serr, 230 - &HYPERVISOR_shared_info->arch.nmi_reason)) 231 - reason |= NMI_REASON_SERR; 232 - 233 - return reason; 234 - } 235 - 236 - static void __init xen_boot_params_init_edd(void) 237 - { 238 - #if IS_ENABLED(CONFIG_EDD) 239 - struct xen_platform_op op; 240 - struct edd_info *edd_info; 241 - u32 *mbr_signature; 242 - unsigned nr; 243 - int ret; 244 - 245 - edd_info = boot_params.eddbuf; 246 - mbr_signature = boot_params.edd_mbr_sig_buffer; 247 - 248 - op.cmd = XENPF_firmware_info; 249 - 250 - op.u.firmware_info.type = XEN_FW_DISK_INFO; 251 - for (nr = 0; nr < EDDMAXNR; nr++) { 252 - struct edd_info *info = edd_info + nr; 253 - 254 - op.u.firmware_info.index = nr; 255 - info->params.length = sizeof(info->params); 256 - set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, 257 - &info->params); 258 - ret = HYPERVISOR_platform_op(&op); 259 - if (ret) 260 - break; 261 - 262 - #define C(x) info->x = op.u.firmware_info.u.disk_info.x 263 - C(device); 264 - C(version); 265 - C(interface_support); 266 - C(legacy_max_cylinder); 267 - C(legacy_max_head); 268 - C(legacy_sectors_per_track); 269 - #undef C 270 - } 271 - boot_params.eddbuf_entries = nr; 272 - 273 - op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; 274 - for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { 275 - op.u.firmware_info.index = nr; 276 - ret = HYPERVISOR_platform_op(&op); 277 - if (ret) 278 - break; 279 - mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; 280 - } 281 - boot_params.edd_mbr_sig_buf_entries = nr; 282 - #endif 283 - } 284 - 285 - /* 286 - * Set up the GDT and segment registers for -fstack-protector. Until 287 - * we do this, we have to be careful not to call any stack-protected 288 - * function, which is most of the kernel. 289 - */ 290 - static void xen_setup_gdt(int cpu) 291 - { 292 - pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; 293 - pv_cpu_ops.load_gdt = xen_load_gdt_boot; 294 - 295 - setup_stack_canary_segment(0); 296 - switch_to_new_gdt(0); 297 - 298 - pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; 299 - pv_cpu_ops.load_gdt = xen_load_gdt; 300 - } 301 - 302 - static void __init xen_dom0_set_legacy_features(void) 303 - { 304 - x86_platform.legacy.rtc = 1; 305 - } 306 - 307 - static int xen_cpuhp_setup(void) 308 - { 309 - int rc; 310 - 311 - rc = cpuhp_setup_state_nocalls(CPUHP_XEN_PREPARE, 312 - "x86/xen/hvm_guest:prepare", 313 - xen_cpu_up_prepare, xen_cpu_dead); 314 - if (rc >= 0) { 315 - rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, 316 - "x86/xen/hvm_guest:online", 317 - xen_cpu_up_online, NULL); 318 - if (rc < 0) 319 - cpuhp_remove_state_nocalls(CPUHP_XEN_PREPARE); 320 - } 321 - 322 - return rc >= 0 ? 0 : rc; 323 - } 324 - 325 - /* First C function to be called on Xen boot */ 326 - asmlinkage __visible void __init xen_start_kernel(void) 327 - { 328 - struct physdev_set_iopl set_iopl; 329 - unsigned long initrd_start = 0; 330 - int rc; 331 - 332 - if (!xen_start_info) 333 - return; 334 - 335 - xen_domain_type = XEN_PV_DOMAIN; 336 - 337 - xen_setup_features(); 338 - 339 - xen_setup_machphys_mapping(); 340 - 341 - /* Install Xen paravirt ops */ 342 - pv_info = xen_info; 343 - pv_init_ops = xen_init_ops; 344 - pv_cpu_ops = xen_cpu_ops; 345 - 346 - x86_platform.get_nmi_reason = xen_get_nmi_reason; 347 - 348 - x86_init.resources.memory_setup = xen_memory_setup; 349 - x86_init.oem.arch_setup = xen_arch_setup; 350 - x86_init.oem.banner = xen_banner; 351 - 352 - xen_init_time_ops(); 353 - 354 - /* 355 - * Set up some pagetable state before starting to set any ptes. 356 - */ 357 - 358 - xen_init_mmu_ops(); 359 - 360 - /* Prevent unwanted bits from being set in PTEs. */ 361 - __supported_pte_mask &= ~_PAGE_GLOBAL; 362 - 363 - /* 364 - * Prevent page tables from being allocated in highmem, even 365 - * if CONFIG_HIGHPTE is enabled. 366 - */ 367 - __userpte_alloc_gfp &= ~__GFP_HIGHMEM; 368 - 369 - /* Work out if we support NX */ 370 - x86_configure_nx(); 371 - 372 - /* Get mfn list */ 373 - xen_build_dynamic_phys_to_machine(); 374 - 375 - /* 376 - * Set up kernel GDT and segment registers, mainly so that 377 - * -fstack-protector code can be executed. 378 - */ 379 - xen_setup_gdt(0); 380 - 381 - xen_init_irq_ops(); 382 - xen_init_cpuid_mask(); 383 - 384 - #ifdef CONFIG_X86_LOCAL_APIC 385 - /* 386 - * set up the basic apic ops. 387 - */ 388 - xen_init_apic(); 389 - #endif 390 - 391 - if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 392 - pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; 393 - pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; 394 - } 395 - 396 - machine_ops = xen_machine_ops; 397 - 398 - /* 399 - * The only reliable way to retain the initial address of the 400 - * percpu gdt_page is to remember it here, so we can go and 401 - * mark it RW later, when the initial percpu area is freed. 402 - */ 403 - xen_initial_gdt = &per_cpu(gdt_page, 0); 404 - 405 - xen_smp_init(); 406 - 407 - #ifdef CONFIG_ACPI_NUMA 408 - /* 409 - * The pages we from Xen are not related to machine pages, so 410 - * any NUMA information the kernel tries to get from ACPI will 411 - * be meaningless. Prevent it from trying. 412 - */ 413 - acpi_numa = -1; 414 - #endif 415 - /* Don't do the full vcpu_info placement stuff until we have a 416 - possible map and a non-dummy shared_info. */ 417 - per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 418 - 419 - WARN_ON(xen_cpuhp_setup()); 420 - 421 - local_irq_disable(); 422 - early_boot_irqs_disabled = true; 423 - 424 - xen_raw_console_write("mapping kernel into physical memory\n"); 425 - xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, 426 - xen_start_info->nr_pages); 427 - xen_reserve_special_pages(); 428 - 429 - /* keep using Xen gdt for now; no urgent need to change it */ 430 - 431 - #ifdef CONFIG_X86_32 432 - pv_info.kernel_rpl = 1; 433 - if (xen_feature(XENFEAT_supervisor_mode_kernel)) 434 - pv_info.kernel_rpl = 0; 435 - #else 436 - pv_info.kernel_rpl = 0; 437 - #endif 438 - /* set the limit of our address space */ 439 - xen_reserve_top(); 440 - 441 - /* 442 - * We used to do this in xen_arch_setup, but that is too late 443 - * on AMD were early_cpu_init (run before ->arch_setup()) calls 444 - * early_amd_init which pokes 0xcf8 port. 445 - */ 446 - set_iopl.iopl = 1; 447 - rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 448 - if (rc != 0) 449 - xen_raw_printk("physdev_op failed %d\n", rc); 450 - 451 - #ifdef CONFIG_X86_32 452 - /* set up basic CPUID stuff */ 453 - cpu_detect(&new_cpu_data); 454 - set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); 455 - new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); 456 - #endif 457 - 458 - if (xen_start_info->mod_start) { 459 - if (xen_start_info->flags & SIF_MOD_START_PFN) 460 - initrd_start = PFN_PHYS(xen_start_info->mod_start); 461 - else 462 - initrd_start = __pa(xen_start_info->mod_start); 463 - } 464 - 465 - /* Poke various useful things into boot_params */ 466 - boot_params.hdr.type_of_loader = (9 << 4) | 0; 467 - boot_params.hdr.ramdisk_image = initrd_start; 468 - boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 469 - boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 470 - boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; 471 - 472 - if (!xen_initial_domain()) { 473 - add_preferred_console("xenboot", 0, NULL); 474 - add_preferred_console("tty", 0, NULL); 475 - add_preferred_console("hvc", 0, NULL); 476 - if (pci_xen) 477 - x86_init.pci.arch_init = pci_xen_init; 478 - } else { 479 - const struct dom0_vga_console_info *info = 480 - (void *)((char *)xen_start_info + 481 - xen_start_info->console.dom0.info_off); 482 - struct xen_platform_op op = { 483 - .cmd = XENPF_firmware_info, 484 - .interface_version = XENPF_INTERFACE_VERSION, 485 - .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, 486 - }; 487 - 488 - x86_platform.set_legacy_features = 489 - xen_dom0_set_legacy_features; 490 - xen_init_vga(info, xen_start_info->console.dom0.info_size); 491 - xen_start_info->console.domU.mfn = 0; 492 - xen_start_info->console.domU.evtchn = 0; 493 - 494 - if (HYPERVISOR_platform_op(&op) == 0) 495 - boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; 496 - 497 - /* Make sure ACS will be enabled */ 498 - pci_request_acs(); 499 - 500 - xen_acpi_sleep_register(); 501 - 502 - /* Avoid searching for BIOS MP tables */ 503 - x86_init.mpparse.find_smp_config = x86_init_noop; 504 - x86_init.mpparse.get_smp_config = x86_init_uint_noop; 505 - 506 - xen_boot_params_init_edd(); 507 - } 508 - #ifdef CONFIG_PCI 509 - /* PCI BIOS service won't work from a PV guest. */ 510 - pci_probe &= ~PCI_PROBE_BIOS; 511 - #endif 512 - xen_raw_console_write("about to get started...\n"); 513 - 514 - /* Let's presume PV guests always boot on vCPU with id 0. */ 515 - per_cpu(xen_vcpu_id, 0) = 0; 516 - 517 - xen_setup_runstate_info(0); 518 - 519 - xen_efi_init(); 520 - 521 - /* Start the world */ 522 - #ifdef CONFIG_X86_32 523 - i386_start_kernel(); 524 - #else 525 - cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */ 526 - x86_64_start_reservations((char *)__pa_symbol(&boot_params)); 527 - #endif 528 - } 529 - 530 - #ifdef CONFIG_XEN_PVH 531 - 532 - static void xen_pvh_arch_setup(void) 533 - { 534 - #ifdef CONFIG_ACPI 535 - /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ 536 - if (nr_ioapics == 0) 537 - acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; 538 - #endif 539 - } 540 - 541 - static void __init init_pvh_bootparams(void) 542 - { 543 - struct xen_memory_map memmap; 544 - unsigned int i; 545 - int rc; 546 - 547 - memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); 548 - 549 - memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); 550 - set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); 551 - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); 552 - if (rc) { 553 - xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); 554 - BUG(); 555 - } 556 - 557 - if (memmap.nr_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) { 558 - pvh_bootparams.e820_table[memmap.nr_entries].addr = 559 - ISA_START_ADDRESS; 560 - pvh_bootparams.e820_table[memmap.nr_entries].size = 561 - ISA_END_ADDRESS - ISA_START_ADDRESS; 562 - pvh_bootparams.e820_table[memmap.nr_entries].type = 563 - E820_TYPE_RESERVED; 564 - memmap.nr_entries++; 565 - } else 566 - xen_raw_printk("Warning: Can fit ISA range into e820\n"); 567 - 568 - pvh_bootparams.e820_entries = memmap.nr_entries; 569 - for (i = 0; i < pvh_bootparams.e820_entries; i++) 570 - e820__range_add(pvh_bootparams.e820_table[i].addr, 571 - pvh_bootparams.e820_table[i].size, 572 - pvh_bootparams.e820_table[i].type); 573 - 574 - e820__update_table(e820_table); 575 - 576 - pvh_bootparams.hdr.cmd_line_ptr = 577 - pvh_start_info.cmdline_paddr; 578 - 579 - /* The first module is always ramdisk. */ 580 - if (pvh_start_info.nr_modules) { 581 - struct hvm_modlist_entry *modaddr = 582 - __va(pvh_start_info.modlist_paddr); 583 - pvh_bootparams.hdr.ramdisk_image = modaddr->paddr; 584 - pvh_bootparams.hdr.ramdisk_size = modaddr->size; 585 - } 586 - 587 - /* 588 - * See Documentation/x86/boot.txt. 589 - * 590 - * Version 2.12 supports Xen entry point but we will use default x86/PC 591 - * environment (i.e. hardware_subarch 0). 592 - */ 593 - pvh_bootparams.hdr.version = 0x212; 594 - pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ 595 - } 596 - 597 - /* 598 - * This routine (and those that it might call) should not use 599 - * anything that lives in .bss since that segment will be cleared later. 600 - */ 601 - void __init xen_prepare_pvh(void) 602 - { 603 - u32 msr; 604 - u64 pfn; 605 - 606 - if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { 607 - xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", 608 - pvh_start_info.magic); 609 - BUG(); 610 - } 611 - 612 - xen_pvh = 1; 613 - 614 - msr = cpuid_ebx(xen_cpuid_base() + 2); 615 - pfn = __pa(hypercall_page); 616 - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); 617 - 618 - init_pvh_bootparams(); 619 - 620 - x86_init.oem.arch_setup = xen_pvh_arch_setup; 621 - } 622 - #endif 623 - 624 - void __ref xen_hvm_init_shared_info(void) 625 - { 626 - int cpu; 627 - struct xen_add_to_physmap xatp; 628 - static struct shared_info *shared_info_page = 0; 629 - 630 - if (!shared_info_page) 631 - shared_info_page = (struct shared_info *) 632 - extend_brk(PAGE_SIZE, PAGE_SIZE); 633 - xatp.domid = DOMID_SELF; 634 - xatp.idx = 0; 635 - xatp.space = XENMAPSPACE_shared_info; 636 - xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; 637 - if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 638 - BUG(); 639 - 640 - HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; 641 - 642 - /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 643 - * page, we use it in the event channel upcall and in some pvclock 644 - * related functions. We don't need the vcpu_info placement 645 - * optimizations because we don't use any pv_mmu or pv_irq op on 646 - * HVM. 647 - * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is 648 - * online but xen_hvm_init_shared_info is run at resume time too and 649 - * in that case multiple vcpus might be online. */ 650 - for_each_online_cpu(cpu) { 651 - /* Leave it to be NULL. */ 652 - if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS) 653 - continue; 654 - per_cpu(xen_vcpu, cpu) = 655 - &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; 656 - } 657 - } 658 - 659 - #ifdef CONFIG_XEN_PVHVM 660 - static void __init init_hvm_pv_info(void) 661 - { 662 - int major, minor; 663 - uint32_t eax, ebx, ecx, edx, base; 664 - 665 - base = xen_cpuid_base(); 666 - eax = cpuid_eax(base + 1); 667 - 668 - major = eax >> 16; 669 - minor = eax & 0xffff; 670 - printk(KERN_INFO "Xen version %d.%d.\n", major, minor); 671 - 672 - xen_domain_type = XEN_HVM_DOMAIN; 673 - 674 - /* PVH set up hypercall page in xen_prepare_pvh(). */ 675 - if (xen_pvh_domain()) 676 - pv_info.name = "Xen PVH"; 677 - else { 678 - u64 pfn; 679 - uint32_t msr; 680 - 681 - pv_info.name = "Xen HVM"; 682 - msr = cpuid_ebx(base + 2); 683 - pfn = __pa(hypercall_page); 684 - wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); 685 - } 686 - 687 - xen_setup_features(); 688 - 689 - cpuid(base + 4, &eax, &ebx, &ecx, &edx); 690 - if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT) 691 - this_cpu_write(xen_vcpu_id, ebx); 692 - else 693 - this_cpu_write(xen_vcpu_id, smp_processor_id()); 694 - } 695 - #endif 696 - 697 - static int xen_cpu_up_prepare(unsigned int cpu) 698 - { 699 - int rc; 700 - 701 - if (xen_hvm_domain()) { 702 - /* 703 - * This can happen if CPU was offlined earlier and 704 - * offlining timed out in common_cpu_die(). 705 - */ 706 - if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { 707 - xen_smp_intr_free(cpu); 708 - xen_uninit_lock_cpu(cpu); 709 - } 710 - 711 - if (cpu_acpi_id(cpu) != U32_MAX) 712 - per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); 713 - else 714 - per_cpu(xen_vcpu_id, cpu) = cpu; 715 - xen_vcpu_setup(cpu); 716 - } 717 - 718 - if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) 719 - xen_setup_timer(cpu); 720 - 721 - rc = xen_smp_intr_init(cpu); 722 - if (rc) { 723 - WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", 724 - cpu, rc); 725 - return rc; 726 - } 727 - return 0; 728 - } 729 - 730 - static int xen_cpu_dead(unsigned int cpu) 731 - { 732 - xen_smp_intr_free(cpu); 733 - 734 - if (xen_pv_domain() || xen_feature(XENFEAT_hvm_safe_pvclock)) 735 - xen_teardown_timer(cpu); 736 - 737 - return 0; 738 - } 739 - 740 - static int xen_cpu_up_online(unsigned int cpu) 741 - { 742 - xen_init_lock_cpu(cpu); 743 - return 0; 744 - } 745 - 746 - #ifdef CONFIG_XEN_PVHVM 747 - #ifdef CONFIG_KEXEC_CORE 748 - static void xen_hvm_shutdown(void) 749 - { 750 - native_machine_shutdown(); 751 - if (kexec_in_progress) 752 - xen_reboot(SHUTDOWN_soft_reset); 753 - } 754 - 755 - static void xen_hvm_crash_shutdown(struct pt_regs *regs) 756 - { 757 - native_machine_crash_shutdown(regs); 758 - xen_reboot(SHUTDOWN_soft_reset); 759 - } 760 - #endif 761 - 762 - static void __init xen_hvm_guest_init(void) 763 - { 764 - if (xen_pv_domain()) 765 - return; 766 - 767 - init_hvm_pv_info(); 768 - 769 - xen_hvm_init_shared_info(); 770 - 771 - xen_panic_handler_init(); 772 - 773 - BUG_ON(!xen_feature(XENFEAT_hvm_callback_vector)); 774 - 775 - xen_hvm_smp_init(); 776 - WARN_ON(xen_cpuhp_setup()); 777 - xen_unplug_emulated_devices(); 778 - x86_init.irqs.intr_init = xen_init_IRQ; 779 - xen_hvm_init_time_ops(); 780 - xen_hvm_init_mmu_ops(); 781 - 782 - if (xen_pvh_domain()) 783 - machine_ops.emergency_restart = xen_emergency_restart; 784 - #ifdef CONFIG_KEXEC_CORE 785 - machine_ops.shutdown = xen_hvm_shutdown; 786 - machine_ops.crash_shutdown = xen_hvm_crash_shutdown; 787 - #endif 788 - } 789 - #endif 790 - 791 - static bool xen_nopv = false; 792 - static __init int xen_parse_nopv(char *arg) 793 - { 794 - xen_nopv = true; 795 - return 0; 796 - } 797 - early_param("xen_nopv", xen_parse_nopv); 798 - 799 - static uint32_t __init xen_platform(void) 800 - { 801 - if (xen_nopv) 802 - return 0; 803 - 804 - return xen_cpuid_base(); 805 - } 806 - 807 - bool xen_hvm_need_lapic(void) 808 - { 809 - if (xen_nopv) 810 - return false; 811 - if (xen_pv_domain()) 812 - return false; 813 - if (!xen_hvm_domain()) 814 - return false; 815 - if (xen_feature(XENFEAT_hvm_pirqs)) 816 - return false; 817 - return true; 818 - } 819 - EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); 820 - 821 - static void xen_set_cpu_features(struct cpuinfo_x86 *c) 822 - { 823 - if (xen_pv_domain()) { 824 - clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); 825 - set_cpu_cap(c, X86_FEATURE_XENPV); 826 - } 827 - } 828 - 829 - static void xen_pin_vcpu(int cpu) 1356 + void xen_pin_vcpu(int cpu) 830 1357 { 831 1358 static bool disable_pinning; 832 1359 struct sched_pin_override pin_override; ··· 247 2008 disable_pinning = true; 248 2009 } 249 2010 } 250 - 251 - const struct hypervisor_x86 x86_hyper_xen = { 252 - .name = "Xen", 253 - .detect = xen_platform, 254 - #ifdef CONFIG_XEN_PVHVM 255 - .init_platform = xen_hvm_guest_init, 256 - #endif 257 - .x2apic_available = xen_x2apic_para_available, 258 - .set_cpu_features = xen_set_cpu_features, 259 - .pin_vcpu = xen_pin_vcpu, 260 - }; 261 - EXPORT_SYMBOL(x86_hyper_xen); 262 2011 263 2012 #ifdef CONFIG_HOTPLUG_CPU 264 2013 void xen_arch_register_cpu(int num)

+214

arch/x86/xen/enlighten_hvm.c

··· 1 + #include <linux/cpu.h> 2 + #include <linux/kexec.h> 3 + 4 + #include <xen/features.h> 5 + #include <xen/events.h> 6 + #include <xen/interface/memory.h> 7 + 8 + #include <asm/cpu.h> 9 + #include <asm/smp.h> 10 + #include <asm/reboot.h> 11 + #include <asm/setup.h> 12 + #include <asm/hypervisor.h> 13 + 14 + #include <asm/xen/cpuid.h> 15 + #include <asm/xen/hypervisor.h> 16 + 17 + #include "xen-ops.h" 18 + #include "mmu.h" 19 + #include "smp.h" 20 + 21 + void __ref xen_hvm_init_shared_info(void) 22 + { 23 + int cpu; 24 + struct xen_add_to_physmap xatp; 25 + static struct shared_info *shared_info_page; 26 + 27 + if (!shared_info_page) 28 + shared_info_page = (struct shared_info *) 29 + extend_brk(PAGE_SIZE, PAGE_SIZE); 30 + xatp.domid = DOMID_SELF; 31 + xatp.idx = 0; 32 + xatp.space = XENMAPSPACE_shared_info; 33 + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; 34 + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) 35 + BUG(); 36 + 37 + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; 38 + 39 + /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info 40 + * page, we use it in the event channel upcall and in some pvclock 41 + * related functions. We don't need the vcpu_info placement 42 + * optimizations because we don't use any pv_mmu or pv_irq op on 43 + * HVM. 44 + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is 45 + * online but xen_hvm_init_shared_info is run at resume time too and 46 + * in that case multiple vcpus might be online. */ 47 + for_each_online_cpu(cpu) { 48 + /* Leave it to be NULL. */ 49 + if (xen_vcpu_nr(cpu) >= MAX_VIRT_CPUS) 50 + continue; 51 + per_cpu(xen_vcpu, cpu) = 52 + &HYPERVISOR_shared_info->vcpu_info[xen_vcpu_nr(cpu)]; 53 + } 54 + } 55 + 56 + static void __init init_hvm_pv_info(void) 57 + { 58 + int major, minor; 59 + uint32_t eax, ebx, ecx, edx, base; 60 + 61 + base = xen_cpuid_base(); 62 + eax = cpuid_eax(base + 1); 63 + 64 + major = eax >> 16; 65 + minor = eax & 0xffff; 66 + printk(KERN_INFO "Xen version %d.%d.\n", major, minor); 67 + 68 + xen_domain_type = XEN_HVM_DOMAIN; 69 + 70 + /* PVH set up hypercall page in xen_prepare_pvh(). */ 71 + if (xen_pvh_domain()) 72 + pv_info.name = "Xen PVH"; 73 + else { 74 + u64 pfn; 75 + uint32_t msr; 76 + 77 + pv_info.name = "Xen HVM"; 78 + msr = cpuid_ebx(base + 2); 79 + pfn = __pa(hypercall_page); 80 + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); 81 + } 82 + 83 + xen_setup_features(); 84 + 85 + cpuid(base + 4, &eax, &ebx, &ecx, &edx); 86 + if (eax & XEN_HVM_CPUID_VCPU_ID_PRESENT) 87 + this_cpu_write(xen_vcpu_id, ebx); 88 + else 89 + this_cpu_write(xen_vcpu_id, smp_processor_id()); 90 + } 91 + 92 + #ifdef CONFIG_KEXEC_CORE 93 + static void xen_hvm_shutdown(void) 94 + { 95 + native_machine_shutdown(); 96 + if (kexec_in_progress) 97 + xen_reboot(SHUTDOWN_soft_reset); 98 + } 99 + 100 + static void xen_hvm_crash_shutdown(struct pt_regs *regs) 101 + { 102 + native_machine_crash_shutdown(regs); 103 + xen_reboot(SHUTDOWN_soft_reset); 104 + } 105 + #endif 106 + 107 + static int xen_cpu_up_prepare_hvm(unsigned int cpu) 108 + { 109 + int rc; 110 + 111 + /* 112 + * This can happen if CPU was offlined earlier and 113 + * offlining timed out in common_cpu_die(). 114 + */ 115 + if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) { 116 + xen_smp_intr_free(cpu); 117 + xen_uninit_lock_cpu(cpu); 118 + } 119 + 120 + if (cpu_acpi_id(cpu) != U32_MAX) 121 + per_cpu(xen_vcpu_id, cpu) = cpu_acpi_id(cpu); 122 + else 123 + per_cpu(xen_vcpu_id, cpu) = cpu; 124 + xen_vcpu_setup(cpu); 125 + 126 + if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) 127 + xen_setup_timer(cpu); 128 + 129 + rc = xen_smp_intr_init(cpu); 130 + if (rc) { 131 + WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", 132 + cpu, rc); 133 + return rc; 134 + } 135 + return 0; 136 + } 137 + 138 + static int xen_cpu_dead_hvm(unsigned int cpu) 139 + { 140 + xen_smp_intr_free(cpu); 141 + 142 + if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) 143 + xen_teardown_timer(cpu); 144 + 145 + return 0; 146 + } 147 + 148 + static void __init xen_hvm_guest_init(void) 149 + { 150 + if (xen_pv_domain()) 151 + return; 152 + 153 + init_hvm_pv_info(); 154 + 155 + xen_hvm_init_shared_info(); 156 + 157 + xen_panic_handler_init(); 158 + 159 + if (xen_feature(XENFEAT_hvm_callback_vector)) 160 + xen_have_vector_callback = 1; 161 + 162 + xen_hvm_smp_init(); 163 + WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_hvm, xen_cpu_dead_hvm)); 164 + xen_unplug_emulated_devices(); 165 + x86_init.irqs.intr_init = xen_init_IRQ; 166 + xen_hvm_init_time_ops(); 167 + xen_hvm_init_mmu_ops(); 168 + 169 + if (xen_pvh_domain()) 170 + machine_ops.emergency_restart = xen_emergency_restart; 171 + #ifdef CONFIG_KEXEC_CORE 172 + machine_ops.shutdown = xen_hvm_shutdown; 173 + machine_ops.crash_shutdown = xen_hvm_crash_shutdown; 174 + #endif 175 + } 176 + 177 + static bool xen_nopv; 178 + static __init int xen_parse_nopv(char *arg) 179 + { 180 + xen_nopv = true; 181 + return 0; 182 + } 183 + early_param("xen_nopv", xen_parse_nopv); 184 + 185 + bool xen_hvm_need_lapic(void) 186 + { 187 + if (xen_nopv) 188 + return false; 189 + if (xen_pv_domain()) 190 + return false; 191 + if (!xen_hvm_domain()) 192 + return false; 193 + if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback) 194 + return false; 195 + return true; 196 + } 197 + EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); 198 + 199 + static uint32_t __init xen_platform_hvm(void) 200 + { 201 + if (xen_pv_domain() || xen_nopv) 202 + return 0; 203 + 204 + return xen_cpuid_base(); 205 + } 206 + 207 + const struct hypervisor_x86 x86_hyper_xen_hvm = { 208 + .name = "Xen HVM", 209 + .detect = xen_platform_hvm, 210 + .init_platform = xen_hvm_guest_init, 211 + .pin_vcpu = xen_pin_vcpu, 212 + .x2apic_available = xen_x2apic_para_available, 213 + }; 214 + EXPORT_SYMBOL(x86_hyper_xen_hvm);

+1513

arch/x86/xen/enlighten_pv.c

··· 1 + /* 2 + * Core of Xen paravirt_ops implementation. 3 + * 4 + * This file contains the xen_paravirt_ops structure itself, and the 5 + * implementations for: 6 + * - privileged instructions 7 + * - interrupt flags 8 + * - segment operations 9 + * - booting and setup 10 + * 11 + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 12 + */ 13 + 14 + #include <linux/cpu.h> 15 + #include <linux/kernel.h> 16 + #include <linux/init.h> 17 + #include <linux/smp.h> 18 + #include <linux/preempt.h> 19 + #include <linux/hardirq.h> 20 + #include <linux/percpu.h> 21 + #include <linux/delay.h> 22 + #include <linux/start_kernel.h> 23 + #include <linux/sched.h> 24 + #include <linux/kprobes.h> 25 + #include <linux/bootmem.h> 26 + #include <linux/export.h> 27 + #include <linux/mm.h> 28 + #include <linux/page-flags.h> 29 + #include <linux/highmem.h> 30 + #include <linux/console.h> 31 + #include <linux/pci.h> 32 + #include <linux/gfp.h> 33 + #include <linux/memblock.h> 34 + #include <linux/edd.h> 35 + #include <linux/frame.h> 36 + 37 + #include <xen/xen.h> 38 + #include <xen/events.h> 39 + #include <xen/interface/xen.h> 40 + #include <xen/interface/version.h> 41 + #include <xen/interface/physdev.h> 42 + #include <xen/interface/vcpu.h> 43 + #include <xen/interface/memory.h> 44 + #include <xen/interface/nmi.h> 45 + #include <xen/interface/xen-mca.h> 46 + #include <xen/features.h> 47 + #include <xen/page.h> 48 + #include <xen/hvc-console.h> 49 + #include <xen/acpi.h> 50 + 51 + #include <asm/paravirt.h> 52 + #include <asm/apic.h> 53 + #include <asm/page.h> 54 + #include <asm/xen/pci.h> 55 + #include <asm/xen/hypercall.h> 56 + #include <asm/xen/hypervisor.h> 57 + #include <asm/xen/cpuid.h> 58 + #include <asm/fixmap.h> 59 + #include <asm/processor.h> 60 + #include <asm/proto.h> 61 + #include <asm/msr-index.h> 62 + #include <asm/traps.h> 63 + #include <asm/setup.h> 64 + #include <asm/desc.h> 65 + #include <asm/pgalloc.h> 66 + #include <asm/pgtable.h> 67 + #include <asm/tlbflush.h> 68 + #include <asm/reboot.h> 69 + #include <asm/stackprotector.h> 70 + #include <asm/hypervisor.h> 71 + #include <asm/mach_traps.h> 72 + #include <asm/mwait.h> 73 + #include <asm/pci_x86.h> 74 + #include <asm/cpu.h> 75 + 76 + #ifdef CONFIG_ACPI 77 + #include <linux/acpi.h> 78 + #include <asm/acpi.h> 79 + #include <acpi/pdc_intel.h> 80 + #include <acpi/processor.h> 81 + #include <xen/interface/platform.h> 82 + #endif 83 + 84 + #include "xen-ops.h" 85 + #include "mmu.h" 86 + #include "smp.h" 87 + #include "multicalls.h" 88 + #include "pmu.h" 89 + 90 + void *xen_initial_gdt; 91 + 92 + RESERVE_BRK(shared_info_page_brk, PAGE_SIZE); 93 + 94 + static int xen_cpu_up_prepare_pv(unsigned int cpu); 95 + static int xen_cpu_dead_pv(unsigned int cpu); 96 + 97 + struct tls_descs { 98 + struct desc_struct desc[3]; 99 + }; 100 + 101 + /* 102 + * Updating the 3 TLS descriptors in the GDT on every task switch is 103 + * surprisingly expensive so we avoid updating them if they haven't 104 + * changed. Since Xen writes different descriptors than the one 105 + * passed in the update_descriptor hypercall we keep shadow copies to 106 + * compare against. 107 + */ 108 + static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc); 109 + 110 + /* 111 + * On restore, set the vcpu placement up again. 112 + * If it fails, then we're in a bad state, since 113 + * we can't back out from using it... 114 + */ 115 + void xen_vcpu_restore(void) 116 + { 117 + int cpu; 118 + 119 + for_each_possible_cpu(cpu) { 120 + bool other_cpu = (cpu != smp_processor_id()); 121 + bool is_up = HYPERVISOR_vcpu_op(VCPUOP_is_up, xen_vcpu_nr(cpu), 122 + NULL); 123 + 124 + if (other_cpu && is_up && 125 + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL)) 126 + BUG(); 127 + 128 + xen_setup_runstate_info(cpu); 129 + 130 + if (xen_have_vcpu_info_placement) 131 + xen_vcpu_setup(cpu); 132 + 133 + if (other_cpu && is_up && 134 + HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL)) 135 + BUG(); 136 + } 137 + } 138 + 139 + static void __init xen_banner(void) 140 + { 141 + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); 142 + struct xen_extraversion extra; 143 + HYPERVISOR_xen_version(XENVER_extraversion, &extra); 144 + 145 + pr_info("Booting paravirtualized kernel %son %s\n", 146 + xen_feature(XENFEAT_auto_translated_physmap) ? 147 + "with PVH extensions " : "", pv_info.name); 148 + printk(KERN_INFO "Xen version: %d.%d%s%s\n", 149 + version >> 16, version & 0xffff, extra.extraversion, 150 + xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 151 + } 152 + /* Check if running on Xen version (major, minor) or later */ 153 + bool 154 + xen_running_on_version_or_later(unsigned int major, unsigned int minor) 155 + { 156 + unsigned int version; 157 + 158 + if (!xen_domain()) 159 + return false; 160 + 161 + version = HYPERVISOR_xen_version(XENVER_version, NULL); 162 + if ((((version >> 16) == major) && ((version & 0xffff) >= minor)) || 163 + ((version >> 16) > major)) 164 + return true; 165 + return false; 166 + } 167 + 168 + static __read_mostly unsigned int cpuid_leaf5_ecx_val; 169 + static __read_mostly unsigned int cpuid_leaf5_edx_val; 170 + 171 + static void xen_cpuid(unsigned int *ax, unsigned int *bx, 172 + unsigned int *cx, unsigned int *dx) 173 + { 174 + unsigned maskebx = ~0; 175 + 176 + /* 177 + * Mask out inconvenient features, to try and disable as many 178 + * unsupported kernel subsystems as possible. 179 + */ 180 + switch (*ax) { 181 + case CPUID_MWAIT_LEAF: 182 + /* Synthesize the values.. */ 183 + *ax = 0; 184 + *bx = 0; 185 + *cx = cpuid_leaf5_ecx_val; 186 + *dx = cpuid_leaf5_edx_val; 187 + return; 188 + 189 + case 0xb: 190 + /* Suppress extended topology stuff */ 191 + maskebx = 0; 192 + break; 193 + } 194 + 195 + asm(XEN_EMULATE_PREFIX "cpuid" 196 + : "=a" (*ax), 197 + "=b" (*bx), 198 + "=c" (*cx), 199 + "=d" (*dx) 200 + : "0" (*ax), "2" (*cx)); 201 + 202 + *bx &= maskebx; 203 + } 204 + STACK_FRAME_NON_STANDARD(xen_cpuid); /* XEN_EMULATE_PREFIX */ 205 + 206 + static bool __init xen_check_mwait(void) 207 + { 208 + #ifdef CONFIG_ACPI 209 + struct xen_platform_op op = { 210 + .cmd = XENPF_set_processor_pminfo, 211 + .u.set_pminfo.id = -1, 212 + .u.set_pminfo.type = XEN_PM_PDC, 213 + }; 214 + uint32_t buf[3]; 215 + unsigned int ax, bx, cx, dx; 216 + unsigned int mwait_mask; 217 + 218 + /* We need to determine whether it is OK to expose the MWAIT 219 + * capability to the kernel to harvest deeper than C3 states from ACPI 220 + * _CST using the processor_harvest_xen.c module. For this to work, we 221 + * need to gather the MWAIT_LEAF values (which the cstate.c code 222 + * checks against). The hypervisor won't expose the MWAIT flag because 223 + * it would break backwards compatibility; so we will find out directly 224 + * from the hardware and hypercall. 225 + */ 226 + if (!xen_initial_domain()) 227 + return false; 228 + 229 + /* 230 + * When running under platform earlier than Xen4.2, do not expose 231 + * mwait, to avoid the risk of loading native acpi pad driver 232 + */ 233 + if (!xen_running_on_version_or_later(4, 2)) 234 + return false; 235 + 236 + ax = 1; 237 + cx = 0; 238 + 239 + native_cpuid(&ax, &bx, &cx, &dx); 240 + 241 + mwait_mask = (1 << (X86_FEATURE_EST % 32)) | 242 + (1 << (X86_FEATURE_MWAIT % 32)); 243 + 244 + if ((cx & mwait_mask) != mwait_mask) 245 + return false; 246 + 247 + /* We need to emulate the MWAIT_LEAF and for that we need both 248 + * ecx and edx. The hypercall provides only partial information. 249 + */ 250 + 251 + ax = CPUID_MWAIT_LEAF; 252 + bx = 0; 253 + cx = 0; 254 + dx = 0; 255 + 256 + native_cpuid(&ax, &bx, &cx, &dx); 257 + 258 + /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, 259 + * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. 260 + */ 261 + buf[0] = ACPI_PDC_REVISION_ID; 262 + buf[1] = 1; 263 + buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); 264 + 265 + set_xen_guest_handle(op.u.set_pminfo.pdc, buf); 266 + 267 + if ((HYPERVISOR_platform_op(&op) == 0) && 268 + (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { 269 + cpuid_leaf5_ecx_val = cx; 270 + cpuid_leaf5_edx_val = dx; 271 + } 272 + return true; 273 + #else 274 + return false; 275 + #endif 276 + } 277 + 278 + static bool __init xen_check_xsave(void) 279 + { 280 + unsigned int err, eax, edx; 281 + 282 + /* 283 + * Xen 4.0 and older accidentally leaked the host XSAVE flag into guest 284 + * view, despite not being able to support guests using the 285 + * functionality. Probe for the actual availability of XSAVE by seeing 286 + * whether xgetbv executes successfully or raises #UD. 287 + */ 288 + asm volatile("1: .byte 0x0f,0x01,0xd0\n\t" /* xgetbv */ 289 + "xor %[err], %[err]\n" 290 + "2:\n\t" 291 + ".pushsection .fixup,\"ax\"\n\t" 292 + "3: movl $1,%[err]\n\t" 293 + "jmp 2b\n\t" 294 + ".popsection\n\t" 295 + _ASM_EXTABLE(1b, 3b) 296 + : [err] "=r" (err), "=a" (eax), "=d" (edx) 297 + : "c" (0)); 298 + 299 + return err == 0; 300 + } 301 + 302 + static void __init xen_init_capabilities(void) 303 + { 304 + setup_clear_cpu_cap(X86_BUG_SYSRET_SS_ATTRS); 305 + setup_force_cpu_cap(X86_FEATURE_XENPV); 306 + setup_clear_cpu_cap(X86_FEATURE_DCA); 307 + setup_clear_cpu_cap(X86_FEATURE_APERFMPERF); 308 + setup_clear_cpu_cap(X86_FEATURE_MTRR); 309 + setup_clear_cpu_cap(X86_FEATURE_ACC); 310 + setup_clear_cpu_cap(X86_FEATURE_X2APIC); 311 + 312 + if (!xen_initial_domain()) 313 + setup_clear_cpu_cap(X86_FEATURE_ACPI); 314 + 315 + if (xen_check_mwait()) 316 + setup_force_cpu_cap(X86_FEATURE_MWAIT); 317 + else 318 + setup_clear_cpu_cap(X86_FEATURE_MWAIT); 319 + 320 + if (xen_check_xsave()) { 321 + setup_force_cpu_cap(X86_FEATURE_XSAVE); 322 + setup_force_cpu_cap(X86_FEATURE_OSXSAVE); 323 + } else { 324 + setup_clear_cpu_cap(X86_FEATURE_XSAVE); 325 + setup_clear_cpu_cap(X86_FEATURE_OSXSAVE); 326 + } 327 + } 328 + 329 + static void xen_set_debugreg(int reg, unsigned long val) 330 + { 331 + HYPERVISOR_set_debugreg(reg, val); 332 + } 333 + 334 + static unsigned long xen_get_debugreg(int reg) 335 + { 336 + return HYPERVISOR_get_debugreg(reg); 337 + } 338 + 339 + static void xen_end_context_switch(struct task_struct *next) 340 + { 341 + xen_mc_flush(); 342 + paravirt_end_context_switch(next); 343 + } 344 + 345 + static unsigned long xen_store_tr(void) 346 + { 347 + return 0; 348 + } 349 + 350 + /* 351 + * Set the page permissions for a particular virtual address. If the 352 + * address is a vmalloc mapping (or other non-linear mapping), then 353 + * find the linear mapping of the page and also set its protections to 354 + * match. 355 + */ 356 + static void set_aliased_prot(void *v, pgprot_t prot) 357 + { 358 + int level; 359 + pte_t *ptep; 360 + pte_t pte; 361 + unsigned long pfn; 362 + struct page *page; 363 + unsigned char dummy; 364 + 365 + ptep = lookup_address((unsigned long)v, &level); 366 + BUG_ON(ptep == NULL); 367 + 368 + pfn = pte_pfn(*ptep); 369 + page = pfn_to_page(pfn); 370 + 371 + pte = pfn_pte(pfn, prot); 372 + 373 + /* 374 + * Careful: update_va_mapping() will fail if the virtual address 375 + * we're poking isn't populated in the page tables. We don't 376 + * need to worry about the direct map (that's always in the page 377 + * tables), but we need to be careful about vmap space. In 378 + * particular, the top level page table can lazily propagate 379 + * entries between processes, so if we've switched mms since we 380 + * vmapped the target in the first place, we might not have the 381 + * top-level page table entry populated. 382 + * 383 + * We disable preemption because we want the same mm active when 384 + * we probe the target and when we issue the hypercall. We'll 385 + * have the same nominal mm, but if we're a kernel thread, lazy 386 + * mm dropping could change our pgd. 387 + * 388 + * Out of an abundance of caution, this uses __get_user() to fault 389 + * in the target address just in case there's some obscure case 390 + * in which the target address isn't readable. 391 + */ 392 + 393 + preempt_disable(); 394 + 395 + probe_kernel_read(&dummy, v, 1); 396 + 397 + if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) 398 + BUG(); 399 + 400 + if (!PageHighMem(page)) { 401 + void *av = __va(PFN_PHYS(pfn)); 402 + 403 + if (av != v) 404 + if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) 405 + BUG(); 406 + } else 407 + kmap_flush_unused(); 408 + 409 + preempt_enable(); 410 + } 411 + 412 + static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) 413 + { 414 + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; 415 + int i; 416 + 417 + /* 418 + * We need to mark the all aliases of the LDT pages RO. We 419 + * don't need to call vm_flush_aliases(), though, since that's 420 + * only responsible for flushing aliases out the TLBs, not the 421 + * page tables, and Xen will flush the TLB for us if needed. 422 + * 423 + * To avoid confusing future readers: none of this is necessary 424 + * to load the LDT. The hypervisor only checks this when the 425 + * LDT is faulted in due to subsequent descriptor access. 426 + */ 427 + 428 + for (i = 0; i < entries; i += entries_per_page) 429 + set_aliased_prot(ldt + i, PAGE_KERNEL_RO); 430 + } 431 + 432 + static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) 433 + { 434 + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; 435 + int i; 436 + 437 + for (i = 0; i < entries; i += entries_per_page) 438 + set_aliased_prot(ldt + i, PAGE_KERNEL); 439 + } 440 + 441 + static void xen_set_ldt(const void *addr, unsigned entries) 442 + { 443 + struct mmuext_op *op; 444 + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); 445 + 446 + trace_xen_cpu_set_ldt(addr, entries); 447 + 448 + op = mcs.args; 449 + op->cmd = MMUEXT_SET_LDT; 450 + op->arg1.linear_addr = (unsigned long)addr; 451 + op->arg2.nr_ents = entries; 452 + 453 + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 454 + 455 + xen_mc_issue(PARAVIRT_LAZY_CPU); 456 + } 457 + 458 + static void xen_load_gdt(const struct desc_ptr *dtr) 459 + { 460 + unsigned long va = dtr->address; 461 + unsigned int size = dtr->size + 1; 462 + unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); 463 + unsigned long frames[pages]; 464 + int f; 465 + 466 + /* 467 + * A GDT can be up to 64k in size, which corresponds to 8192 468 + * 8-byte entries, or 16 4k pages.. 469 + */ 470 + 471 + BUG_ON(size > 65536); 472 + BUG_ON(va & ~PAGE_MASK); 473 + 474 + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 475 + int level; 476 + pte_t *ptep; 477 + unsigned long pfn, mfn; 478 + void *virt; 479 + 480 + /* 481 + * The GDT is per-cpu and is in the percpu data area. 482 + * That can be virtually mapped, so we need to do a 483 + * page-walk to get the underlying MFN for the 484 + * hypercall. The page can also be in the kernel's 485 + * linear range, so we need to RO that mapping too. 486 + */ 487 + ptep = lookup_address(va, &level); 488 + BUG_ON(ptep == NULL); 489 + 490 + pfn = pte_pfn(*ptep); 491 + mfn = pfn_to_mfn(pfn); 492 + virt = __va(PFN_PHYS(pfn)); 493 + 494 + frames[f] = mfn; 495 + 496 + make_lowmem_page_readonly((void *)va); 497 + make_lowmem_page_readonly(virt); 498 + } 499 + 500 + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) 501 + BUG(); 502 + } 503 + 504 + /* 505 + * load_gdt for early boot, when the gdt is only mapped once 506 + */ 507 + static void __init xen_load_gdt_boot(const struct desc_ptr *dtr) 508 + { 509 + unsigned long va = dtr->address; 510 + unsigned int size = dtr->size + 1; 511 + unsigned pages = DIV_ROUND_UP(size, PAGE_SIZE); 512 + unsigned long frames[pages]; 513 + int f; 514 + 515 + /* 516 + * A GDT can be up to 64k in size, which corresponds to 8192 517 + * 8-byte entries, or 16 4k pages.. 518 + */ 519 + 520 + BUG_ON(size > 65536); 521 + BUG_ON(va & ~PAGE_MASK); 522 + 523 + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { 524 + pte_t pte; 525 + unsigned long pfn, mfn; 526 + 527 + pfn = virt_to_pfn(va); 528 + mfn = pfn_to_mfn(pfn); 529 + 530 + pte = pfn_pte(pfn, PAGE_KERNEL_RO); 531 + 532 + if (HYPERVISOR_update_va_mapping((unsigned long)va, pte, 0)) 533 + BUG(); 534 + 535 + frames[f] = mfn; 536 + } 537 + 538 + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) 539 + BUG(); 540 + } 541 + 542 + static inline bool desc_equal(const struct desc_struct *d1, 543 + const struct desc_struct *d2) 544 + { 545 + return d1->a == d2->a && d1->b == d2->b; 546 + } 547 + 548 + static void load_TLS_descriptor(struct thread_struct *t, 549 + unsigned int cpu, unsigned int i) 550 + { 551 + struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i]; 552 + struct desc_struct *gdt; 553 + xmaddr_t maddr; 554 + struct multicall_space mc; 555 + 556 + if (desc_equal(shadow, &t->tls_array[i])) 557 + return; 558 + 559 + *shadow = t->tls_array[i]; 560 + 561 + gdt = get_cpu_gdt_rw(cpu); 562 + maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); 563 + mc = __xen_mc_entry(0); 564 + 565 + MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); 566 + } 567 + 568 + static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 569 + { 570 + /* 571 + * XXX sleazy hack: If we're being called in a lazy-cpu zone 572 + * and lazy gs handling is enabled, it means we're in a 573 + * context switch, and %gs has just been saved. This means we 574 + * can zero it out to prevent faults on exit from the 575 + * hypervisor if the next process has no %gs. Either way, it 576 + * has been saved, and the new value will get loaded properly. 577 + * This will go away as soon as Xen has been modified to not 578 + * save/restore %gs for normal hypercalls. 579 + * 580 + * On x86_64, this hack is not used for %gs, because gs points 581 + * to KERNEL_GS_BASE (and uses it for PDA references), so we 582 + * must not zero %gs on x86_64 583 + * 584 + * For x86_64, we need to zero %fs, otherwise we may get an 585 + * exception between the new %fs descriptor being loaded and 586 + * %fs being effectively cleared at __switch_to(). 587 + */ 588 + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { 589 + #ifdef CONFIG_X86_32 590 + lazy_load_gs(0); 591 + #else 592 + loadsegment(fs, 0); 593 + #endif 594 + } 595 + 596 + xen_mc_batch(); 597 + 598 + load_TLS_descriptor(t, cpu, 0); 599 + load_TLS_descriptor(t, cpu, 1); 600 + load_TLS_descriptor(t, cpu, 2); 601 + 602 + xen_mc_issue(PARAVIRT_LAZY_CPU); 603 + } 604 + 605 + #ifdef CONFIG_X86_64 606 + static void xen_load_gs_index(unsigned int idx) 607 + { 608 + if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) 609 + BUG(); 610 + } 611 + #endif 612 + 613 + static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 614 + const void *ptr) 615 + { 616 + xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); 617 + u64 entry = *(u64 *)ptr; 618 + 619 + trace_xen_cpu_write_ldt_entry(dt, entrynum, entry); 620 + 621 + preempt_disable(); 622 + 623 + xen_mc_flush(); 624 + if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) 625 + BUG(); 626 + 627 + preempt_enable(); 628 + } 629 + 630 + static int cvt_gate_to_trap(int vector, const gate_desc *val, 631 + struct trap_info *info) 632 + { 633 + unsigned long addr; 634 + 635 + if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) 636 + return 0; 637 + 638 + info->vector = vector; 639 + 640 + addr = gate_offset(*val); 641 + #ifdef CONFIG_X86_64 642 + /* 643 + * Look for known traps using IST, and substitute them 644 + * appropriately. The debugger ones are the only ones we care 645 + * about. Xen will handle faults like double_fault, 646 + * so we should never see them. Warn if 647 + * there's an unexpected IST-using fault handler. 648 + */ 649 + if (addr == (unsigned long)debug) 650 + addr = (unsigned long)xen_debug; 651 + else if (addr == (unsigned long)int3) 652 + addr = (unsigned long)xen_int3; 653 + else if (addr == (unsigned long)stack_segment) 654 + addr = (unsigned long)xen_stack_segment; 655 + else if (addr == (unsigned long)double_fault) { 656 + /* Don't need to handle these */ 657 + return 0; 658 + #ifdef CONFIG_X86_MCE 659 + } else if (addr == (unsigned long)machine_check) { 660 + /* 661 + * when xen hypervisor inject vMCE to guest, 662 + * use native mce handler to handle it 663 + */ 664 + ; 665 + #endif 666 + } else if (addr == (unsigned long)nmi) 667 + /* 668 + * Use the native version as well. 669 + */ 670 + ; 671 + else { 672 + /* Some other trap using IST? */ 673 + if (WARN_ON(val->ist != 0)) 674 + return 0; 675 + } 676 + #endif /* CONFIG_X86_64 */ 677 + info->address = addr; 678 + 679 + info->cs = gate_segment(*val); 680 + info->flags = val->dpl; 681 + /* interrupt gates clear IF */ 682 + if (val->type == GATE_INTERRUPT) 683 + info->flags |= 1 << 2; 684 + 685 + return 1; 686 + } 687 + 688 + /* Locations of each CPU's IDT */ 689 + static DEFINE_PER_CPU(struct desc_ptr, idt_desc); 690 + 691 + /* Set an IDT entry. If the entry is part of the current IDT, then 692 + also update Xen. */ 693 + static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) 694 + { 695 + unsigned long p = (unsigned long)&dt[entrynum]; 696 + unsigned long start, end; 697 + 698 + trace_xen_cpu_write_idt_entry(dt, entrynum, g); 699 + 700 + preempt_disable(); 701 + 702 + start = __this_cpu_read(idt_desc.address); 703 + end = start + __this_cpu_read(idt_desc.size) + 1; 704 + 705 + xen_mc_flush(); 706 + 707 + native_write_idt_entry(dt, entrynum, g); 708 + 709 + if (p >= start && (p + 8) <= end) { 710 + struct trap_info info[2]; 711 + 712 + info[1].address = 0; 713 + 714 + if (cvt_gate_to_trap(entrynum, g, &info[0])) 715 + if (HYPERVISOR_set_trap_table(info)) 716 + BUG(); 717 + } 718 + 719 + preempt_enable(); 720 + } 721 + 722 + static void xen_convert_trap_info(const struct desc_ptr *desc, 723 + struct trap_info *traps) 724 + { 725 + unsigned in, out, count; 726 + 727 + count = (desc->size+1) / sizeof(gate_desc); 728 + BUG_ON(count > 256); 729 + 730 + for (in = out = 0; in < count; in++) { 731 + gate_desc *entry = (gate_desc *)(desc->address) + in; 732 + 733 + if (cvt_gate_to_trap(in, entry, &traps[out])) 734 + out++; 735 + } 736 + traps[out].address = 0; 737 + } 738 + 739 + void xen_copy_trap_info(struct trap_info *traps) 740 + { 741 + const struct desc_ptr *desc = this_cpu_ptr(&idt_desc); 742 + 743 + xen_convert_trap_info(desc, traps); 744 + } 745 + 746 + /* Load a new IDT into Xen. In principle this can be per-CPU, so we 747 + hold a spinlock to protect the static traps[] array (static because 748 + it avoids allocation, and saves stack space). */ 749 + static void xen_load_idt(const struct desc_ptr *desc) 750 + { 751 + static DEFINE_SPINLOCK(lock); 752 + static struct trap_info traps[257]; 753 + 754 + trace_xen_cpu_load_idt(desc); 755 + 756 + spin_lock(&lock); 757 + 758 + memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc)); 759 + 760 + xen_convert_trap_info(desc, traps); 761 + 762 + xen_mc_flush(); 763 + if (HYPERVISOR_set_trap_table(traps)) 764 + BUG(); 765 + 766 + spin_unlock(&lock); 767 + } 768 + 769 + /* Write a GDT descriptor entry. Ignore LDT descriptors, since 770 + they're handled differently. */ 771 + static void xen_write_gdt_entry(struct desc_struct *dt, int entry, 772 + const void *desc, int type) 773 + { 774 + trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); 775 + 776 + preempt_disable(); 777 + 778 + switch (type) { 779 + case DESC_LDT: 780 + case DESC_TSS: 781 + /* ignore */ 782 + break; 783 + 784 + default: { 785 + xmaddr_t maddr = arbitrary_virt_to_machine(&dt[entry]); 786 + 787 + xen_mc_flush(); 788 + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 789 + BUG(); 790 + } 791 + 792 + } 793 + 794 + preempt_enable(); 795 + } 796 + 797 + /* 798 + * Version of write_gdt_entry for use at early boot-time needed to 799 + * update an entry as simply as possible. 800 + */ 801 + static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, 802 + const void *desc, int type) 803 + { 804 + trace_xen_cpu_write_gdt_entry(dt, entry, desc, type); 805 + 806 + switch (type) { 807 + case DESC_LDT: 808 + case DESC_TSS: 809 + /* ignore */ 810 + break; 811 + 812 + default: { 813 + xmaddr_t maddr = virt_to_machine(&dt[entry]); 814 + 815 + if (HYPERVISOR_update_descriptor(maddr.maddr, *(u64 *)desc)) 816 + dt[entry] = *(struct desc_struct *)desc; 817 + } 818 + 819 + } 820 + } 821 + 822 + static void xen_load_sp0(struct tss_struct *tss, 823 + struct thread_struct *thread) 824 + { 825 + struct multicall_space mcs; 826 + 827 + mcs = xen_mc_entry(0); 828 + MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 829 + xen_mc_issue(PARAVIRT_LAZY_CPU); 830 + tss->x86_tss.sp0 = thread->sp0; 831 + } 832 + 833 + void xen_set_iopl_mask(unsigned mask) 834 + { 835 + struct physdev_set_iopl set_iopl; 836 + 837 + /* Force the change at ring 0. */ 838 + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; 839 + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 840 + } 841 + 842 + static void xen_io_delay(void) 843 + { 844 + } 845 + 846 + static DEFINE_PER_CPU(unsigned long, xen_cr0_value); 847 + 848 + static unsigned long xen_read_cr0(void) 849 + { 850 + unsigned long cr0 = this_cpu_read(xen_cr0_value); 851 + 852 + if (unlikely(cr0 == 0)) { 853 + cr0 = native_read_cr0(); 854 + this_cpu_write(xen_cr0_value, cr0); 855 + } 856 + 857 + return cr0; 858 + } 859 + 860 + static void xen_write_cr0(unsigned long cr0) 861 + { 862 + struct multicall_space mcs; 863 + 864 + this_cpu_write(xen_cr0_value, cr0); 865 + 866 + /* Only pay attention to cr0.TS; everything else is 867 + ignored. */ 868 + mcs = xen_mc_entry(0); 869 + 870 + MULTI_fpu_taskswitch(mcs.mc, (cr0 & X86_CR0_TS) != 0); 871 + 872 + xen_mc_issue(PARAVIRT_LAZY_CPU); 873 + } 874 + 875 + static void xen_write_cr4(unsigned long cr4) 876 + { 877 + cr4 &= ~(X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PCE); 878 + 879 + native_write_cr4(cr4); 880 + } 881 + #ifdef CONFIG_X86_64 882 + static inline unsigned long xen_read_cr8(void) 883 + { 884 + return 0; 885 + } 886 + static inline void xen_write_cr8(unsigned long val) 887 + { 888 + BUG_ON(val); 889 + } 890 + #endif 891 + 892 + static u64 xen_read_msr_safe(unsigned int msr, int *err) 893 + { 894 + u64 val; 895 + 896 + if (pmu_msr_read(msr, &val, err)) 897 + return val; 898 + 899 + val = native_read_msr_safe(msr, err); 900 + switch (msr) { 901 + case MSR_IA32_APICBASE: 902 + #ifdef CONFIG_X86_X2APIC 903 + if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31)))) 904 + #endif 905 + val &= ~X2APIC_ENABLE; 906 + break; 907 + } 908 + return val; 909 + } 910 + 911 + static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) 912 + { 913 + int ret; 914 + 915 + ret = 0; 916 + 917 + switch (msr) { 918 + #ifdef CONFIG_X86_64 919 + unsigned which; 920 + u64 base; 921 + 922 + case MSR_FS_BASE: which = SEGBASE_FS; goto set; 923 + case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; 924 + case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; 925 + 926 + set: 927 + base = ((u64)high << 32) | low; 928 + if (HYPERVISOR_set_segment_base(which, base) != 0) 929 + ret = -EIO; 930 + break; 931 + #endif 932 + 933 + case MSR_STAR: 934 + case MSR_CSTAR: 935 + case MSR_LSTAR: 936 + case MSR_SYSCALL_MASK: 937 + case MSR_IA32_SYSENTER_CS: 938 + case MSR_IA32_SYSENTER_ESP: 939 + case MSR_IA32_SYSENTER_EIP: 940 + /* Fast syscall setup is all done in hypercalls, so 941 + these are all ignored. Stub them out here to stop 942 + Xen console noise. */ 943 + break; 944 + 945 + default: 946 + if (!pmu_msr_write(msr, low, high, &ret)) 947 + ret = native_write_msr_safe(msr, low, high); 948 + } 949 + 950 + return ret; 951 + } 952 + 953 + static u64 xen_read_msr(unsigned int msr) 954 + { 955 + /* 956 + * This will silently swallow a #GP from RDMSR. It may be worth 957 + * changing that. 958 + */ 959 + int err; 960 + 961 + return xen_read_msr_safe(msr, &err); 962 + } 963 + 964 + static void xen_write_msr(unsigned int msr, unsigned low, unsigned high) 965 + { 966 + /* 967 + * This will silently swallow a #GP from WRMSR. It may be worth 968 + * changing that. 969 + */ 970 + xen_write_msr_safe(msr, low, high); 971 + } 972 + 973 + void xen_setup_shared_info(void) 974 + { 975 + if (!xen_feature(XENFEAT_auto_translated_physmap)) { 976 + set_fixmap(FIX_PARAVIRT_BOOTMAP, 977 + xen_start_info->shared_info); 978 + 979 + HYPERVISOR_shared_info = 980 + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); 981 + } else 982 + HYPERVISOR_shared_info = 983 + (struct shared_info *)__va(xen_start_info->shared_info); 984 + 985 + #ifndef CONFIG_SMP 986 + /* In UP this is as good a place as any to set up shared info */ 987 + xen_setup_vcpu_info_placement(); 988 + #endif 989 + 990 + xen_setup_mfn_list_list(); 991 + } 992 + 993 + /* This is called once we have the cpu_possible_mask */ 994 + void xen_setup_vcpu_info_placement(void) 995 + { 996 + int cpu; 997 + 998 + for_each_possible_cpu(cpu) { 999 + /* Set up direct vCPU id mapping for PV guests. */ 1000 + per_cpu(xen_vcpu_id, cpu) = cpu; 1001 + xen_vcpu_setup(cpu); 1002 + } 1003 + 1004 + /* 1005 + * xen_vcpu_setup managed to place the vcpu_info within the 1006 + * percpu area for all cpus, so make use of it. 1007 + */ 1008 + if (xen_have_vcpu_info_placement) { 1009 + pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); 1010 + pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); 1011 + pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); 1012 + pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(xen_irq_enable_direct); 1013 + pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1014 + } 1015 + } 1016 + 1017 + static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1018 + unsigned long addr, unsigned len) 1019 + { 1020 + char *start, *end, *reloc; 1021 + unsigned ret; 1022 + 1023 + start = end = reloc = NULL; 1024 + 1025 + #define SITE(op, x) \ 1026 + case PARAVIRT_PATCH(op.x): \ 1027 + if (xen_have_vcpu_info_placement) { \ 1028 + start = (char *)xen_##x##_direct; \ 1029 + end = xen_##x##_direct_end; \ 1030 + reloc = xen_##x##_direct_reloc; \ 1031 + } \ 1032 + goto patch_site 1033 + 1034 + switch (type) { 1035 + SITE(pv_irq_ops, irq_enable); 1036 + SITE(pv_irq_ops, irq_disable); 1037 + SITE(pv_irq_ops, save_fl); 1038 + SITE(pv_irq_ops, restore_fl); 1039 + #undef SITE 1040 + 1041 + patch_site: 1042 + if (start == NULL || (end-start) > len) 1043 + goto default_patch; 1044 + 1045 + ret = paravirt_patch_insns(insnbuf, len, start, end); 1046 + 1047 + /* Note: because reloc is assigned from something that 1048 + appears to be an array, gcc assumes it's non-null, 1049 + but doesn't know its relationship with start and 1050 + end. */ 1051 + if (reloc > start && reloc < end) { 1052 + int reloc_off = reloc - start; 1053 + long *relocp = (long *)(insnbuf + reloc_off); 1054 + long delta = start - (char *)addr; 1055 + 1056 + *relocp += delta; 1057 + } 1058 + break; 1059 + 1060 + default_patch: 1061 + default: 1062 + ret = paravirt_patch_default(type, clobbers, insnbuf, 1063 + addr, len); 1064 + break; 1065 + } 1066 + 1067 + return ret; 1068 + } 1069 + 1070 + static const struct pv_info xen_info __initconst = { 1071 + .shared_kernel_pmd = 0, 1072 + 1073 + #ifdef CONFIG_X86_64 1074 + .extra_user_64bit_cs = FLAT_USER_CS64, 1075 + #endif 1076 + .name = "Xen", 1077 + }; 1078 + 1079 + static const struct pv_init_ops xen_init_ops __initconst = { 1080 + .patch = xen_patch, 1081 + }; 1082 + 1083 + static const struct pv_cpu_ops xen_cpu_ops __initconst = { 1084 + .cpuid = xen_cpuid, 1085 + 1086 + .set_debugreg = xen_set_debugreg, 1087 + .get_debugreg = xen_get_debugreg, 1088 + 1089 + .read_cr0 = xen_read_cr0, 1090 + .write_cr0 = xen_write_cr0, 1091 + 1092 + .read_cr4 = native_read_cr4, 1093 + .write_cr4 = xen_write_cr4, 1094 + 1095 + #ifdef CONFIG_X86_64 1096 + .read_cr8 = xen_read_cr8, 1097 + .write_cr8 = xen_write_cr8, 1098 + #endif 1099 + 1100 + .wbinvd = native_wbinvd, 1101 + 1102 + .read_msr = xen_read_msr, 1103 + .write_msr = xen_write_msr, 1104 + 1105 + .read_msr_safe = xen_read_msr_safe, 1106 + .write_msr_safe = xen_write_msr_safe, 1107 + 1108 + .read_pmc = xen_read_pmc, 1109 + 1110 + .iret = xen_iret, 1111 + #ifdef CONFIG_X86_64 1112 + .usergs_sysret64 = xen_sysret64, 1113 + #endif 1114 + 1115 + .load_tr_desc = paravirt_nop, 1116 + .set_ldt = xen_set_ldt, 1117 + .load_gdt = xen_load_gdt, 1118 + .load_idt = xen_load_idt, 1119 + .load_tls = xen_load_tls, 1120 + #ifdef CONFIG_X86_64 1121 + .load_gs_index = xen_load_gs_index, 1122 + #endif 1123 + 1124 + .alloc_ldt = xen_alloc_ldt, 1125 + .free_ldt = xen_free_ldt, 1126 + 1127 + .store_idt = native_store_idt, 1128 + .store_tr = xen_store_tr, 1129 + 1130 + .write_ldt_entry = xen_write_ldt_entry, 1131 + .write_gdt_entry = xen_write_gdt_entry, 1132 + .write_idt_entry = xen_write_idt_entry, 1133 + .load_sp0 = xen_load_sp0, 1134 + 1135 + .set_iopl_mask = xen_set_iopl_mask, 1136 + .io_delay = xen_io_delay, 1137 + 1138 + /* Xen takes care of %gs when switching to usermode for us */ 1139 + .swapgs = paravirt_nop, 1140 + 1141 + .start_context_switch = paravirt_start_context_switch, 1142 + .end_context_switch = xen_end_context_switch, 1143 + }; 1144 + 1145 + static void xen_restart(char *msg) 1146 + { 1147 + xen_reboot(SHUTDOWN_reboot); 1148 + } 1149 + 1150 + static void xen_machine_halt(void) 1151 + { 1152 + xen_reboot(SHUTDOWN_poweroff); 1153 + } 1154 + 1155 + static void xen_machine_power_off(void) 1156 + { 1157 + if (pm_power_off) 1158 + pm_power_off(); 1159 + xen_reboot(SHUTDOWN_poweroff); 1160 + } 1161 + 1162 + static void xen_crash_shutdown(struct pt_regs *regs) 1163 + { 1164 + xen_reboot(SHUTDOWN_crash); 1165 + } 1166 + 1167 + static const struct machine_ops xen_machine_ops __initconst = { 1168 + .restart = xen_restart, 1169 + .halt = xen_machine_halt, 1170 + .power_off = xen_machine_power_off, 1171 + .shutdown = xen_machine_halt, 1172 + .crash_shutdown = xen_crash_shutdown, 1173 + .emergency_restart = xen_emergency_restart, 1174 + }; 1175 + 1176 + static unsigned char xen_get_nmi_reason(void) 1177 + { 1178 + unsigned char reason = 0; 1179 + 1180 + /* Construct a value which looks like it came from port 0x61. */ 1181 + if (test_bit(_XEN_NMIREASON_io_error, 1182 + &HYPERVISOR_shared_info->arch.nmi_reason)) 1183 + reason |= NMI_REASON_IOCHK; 1184 + if (test_bit(_XEN_NMIREASON_pci_serr, 1185 + &HYPERVISOR_shared_info->arch.nmi_reason)) 1186 + reason |= NMI_REASON_SERR; 1187 + 1188 + return reason; 1189 + } 1190 + 1191 + static void __init xen_boot_params_init_edd(void) 1192 + { 1193 + #if IS_ENABLED(CONFIG_EDD) 1194 + struct xen_platform_op op; 1195 + struct edd_info *edd_info; 1196 + u32 *mbr_signature; 1197 + unsigned nr; 1198 + int ret; 1199 + 1200 + edd_info = boot_params.eddbuf; 1201 + mbr_signature = boot_params.edd_mbr_sig_buffer; 1202 + 1203 + op.cmd = XENPF_firmware_info; 1204 + 1205 + op.u.firmware_info.type = XEN_FW_DISK_INFO; 1206 + for (nr = 0; nr < EDDMAXNR; nr++) { 1207 + struct edd_info *info = edd_info + nr; 1208 + 1209 + op.u.firmware_info.index = nr; 1210 + info->params.length = sizeof(info->params); 1211 + set_xen_guest_handle(op.u.firmware_info.u.disk_info.edd_params, 1212 + &info->params); 1213 + ret = HYPERVISOR_platform_op(&op); 1214 + if (ret) 1215 + break; 1216 + 1217 + #define C(x) info->x = op.u.firmware_info.u.disk_info.x 1218 + C(device); 1219 + C(version); 1220 + C(interface_support); 1221 + C(legacy_max_cylinder); 1222 + C(legacy_max_head); 1223 + C(legacy_sectors_per_track); 1224 + #undef C 1225 + } 1226 + boot_params.eddbuf_entries = nr; 1227 + 1228 + op.u.firmware_info.type = XEN_FW_DISK_MBR_SIGNATURE; 1229 + for (nr = 0; nr < EDD_MBR_SIG_MAX; nr++) { 1230 + op.u.firmware_info.index = nr; 1231 + ret = HYPERVISOR_platform_op(&op); 1232 + if (ret) 1233 + break; 1234 + mbr_signature[nr] = op.u.firmware_info.u.disk_mbr_signature.mbr_signature; 1235 + } 1236 + boot_params.edd_mbr_sig_buf_entries = nr; 1237 + #endif 1238 + } 1239 + 1240 + /* 1241 + * Set up the GDT and segment registers for -fstack-protector. Until 1242 + * we do this, we have to be careful not to call any stack-protected 1243 + * function, which is most of the kernel. 1244 + */ 1245 + static void xen_setup_gdt(int cpu) 1246 + { 1247 + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; 1248 + pv_cpu_ops.load_gdt = xen_load_gdt_boot; 1249 + 1250 + setup_stack_canary_segment(0); 1251 + switch_to_new_gdt(0); 1252 + 1253 + pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; 1254 + pv_cpu_ops.load_gdt = xen_load_gdt; 1255 + } 1256 + 1257 + static void __init xen_dom0_set_legacy_features(void) 1258 + { 1259 + x86_platform.legacy.rtc = 1; 1260 + } 1261 + 1262 + /* First C function to be called on Xen boot */ 1263 + asmlinkage __visible void __init xen_start_kernel(void) 1264 + { 1265 + struct physdev_set_iopl set_iopl; 1266 + unsigned long initrd_start = 0; 1267 + int rc; 1268 + 1269 + if (!xen_start_info) 1270 + return; 1271 + 1272 + xen_domain_type = XEN_PV_DOMAIN; 1273 + 1274 + xen_setup_features(); 1275 + 1276 + xen_setup_machphys_mapping(); 1277 + 1278 + /* Install Xen paravirt ops */ 1279 + pv_info = xen_info; 1280 + pv_init_ops = xen_init_ops; 1281 + pv_cpu_ops = xen_cpu_ops; 1282 + 1283 + x86_platform.get_nmi_reason = xen_get_nmi_reason; 1284 + 1285 + x86_init.resources.memory_setup = xen_memory_setup; 1286 + x86_init.oem.arch_setup = xen_arch_setup; 1287 + x86_init.oem.banner = xen_banner; 1288 + 1289 + xen_init_time_ops(); 1290 + 1291 + /* 1292 + * Set up some pagetable state before starting to set any ptes. 1293 + */ 1294 + 1295 + xen_init_mmu_ops(); 1296 + 1297 + /* Prevent unwanted bits from being set in PTEs. */ 1298 + __supported_pte_mask &= ~_PAGE_GLOBAL; 1299 + 1300 + /* 1301 + * Prevent page tables from being allocated in highmem, even 1302 + * if CONFIG_HIGHPTE is enabled. 1303 + */ 1304 + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; 1305 + 1306 + /* Work out if we support NX */ 1307 + x86_configure_nx(); 1308 + 1309 + /* Get mfn list */ 1310 + xen_build_dynamic_phys_to_machine(); 1311 + 1312 + /* 1313 + * Set up kernel GDT and segment registers, mainly so that 1314 + * -fstack-protector code can be executed. 1315 + */ 1316 + xen_setup_gdt(0); 1317 + 1318 + xen_init_irq_ops(); 1319 + xen_init_capabilities(); 1320 + 1321 + #ifdef CONFIG_X86_LOCAL_APIC 1322 + /* 1323 + * set up the basic apic ops. 1324 + */ 1325 + xen_init_apic(); 1326 + #endif 1327 + 1328 + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1329 + pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; 1330 + pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; 1331 + } 1332 + 1333 + machine_ops = xen_machine_ops; 1334 + 1335 + /* 1336 + * The only reliable way to retain the initial address of the 1337 + * percpu gdt_page is to remember it here, so we can go and 1338 + * mark it RW later, when the initial percpu area is freed. 1339 + */ 1340 + xen_initial_gdt = &per_cpu(gdt_page, 0); 1341 + 1342 + xen_smp_init(); 1343 + 1344 + #ifdef CONFIG_ACPI_NUMA 1345 + /* 1346 + * The pages we from Xen are not related to machine pages, so 1347 + * any NUMA information the kernel tries to get from ACPI will 1348 + * be meaningless. Prevent it from trying. 1349 + */ 1350 + acpi_numa = -1; 1351 + #endif 1352 + /* Don't do the full vcpu_info placement stuff until we have a 1353 + possible map and a non-dummy shared_info. */ 1354 + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1355 + 1356 + WARN_ON(xen_cpuhp_setup(xen_cpu_up_prepare_pv, xen_cpu_dead_pv)); 1357 + 1358 + local_irq_disable(); 1359 + early_boot_irqs_disabled = true; 1360 + 1361 + xen_raw_console_write("mapping kernel into physical memory\n"); 1362 + xen_setup_kernel_pagetable((pgd_t *)xen_start_info->pt_base, 1363 + xen_start_info->nr_pages); 1364 + xen_reserve_special_pages(); 1365 + 1366 + /* keep using Xen gdt for now; no urgent need to change it */ 1367 + 1368 + #ifdef CONFIG_X86_32 1369 + pv_info.kernel_rpl = 1; 1370 + if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1371 + pv_info.kernel_rpl = 0; 1372 + #else 1373 + pv_info.kernel_rpl = 0; 1374 + #endif 1375 + /* set the limit of our address space */ 1376 + xen_reserve_top(); 1377 + 1378 + /* 1379 + * We used to do this in xen_arch_setup, but that is too late 1380 + * on AMD were early_cpu_init (run before ->arch_setup()) calls 1381 + * early_amd_init which pokes 0xcf8 port. 1382 + */ 1383 + set_iopl.iopl = 1; 1384 + rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 1385 + if (rc != 0) 1386 + xen_raw_printk("physdev_op failed %d\n", rc); 1387 + 1388 + #ifdef CONFIG_X86_32 1389 + /* set up basic CPUID stuff */ 1390 + cpu_detect(&new_cpu_data); 1391 + set_cpu_cap(&new_cpu_data, X86_FEATURE_FPU); 1392 + new_cpu_data.x86_capability[CPUID_1_EDX] = cpuid_edx(1); 1393 + #endif 1394 + 1395 + if (xen_start_info->mod_start) { 1396 + if (xen_start_info->flags & SIF_MOD_START_PFN) 1397 + initrd_start = PFN_PHYS(xen_start_info->mod_start); 1398 + else 1399 + initrd_start = __pa(xen_start_info->mod_start); 1400 + } 1401 + 1402 + /* Poke various useful things into boot_params */ 1403 + boot_params.hdr.type_of_loader = (9 << 4) | 0; 1404 + boot_params.hdr.ramdisk_image = initrd_start; 1405 + boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1406 + boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 1407 + boot_params.hdr.hardware_subarch = X86_SUBARCH_XEN; 1408 + 1409 + if (!xen_initial_domain()) { 1410 + add_preferred_console("xenboot", 0, NULL); 1411 + add_preferred_console("tty", 0, NULL); 1412 + add_preferred_console("hvc", 0, NULL); 1413 + if (pci_xen) 1414 + x86_init.pci.arch_init = pci_xen_init; 1415 + } else { 1416 + const struct dom0_vga_console_info *info = 1417 + (void *)((char *)xen_start_info + 1418 + xen_start_info->console.dom0.info_off); 1419 + struct xen_platform_op op = { 1420 + .cmd = XENPF_firmware_info, 1421 + .interface_version = XENPF_INTERFACE_VERSION, 1422 + .u.firmware_info.type = XEN_FW_KBD_SHIFT_FLAGS, 1423 + }; 1424 + 1425 + x86_platform.set_legacy_features = 1426 + xen_dom0_set_legacy_features; 1427 + xen_init_vga(info, xen_start_info->console.dom0.info_size); 1428 + xen_start_info->console.domU.mfn = 0; 1429 + xen_start_info->console.domU.evtchn = 0; 1430 + 1431 + if (HYPERVISOR_platform_op(&op) == 0) 1432 + boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags; 1433 + 1434 + /* Make sure ACS will be enabled */ 1435 + pci_request_acs(); 1436 + 1437 + xen_acpi_sleep_register(); 1438 + 1439 + /* Avoid searching for BIOS MP tables */ 1440 + x86_init.mpparse.find_smp_config = x86_init_noop; 1441 + x86_init.mpparse.get_smp_config = x86_init_uint_noop; 1442 + 1443 + xen_boot_params_init_edd(); 1444 + } 1445 + #ifdef CONFIG_PCI 1446 + /* PCI BIOS service won't work from a PV guest. */ 1447 + pci_probe &= ~PCI_PROBE_BIOS; 1448 + #endif 1449 + xen_raw_console_write("about to get started...\n"); 1450 + 1451 + /* Let's presume PV guests always boot on vCPU with id 0. */ 1452 + per_cpu(xen_vcpu_id, 0) = 0; 1453 + 1454 + xen_setup_runstate_info(0); 1455 + 1456 + xen_efi_init(); 1457 + 1458 + /* Start the world */ 1459 + #ifdef CONFIG_X86_32 1460 + i386_start_kernel(); 1461 + #else 1462 + cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */ 1463 + x86_64_start_reservations((char *)__pa_symbol(&boot_params)); 1464 + #endif 1465 + } 1466 + 1467 + static int xen_cpu_up_prepare_pv(unsigned int cpu) 1468 + { 1469 + int rc; 1470 + 1471 + xen_setup_timer(cpu); 1472 + 1473 + rc = xen_smp_intr_init(cpu); 1474 + if (rc) { 1475 + WARN(1, "xen_smp_intr_init() for CPU %d failed: %d\n", 1476 + cpu, rc); 1477 + return rc; 1478 + } 1479 + 1480 + rc = xen_smp_intr_init_pv(cpu); 1481 + if (rc) { 1482 + WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n", 1483 + cpu, rc); 1484 + return rc; 1485 + } 1486 + 1487 + return 0; 1488 + } 1489 + 1490 + static int xen_cpu_dead_pv(unsigned int cpu) 1491 + { 1492 + xen_smp_intr_free(cpu); 1493 + xen_smp_intr_free_pv(cpu); 1494 + 1495 + xen_teardown_timer(cpu); 1496 + 1497 + return 0; 1498 + } 1499 + 1500 + static uint32_t __init xen_platform_pv(void) 1501 + { 1502 + if (xen_pv_domain()) 1503 + return xen_cpuid_base(); 1504 + 1505 + return 0; 1506 + } 1507 + 1508 + const struct hypervisor_x86 x86_hyper_xen_pv = { 1509 + .name = "Xen PV", 1510 + .detect = xen_platform_pv, 1511 + .pin_vcpu = xen_pin_vcpu, 1512 + }; 1513 + EXPORT_SYMBOL(x86_hyper_xen_pv);

+106

arch/x86/xen/enlighten_pvh.c

··· 1 + #include <linux/acpi.h> 2 + 3 + #include <xen/hvc-console.h> 4 + 5 + #include <asm/io_apic.h> 6 + #include <asm/hypervisor.h> 7 + #include <asm/e820/api.h> 8 + 9 + #include <asm/xen/interface.h> 10 + #include <asm/xen/hypercall.h> 11 + 12 + #include <xen/interface/memory.h> 13 + #include <xen/interface/hvm/start_info.h> 14 + 15 + /* 16 + * PVH variables. 17 + * 18 + * xen_pvh and pvh_bootparams need to live in data segment since they 19 + * are used after startup_{32|64}, which clear .bss, are invoked. 20 + */ 21 + bool xen_pvh __attribute__((section(".data"))) = 0; 22 + struct boot_params pvh_bootparams __attribute__((section(".data"))); 23 + 24 + struct hvm_start_info pvh_start_info; 25 + unsigned int pvh_start_info_sz = sizeof(pvh_start_info); 26 + 27 + static void xen_pvh_arch_setup(void) 28 + { 29 + /* Make sure we don't fall back to (default) ACPI_IRQ_MODEL_PIC. */ 30 + if (nr_ioapics == 0) 31 + acpi_irq_model = ACPI_IRQ_MODEL_PLATFORM; 32 + } 33 + 34 + static void __init init_pvh_bootparams(void) 35 + { 36 + struct xen_memory_map memmap; 37 + int rc; 38 + 39 + memset(&pvh_bootparams, 0, sizeof(pvh_bootparams)); 40 + 41 + memmap.nr_entries = ARRAY_SIZE(pvh_bootparams.e820_table); 42 + set_xen_guest_handle(memmap.buffer, pvh_bootparams.e820_table); 43 + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); 44 + if (rc) { 45 + xen_raw_printk("XENMEM_memory_map failed (%d)\n", rc); 46 + BUG(); 47 + } 48 + pvh_bootparams.e820_entries = memmap.nr_entries; 49 + 50 + if (pvh_bootparams.e820_entries < E820_MAX_ENTRIES_ZEROPAGE - 1) { 51 + pvh_bootparams.e820_table[pvh_bootparams.e820_entries].addr = 52 + ISA_START_ADDRESS; 53 + pvh_bootparams.e820_table[pvh_bootparams.e820_entries].size = 54 + ISA_END_ADDRESS - ISA_START_ADDRESS; 55 + pvh_bootparams.e820_table[pvh_bootparams.e820_entries].type = 56 + E820_TYPE_RESERVED; 57 + pvh_bootparams.e820_entries++; 58 + } else 59 + xen_raw_printk("Warning: Can fit ISA range into e820\n"); 60 + 61 + pvh_bootparams.hdr.cmd_line_ptr = 62 + pvh_start_info.cmdline_paddr; 63 + 64 + /* The first module is always ramdisk. */ 65 + if (pvh_start_info.nr_modules) { 66 + struct hvm_modlist_entry *modaddr = 67 + __va(pvh_start_info.modlist_paddr); 68 + pvh_bootparams.hdr.ramdisk_image = modaddr->paddr; 69 + pvh_bootparams.hdr.ramdisk_size = modaddr->size; 70 + } 71 + 72 + /* 73 + * See Documentation/x86/boot.txt. 74 + * 75 + * Version 2.12 supports Xen entry point but we will use default x86/PC 76 + * environment (i.e. hardware_subarch 0). 77 + */ 78 + pvh_bootparams.hdr.version = 0x212; 79 + pvh_bootparams.hdr.type_of_loader = (9 << 4) | 0; /* Xen loader */ 80 + } 81 + 82 + /* 83 + * This routine (and those that it might call) should not use 84 + * anything that lives in .bss since that segment will be cleared later. 85 + */ 86 + void __init xen_prepare_pvh(void) 87 + { 88 + u32 msr; 89 + u64 pfn; 90 + 91 + if (pvh_start_info.magic != XEN_HVM_START_MAGIC_VALUE) { 92 + xen_raw_printk("Error: Unexpected magic value (0x%08x)\n", 93 + pvh_start_info.magic); 94 + BUG(); 95 + } 96 + 97 + xen_pvh = 1; 98 + 99 + msr = cpuid_ebx(xen_cpuid_base() + 2); 100 + pfn = __pa(hypercall_page); 101 + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); 102 + 103 + init_pvh_bootparams(); 104 + 105 + x86_init.oem.arch_setup = xen_pvh_arch_setup; 106 + }

+2 -2786

arch/x86/xen/mmu.c

··· 1 - /* 2 - * Xen mmu operations 3 - * 4 - * This file contains the various mmu fetch and update operations. 5 - * The most important job they must perform is the mapping between the 6 - * domain's pfn and the overall machine mfns. 7 - * 8 - * Xen allows guests to directly update the pagetable, in a controlled 9 - * fashion. In other words, the guest modifies the same pagetable 10 - * that the CPU actually uses, which eliminates the overhead of having 11 - * a separate shadow pagetable. 12 - * 13 - * In order to allow this, it falls on the guest domain to map its 14 - * notion of a "physical" pfn - which is just a domain-local linear 15 - * address - into a real "machine address" which the CPU's MMU can 16 - * use. 17 - * 18 - * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be 19 - * inserted directly into the pagetable. When creating a new 20 - * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, 21 - * when reading the content back with __(pgd|pmd|pte)_val, it converts 22 - * the mfn back into a pfn. 23 - * 24 - * The other constraint is that all pages which make up a pagetable 25 - * must be mapped read-only in the guest. This prevents uncontrolled 26 - * guest updates to the pagetable. Xen strictly enforces this, and 27 - * will disallow any pagetable update which will end up mapping a 28 - * pagetable page RW, and will disallow using any writable page as a 29 - * pagetable. 30 - * 31 - * Naively, when loading %cr3 with the base of a new pagetable, Xen 32 - * would need to validate the whole pagetable before going on. 33 - * Naturally, this is quite slow. The solution is to "pin" a 34 - * pagetable, which enforces all the constraints on the pagetable even 35 - * when it is not actively in use. This menas that Xen can be assured 36 - * that it is still valid when you do load it into %cr3, and doesn't 37 - * need to revalidate it. 38 - * 39 - * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 40 - */ 41 - #include <linux/sched/mm.h> 42 - #include <linux/highmem.h> 43 - #include <linux/debugfs.h> 44 - #include <linux/bug.h> 45 - #include <linux/vmalloc.h> 46 - #include <linux/export.h> 47 - #include <linux/init.h> 48 - #include <linux/gfp.h> 49 - #include <linux/memblock.h> 50 - #include <linux/seq_file.h> 51 - #include <linux/crash_dump.h> 52 - 53 - #include <trace/events/xen.h> 54 - 55 - #include <asm/pgtable.h> 56 - #include <asm/tlbflush.h> 57 - #include <asm/fixmap.h> 58 - #include <asm/mmu_context.h> 59 - #include <asm/setup.h> 60 - #include <asm/paravirt.h> 61 - #include <asm/e820/api.h> 62 - #include <asm/linkage.h> 63 - #include <asm/page.h> 64 - #include <asm/init.h> 65 - #include <asm/pat.h> 66 - #include <asm/smp.h> 67 - 1 + #include <linux/pfn.h> 2 + #include <asm/xen/page.h> 68 3 #include <asm/xen/hypercall.h> 69 - #include <asm/xen/hypervisor.h> 70 - 71 - #include <xen/xen.h> 72 - #include <xen/page.h> 73 - #include <xen/interface/xen.h> 74 - #include <xen/interface/hvm/hvm_op.h> 75 - #include <xen/interface/version.h> 76 4 #include <xen/interface/memory.h> 77 - #include <xen/hvc-console.h> 78 5 79 6 #include "multicalls.h" 80 7 #include "mmu.h" 81 - #include "debugfs.h" 82 8 83 9 /* 84 10 * Protects atomic reservation decrease/increase against concurrent increases. 85 11 * Also protects non-atomic updates of current_pages and balloon lists. 86 12 */ 87 13 DEFINE_SPINLOCK(xen_reservation_lock); 88 - 89 - #ifdef CONFIG_X86_32 90 - /* 91 - * Identity map, in addition to plain kernel map. This needs to be 92 - * large enough to allocate page table pages to allocate the rest. 93 - * Each page can map 2MB. 94 - */ 95 - #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) 96 - static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); 97 - #endif 98 - #ifdef CONFIG_X86_64 99 - /* l3 pud for userspace vsyscall mapping */ 100 - static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; 101 - #endif /* CONFIG_X86_64 */ 102 - 103 - /* 104 - * Note about cr3 (pagetable base) values: 105 - * 106 - * xen_cr3 contains the current logical cr3 value; it contains the 107 - * last set cr3. This may not be the current effective cr3, because 108 - * its update may be being lazily deferred. However, a vcpu looking 109 - * at its own cr3 can use this value knowing that it everything will 110 - * be self-consistent. 111 - * 112 - * xen_current_cr3 contains the actual vcpu cr3; it is set once the 113 - * hypercall to set the vcpu cr3 is complete (so it may be a little 114 - * out of date, but it will never be set early). If one vcpu is 115 - * looking at another vcpu's cr3 value, it should use this variable. 116 - */ 117 - DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ 118 - DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ 119 - 120 - static phys_addr_t xen_pt_base, xen_pt_size __initdata; 121 - 122 - /* 123 - * Just beyond the highest usermode address. STACK_TOP_MAX has a 124 - * redzone above it, so round it up to a PGD boundary. 125 - */ 126 - #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 127 14 128 15 unsigned long arbitrary_virt_to_mfn(void *vaddr) 129 16 { ··· 42 155 } 43 156 EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine); 44 157 45 - void make_lowmem_page_readonly(void *vaddr) 46 - { 47 - pte_t *pte, ptev; 48 - unsigned long address = (unsigned long)vaddr; 49 - unsigned int level; 50 - 51 - pte = lookup_address(address, &level); 52 - if (pte == NULL) 53 - return; /* vaddr missing */ 54 - 55 - ptev = pte_wrprotect(*pte); 56 - 57 - if (HYPERVISOR_update_va_mapping(address, ptev, 0)) 58 - BUG(); 59 - } 60 - 61 - void make_lowmem_page_readwrite(void *vaddr) 62 - { 63 - pte_t *pte, ptev; 64 - unsigned long address = (unsigned long)vaddr; 65 - unsigned int level; 66 - 67 - pte = lookup_address(address, &level); 68 - if (pte == NULL) 69 - return; /* vaddr missing */ 70 - 71 - ptev = pte_mkwrite(*pte); 72 - 73 - if (HYPERVISOR_update_va_mapping(address, ptev, 0)) 74 - BUG(); 75 - } 76 - 77 - 78 - static bool xen_page_pinned(void *ptr) 79 - { 80 - struct page *page = virt_to_page(ptr); 81 - 82 - return PagePinned(page); 83 - } 84 - 85 - void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) 86 - { 87 - struct multicall_space mcs; 88 - struct mmu_update *u; 89 - 90 - trace_xen_mmu_set_domain_pte(ptep, pteval, domid); 91 - 92 - mcs = xen_mc_entry(sizeof(*u)); 93 - u = mcs.args; 94 - 95 - /* ptep might be kmapped when using 32-bit HIGHPTE */ 96 - u->ptr = virt_to_machine(ptep).maddr; 97 - u->val = pte_val_ma(pteval); 98 - 99 - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); 100 - 101 - xen_mc_issue(PARAVIRT_LAZY_MMU); 102 - } 103 - EXPORT_SYMBOL_GPL(xen_set_domain_pte); 104 - 105 - static void xen_extend_mmu_update(const struct mmu_update *update) 106 - { 107 - struct multicall_space mcs; 108 - struct mmu_update *u; 109 - 110 - mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 111 - 112 - if (mcs.mc != NULL) { 113 - mcs.mc->args[1]++; 114 - } else { 115 - mcs = __xen_mc_entry(sizeof(*u)); 116 - MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 117 - } 118 - 119 - u = mcs.args; 120 - *u = *update; 121 - } 122 - 123 - static void xen_extend_mmuext_op(const struct mmuext_op *op) 124 - { 125 - struct multicall_space mcs; 126 - struct mmuext_op *u; 127 - 128 - mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); 129 - 130 - if (mcs.mc != NULL) { 131 - mcs.mc->args[1]++; 132 - } else { 133 - mcs = __xen_mc_entry(sizeof(*u)); 134 - MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 135 - } 136 - 137 - u = mcs.args; 138 - *u = *op; 139 - } 140 - 141 - static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) 142 - { 143 - struct mmu_update u; 144 - 145 - preempt_disable(); 146 - 147 - xen_mc_batch(); 148 - 149 - /* ptr may be ioremapped for 64-bit pagetable setup */ 150 - u.ptr = arbitrary_virt_to_machine(ptr).maddr; 151 - u.val = pmd_val_ma(val); 152 - xen_extend_mmu_update(&u); 153 - 154 - xen_mc_issue(PARAVIRT_LAZY_MMU); 155 - 156 - preempt_enable(); 157 - } 158 - 159 - static void xen_set_pmd(pmd_t *ptr, pmd_t val) 160 - { 161 - trace_xen_mmu_set_pmd(ptr, val); 162 - 163 - /* If page is not pinned, we can just update the entry 164 - directly */ 165 - if (!xen_page_pinned(ptr)) { 166 - *ptr = val; 167 - return; 168 - } 169 - 170 - xen_set_pmd_hyper(ptr, val); 171 - } 172 - 173 - /* 174 - * Associate a virtual page frame with a given physical page frame 175 - * and protection flags for that frame. 176 - */ 177 - void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 178 - { 179 - set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); 180 - } 181 - 182 - static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) 183 - { 184 - struct mmu_update u; 185 - 186 - if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) 187 - return false; 188 - 189 - xen_mc_batch(); 190 - 191 - u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; 192 - u.val = pte_val_ma(pteval); 193 - xen_extend_mmu_update(&u); 194 - 195 - xen_mc_issue(PARAVIRT_LAZY_MMU); 196 - 197 - return true; 198 - } 199 - 200 - static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) 201 - { 202 - if (!xen_batched_set_pte(ptep, pteval)) { 203 - /* 204 - * Could call native_set_pte() here and trap and 205 - * emulate the PTE write but with 32-bit guests this 206 - * needs two traps (one for each of the two 32-bit 207 - * words in the PTE) so do one hypercall directly 208 - * instead. 209 - */ 210 - struct mmu_update u; 211 - 212 - u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; 213 - u.val = pte_val_ma(pteval); 214 - HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); 215 - } 216 - } 217 - 218 - static void xen_set_pte(pte_t *ptep, pte_t pteval) 219 - { 220 - trace_xen_mmu_set_pte(ptep, pteval); 221 - __xen_set_pte(ptep, pteval); 222 - } 223 - 224 - static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 225 - pte_t *ptep, pte_t pteval) 226 - { 227 - trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); 228 - __xen_set_pte(ptep, pteval); 229 - } 230 - 231 - pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 232 - unsigned long addr, pte_t *ptep) 233 - { 234 - /* Just return the pte as-is. We preserve the bits on commit */ 235 - trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); 236 - return *ptep; 237 - } 238 - 239 - void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 240 - pte_t *ptep, pte_t pte) 241 - { 242 - struct mmu_update u; 243 - 244 - trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); 245 - xen_mc_batch(); 246 - 247 - u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 248 - u.val = pte_val_ma(pte); 249 - xen_extend_mmu_update(&u); 250 - 251 - xen_mc_issue(PARAVIRT_LAZY_MMU); 252 - } 253 - 254 - /* Assume pteval_t is equivalent to all the other *val_t types. */ 255 - static pteval_t pte_mfn_to_pfn(pteval_t val) 256 - { 257 - if (val & _PAGE_PRESENT) { 258 - unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 259 - unsigned long pfn = mfn_to_pfn(mfn); 260 - 261 - pteval_t flags = val & PTE_FLAGS_MASK; 262 - if (unlikely(pfn == ~0)) 263 - val = flags & ~_PAGE_PRESENT; 264 - else 265 - val = ((pteval_t)pfn << PAGE_SHIFT) | flags; 266 - } 267 - 268 - return val; 269 - } 270 - 271 - static pteval_t pte_pfn_to_mfn(pteval_t val) 272 - { 273 - if (val & _PAGE_PRESENT) { 274 - unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 275 - pteval_t flags = val & PTE_FLAGS_MASK; 276 - unsigned long mfn; 277 - 278 - if (!xen_feature(XENFEAT_auto_translated_physmap)) 279 - mfn = __pfn_to_mfn(pfn); 280 - else 281 - mfn = pfn; 282 - /* 283 - * If there's no mfn for the pfn, then just create an 284 - * empty non-present pte. Unfortunately this loses 285 - * information about the original pfn, so 286 - * pte_mfn_to_pfn is asymmetric. 287 - */ 288 - if (unlikely(mfn == INVALID_P2M_ENTRY)) { 289 - mfn = 0; 290 - flags = 0; 291 - } else 292 - mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); 293 - val = ((pteval_t)mfn << PAGE_SHIFT) | flags; 294 - } 295 - 296 - return val; 297 - } 298 - 299 - __visible pteval_t xen_pte_val(pte_t pte) 300 - { 301 - pteval_t pteval = pte.pte; 302 - 303 - return pte_mfn_to_pfn(pteval); 304 - } 305 - PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 306 - 307 - __visible pgdval_t xen_pgd_val(pgd_t pgd) 308 - { 309 - return pte_mfn_to_pfn(pgd.pgd); 310 - } 311 - PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); 312 - 313 - __visible pte_t xen_make_pte(pteval_t pte) 314 - { 315 - pte = pte_pfn_to_mfn(pte); 316 - 317 - return native_make_pte(pte); 318 - } 319 - PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 320 - 321 - __visible pgd_t xen_make_pgd(pgdval_t pgd) 322 - { 323 - pgd = pte_pfn_to_mfn(pgd); 324 - return native_make_pgd(pgd); 325 - } 326 - PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); 327 - 328 - __visible pmdval_t xen_pmd_val(pmd_t pmd) 329 - { 330 - return pte_mfn_to_pfn(pmd.pmd); 331 - } 332 - PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); 333 - 334 - static void xen_set_pud_hyper(pud_t *ptr, pud_t val) 335 - { 336 - struct mmu_update u; 337 - 338 - preempt_disable(); 339 - 340 - xen_mc_batch(); 341 - 342 - /* ptr may be ioremapped for 64-bit pagetable setup */ 343 - u.ptr = arbitrary_virt_to_machine(ptr).maddr; 344 - u.val = pud_val_ma(val); 345 - xen_extend_mmu_update(&u); 346 - 347 - xen_mc_issue(PARAVIRT_LAZY_MMU); 348 - 349 - preempt_enable(); 350 - } 351 - 352 - static void xen_set_pud(pud_t *ptr, pud_t val) 353 - { 354 - trace_xen_mmu_set_pud(ptr, val); 355 - 356 - /* If page is not pinned, we can just update the entry 357 - directly */ 358 - if (!xen_page_pinned(ptr)) { 359 - *ptr = val; 360 - return; 361 - } 362 - 363 - xen_set_pud_hyper(ptr, val); 364 - } 365 - 366 - #ifdef CONFIG_X86_PAE 367 - static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 368 - { 369 - trace_xen_mmu_set_pte_atomic(ptep, pte); 370 - set_64bit((u64 *)ptep, native_pte_val(pte)); 371 - } 372 - 373 - static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 374 - { 375 - trace_xen_mmu_pte_clear(mm, addr, ptep); 376 - if (!xen_batched_set_pte(ptep, native_make_pte(0))) 377 - native_pte_clear(mm, addr, ptep); 378 - } 379 - 380 - static void xen_pmd_clear(pmd_t *pmdp) 381 - { 382 - trace_xen_mmu_pmd_clear(pmdp); 383 - set_pmd(pmdp, __pmd(0)); 384 - } 385 - #endif /* CONFIG_X86_PAE */ 386 - 387 - __visible pmd_t xen_make_pmd(pmdval_t pmd) 388 - { 389 - pmd = pte_pfn_to_mfn(pmd); 390 - return native_make_pmd(pmd); 391 - } 392 - PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 393 - 394 - #if CONFIG_PGTABLE_LEVELS == 4 395 - __visible pudval_t xen_pud_val(pud_t pud) 396 - { 397 - return pte_mfn_to_pfn(pud.pud); 398 - } 399 - PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); 400 - 401 - __visible pud_t xen_make_pud(pudval_t pud) 402 - { 403 - pud = pte_pfn_to_mfn(pud); 404 - 405 - return native_make_pud(pud); 406 - } 407 - PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); 408 - 409 - static pgd_t *xen_get_user_pgd(pgd_t *pgd) 410 - { 411 - pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); 412 - unsigned offset = pgd - pgd_page; 413 - pgd_t *user_ptr = NULL; 414 - 415 - if (offset < pgd_index(USER_LIMIT)) { 416 - struct page *page = virt_to_page(pgd_page); 417 - user_ptr = (pgd_t *)page->private; 418 - if (user_ptr) 419 - user_ptr += offset; 420 - } 421 - 422 - return user_ptr; 423 - } 424 - 425 - static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) 426 - { 427 - struct mmu_update u; 428 - 429 - u.ptr = virt_to_machine(ptr).maddr; 430 - u.val = p4d_val_ma(val); 431 - xen_extend_mmu_update(&u); 432 - } 433 - 434 - /* 435 - * Raw hypercall-based set_p4d, intended for in early boot before 436 - * there's a page structure. This implies: 437 - * 1. The only existing pagetable is the kernel's 438 - * 2. It is always pinned 439 - * 3. It has no user pagetable attached to it 440 - */ 441 - static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) 442 - { 443 - preempt_disable(); 444 - 445 - xen_mc_batch(); 446 - 447 - __xen_set_p4d_hyper(ptr, val); 448 - 449 - xen_mc_issue(PARAVIRT_LAZY_MMU); 450 - 451 - preempt_enable(); 452 - } 453 - 454 - static void xen_set_p4d(p4d_t *ptr, p4d_t val) 455 - { 456 - pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr); 457 - pgd_t pgd_val; 458 - 459 - trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val); 460 - 461 - /* If page is not pinned, we can just update the entry 462 - directly */ 463 - if (!xen_page_pinned(ptr)) { 464 - *ptr = val; 465 - if (user_ptr) { 466 - WARN_ON(xen_page_pinned(user_ptr)); 467 - pgd_val.pgd = p4d_val_ma(val); 468 - *user_ptr = pgd_val; 469 - } 470 - return; 471 - } 472 - 473 - /* If it's pinned, then we can at least batch the kernel and 474 - user updates together. */ 475 - xen_mc_batch(); 476 - 477 - __xen_set_p4d_hyper(ptr, val); 478 - if (user_ptr) 479 - __xen_set_p4d_hyper((p4d_t *)user_ptr, val); 480 - 481 - xen_mc_issue(PARAVIRT_LAZY_MMU); 482 - } 483 - #endif /* CONFIG_PGTABLE_LEVELS == 4 */ 484 - 485 - static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, 486 - int (*func)(struct mm_struct *mm, struct page *, enum pt_level), 487 - bool last, unsigned long limit) 488 - { 489 - int i, nr, flush = 0; 490 - 491 - nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD; 492 - for (i = 0; i < nr; i++) { 493 - if (!pmd_none(pmd[i])) 494 - flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE); 495 - } 496 - return flush; 497 - } 498 - 499 - static int xen_pud_walk(struct mm_struct *mm, pud_t *pud, 500 - int (*func)(struct mm_struct *mm, struct page *, enum pt_level), 501 - bool last, unsigned long limit) 502 - { 503 - int i, nr, flush = 0; 504 - 505 - nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD; 506 - for (i = 0; i < nr; i++) { 507 - pmd_t *pmd; 508 - 509 - if (pud_none(pud[i])) 510 - continue; 511 - 512 - pmd = pmd_offset(&pud[i], 0); 513 - if (PTRS_PER_PMD > 1) 514 - flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); 515 - flush |= xen_pmd_walk(mm, pmd, func, 516 - last && i == nr - 1, limit); 517 - } 518 - return flush; 519 - } 520 - 521 - static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, 522 - int (*func)(struct mm_struct *mm, struct page *, enum pt_level), 523 - bool last, unsigned long limit) 524 - { 525 - int i, nr, flush = 0; 526 - 527 - nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; 528 - for (i = 0; i < nr; i++) { 529 - pud_t *pud; 530 - 531 - if (p4d_none(p4d[i])) 532 - continue; 533 - 534 - pud = pud_offset(&p4d[i], 0); 535 - if (PTRS_PER_PUD > 1) 536 - flush |= (*func)(mm, virt_to_page(pud), PT_PUD); 537 - flush |= xen_pud_walk(mm, pud, func, 538 - last && i == nr - 1, limit); 539 - } 540 - return flush; 541 - } 542 - 543 - /* 544 - * (Yet another) pagetable walker. This one is intended for pinning a 545 - * pagetable. This means that it walks a pagetable and calls the 546 - * callback function on each page it finds making up the page table, 547 - * at every level. It walks the entire pagetable, but it only bothers 548 - * pinning pte pages which are below limit. In the normal case this 549 - * will be STACK_TOP_MAX, but at boot we need to pin up to 550 - * FIXADDR_TOP. 551 - * 552 - * For 32-bit the important bit is that we don't pin beyond there, 553 - * because then we start getting into Xen's ptes. 554 - * 555 - * For 64-bit, we must skip the Xen hole in the middle of the address 556 - * space, just after the big x86-64 virtual hole. 557 - */ 558 - static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, 559 - int (*func)(struct mm_struct *mm, struct page *, 560 - enum pt_level), 561 - unsigned long limit) 562 - { 563 - int i, nr, flush = 0; 564 - unsigned hole_low, hole_high; 565 - 566 - /* The limit is the last byte to be touched */ 567 - limit--; 568 - BUG_ON(limit >= FIXADDR_TOP); 569 - 570 - if (xen_feature(XENFEAT_auto_translated_physmap)) 571 - return 0; 572 - 573 - /* 574 - * 64-bit has a great big hole in the middle of the address 575 - * space, which contains the Xen mappings. On 32-bit these 576 - * will end up making a zero-sized hole and so is a no-op. 577 - */ 578 - hole_low = pgd_index(USER_LIMIT); 579 - hole_high = pgd_index(PAGE_OFFSET); 580 - 581 - nr = pgd_index(limit) + 1; 582 - for (i = 0; i < nr; i++) { 583 - p4d_t *p4d; 584 - 585 - if (i >= hole_low && i < hole_high) 586 - continue; 587 - 588 - if (pgd_none(pgd[i])) 589 - continue; 590 - 591 - p4d = p4d_offset(&pgd[i], 0); 592 - if (PTRS_PER_P4D > 1) 593 - flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); 594 - flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); 595 - } 596 - 597 - /* Do the top level last, so that the callbacks can use it as 598 - a cue to do final things like tlb flushes. */ 599 - flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); 600 - 601 - return flush; 602 - } 603 - 604 - static int xen_pgd_walk(struct mm_struct *mm, 605 - int (*func)(struct mm_struct *mm, struct page *, 606 - enum pt_level), 607 - unsigned long limit) 608 - { 609 - return __xen_pgd_walk(mm, mm->pgd, func, limit); 610 - } 611 - 612 - /* If we're using split pte locks, then take the page's lock and 613 - return a pointer to it. Otherwise return NULL. */ 614 - static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) 615 - { 616 - spinlock_t *ptl = NULL; 617 - 618 - #if USE_SPLIT_PTE_PTLOCKS 619 - ptl = ptlock_ptr(page); 620 - spin_lock_nest_lock(ptl, &mm->page_table_lock); 621 - #endif 622 - 623 - return ptl; 624 - } 625 - 626 - static void xen_pte_unlock(void *v) 627 - { 628 - spinlock_t *ptl = v; 629 - spin_unlock(ptl); 630 - } 631 - 632 - static void xen_do_pin(unsigned level, unsigned long pfn) 633 - { 634 - struct mmuext_op op; 635 - 636 - op.cmd = level; 637 - op.arg1.mfn = pfn_to_mfn(pfn); 638 - 639 - xen_extend_mmuext_op(&op); 640 - } 641 - 642 - static int xen_pin_page(struct mm_struct *mm, struct page *page, 643 - enum pt_level level) 644 - { 645 - unsigned pgfl = TestSetPagePinned(page); 646 - int flush; 647 - 648 - if (pgfl) 649 - flush = 0; /* already pinned */ 650 - else if (PageHighMem(page)) 651 - /* kmaps need flushing if we found an unpinned 652 - highpage */ 653 - flush = 1; 654 - else { 655 - void *pt = lowmem_page_address(page); 656 - unsigned long pfn = page_to_pfn(page); 657 - struct multicall_space mcs = __xen_mc_entry(0); 658 - spinlock_t *ptl; 659 - 660 - flush = 0; 661 - 662 - /* 663 - * We need to hold the pagetable lock between the time 664 - * we make the pagetable RO and when we actually pin 665 - * it. If we don't, then other users may come in and 666 - * attempt to update the pagetable by writing it, 667 - * which will fail because the memory is RO but not 668 - * pinned, so Xen won't do the trap'n'emulate. 669 - * 670 - * If we're using split pte locks, we can't hold the 671 - * entire pagetable's worth of locks during the 672 - * traverse, because we may wrap the preempt count (8 673 - * bits). The solution is to mark RO and pin each PTE 674 - * page while holding the lock. This means the number 675 - * of locks we end up holding is never more than a 676 - * batch size (~32 entries, at present). 677 - * 678 - * If we're not using split pte locks, we needn't pin 679 - * the PTE pages independently, because we're 680 - * protected by the overall pagetable lock. 681 - */ 682 - ptl = NULL; 683 - if (level == PT_PTE) 684 - ptl = xen_pte_lock(page, mm); 685 - 686 - MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 687 - pfn_pte(pfn, PAGE_KERNEL_RO), 688 - level == PT_PGD ? UVMF_TLB_FLUSH : 0); 689 - 690 - if (ptl) { 691 - xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 692 - 693 - /* Queue a deferred unlock for when this batch 694 - is completed. */ 695 - xen_mc_callback(xen_pte_unlock, ptl); 696 - } 697 - } 698 - 699 - return flush; 700 - } 701 - 702 - /* This is called just after a mm has been created, but it has not 703 - been used yet. We need to make sure that its pagetable is all 704 - read-only, and can be pinned. */ 705 - static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) 706 - { 707 - trace_xen_mmu_pgd_pin(mm, pgd); 708 - 709 - xen_mc_batch(); 710 - 711 - if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { 712 - /* re-enable interrupts for flushing */ 713 - xen_mc_issue(0); 714 - 715 - kmap_flush_unused(); 716 - 717 - xen_mc_batch(); 718 - } 719 - 720 - #ifdef CONFIG_X86_64 721 - { 722 - pgd_t *user_pgd = xen_get_user_pgd(pgd); 723 - 724 - xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 725 - 726 - if (user_pgd) { 727 - xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); 728 - xen_do_pin(MMUEXT_PIN_L4_TABLE, 729 - PFN_DOWN(__pa(user_pgd))); 730 - } 731 - } 732 - #else /* CONFIG_X86_32 */ 733 - #ifdef CONFIG_X86_PAE 734 - /* Need to make sure unshared kernel PMD is pinnable */ 735 - xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), 736 - PT_PMD); 737 - #endif 738 - xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 739 - #endif /* CONFIG_X86_64 */ 740 - xen_mc_issue(0); 741 - } 742 - 743 - static void xen_pgd_pin(struct mm_struct *mm) 744 - { 745 - __xen_pgd_pin(mm, mm->pgd); 746 - } 747 - 748 - /* 749 - * On save, we need to pin all pagetables to make sure they get their 750 - * mfns turned into pfns. Search the list for any unpinned pgds and pin 751 - * them (unpinned pgds are not currently in use, probably because the 752 - * process is under construction or destruction). 753 - * 754 - * Expected to be called in stop_machine() ("equivalent to taking 755 - * every spinlock in the system"), so the locking doesn't really 756 - * matter all that much. 757 - */ 758 - void xen_mm_pin_all(void) 759 - { 760 - struct page *page; 761 - 762 - spin_lock(&pgd_lock); 763 - 764 - list_for_each_entry(page, &pgd_list, lru) { 765 - if (!PagePinned(page)) { 766 - __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); 767 - SetPageSavePinned(page); 768 - } 769 - } 770 - 771 - spin_unlock(&pgd_lock); 772 - } 773 - 774 - /* 775 - * The init_mm pagetable is really pinned as soon as its created, but 776 - * that's before we have page structures to store the bits. So do all 777 - * the book-keeping now. 778 - */ 779 - static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, 780 - enum pt_level level) 781 - { 782 - SetPagePinned(page); 783 - return 0; 784 - } 785 - 786 - static void __init xen_mark_init_mm_pinned(void) 787 - { 788 - xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); 789 - } 790 - 791 - static int xen_unpin_page(struct mm_struct *mm, struct page *page, 792 - enum pt_level level) 793 - { 794 - unsigned pgfl = TestClearPagePinned(page); 795 - 796 - if (pgfl && !PageHighMem(page)) { 797 - void *pt = lowmem_page_address(page); 798 - unsigned long pfn = page_to_pfn(page); 799 - spinlock_t *ptl = NULL; 800 - struct multicall_space mcs; 801 - 802 - /* 803 - * Do the converse to pin_page. If we're using split 804 - * pte locks, we must be holding the lock for while 805 - * the pte page is unpinned but still RO to prevent 806 - * concurrent updates from seeing it in this 807 - * partially-pinned state. 808 - */ 809 - if (level == PT_PTE) { 810 - ptl = xen_pte_lock(page, mm); 811 - 812 - if (ptl) 813 - xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 814 - } 815 - 816 - mcs = __xen_mc_entry(0); 817 - 818 - MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 819 - pfn_pte(pfn, PAGE_KERNEL), 820 - level == PT_PGD ? UVMF_TLB_FLUSH : 0); 821 - 822 - if (ptl) { 823 - /* unlock when batch completed */ 824 - xen_mc_callback(xen_pte_unlock, ptl); 825 - } 826 - } 827 - 828 - return 0; /* never need to flush on unpin */ 829 - } 830 - 831 - /* Release a pagetables pages back as normal RW */ 832 - static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) 833 - { 834 - trace_xen_mmu_pgd_unpin(mm, pgd); 835 - 836 - xen_mc_batch(); 837 - 838 - xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 839 - 840 - #ifdef CONFIG_X86_64 841 - { 842 - pgd_t *user_pgd = xen_get_user_pgd(pgd); 843 - 844 - if (user_pgd) { 845 - xen_do_pin(MMUEXT_UNPIN_TABLE, 846 - PFN_DOWN(__pa(user_pgd))); 847 - xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); 848 - } 849 - } 850 - #endif 851 - 852 - #ifdef CONFIG_X86_PAE 853 - /* Need to make sure unshared kernel PMD is unpinned */ 854 - xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), 855 - PT_PMD); 856 - #endif 857 - 858 - __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT); 859 - 860 - xen_mc_issue(0); 861 - } 862 - 863 - static void xen_pgd_unpin(struct mm_struct *mm) 864 - { 865 - __xen_pgd_unpin(mm, mm->pgd); 866 - } 867 - 868 - /* 869 - * On resume, undo any pinning done at save, so that the rest of the 870 - * kernel doesn't see any unexpected pinned pagetables. 871 - */ 872 - void xen_mm_unpin_all(void) 873 - { 874 - struct page *page; 875 - 876 - spin_lock(&pgd_lock); 877 - 878 - list_for_each_entry(page, &pgd_list, lru) { 879 - if (PageSavePinned(page)) { 880 - BUG_ON(!PagePinned(page)); 881 - __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); 882 - ClearPageSavePinned(page); 883 - } 884 - } 885 - 886 - spin_unlock(&pgd_lock); 887 - } 888 - 889 - static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 890 - { 891 - spin_lock(&next->page_table_lock); 892 - xen_pgd_pin(next); 893 - spin_unlock(&next->page_table_lock); 894 - } 895 - 896 - static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 897 - { 898 - spin_lock(&mm->page_table_lock); 899 - xen_pgd_pin(mm); 900 - spin_unlock(&mm->page_table_lock); 901 - } 902 - 903 - 904 - #ifdef CONFIG_SMP 905 - /* Another cpu may still have their %cr3 pointing at the pagetable, so 906 - we need to repoint it somewhere else before we can unpin it. */ 907 - static void drop_other_mm_ref(void *info) 908 - { 909 - struct mm_struct *mm = info; 910 - struct mm_struct *active_mm; 911 - 912 - active_mm = this_cpu_read(cpu_tlbstate.active_mm); 913 - 914 - if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) 915 - leave_mm(smp_processor_id()); 916 - 917 - /* If this cpu still has a stale cr3 reference, then make sure 918 - it has been flushed. */ 919 - if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) 920 - load_cr3(swapper_pg_dir); 921 - } 922 - 923 - static void xen_drop_mm_ref(struct mm_struct *mm) 924 - { 925 - cpumask_var_t mask; 926 - unsigned cpu; 927 - 928 - if (current->active_mm == mm) { 929 - if (current->mm == mm) 930 - load_cr3(swapper_pg_dir); 931 - else 932 - leave_mm(smp_processor_id()); 933 - } 934 - 935 - /* Get the "official" set of cpus referring to our pagetable. */ 936 - if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { 937 - for_each_online_cpu(cpu) { 938 - if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) 939 - && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) 940 - continue; 941 - smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); 942 - } 943 - return; 944 - } 945 - cpumask_copy(mask, mm_cpumask(mm)); 946 - 947 - /* It's possible that a vcpu may have a stale reference to our 948 - cr3, because its in lazy mode, and it hasn't yet flushed 949 - its set of pending hypercalls yet. In this case, we can 950 - look at its actual current cr3 value, and force it to flush 951 - if needed. */ 952 - for_each_online_cpu(cpu) { 953 - if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) 954 - cpumask_set_cpu(cpu, mask); 955 - } 956 - 957 - if (!cpumask_empty(mask)) 958 - smp_call_function_many(mask, drop_other_mm_ref, mm, 1); 959 - free_cpumask_var(mask); 960 - } 961 - #else 962 - static void xen_drop_mm_ref(struct mm_struct *mm) 963 - { 964 - if (current->active_mm == mm) 965 - load_cr3(swapper_pg_dir); 966 - } 967 - #endif 968 - 969 - /* 970 - * While a process runs, Xen pins its pagetables, which means that the 971 - * hypervisor forces it to be read-only, and it controls all updates 972 - * to it. This means that all pagetable updates have to go via the 973 - * hypervisor, which is moderately expensive. 974 - * 975 - * Since we're pulling the pagetable down, we switch to use init_mm, 976 - * unpin old process pagetable and mark it all read-write, which 977 - * allows further operations on it to be simple memory accesses. 978 - * 979 - * The only subtle point is that another CPU may be still using the 980 - * pagetable because of lazy tlb flushing. This means we need need to 981 - * switch all CPUs off this pagetable before we can unpin it. 982 - */ 983 - static void xen_exit_mmap(struct mm_struct *mm) 984 - { 985 - get_cpu(); /* make sure we don't move around */ 986 - xen_drop_mm_ref(mm); 987 - put_cpu(); 988 - 989 - spin_lock(&mm->page_table_lock); 990 - 991 - /* pgd may not be pinned in the error exit path of execve */ 992 - if (xen_page_pinned(mm->pgd)) 993 - xen_pgd_unpin(mm); 994 - 995 - spin_unlock(&mm->page_table_lock); 996 - } 997 - 998 - static void xen_post_allocator_init(void); 999 - 1000 - static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1001 - { 1002 - struct mmuext_op op; 1003 - 1004 - op.cmd = cmd; 1005 - op.arg1.mfn = pfn_to_mfn(pfn); 1006 - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) 1007 - BUG(); 1008 - } 1009 - 1010 - #ifdef CONFIG_X86_64 1011 - static void __init xen_cleanhighmap(unsigned long vaddr, 1012 - unsigned long vaddr_end) 1013 - { 1014 - unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; 1015 - pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); 1016 - 1017 - /* NOTE: The loop is more greedy than the cleanup_highmap variant. 1018 - * We include the PMD passed in on _both_ boundaries. */ 1019 - for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD)); 1020 - pmd++, vaddr += PMD_SIZE) { 1021 - if (pmd_none(*pmd)) 1022 - continue; 1023 - if (vaddr < (unsigned long) _text || vaddr > kernel_end) 1024 - set_pmd(pmd, __pmd(0)); 1025 - } 1026 - /* In case we did something silly, we should crash in this function 1027 - * instead of somewhere later and be confusing. */ 1028 - xen_mc_flush(); 1029 - } 1030 - 1031 - /* 1032 - * Make a page range writeable and free it. 1033 - */ 1034 - static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) 1035 - { 1036 - void *vaddr = __va(paddr); 1037 - void *vaddr_end = vaddr + size; 1038 - 1039 - for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) 1040 - make_lowmem_page_readwrite(vaddr); 1041 - 1042 - memblock_free(paddr, size); 1043 - } 1044 - 1045 - static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) 1046 - { 1047 - unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; 1048 - 1049 - if (unpin) 1050 - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); 1051 - ClearPagePinned(virt_to_page(__va(pa))); 1052 - xen_free_ro_pages(pa, PAGE_SIZE); 1053 - } 1054 - 1055 - static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) 1056 - { 1057 - unsigned long pa; 1058 - pte_t *pte_tbl; 1059 - int i; 1060 - 1061 - if (pmd_large(*pmd)) { 1062 - pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; 1063 - xen_free_ro_pages(pa, PMD_SIZE); 1064 - return; 1065 - } 1066 - 1067 - pte_tbl = pte_offset_kernel(pmd, 0); 1068 - for (i = 0; i < PTRS_PER_PTE; i++) { 1069 - if (pte_none(pte_tbl[i])) 1070 - continue; 1071 - pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT; 1072 - xen_free_ro_pages(pa, PAGE_SIZE); 1073 - } 1074 - set_pmd(pmd, __pmd(0)); 1075 - xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin); 1076 - } 1077 - 1078 - static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) 1079 - { 1080 - unsigned long pa; 1081 - pmd_t *pmd_tbl; 1082 - int i; 1083 - 1084 - if (pud_large(*pud)) { 1085 - pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; 1086 - xen_free_ro_pages(pa, PUD_SIZE); 1087 - return; 1088 - } 1089 - 1090 - pmd_tbl = pmd_offset(pud, 0); 1091 - for (i = 0; i < PTRS_PER_PMD; i++) { 1092 - if (pmd_none(pmd_tbl[i])) 1093 - continue; 1094 - xen_cleanmfnmap_pmd(pmd_tbl + i, unpin); 1095 - } 1096 - set_pud(pud, __pud(0)); 1097 - xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin); 1098 - } 1099 - 1100 - static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) 1101 - { 1102 - unsigned long pa; 1103 - pud_t *pud_tbl; 1104 - int i; 1105 - 1106 - if (p4d_large(*p4d)) { 1107 - pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; 1108 - xen_free_ro_pages(pa, P4D_SIZE); 1109 - return; 1110 - } 1111 - 1112 - pud_tbl = pud_offset(p4d, 0); 1113 - for (i = 0; i < PTRS_PER_PUD; i++) { 1114 - if (pud_none(pud_tbl[i])) 1115 - continue; 1116 - xen_cleanmfnmap_pud(pud_tbl + i, unpin); 1117 - } 1118 - set_p4d(p4d, __p4d(0)); 1119 - xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin); 1120 - } 1121 - 1122 - /* 1123 - * Since it is well isolated we can (and since it is perhaps large we should) 1124 - * also free the page tables mapping the initial P->M table. 1125 - */ 1126 - static void __init xen_cleanmfnmap(unsigned long vaddr) 1127 - { 1128 - pgd_t *pgd; 1129 - p4d_t *p4d; 1130 - unsigned int i; 1131 - bool unpin; 1132 - 1133 - unpin = (vaddr == 2 * PGDIR_SIZE); 1134 - vaddr &= PMD_MASK; 1135 - pgd = pgd_offset_k(vaddr); 1136 - p4d = p4d_offset(pgd, 0); 1137 - for (i = 0; i < PTRS_PER_P4D; i++) { 1138 - if (p4d_none(p4d[i])) 1139 - continue; 1140 - xen_cleanmfnmap_p4d(p4d + i, unpin); 1141 - } 1142 - if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 1143 - set_pgd(pgd, __pgd(0)); 1144 - xen_cleanmfnmap_free_pgtbl(p4d, unpin); 1145 - } 1146 - } 1147 - 1148 - static void __init xen_pagetable_p2m_free(void) 1149 - { 1150 - unsigned long size; 1151 - unsigned long addr; 1152 - 1153 - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 1154 - 1155 - /* No memory or already called. */ 1156 - if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list) 1157 - return; 1158 - 1159 - /* using __ka address and sticking INVALID_P2M_ENTRY! */ 1160 - memset((void *)xen_start_info->mfn_list, 0xff, size); 1161 - 1162 - addr = xen_start_info->mfn_list; 1163 - /* 1164 - * We could be in __ka space. 1165 - * We roundup to the PMD, which means that if anybody at this stage is 1166 - * using the __ka address of xen_start_info or 1167 - * xen_start_info->shared_info they are in going to crash. Fortunatly 1168 - * we have already revectored in xen_setup_kernel_pagetable and in 1169 - * xen_setup_shared_info. 1170 - */ 1171 - size = roundup(size, PMD_SIZE); 1172 - 1173 - if (addr >= __START_KERNEL_map) { 1174 - xen_cleanhighmap(addr, addr + size); 1175 - size = PAGE_ALIGN(xen_start_info->nr_pages * 1176 - sizeof(unsigned long)); 1177 - memblock_free(__pa(addr), size); 1178 - } else { 1179 - xen_cleanmfnmap(addr); 1180 - } 1181 - } 1182 - 1183 - static void __init xen_pagetable_cleanhighmap(void) 1184 - { 1185 - unsigned long size; 1186 - unsigned long addr; 1187 - 1188 - /* At this stage, cleanup_highmap has already cleaned __ka space 1189 - * from _brk_limit way up to the max_pfn_mapped (which is the end of 1190 - * the ramdisk). We continue on, erasing PMD entries that point to page 1191 - * tables - do note that they are accessible at this stage via __va. 1192 - * For good measure we also round up to the PMD - which means that if 1193 - * anybody is using __ka address to the initial boot-stack - and try 1194 - * to use it - they are going to crash. The xen_start_info has been 1195 - * taken care of already in xen_setup_kernel_pagetable. */ 1196 - addr = xen_start_info->pt_base; 1197 - size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); 1198 - 1199 - xen_cleanhighmap(addr, addr + size); 1200 - xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); 1201 - #ifdef DEBUG 1202 - /* This is superfluous and is not necessary, but you know what 1203 - * lets do it. The MODULES_VADDR -> MODULES_END should be clear of 1204 - * anything at this stage. */ 1205 - xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); 1206 - #endif 1207 - } 1208 - #endif 1209 - 1210 - static void __init xen_pagetable_p2m_setup(void) 1211 - { 1212 - if (xen_feature(XENFEAT_auto_translated_physmap)) 1213 - return; 1214 - 1215 - xen_vmalloc_p2m_tree(); 1216 - 1217 - #ifdef CONFIG_X86_64 1218 - xen_pagetable_p2m_free(); 1219 - 1220 - xen_pagetable_cleanhighmap(); 1221 - #endif 1222 - /* And revector! Bye bye old array */ 1223 - xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; 1224 - } 1225 - 1226 - static void __init xen_pagetable_init(void) 1227 - { 1228 - paging_init(); 1229 - xen_post_allocator_init(); 1230 - 1231 - xen_pagetable_p2m_setup(); 1232 - 1233 - /* Allocate and initialize top and mid mfn levels for p2m structure */ 1234 - xen_build_mfn_list_list(); 1235 - 1236 - /* Remap memory freed due to conflicts with E820 map */ 1237 - if (!xen_feature(XENFEAT_auto_translated_physmap)) 1238 - xen_remap_memory(); 1239 - 1240 - xen_setup_shared_info(); 1241 - } 1242 - static void xen_write_cr2(unsigned long cr2) 1243 - { 1244 - this_cpu_read(xen_vcpu)->arch.cr2 = cr2; 1245 - } 1246 - 1247 - static unsigned long xen_read_cr2(void) 1248 - { 1249 - return this_cpu_read(xen_vcpu)->arch.cr2; 1250 - } 1251 - 1252 - unsigned long xen_read_cr2_direct(void) 1253 - { 1254 - return this_cpu_read(xen_vcpu_info.arch.cr2); 1255 - } 1256 - 1257 158 void xen_flush_tlb_all(void) 1258 159 { 1259 160 struct mmuext_op *op; ··· 61 1386 62 1387 preempt_enable(); 63 1388 } 64 - static void xen_flush_tlb(void) 65 - { 66 - struct mmuext_op *op; 67 - struct multicall_space mcs; 68 - 69 - trace_xen_mmu_flush_tlb(0); 70 - 71 - preempt_disable(); 72 - 73 - mcs = xen_mc_entry(sizeof(*op)); 74 - 75 - op = mcs.args; 76 - op->cmd = MMUEXT_TLB_FLUSH_LOCAL; 77 - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 78 - 79 - xen_mc_issue(PARAVIRT_LAZY_MMU); 80 - 81 - preempt_enable(); 82 - } 83 - 84 - static void xen_flush_tlb_single(unsigned long addr) 85 - { 86 - struct mmuext_op *op; 87 - struct multicall_space mcs; 88 - 89 - trace_xen_mmu_flush_tlb_single(addr); 90 - 91 - preempt_disable(); 92 - 93 - mcs = xen_mc_entry(sizeof(*op)); 94 - op = mcs.args; 95 - op->cmd = MMUEXT_INVLPG_LOCAL; 96 - op->arg1.linear_addr = addr & PAGE_MASK; 97 - MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 98 - 99 - xen_mc_issue(PARAVIRT_LAZY_MMU); 100 - 101 - preempt_enable(); 102 - } 103 - 104 - static void xen_flush_tlb_others(const struct cpumask *cpus, 105 - struct mm_struct *mm, unsigned long start, 106 - unsigned long end) 107 - { 108 - struct { 109 - struct mmuext_op op; 110 - #ifdef CONFIG_SMP 111 - DECLARE_BITMAP(mask, num_processors); 112 - #else 113 - DECLARE_BITMAP(mask, NR_CPUS); 114 - #endif 115 - } *args; 116 - struct multicall_space mcs; 117 - 118 - trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); 119 - 120 - if (cpumask_empty(cpus)) 121 - return; /* nothing to do */ 122 - 123 - mcs = xen_mc_entry(sizeof(*args)); 124 - args = mcs.args; 125 - args->op.arg2.vcpumask = to_cpumask(args->mask); 126 - 127 - /* Remove us, and any offline CPUS. */ 128 - cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); 129 - cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 130 - 131 - args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 132 - if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { 133 - args->op.cmd = MMUEXT_INVLPG_MULTI; 134 - args->op.arg1.linear_addr = start; 135 - } 136 - 137 - MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 138 - 139 - xen_mc_issue(PARAVIRT_LAZY_MMU); 140 - } 141 - 142 - static unsigned long xen_read_cr3(void) 143 - { 144 - return this_cpu_read(xen_cr3); 145 - } 146 - 147 - static void set_current_cr3(void *v) 148 - { 149 - this_cpu_write(xen_current_cr3, (unsigned long)v); 150 - } 151 - 152 - static void __xen_write_cr3(bool kernel, unsigned long cr3) 153 - { 154 - struct mmuext_op op; 155 - unsigned long mfn; 156 - 157 - trace_xen_mmu_write_cr3(kernel, cr3); 158 - 159 - if (cr3) 160 - mfn = pfn_to_mfn(PFN_DOWN(cr3)); 161 - else 162 - mfn = 0; 163 - 164 - WARN_ON(mfn == 0 && kernel); 165 - 166 - op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; 167 - op.arg1.mfn = mfn; 168 - 169 - xen_extend_mmuext_op(&op); 170 - 171 - if (kernel) { 172 - this_cpu_write(xen_cr3, cr3); 173 - 174 - /* Update xen_current_cr3 once the batch has actually 175 - been submitted. */ 176 - xen_mc_callback(set_current_cr3, (void *)cr3); 177 - } 178 - } 179 - static void xen_write_cr3(unsigned long cr3) 180 - { 181 - BUG_ON(preemptible()); 182 - 183 - xen_mc_batch(); /* disables interrupts */ 184 - 185 - /* Update while interrupts are disabled, so its atomic with 186 - respect to ipis */ 187 - this_cpu_write(xen_cr3, cr3); 188 - 189 - __xen_write_cr3(true, cr3); 190 - 191 - #ifdef CONFIG_X86_64 192 - { 193 - pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); 194 - if (user_pgd) 195 - __xen_write_cr3(false, __pa(user_pgd)); 196 - else 197 - __xen_write_cr3(false, 0); 198 - } 199 - #endif 200 - 201 - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 202 - } 203 - 204 - #ifdef CONFIG_X86_64 205 - /* 206 - * At the start of the day - when Xen launches a guest, it has already 207 - * built pagetables for the guest. We diligently look over them 208 - * in xen_setup_kernel_pagetable and graft as appropriate them in the 209 - * init_level4_pgt and its friends. Then when we are happy we load 210 - * the new init_level4_pgt - and continue on. 211 - * 212 - * The generic code starts (start_kernel) and 'init_mem_mapping' sets 213 - * up the rest of the pagetables. When it has completed it loads the cr3. 214 - * N.B. that baremetal would start at 'start_kernel' (and the early 215 - * #PF handler would create bootstrap pagetables) - so we are running 216 - * with the same assumptions as what to do when write_cr3 is executed 217 - * at this point. 218 - * 219 - * Since there are no user-page tables at all, we have two variants 220 - * of xen_write_cr3 - the early bootup (this one), and the late one 221 - * (xen_write_cr3). The reason we have to do that is that in 64-bit 222 - * the Linux kernel and user-space are both in ring 3 while the 223 - * hypervisor is in ring 0. 224 - */ 225 - static void __init xen_write_cr3_init(unsigned long cr3) 226 - { 227 - BUG_ON(preemptible()); 228 - 229 - xen_mc_batch(); /* disables interrupts */ 230 - 231 - /* Update while interrupts are disabled, so its atomic with 232 - respect to ipis */ 233 - this_cpu_write(xen_cr3, cr3); 234 - 235 - __xen_write_cr3(true, cr3); 236 - 237 - xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 238 - } 239 - #endif 240 - 241 - static int xen_pgd_alloc(struct mm_struct *mm) 242 - { 243 - pgd_t *pgd = mm->pgd; 244 - int ret = 0; 245 - 246 - BUG_ON(PagePinned(virt_to_page(pgd))); 247 - 248 - #ifdef CONFIG_X86_64 249 - { 250 - struct page *page = virt_to_page(pgd); 251 - pgd_t *user_pgd; 252 - 253 - BUG_ON(page->private != 0); 254 - 255 - ret = -ENOMEM; 256 - 257 - user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 258 - page->private = (unsigned long)user_pgd; 259 - 260 - if (user_pgd != NULL) { 261 - #ifdef CONFIG_X86_VSYSCALL_EMULATION 262 - user_pgd[pgd_index(VSYSCALL_ADDR)] = 263 - __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); 264 - #endif 265 - ret = 0; 266 - } 267 - 268 - BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); 269 - } 270 - #endif 271 - return ret; 272 - } 273 - 274 - static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) 275 - { 276 - #ifdef CONFIG_X86_64 277 - pgd_t *user_pgd = xen_get_user_pgd(pgd); 278 - 279 - if (user_pgd) 280 - free_page((unsigned long)user_pgd); 281 - #endif 282 - } 283 - 284 - /* 285 - * Init-time set_pte while constructing initial pagetables, which 286 - * doesn't allow RO page table pages to be remapped RW. 287 - * 288 - * If there is no MFN for this PFN then this page is initially 289 - * ballooned out so clear the PTE (as in decrease_reservation() in 290 - * drivers/xen/balloon.c). 291 - * 292 - * Many of these PTE updates are done on unpinned and writable pages 293 - * and doing a hypercall for these is unnecessary and expensive. At 294 - * this point it is not possible to tell if a page is pinned or not, 295 - * so always write the PTE directly and rely on Xen trapping and 296 - * emulating any updates as necessary. 297 - */ 298 - __visible pte_t xen_make_pte_init(pteval_t pte) 299 - { 300 - #ifdef CONFIG_X86_64 301 - unsigned long pfn; 302 - 303 - /* 304 - * Pages belonging to the initial p2m list mapped outside the default 305 - * address range must be mapped read-only. This region contains the 306 - * page tables for mapping the p2m list, too, and page tables MUST be 307 - * mapped read-only. 308 - */ 309 - pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT; 310 - if (xen_start_info->mfn_list < __START_KERNEL_map && 311 - pfn >= xen_start_info->first_p2m_pfn && 312 - pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) 313 - pte &= ~_PAGE_RW; 314 - #endif 315 - pte = pte_pfn_to_mfn(pte); 316 - return native_make_pte(pte); 317 - } 318 - PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init); 319 - 320 - static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) 321 - { 322 - #ifdef CONFIG_X86_32 323 - /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 324 - if (pte_mfn(pte) != INVALID_P2M_ENTRY 325 - && pte_val_ma(*ptep) & _PAGE_PRESENT) 326 - pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 327 - pte_val_ma(pte)); 328 - #endif 329 - native_set_pte(ptep, pte); 330 - } 331 - 332 - /* Early in boot, while setting up the initial pagetable, assume 333 - everything is pinned. */ 334 - static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) 335 - { 336 - #ifdef CONFIG_FLATMEM 337 - BUG_ON(mem_map); /* should only be used early */ 338 - #endif 339 - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 340 - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 341 - } 342 - 343 - /* Used for pmd and pud */ 344 - static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) 345 - { 346 - #ifdef CONFIG_FLATMEM 347 - BUG_ON(mem_map); /* should only be used early */ 348 - #endif 349 - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 350 - } 351 - 352 - /* Early release_pte assumes that all pts are pinned, since there's 353 - only init_mm and anything attached to that is pinned. */ 354 - static void __init xen_release_pte_init(unsigned long pfn) 355 - { 356 - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 357 - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 358 - } 359 - 360 - static void __init xen_release_pmd_init(unsigned long pfn) 361 - { 362 - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 363 - } 364 - 365 - static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 366 - { 367 - struct multicall_space mcs; 368 - struct mmuext_op *op; 369 - 370 - mcs = __xen_mc_entry(sizeof(*op)); 371 - op = mcs.args; 372 - op->cmd = cmd; 373 - op->arg1.mfn = pfn_to_mfn(pfn); 374 - 375 - MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 376 - } 377 - 378 - static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) 379 - { 380 - struct multicall_space mcs; 381 - unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); 382 - 383 - mcs = __xen_mc_entry(0); 384 - MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, 385 - pfn_pte(pfn, prot), 0); 386 - } 387 - 388 - /* This needs to make sure the new pte page is pinned iff its being 389 - attached to a pinned pagetable. */ 390 - static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, 391 - unsigned level) 392 - { 393 - bool pinned = PagePinned(virt_to_page(mm->pgd)); 394 - 395 - trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); 396 - 397 - if (pinned) { 398 - struct page *page = pfn_to_page(pfn); 399 - 400 - SetPagePinned(page); 401 - 402 - if (!PageHighMem(page)) { 403 - xen_mc_batch(); 404 - 405 - __set_pfn_prot(pfn, PAGE_KERNEL_RO); 406 - 407 - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) 408 - __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 409 - 410 - xen_mc_issue(PARAVIRT_LAZY_MMU); 411 - } else { 412 - /* make sure there are no stray mappings of 413 - this page */ 414 - kmap_flush_unused(); 415 - } 416 - } 417 - } 418 - 419 - static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) 420 - { 421 - xen_alloc_ptpage(mm, pfn, PT_PTE); 422 - } 423 - 424 - static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) 425 - { 426 - xen_alloc_ptpage(mm, pfn, PT_PMD); 427 - } 428 - 429 - /* This should never happen until we're OK to use struct page */ 430 - static inline void xen_release_ptpage(unsigned long pfn, unsigned level) 431 - { 432 - struct page *page = pfn_to_page(pfn); 433 - bool pinned = PagePinned(page); 434 - 435 - trace_xen_mmu_release_ptpage(pfn, level, pinned); 436 - 437 - if (pinned) { 438 - if (!PageHighMem(page)) { 439 - xen_mc_batch(); 440 - 441 - if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) 442 - __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 443 - 444 - __set_pfn_prot(pfn, PAGE_KERNEL); 445 - 446 - xen_mc_issue(PARAVIRT_LAZY_MMU); 447 - } 448 - ClearPagePinned(page); 449 - } 450 - } 451 - 452 - static void xen_release_pte(unsigned long pfn) 453 - { 454 - xen_release_ptpage(pfn, PT_PTE); 455 - } 456 - 457 - static void xen_release_pmd(unsigned long pfn) 458 - { 459 - xen_release_ptpage(pfn, PT_PMD); 460 - } 461 - 462 - #if CONFIG_PGTABLE_LEVELS >= 4 463 - static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) 464 - { 465 - xen_alloc_ptpage(mm, pfn, PT_PUD); 466 - } 467 - 468 - static void xen_release_pud(unsigned long pfn) 469 - { 470 - xen_release_ptpage(pfn, PT_PUD); 471 - } 472 - #endif 473 - 474 - void __init xen_reserve_top(void) 475 - { 476 - #ifdef CONFIG_X86_32 477 - unsigned long top = HYPERVISOR_VIRT_START; 478 - struct xen_platform_parameters pp; 479 - 480 - if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 481 - top = pp.virt_start; 482 - 483 - reserve_top_address(-top); 484 - #endif /* CONFIG_X86_32 */ 485 - } 486 - 487 - /* 488 - * Like __va(), but returns address in the kernel mapping (which is 489 - * all we have until the physical memory mapping has been set up. 490 - */ 491 - static void * __init __ka(phys_addr_t paddr) 492 - { 493 - #ifdef CONFIG_X86_64 494 - return (void *)(paddr + __START_KERNEL_map); 495 - #else 496 - return __va(paddr); 497 - #endif 498 - } 499 - 500 - /* Convert a machine address to physical address */ 501 - static unsigned long __init m2p(phys_addr_t maddr) 502 - { 503 - phys_addr_t paddr; 504 - 505 - maddr &= PTE_PFN_MASK; 506 - paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; 507 - 508 - return paddr; 509 - } 510 - 511 - /* Convert a machine address to kernel virtual */ 512 - static void * __init m2v(phys_addr_t maddr) 513 - { 514 - return __ka(m2p(maddr)); 515 - } 516 - 517 - /* Set the page permissions on an identity-mapped pages */ 518 - static void __init set_page_prot_flags(void *addr, pgprot_t prot, 519 - unsigned long flags) 520 - { 521 - unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 522 - pte_t pte = pfn_pte(pfn, prot); 523 - 524 - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) 525 - BUG(); 526 - } 527 - static void __init set_page_prot(void *addr, pgprot_t prot) 528 - { 529 - return set_page_prot_flags(addr, prot, UVMF_NONE); 530 - } 531 - #ifdef CONFIG_X86_32 532 - static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) 533 - { 534 - unsigned pmdidx, pteidx; 535 - unsigned ident_pte; 536 - unsigned long pfn; 537 - 538 - level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, 539 - PAGE_SIZE); 540 - 541 - ident_pte = 0; 542 - pfn = 0; 543 - for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { 544 - pte_t *pte_page; 545 - 546 - /* Reuse or allocate a page of ptes */ 547 - if (pmd_present(pmd[pmdidx])) 548 - pte_page = m2v(pmd[pmdidx].pmd); 549 - else { 550 - /* Check for free pte pages */ 551 - if (ident_pte == LEVEL1_IDENT_ENTRIES) 552 - break; 553 - 554 - pte_page = &level1_ident_pgt[ident_pte]; 555 - ident_pte += PTRS_PER_PTE; 556 - 557 - pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); 558 - } 559 - 560 - /* Install mappings */ 561 - for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 562 - pte_t pte; 563 - 564 - if (pfn > max_pfn_mapped) 565 - max_pfn_mapped = pfn; 566 - 567 - if (!pte_none(pte_page[pteidx])) 568 - continue; 569 - 570 - pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); 571 - pte_page[pteidx] = pte; 572 - } 573 - } 574 - 575 - for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) 576 - set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); 577 - 578 - set_page_prot(pmd, PAGE_KERNEL_RO); 579 - } 580 - #endif 581 - void __init xen_setup_machphys_mapping(void) 582 - { 583 - struct xen_machphys_mapping mapping; 584 - 585 - if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { 586 - machine_to_phys_mapping = (unsigned long *)mapping.v_start; 587 - machine_to_phys_nr = mapping.max_mfn + 1; 588 - } else { 589 - machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; 590 - } 591 - #ifdef CONFIG_X86_32 592 - WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) 593 - < machine_to_phys_mapping); 594 - #endif 595 - } 596 - 597 - #ifdef CONFIG_X86_64 598 - static void __init convert_pfn_mfn(void *v) 599 - { 600 - pte_t *pte = v; 601 - int i; 602 - 603 - /* All levels are converted the same way, so just treat them 604 - as ptes. */ 605 - for (i = 0; i < PTRS_PER_PTE; i++) 606 - pte[i] = xen_make_pte(pte[i].pte); 607 - } 608 - static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, 609 - unsigned long addr) 610 - { 611 - if (*pt_base == PFN_DOWN(__pa(addr))) { 612 - set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); 613 - clear_page((void *)addr); 614 - (*pt_base)++; 615 - } 616 - if (*pt_end == PFN_DOWN(__pa(addr))) { 617 - set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); 618 - clear_page((void *)addr); 619 - (*pt_end)--; 620 - } 621 - } 622 - /* 623 - * Set up the initial kernel pagetable. 624 - * 625 - * We can construct this by grafting the Xen provided pagetable into 626 - * head_64.S's preconstructed pagetables. We copy the Xen L2's into 627 - * level2_ident_pgt, and level2_kernel_pgt. This means that only the 628 - * kernel has a physical mapping to start with - but that's enough to 629 - * get __va working. We need to fill in the rest of the physical 630 - * mapping once some sort of allocator has been set up. 631 - */ 632 - void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 633 - { 634 - pud_t *l3; 635 - pmd_t *l2; 636 - unsigned long addr[3]; 637 - unsigned long pt_base, pt_end; 638 - unsigned i; 639 - 640 - /* max_pfn_mapped is the last pfn mapped in the initial memory 641 - * mappings. Considering that on Xen after the kernel mappings we 642 - * have the mappings of some pages that don't exist in pfn space, we 643 - * set max_pfn_mapped to the last real pfn mapped. */ 644 - if (xen_start_info->mfn_list < __START_KERNEL_map) 645 - max_pfn_mapped = xen_start_info->first_p2m_pfn; 646 - else 647 - max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); 648 - 649 - pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); 650 - pt_end = pt_base + xen_start_info->nr_pt_frames; 651 - 652 - /* Zap identity mapping */ 653 - init_level4_pgt[0] = __pgd(0); 654 - 655 - if (!xen_feature(XENFEAT_auto_translated_physmap)) { 656 - /* Pre-constructed entries are in pfn, so convert to mfn */ 657 - /* L4[272] -> level3_ident_pgt 658 - * L4[511] -> level3_kernel_pgt */ 659 - convert_pfn_mfn(init_level4_pgt); 660 - 661 - /* L3_i[0] -> level2_ident_pgt */ 662 - convert_pfn_mfn(level3_ident_pgt); 663 - /* L3_k[510] -> level2_kernel_pgt 664 - * L3_k[511] -> level2_fixmap_pgt */ 665 - convert_pfn_mfn(level3_kernel_pgt); 666 - 667 - /* L3_k[511][506] -> level1_fixmap_pgt */ 668 - convert_pfn_mfn(level2_fixmap_pgt); 669 - } 670 - /* We get [511][511] and have Xen's version of level2_kernel_pgt */ 671 - l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); 672 - l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); 673 - 674 - addr[0] = (unsigned long)pgd; 675 - addr[1] = (unsigned long)l3; 676 - addr[2] = (unsigned long)l2; 677 - /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: 678 - * Both L4[272][0] and L4[511][510] have entries that point to the same 679 - * L2 (PMD) tables. Meaning that if you modify it in __va space 680 - * it will be also modified in the __ka space! (But if you just 681 - * modify the PMD table to point to other PTE's or none, then you 682 - * are OK - which is what cleanup_highmap does) */ 683 - copy_page(level2_ident_pgt, l2); 684 - /* Graft it onto L4[511][510] */ 685 - copy_page(level2_kernel_pgt, l2); 686 - 687 - /* Copy the initial P->M table mappings if necessary. */ 688 - i = pgd_index(xen_start_info->mfn_list); 689 - if (i && i < pgd_index(__START_KERNEL_map)) 690 - init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; 691 - 692 - if (!xen_feature(XENFEAT_auto_translated_physmap)) { 693 - /* Make pagetable pieces RO */ 694 - set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); 695 - set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); 696 - set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); 697 - set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); 698 - set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); 699 - set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 700 - set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); 701 - set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); 702 - 703 - /* Pin down new L4 */ 704 - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, 705 - PFN_DOWN(__pa_symbol(init_level4_pgt))); 706 - 707 - /* Unpin Xen-provided one */ 708 - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 709 - 710 - /* 711 - * At this stage there can be no user pgd, and no page 712 - * structure to attach it to, so make sure we just set kernel 713 - * pgd. 714 - */ 715 - xen_mc_batch(); 716 - __xen_write_cr3(true, __pa(init_level4_pgt)); 717 - xen_mc_issue(PARAVIRT_LAZY_CPU); 718 - } else 719 - native_write_cr3(__pa(init_level4_pgt)); 720 - 721 - /* We can't that easily rip out L3 and L2, as the Xen pagetables are 722 - * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for 723 - * the initial domain. For guests using the toolstack, they are in: 724 - * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only 725 - * rip out the [L4] (pgd), but for guests we shave off three pages. 726 - */ 727 - for (i = 0; i < ARRAY_SIZE(addr); i++) 728 - check_pt_base(&pt_base, &pt_end, addr[i]); 729 - 730 - /* Our (by three pages) smaller Xen pagetable that we are using */ 731 - xen_pt_base = PFN_PHYS(pt_base); 732 - xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; 733 - memblock_reserve(xen_pt_base, xen_pt_size); 734 - 735 - /* Revector the xen_start_info */ 736 - xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); 737 - } 738 - 739 - /* 740 - * Read a value from a physical address. 741 - */ 742 - static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) 743 - { 744 - unsigned long *vaddr; 745 - unsigned long val; 746 - 747 - vaddr = early_memremap_ro(addr, sizeof(val)); 748 - val = *vaddr; 749 - early_memunmap(vaddr, sizeof(val)); 750 - return val; 751 - } 752 - 753 - /* 754 - * Translate a virtual address to a physical one without relying on mapped 755 - * page tables. 756 - */ 757 - static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) 758 - { 759 - phys_addr_t pa; 760 - pgd_t pgd; 761 - pud_t pud; 762 - pmd_t pmd; 763 - pte_t pte; 764 - 765 - pa = read_cr3(); 766 - pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * 767 - sizeof(pgd))); 768 - if (!pgd_present(pgd)) 769 - return 0; 770 - 771 - pa = pgd_val(pgd) & PTE_PFN_MASK; 772 - pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * 773 - sizeof(pud))); 774 - if (!pud_present(pud)) 775 - return 0; 776 - pa = pud_pfn(pud) << PAGE_SHIFT; 777 - if (pud_large(pud)) 778 - return pa + (vaddr & ~PUD_MASK); 779 - 780 - pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * 781 - sizeof(pmd))); 782 - if (!pmd_present(pmd)) 783 - return 0; 784 - pa = pmd_pfn(pmd) << PAGE_SHIFT; 785 - if (pmd_large(pmd)) 786 - return pa + (vaddr & ~PMD_MASK); 787 - 788 - pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * 789 - sizeof(pte))); 790 - if (!pte_present(pte)) 791 - return 0; 792 - pa = pte_pfn(pte) << PAGE_SHIFT; 793 - 794 - return pa | (vaddr & ~PAGE_MASK); 795 - } 796 - 797 - /* 798 - * Find a new area for the hypervisor supplied p2m list and relocate the p2m to 799 - * this area. 800 - */ 801 - void __init xen_relocate_p2m(void) 802 - { 803 - phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; 804 - unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; 805 - int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; 806 - pte_t *pt; 807 - pmd_t *pmd; 808 - pud_t *pud; 809 - p4d_t *p4d = NULL; 810 - pgd_t *pgd; 811 - unsigned long *new_p2m; 812 - int save_pud; 813 - 814 - size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 815 - n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; 816 - n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; 817 - n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; 818 - n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; 819 - if (PTRS_PER_P4D > 1) 820 - n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; 821 - else 822 - n_p4d = 0; 823 - n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; 824 - 825 - new_area = xen_find_free_area(PFN_PHYS(n_frames)); 826 - if (!new_area) { 827 - xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); 828 - BUG(); 829 - } 830 - 831 - /* 832 - * Setup the page tables for addressing the new p2m list. 833 - * We have asked the hypervisor to map the p2m list at the user address 834 - * PUD_SIZE. It may have done so, or it may have used a kernel space 835 - * address depending on the Xen version. 836 - * To avoid any possible virtual address collision, just use 837 - * 2 * PUD_SIZE for the new area. 838 - */ 839 - p4d_phys = new_area; 840 - pud_phys = p4d_phys + PFN_PHYS(n_p4d); 841 - pmd_phys = pud_phys + PFN_PHYS(n_pud); 842 - pt_phys = pmd_phys + PFN_PHYS(n_pmd); 843 - p2m_pfn = PFN_DOWN(pt_phys) + n_pt; 844 - 845 - pgd = __va(read_cr3()); 846 - new_p2m = (unsigned long *)(2 * PGDIR_SIZE); 847 - idx_p4d = 0; 848 - save_pud = n_pud; 849 - do { 850 - if (n_p4d > 0) { 851 - p4d = early_memremap(p4d_phys, PAGE_SIZE); 852 - clear_page(p4d); 853 - n_pud = min(save_pud, PTRS_PER_P4D); 854 - } 855 - for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { 856 - pud = early_memremap(pud_phys, PAGE_SIZE); 857 - clear_page(pud); 858 - for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); 859 - idx_pmd++) { 860 - pmd = early_memremap(pmd_phys, PAGE_SIZE); 861 - clear_page(pmd); 862 - for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); 863 - idx_pt++) { 864 - pt = early_memremap(pt_phys, PAGE_SIZE); 865 - clear_page(pt); 866 - for (idx_pte = 0; 867 - idx_pte < min(n_pte, PTRS_PER_PTE); 868 - idx_pte++) { 869 - set_pte(pt + idx_pte, 870 - pfn_pte(p2m_pfn, PAGE_KERNEL)); 871 - p2m_pfn++; 872 - } 873 - n_pte -= PTRS_PER_PTE; 874 - early_memunmap(pt, PAGE_SIZE); 875 - make_lowmem_page_readonly(__va(pt_phys)); 876 - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, 877 - PFN_DOWN(pt_phys)); 878 - set_pmd(pmd + idx_pt, 879 - __pmd(_PAGE_TABLE | pt_phys)); 880 - pt_phys += PAGE_SIZE; 881 - } 882 - n_pt -= PTRS_PER_PMD; 883 - early_memunmap(pmd, PAGE_SIZE); 884 - make_lowmem_page_readonly(__va(pmd_phys)); 885 - pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, 886 - PFN_DOWN(pmd_phys)); 887 - set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); 888 - pmd_phys += PAGE_SIZE; 889 - } 890 - n_pmd -= PTRS_PER_PUD; 891 - early_memunmap(pud, PAGE_SIZE); 892 - make_lowmem_page_readonly(__va(pud_phys)); 893 - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); 894 - if (n_p4d > 0) 895 - set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); 896 - else 897 - set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); 898 - pud_phys += PAGE_SIZE; 899 - } 900 - if (n_p4d > 0) { 901 - save_pud -= PTRS_PER_P4D; 902 - early_memunmap(p4d, PAGE_SIZE); 903 - make_lowmem_page_readonly(__va(p4d_phys)); 904 - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); 905 - set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); 906 - p4d_phys += PAGE_SIZE; 907 - } 908 - } while (++idx_p4d < n_p4d); 909 - 910 - /* Now copy the old p2m info to the new area. */ 911 - memcpy(new_p2m, xen_p2m_addr, size); 912 - xen_p2m_addr = new_p2m; 913 - 914 - /* Release the old p2m list and set new list info. */ 915 - p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); 916 - BUG_ON(!p2m_pfn); 917 - p2m_pfn_end = p2m_pfn + PFN_DOWN(size); 918 - 919 - if (xen_start_info->mfn_list < __START_KERNEL_map) { 920 - pfn = xen_start_info->first_p2m_pfn; 921 - pfn_end = xen_start_info->first_p2m_pfn + 922 - xen_start_info->nr_p2m_frames; 923 - set_pgd(pgd + 1, __pgd(0)); 924 - } else { 925 - pfn = p2m_pfn; 926 - pfn_end = p2m_pfn_end; 927 - } 928 - 929 - memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); 930 - while (pfn < pfn_end) { 931 - if (pfn == p2m_pfn) { 932 - pfn = p2m_pfn_end; 933 - continue; 934 - } 935 - make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 936 - pfn++; 937 - } 938 - 939 - xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; 940 - xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); 941 - xen_start_info->nr_p2m_frames = n_frames; 942 - } 943 - 944 - #else /* !CONFIG_X86_64 */ 945 - static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); 946 - static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); 947 - 948 - static void __init xen_write_cr3_init(unsigned long cr3) 949 - { 950 - unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); 951 - 952 - BUG_ON(read_cr3() != __pa(initial_page_table)); 953 - BUG_ON(cr3 != __pa(swapper_pg_dir)); 954 - 955 - /* 956 - * We are switching to swapper_pg_dir for the first time (from 957 - * initial_page_table) and therefore need to mark that page 958 - * read-only and then pin it. 959 - * 960 - * Xen disallows sharing of kernel PMDs for PAE 961 - * guests. Therefore we must copy the kernel PMD from 962 - * initial_page_table into a new kernel PMD to be used in 963 - * swapper_pg_dir. 964 - */ 965 - swapper_kernel_pmd = 966 - extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 967 - copy_page(swapper_kernel_pmd, initial_kernel_pmd); 968 - swapper_pg_dir[KERNEL_PGD_BOUNDARY] = 969 - __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); 970 - set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); 971 - 972 - set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); 973 - xen_write_cr3(cr3); 974 - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); 975 - 976 - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, 977 - PFN_DOWN(__pa(initial_page_table))); 978 - set_page_prot(initial_page_table, PAGE_KERNEL); 979 - set_page_prot(initial_kernel_pmd, PAGE_KERNEL); 980 - 981 - pv_mmu_ops.write_cr3 = &xen_write_cr3; 982 - } 983 - 984 - /* 985 - * For 32 bit domains xen_start_info->pt_base is the pgd address which might be 986 - * not the first page table in the page table pool. 987 - * Iterate through the initial page tables to find the real page table base. 988 - */ 989 - static phys_addr_t xen_find_pt_base(pmd_t *pmd) 990 - { 991 - phys_addr_t pt_base, paddr; 992 - unsigned pmdidx; 993 - 994 - pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); 995 - 996 - for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) 997 - if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { 998 - paddr = m2p(pmd[pmdidx].pmd); 999 - pt_base = min(pt_base, paddr); 1000 - } 1001 - 1002 - return pt_base; 1003 - } 1004 - 1005 - void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1006 - { 1007 - pmd_t *kernel_pmd; 1008 - 1009 - kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1010 - 1011 - xen_pt_base = xen_find_pt_base(kernel_pmd); 1012 - xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; 1013 - 1014 - initial_kernel_pmd = 1015 - extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 1016 - 1017 - max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); 1018 - 1019 - copy_page(initial_kernel_pmd, kernel_pmd); 1020 - 1021 - xen_map_identity_early(initial_kernel_pmd, max_pfn); 1022 - 1023 - copy_page(initial_page_table, pgd); 1024 - initial_page_table[KERNEL_PGD_BOUNDARY] = 1025 - __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); 1026 - 1027 - set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); 1028 - set_page_prot(initial_page_table, PAGE_KERNEL_RO); 1029 - set_page_prot(empty_zero_page, PAGE_KERNEL_RO); 1030 - 1031 - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1032 - 1033 - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, 1034 - PFN_DOWN(__pa(initial_page_table))); 1035 - xen_write_cr3(__pa(initial_page_table)); 1036 - 1037 - memblock_reserve(xen_pt_base, xen_pt_size); 1038 - } 1039 - #endif /* CONFIG_X86_64 */ 1040 - 1041 - void __init xen_reserve_special_pages(void) 1042 - { 1043 - phys_addr_t paddr; 1044 - 1045 - memblock_reserve(__pa(xen_start_info), PAGE_SIZE); 1046 - if (xen_start_info->store_mfn) { 1047 - paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); 1048 - memblock_reserve(paddr, PAGE_SIZE); 1049 - } 1050 - if (!xen_initial_domain()) { 1051 - paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); 1052 - memblock_reserve(paddr, PAGE_SIZE); 1053 - } 1054 - } 1055 - 1056 - void __init xen_pt_check_e820(void) 1057 - { 1058 - if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { 1059 - xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); 1060 - BUG(); 1061 - } 1062 - } 1063 - 1064 - static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; 1065 - 1066 - static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) 1067 - { 1068 - pte_t pte; 1069 - 1070 - phys >>= PAGE_SHIFT; 1071 - 1072 - switch (idx) { 1073 - case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: 1074 - case FIX_RO_IDT: 1075 - #ifdef CONFIG_X86_32 1076 - case FIX_WP_TEST: 1077 - # ifdef CONFIG_HIGHMEM 1078 - case FIX_KMAP_BEGIN ... FIX_KMAP_END: 1079 - # endif 1080 - #elif defined(CONFIG_X86_VSYSCALL_EMULATION) 1081 - case VSYSCALL_PAGE: 1082 - #endif 1083 - case FIX_TEXT_POKE0: 1084 - case FIX_TEXT_POKE1: 1085 - case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: 1086 - /* All local page mappings */ 1087 - pte = pfn_pte(phys, prot); 1088 - break; 1089 - 1090 - #ifdef CONFIG_X86_LOCAL_APIC 1091 - case FIX_APIC_BASE: /* maps dummy local APIC */ 1092 - pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); 1093 - break; 1094 - #endif 1095 - 1096 - #ifdef CONFIG_X86_IO_APIC 1097 - case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: 1098 - /* 1099 - * We just don't map the IO APIC - all access is via 1100 - * hypercalls. Keep the address in the pte for reference. 1101 - */ 1102 - pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); 1103 - break; 1104 - #endif 1105 - 1106 - case FIX_PARAVIRT_BOOTMAP: 1107 - /* This is an MFN, but it isn't an IO mapping from the 1108 - IO domain */ 1109 - pte = mfn_pte(phys, prot); 1110 - break; 1111 - 1112 - default: 1113 - /* By default, set_fixmap is used for hardware mappings */ 1114 - pte = mfn_pte(phys, prot); 1115 - break; 1116 - } 1117 - 1118 - __native_set_fixmap(idx, pte); 1119 - 1120 - #ifdef CONFIG_X86_VSYSCALL_EMULATION 1121 - /* Replicate changes to map the vsyscall page into the user 1122 - pagetable vsyscall mapping. */ 1123 - if (idx == VSYSCALL_PAGE) { 1124 - unsigned long vaddr = __fix_to_virt(idx); 1125 - set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); 1126 - } 1127 - #endif 1128 - } 1129 - 1130 - static void __init xen_post_allocator_init(void) 1131 - { 1132 - if (xen_feature(XENFEAT_auto_translated_physmap)) 1133 - return; 1134 - 1135 - pv_mmu_ops.set_pte = xen_set_pte; 1136 - pv_mmu_ops.set_pmd = xen_set_pmd; 1137 - pv_mmu_ops.set_pud = xen_set_pud; 1138 - #if CONFIG_PGTABLE_LEVELS >= 4 1139 - pv_mmu_ops.set_p4d = xen_set_p4d; 1140 - #endif 1141 - 1142 - /* This will work as long as patching hasn't happened yet 1143 - (which it hasn't) */ 1144 - pv_mmu_ops.alloc_pte = xen_alloc_pte; 1145 - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 1146 - pv_mmu_ops.release_pte = xen_release_pte; 1147 - pv_mmu_ops.release_pmd = xen_release_pmd; 1148 - #if CONFIG_PGTABLE_LEVELS >= 4 1149 - pv_mmu_ops.alloc_pud = xen_alloc_pud; 1150 - pv_mmu_ops.release_pud = xen_release_pud; 1151 - #endif 1152 - pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte); 1153 - 1154 - #ifdef CONFIG_X86_64 1155 - pv_mmu_ops.write_cr3 = &xen_write_cr3; 1156 - SetPagePinned(virt_to_page(level3_user_vsyscall)); 1157 - #endif 1158 - xen_mark_init_mm_pinned(); 1159 - } 1160 - 1161 - static void xen_leave_lazy_mmu(void) 1162 - { 1163 - preempt_disable(); 1164 - xen_mc_flush(); 1165 - paravirt_leave_lazy_mmu(); 1166 - preempt_enable(); 1167 - } 1168 - 1169 - static const struct pv_mmu_ops xen_mmu_ops __initconst = { 1170 - .read_cr2 = xen_read_cr2, 1171 - .write_cr2 = xen_write_cr2, 1172 - 1173 - .read_cr3 = xen_read_cr3, 1174 - .write_cr3 = xen_write_cr3_init, 1175 - 1176 - .flush_tlb_user = xen_flush_tlb, 1177 - .flush_tlb_kernel = xen_flush_tlb, 1178 - .flush_tlb_single = xen_flush_tlb_single, 1179 - .flush_tlb_others = xen_flush_tlb_others, 1180 - 1181 - .pte_update = paravirt_nop, 1182 - 1183 - .pgd_alloc = xen_pgd_alloc, 1184 - .pgd_free = xen_pgd_free, 1185 - 1186 - .alloc_pte = xen_alloc_pte_init, 1187 - .release_pte = xen_release_pte_init, 1188 - .alloc_pmd = xen_alloc_pmd_init, 1189 - .release_pmd = xen_release_pmd_init, 1190 - 1191 - .set_pte = xen_set_pte_init, 1192 - .set_pte_at = xen_set_pte_at, 1193 - .set_pmd = xen_set_pmd_hyper, 1194 - 1195 - .ptep_modify_prot_start = __ptep_modify_prot_start, 1196 - .ptep_modify_prot_commit = __ptep_modify_prot_commit, 1197 - 1198 - .pte_val = PV_CALLEE_SAVE(xen_pte_val), 1199 - .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), 1200 - 1201 - .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), 1202 - .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), 1203 - 1204 - #ifdef CONFIG_X86_PAE 1205 - .set_pte_atomic = xen_set_pte_atomic, 1206 - .pte_clear = xen_pte_clear, 1207 - .pmd_clear = xen_pmd_clear, 1208 - #endif /* CONFIG_X86_PAE */ 1209 - .set_pud = xen_set_pud_hyper, 1210 - 1211 - .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), 1212 - .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), 1213 - 1214 - #if CONFIG_PGTABLE_LEVELS >= 4 1215 - .pud_val = PV_CALLEE_SAVE(xen_pud_val), 1216 - .make_pud = PV_CALLEE_SAVE(xen_make_pud), 1217 - .set_p4d = xen_set_p4d_hyper, 1218 - 1219 - .alloc_pud = xen_alloc_pmd_init, 1220 - .release_pud = xen_release_pmd_init, 1221 - #endif /* CONFIG_PGTABLE_LEVELS == 4 */ 1222 - 1223 - .activate_mm = xen_activate_mm, 1224 - .dup_mmap = xen_dup_mmap, 1225 - .exit_mmap = xen_exit_mmap, 1226 - 1227 - .lazy_mode = { 1228 - .enter = paravirt_enter_lazy_mmu, 1229 - .leave = xen_leave_lazy_mmu, 1230 - .flush = paravirt_flush_lazy_mmu, 1231 - }, 1232 - 1233 - .set_fixmap = xen_set_fixmap, 1234 - }; 1235 - 1236 - void __init xen_init_mmu_ops(void) 1237 - { 1238 - x86_init.paging.pagetable_init = xen_pagetable_init; 1239 - 1240 - if (xen_feature(XENFEAT_auto_translated_physmap)) 1241 - return; 1242 - 1243 - pv_mmu_ops = xen_mmu_ops; 1244 - 1245 - memset(dummy_mapping, 0xff, PAGE_SIZE); 1246 - } 1247 - 1248 - /* Protected by xen_reservation_lock. */ 1249 - #define MAX_CONTIG_ORDER 9 /* 2MB */ 1250 - static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; 1251 - 1252 - #define VOID_PTE (mfn_pte(0, __pgprot(0))) 1253 - static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, 1254 - unsigned long *in_frames, 1255 - unsigned long *out_frames) 1256 - { 1257 - int i; 1258 - struct multicall_space mcs; 1259 - 1260 - xen_mc_batch(); 1261 - for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { 1262 - mcs = __xen_mc_entry(0); 1263 - 1264 - if (in_frames) 1265 - in_frames[i] = virt_to_mfn(vaddr); 1266 - 1267 - MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); 1268 - __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); 1269 - 1270 - if (out_frames) 1271 - out_frames[i] = virt_to_pfn(vaddr); 1272 - } 1273 - xen_mc_issue(0); 1274 - } 1275 - 1276 - /* 1277 - * Update the pfn-to-mfn mappings for a virtual address range, either to 1278 - * point to an array of mfns, or contiguously from a single starting 1279 - * mfn. 1280 - */ 1281 - static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, 1282 - unsigned long *mfns, 1283 - unsigned long first_mfn) 1284 - { 1285 - unsigned i, limit; 1286 - unsigned long mfn; 1287 - 1288 - xen_mc_batch(); 1289 - 1290 - limit = 1u << order; 1291 - for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { 1292 - struct multicall_space mcs; 1293 - unsigned flags; 1294 - 1295 - mcs = __xen_mc_entry(0); 1296 - if (mfns) 1297 - mfn = mfns[i]; 1298 - else 1299 - mfn = first_mfn + i; 1300 - 1301 - if (i < (limit - 1)) 1302 - flags = 0; 1303 - else { 1304 - if (order == 0) 1305 - flags = UVMF_INVLPG | UVMF_ALL; 1306 - else 1307 - flags = UVMF_TLB_FLUSH | UVMF_ALL; 1308 - } 1309 - 1310 - MULTI_update_va_mapping(mcs.mc, vaddr, 1311 - mfn_pte(mfn, PAGE_KERNEL), flags); 1312 - 1313 - set_phys_to_machine(virt_to_pfn(vaddr), mfn); 1314 - } 1315 - 1316 - xen_mc_issue(0); 1317 - } 1318 - 1319 - /* 1320 - * Perform the hypercall to exchange a region of our pfns to point to 1321 - * memory with the required contiguous alignment. Takes the pfns as 1322 - * input, and populates mfns as output. 1323 - * 1324 - * Returns a success code indicating whether the hypervisor was able to 1325 - * satisfy the request or not. 1326 - */ 1327 - static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, 1328 - unsigned long *pfns_in, 1329 - unsigned long extents_out, 1330 - unsigned int order_out, 1331 - unsigned long *mfns_out, 1332 - unsigned int address_bits) 1333 - { 1334 - long rc; 1335 - int success; 1336 - 1337 - struct xen_memory_exchange exchange = { 1338 - .in = { 1339 - .nr_extents = extents_in, 1340 - .extent_order = order_in, 1341 - .extent_start = pfns_in, 1342 - .domid = DOMID_SELF 1343 - }, 1344 - .out = { 1345 - .nr_extents = extents_out, 1346 - .extent_order = order_out, 1347 - .extent_start = mfns_out, 1348 - .address_bits = address_bits, 1349 - .domid = DOMID_SELF 1350 - } 1351 - }; 1352 - 1353 - BUG_ON(extents_in << order_in != extents_out << order_out); 1354 - 1355 - rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); 1356 - success = (exchange.nr_exchanged == extents_in); 1357 - 1358 - BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); 1359 - BUG_ON(success && (rc != 0)); 1360 - 1361 - return success; 1362 - } 1363 - 1364 - int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, 1365 - unsigned int address_bits, 1366 - dma_addr_t *dma_handle) 1367 - { 1368 - unsigned long *in_frames = discontig_frames, out_frame; 1369 - unsigned long flags; 1370 - int success; 1371 - unsigned long vstart = (unsigned long)phys_to_virt(pstart); 1372 - 1373 - /* 1374 - * Currently an auto-translated guest will not perform I/O, nor will 1375 - * it require PAE page directories below 4GB. Therefore any calls to 1376 - * this function are redundant and can be ignored. 1377 - */ 1378 - 1379 - if (xen_feature(XENFEAT_auto_translated_physmap)) 1380 - return 0; 1381 - 1382 - if (unlikely(order > MAX_CONTIG_ORDER)) 1383 - return -ENOMEM; 1384 - 1385 - memset((void *) vstart, 0, PAGE_SIZE << order); 1386 - 1387 - spin_lock_irqsave(&xen_reservation_lock, flags); 1388 - 1389 - /* 1. Zap current PTEs, remembering MFNs. */ 1390 - xen_zap_pfn_range(vstart, order, in_frames, NULL); 1391 - 1392 - /* 2. Get a new contiguous memory extent. */ 1393 - out_frame = virt_to_pfn(vstart); 1394 - success = xen_exchange_memory(1UL << order, 0, in_frames, 1395 - 1, order, &out_frame, 1396 - address_bits); 1397 - 1398 - /* 3. Map the new extent in place of old pages. */ 1399 - if (success) 1400 - xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); 1401 - else 1402 - xen_remap_exchanged_ptes(vstart, order, in_frames, 0); 1403 - 1404 - spin_unlock_irqrestore(&xen_reservation_lock, flags); 1405 - 1406 - *dma_handle = virt_to_machine(vstart).maddr; 1407 - return success ? 0 : -ENOMEM; 1408 - } 1409 - EXPORT_SYMBOL_GPL(xen_create_contiguous_region); 1410 - 1411 - void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) 1412 - { 1413 - unsigned long *out_frames = discontig_frames, in_frame; 1414 - unsigned long flags; 1415 - int success; 1416 - unsigned long vstart; 1417 - 1418 - if (xen_feature(XENFEAT_auto_translated_physmap)) 1419 - return; 1420 - 1421 - if (unlikely(order > MAX_CONTIG_ORDER)) 1422 - return; 1423 - 1424 - vstart = (unsigned long)phys_to_virt(pstart); 1425 - memset((void *) vstart, 0, PAGE_SIZE << order); 1426 - 1427 - spin_lock_irqsave(&xen_reservation_lock, flags); 1428 - 1429 - /* 1. Find start MFN of contiguous extent. */ 1430 - in_frame = virt_to_mfn(vstart); 1431 - 1432 - /* 2. Zap current PTEs. */ 1433 - xen_zap_pfn_range(vstart, order, NULL, out_frames); 1434 - 1435 - /* 3. Do the exchange for non-contiguous MFNs. */ 1436 - success = xen_exchange_memory(1, order, &in_frame, 1UL << order, 1437 - 0, out_frames, 0); 1438 - 1439 - /* 4. Map new pages in place of old pages. */ 1440 - if (success) 1441 - xen_remap_exchanged_ptes(vstart, order, out_frames, 0); 1442 - else 1443 - xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); 1444 - 1445 - spin_unlock_irqrestore(&xen_reservation_lock, flags); 1446 - } 1447 - EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); 1448 - 1449 - #ifdef CONFIG_XEN_PVHVM 1450 - #ifdef CONFIG_PROC_VMCORE 1451 - /* 1452 - * This function is used in two contexts: 1453 - * - the kdump kernel has to check whether a pfn of the crashed kernel 1454 - * was a ballooned page. vmcore is using this function to decide 1455 - * whether to access a pfn of the crashed kernel. 1456 - * - the kexec kernel has to check whether a pfn was ballooned by the 1457 - * previous kernel. If the pfn is ballooned, handle it properly. 1458 - * Returns 0 if the pfn is not backed by a RAM page, the caller may 1459 - * handle the pfn special in this case. 1460 - */ 1461 - static int xen_oldmem_pfn_is_ram(unsigned long pfn) 1462 - { 1463 - struct xen_hvm_get_mem_type a = { 1464 - .domid = DOMID_SELF, 1465 - .pfn = pfn, 1466 - }; 1467 - int ram; 1468 - 1469 - if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) 1470 - return -ENXIO; 1471 - 1472 - switch (a.mem_type) { 1473 - case HVMMEM_mmio_dm: 1474 - ram = 0; 1475 - break; 1476 - case HVMMEM_ram_rw: 1477 - case HVMMEM_ram_ro: 1478 - default: 1479 - ram = 1; 1480 - break; 1481 - } 1482 - 1483 - return ram; 1484 - } 1485 - #endif 1486 - 1487 - static void xen_hvm_exit_mmap(struct mm_struct *mm) 1488 - { 1489 - struct xen_hvm_pagetable_dying a; 1490 - int rc; 1491 - 1492 - a.domid = DOMID_SELF; 1493 - a.gpa = __pa(mm->pgd); 1494 - rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); 1495 - WARN_ON_ONCE(rc < 0); 1496 - } 1497 - 1498 - static int is_pagetable_dying_supported(void) 1499 - { 1500 - struct xen_hvm_pagetable_dying a; 1501 - int rc = 0; 1502 - 1503 - a.domid = DOMID_SELF; 1504 - a.gpa = 0x00; 1505 - rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); 1506 - if (rc < 0) { 1507 - printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); 1508 - return 0; 1509 - } 1510 - return 1; 1511 - } 1512 - 1513 - void __init xen_hvm_init_mmu_ops(void) 1514 - { 1515 - if (is_pagetable_dying_supported()) 1516 - pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; 1517 - #ifdef CONFIG_PROC_VMCORE 1518 - register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); 1519 - #endif 1520 - } 1521 - #endif 1522 1389 1523 1390 #define REMAP_BATCH_SIZE 16 1524 1391 ··· 190 2973 return do_remap_gfn(vma, addr, gfn, nr, err_ptr, prot, domid, pages); 191 2974 } 192 2975 EXPORT_SYMBOL_GPL(xen_remap_domain_gfn_array); 193 - 194 2976 195 2977 /* Returns: 0 success */ 196 2978 int xen_unmap_domain_gfn_range(struct vm_area_struct *vma,

+79

arch/x86/xen/mmu_hvm.c

··· 1 + #include <linux/types.h> 2 + #include <linux/crash_dump.h> 3 + 4 + #include <xen/interface/xen.h> 5 + #include <xen/hvm.h> 6 + 7 + #include "mmu.h" 8 + 9 + #ifdef CONFIG_PROC_VMCORE 10 + /* 11 + * This function is used in two contexts: 12 + * - the kdump kernel has to check whether a pfn of the crashed kernel 13 + * was a ballooned page. vmcore is using this function to decide 14 + * whether to access a pfn of the crashed kernel. 15 + * - the kexec kernel has to check whether a pfn was ballooned by the 16 + * previous kernel. If the pfn is ballooned, handle it properly. 17 + * Returns 0 if the pfn is not backed by a RAM page, the caller may 18 + * handle the pfn special in this case. 19 + */ 20 + static int xen_oldmem_pfn_is_ram(unsigned long pfn) 21 + { 22 + struct xen_hvm_get_mem_type a = { 23 + .domid = DOMID_SELF, 24 + .pfn = pfn, 25 + }; 26 + int ram; 27 + 28 + if (HYPERVISOR_hvm_op(HVMOP_get_mem_type, &a)) 29 + return -ENXIO; 30 + 31 + switch (a.mem_type) { 32 + case HVMMEM_mmio_dm: 33 + ram = 0; 34 + break; 35 + case HVMMEM_ram_rw: 36 + case HVMMEM_ram_ro: 37 + default: 38 + ram = 1; 39 + break; 40 + } 41 + 42 + return ram; 43 + } 44 + #endif 45 + 46 + static void xen_hvm_exit_mmap(struct mm_struct *mm) 47 + { 48 + struct xen_hvm_pagetable_dying a; 49 + int rc; 50 + 51 + a.domid = DOMID_SELF; 52 + a.gpa = __pa(mm->pgd); 53 + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); 54 + WARN_ON_ONCE(rc < 0); 55 + } 56 + 57 + static int is_pagetable_dying_supported(void) 58 + { 59 + struct xen_hvm_pagetable_dying a; 60 + int rc = 0; 61 + 62 + a.domid = DOMID_SELF; 63 + a.gpa = 0x00; 64 + rc = HYPERVISOR_hvm_op(HVMOP_pagetable_dying, &a); 65 + if (rc < 0) { 66 + printk(KERN_DEBUG "HVMOP_pagetable_dying not supported\n"); 67 + return 0; 68 + } 69 + return 1; 70 + } 71 + 72 + void __init xen_hvm_init_mmu_ops(void) 73 + { 74 + if (is_pagetable_dying_supported()) 75 + pv_mmu_ops.exit_mmap = xen_hvm_exit_mmap; 76 + #ifdef CONFIG_PROC_VMCORE 77 + register_oldmem_pfn_is_ram(&xen_oldmem_pfn_is_ram); 78 + #endif 79 + }

+2730

arch/x86/xen/mmu_pv.c

··· 1 + /* 2 + * Xen mmu operations 3 + * 4 + * This file contains the various mmu fetch and update operations. 5 + * The most important job they must perform is the mapping between the 6 + * domain's pfn and the overall machine mfns. 7 + * 8 + * Xen allows guests to directly update the pagetable, in a controlled 9 + * fashion. In other words, the guest modifies the same pagetable 10 + * that the CPU actually uses, which eliminates the overhead of having 11 + * a separate shadow pagetable. 12 + * 13 + * In order to allow this, it falls on the guest domain to map its 14 + * notion of a "physical" pfn - which is just a domain-local linear 15 + * address - into a real "machine address" which the CPU's MMU can 16 + * use. 17 + * 18 + * A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be 19 + * inserted directly into the pagetable. When creating a new 20 + * pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely, 21 + * when reading the content back with __(pgd|pmd|pte)_val, it converts 22 + * the mfn back into a pfn. 23 + * 24 + * The other constraint is that all pages which make up a pagetable 25 + * must be mapped read-only in the guest. This prevents uncontrolled 26 + * guest updates to the pagetable. Xen strictly enforces this, and 27 + * will disallow any pagetable update which will end up mapping a 28 + * pagetable page RW, and will disallow using any writable page as a 29 + * pagetable. 30 + * 31 + * Naively, when loading %cr3 with the base of a new pagetable, Xen 32 + * would need to validate the whole pagetable before going on. 33 + * Naturally, this is quite slow. The solution is to "pin" a 34 + * pagetable, which enforces all the constraints on the pagetable even 35 + * when it is not actively in use. This menas that Xen can be assured 36 + * that it is still valid when you do load it into %cr3, and doesn't 37 + * need to revalidate it. 38 + * 39 + * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007 40 + */ 41 + #include <linux/sched/mm.h> 42 + #include <linux/highmem.h> 43 + #include <linux/debugfs.h> 44 + #include <linux/bug.h> 45 + #include <linux/vmalloc.h> 46 + #include <linux/export.h> 47 + #include <linux/init.h> 48 + #include <linux/gfp.h> 49 + #include <linux/memblock.h> 50 + #include <linux/seq_file.h> 51 + #include <linux/crash_dump.h> 52 + #ifdef CONFIG_KEXEC_CORE 53 + #include <linux/kexec.h> 54 + #endif 55 + 56 + #include <trace/events/xen.h> 57 + 58 + #include <asm/pgtable.h> 59 + #include <asm/tlbflush.h> 60 + #include <asm/fixmap.h> 61 + #include <asm/mmu_context.h> 62 + #include <asm/setup.h> 63 + #include <asm/paravirt.h> 64 + #include <asm/e820/api.h> 65 + #include <asm/linkage.h> 66 + #include <asm/page.h> 67 + #include <asm/init.h> 68 + #include <asm/pat.h> 69 + #include <asm/smp.h> 70 + 71 + #include <asm/xen/hypercall.h> 72 + #include <asm/xen/hypervisor.h> 73 + 74 + #include <xen/xen.h> 75 + #include <xen/page.h> 76 + #include <xen/interface/xen.h> 77 + #include <xen/interface/hvm/hvm_op.h> 78 + #include <xen/interface/version.h> 79 + #include <xen/interface/memory.h> 80 + #include <xen/hvc-console.h> 81 + 82 + #include "multicalls.h" 83 + #include "mmu.h" 84 + #include "debugfs.h" 85 + 86 + #ifdef CONFIG_X86_32 87 + /* 88 + * Identity map, in addition to plain kernel map. This needs to be 89 + * large enough to allocate page table pages to allocate the rest. 90 + * Each page can map 2MB. 91 + */ 92 + #define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4) 93 + static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES); 94 + #endif 95 + #ifdef CONFIG_X86_64 96 + /* l3 pud for userspace vsyscall mapping */ 97 + static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; 98 + #endif /* CONFIG_X86_64 */ 99 + 100 + /* 101 + * Note about cr3 (pagetable base) values: 102 + * 103 + * xen_cr3 contains the current logical cr3 value; it contains the 104 + * last set cr3. This may not be the current effective cr3, because 105 + * its update may be being lazily deferred. However, a vcpu looking 106 + * at its own cr3 can use this value knowing that it everything will 107 + * be self-consistent. 108 + * 109 + * xen_current_cr3 contains the actual vcpu cr3; it is set once the 110 + * hypercall to set the vcpu cr3 is complete (so it may be a little 111 + * out of date, but it will never be set early). If one vcpu is 112 + * looking at another vcpu's cr3 value, it should use this variable. 113 + */ 114 + DEFINE_PER_CPU(unsigned long, xen_cr3); /* cr3 stored as physaddr */ 115 + DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */ 116 + 117 + static phys_addr_t xen_pt_base, xen_pt_size __initdata; 118 + 119 + /* 120 + * Just beyond the highest usermode address. STACK_TOP_MAX has a 121 + * redzone above it, so round it up to a PGD boundary. 122 + */ 123 + #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 124 + 125 + void make_lowmem_page_readonly(void *vaddr) 126 + { 127 + pte_t *pte, ptev; 128 + unsigned long address = (unsigned long)vaddr; 129 + unsigned int level; 130 + 131 + pte = lookup_address(address, &level); 132 + if (pte == NULL) 133 + return; /* vaddr missing */ 134 + 135 + ptev = pte_wrprotect(*pte); 136 + 137 + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) 138 + BUG(); 139 + } 140 + 141 + void make_lowmem_page_readwrite(void *vaddr) 142 + { 143 + pte_t *pte, ptev; 144 + unsigned long address = (unsigned long)vaddr; 145 + unsigned int level; 146 + 147 + pte = lookup_address(address, &level); 148 + if (pte == NULL) 149 + return; /* vaddr missing */ 150 + 151 + ptev = pte_mkwrite(*pte); 152 + 153 + if (HYPERVISOR_update_va_mapping(address, ptev, 0)) 154 + BUG(); 155 + } 156 + 157 + 158 + static bool xen_page_pinned(void *ptr) 159 + { 160 + struct page *page = virt_to_page(ptr); 161 + 162 + return PagePinned(page); 163 + } 164 + 165 + void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) 166 + { 167 + struct multicall_space mcs; 168 + struct mmu_update *u; 169 + 170 + trace_xen_mmu_set_domain_pte(ptep, pteval, domid); 171 + 172 + mcs = xen_mc_entry(sizeof(*u)); 173 + u = mcs.args; 174 + 175 + /* ptep might be kmapped when using 32-bit HIGHPTE */ 176 + u->ptr = virt_to_machine(ptep).maddr; 177 + u->val = pte_val_ma(pteval); 178 + 179 + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); 180 + 181 + xen_mc_issue(PARAVIRT_LAZY_MMU); 182 + } 183 + EXPORT_SYMBOL_GPL(xen_set_domain_pte); 184 + 185 + static void xen_extend_mmu_update(const struct mmu_update *update) 186 + { 187 + struct multicall_space mcs; 188 + struct mmu_update *u; 189 + 190 + mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 191 + 192 + if (mcs.mc != NULL) { 193 + mcs.mc->args[1]++; 194 + } else { 195 + mcs = __xen_mc_entry(sizeof(*u)); 196 + MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 197 + } 198 + 199 + u = mcs.args; 200 + *u = *update; 201 + } 202 + 203 + static void xen_extend_mmuext_op(const struct mmuext_op *op) 204 + { 205 + struct multicall_space mcs; 206 + struct mmuext_op *u; 207 + 208 + mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, sizeof(*u)); 209 + 210 + if (mcs.mc != NULL) { 211 + mcs.mc->args[1]++; 212 + } else { 213 + mcs = __xen_mc_entry(sizeof(*u)); 214 + MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 215 + } 216 + 217 + u = mcs.args; 218 + *u = *op; 219 + } 220 + 221 + static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) 222 + { 223 + struct mmu_update u; 224 + 225 + preempt_disable(); 226 + 227 + xen_mc_batch(); 228 + 229 + /* ptr may be ioremapped for 64-bit pagetable setup */ 230 + u.ptr = arbitrary_virt_to_machine(ptr).maddr; 231 + u.val = pmd_val_ma(val); 232 + xen_extend_mmu_update(&u); 233 + 234 + xen_mc_issue(PARAVIRT_LAZY_MMU); 235 + 236 + preempt_enable(); 237 + } 238 + 239 + static void xen_set_pmd(pmd_t *ptr, pmd_t val) 240 + { 241 + trace_xen_mmu_set_pmd(ptr, val); 242 + 243 + /* If page is not pinned, we can just update the entry 244 + directly */ 245 + if (!xen_page_pinned(ptr)) { 246 + *ptr = val; 247 + return; 248 + } 249 + 250 + xen_set_pmd_hyper(ptr, val); 251 + } 252 + 253 + /* 254 + * Associate a virtual page frame with a given physical page frame 255 + * and protection flags for that frame. 256 + */ 257 + void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 258 + { 259 + set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); 260 + } 261 + 262 + static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) 263 + { 264 + struct mmu_update u; 265 + 266 + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) 267 + return false; 268 + 269 + xen_mc_batch(); 270 + 271 + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; 272 + u.val = pte_val_ma(pteval); 273 + xen_extend_mmu_update(&u); 274 + 275 + xen_mc_issue(PARAVIRT_LAZY_MMU); 276 + 277 + return true; 278 + } 279 + 280 + static inline void __xen_set_pte(pte_t *ptep, pte_t pteval) 281 + { 282 + if (!xen_batched_set_pte(ptep, pteval)) { 283 + /* 284 + * Could call native_set_pte() here and trap and 285 + * emulate the PTE write but with 32-bit guests this 286 + * needs two traps (one for each of the two 32-bit 287 + * words in the PTE) so do one hypercall directly 288 + * instead. 289 + */ 290 + struct mmu_update u; 291 + 292 + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; 293 + u.val = pte_val_ma(pteval); 294 + HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF); 295 + } 296 + } 297 + 298 + static void xen_set_pte(pte_t *ptep, pte_t pteval) 299 + { 300 + trace_xen_mmu_set_pte(ptep, pteval); 301 + __xen_set_pte(ptep, pteval); 302 + } 303 + 304 + static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 305 + pte_t *ptep, pte_t pteval) 306 + { 307 + trace_xen_mmu_set_pte_at(mm, addr, ptep, pteval); 308 + __xen_set_pte(ptep, pteval); 309 + } 310 + 311 + pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 312 + unsigned long addr, pte_t *ptep) 313 + { 314 + /* Just return the pte as-is. We preserve the bits on commit */ 315 + trace_xen_mmu_ptep_modify_prot_start(mm, addr, ptep, *ptep); 316 + return *ptep; 317 + } 318 + 319 + void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 320 + pte_t *ptep, pte_t pte) 321 + { 322 + struct mmu_update u; 323 + 324 + trace_xen_mmu_ptep_modify_prot_commit(mm, addr, ptep, pte); 325 + xen_mc_batch(); 326 + 327 + u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 328 + u.val = pte_val_ma(pte); 329 + xen_extend_mmu_update(&u); 330 + 331 + xen_mc_issue(PARAVIRT_LAZY_MMU); 332 + } 333 + 334 + /* Assume pteval_t is equivalent to all the other *val_t types. */ 335 + static pteval_t pte_mfn_to_pfn(pteval_t val) 336 + { 337 + if (val & _PAGE_PRESENT) { 338 + unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 339 + unsigned long pfn = mfn_to_pfn(mfn); 340 + 341 + pteval_t flags = val & PTE_FLAGS_MASK; 342 + if (unlikely(pfn == ~0)) 343 + val = flags & ~_PAGE_PRESENT; 344 + else 345 + val = ((pteval_t)pfn << PAGE_SHIFT) | flags; 346 + } 347 + 348 + return val; 349 + } 350 + 351 + static pteval_t pte_pfn_to_mfn(pteval_t val) 352 + { 353 + if (val & _PAGE_PRESENT) { 354 + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 355 + pteval_t flags = val & PTE_FLAGS_MASK; 356 + unsigned long mfn; 357 + 358 + if (!xen_feature(XENFEAT_auto_translated_physmap)) 359 + mfn = __pfn_to_mfn(pfn); 360 + else 361 + mfn = pfn; 362 + /* 363 + * If there's no mfn for the pfn, then just create an 364 + * empty non-present pte. Unfortunately this loses 365 + * information about the original pfn, so 366 + * pte_mfn_to_pfn is asymmetric. 367 + */ 368 + if (unlikely(mfn == INVALID_P2M_ENTRY)) { 369 + mfn = 0; 370 + flags = 0; 371 + } else 372 + mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT); 373 + val = ((pteval_t)mfn << PAGE_SHIFT) | flags; 374 + } 375 + 376 + return val; 377 + } 378 + 379 + __visible pteval_t xen_pte_val(pte_t pte) 380 + { 381 + pteval_t pteval = pte.pte; 382 + 383 + return pte_mfn_to_pfn(pteval); 384 + } 385 + PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 386 + 387 + __visible pgdval_t xen_pgd_val(pgd_t pgd) 388 + { 389 + return pte_mfn_to_pfn(pgd.pgd); 390 + } 391 + PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); 392 + 393 + __visible pte_t xen_make_pte(pteval_t pte) 394 + { 395 + pte = pte_pfn_to_mfn(pte); 396 + 397 + return native_make_pte(pte); 398 + } 399 + PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 400 + 401 + __visible pgd_t xen_make_pgd(pgdval_t pgd) 402 + { 403 + pgd = pte_pfn_to_mfn(pgd); 404 + return native_make_pgd(pgd); 405 + } 406 + PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); 407 + 408 + __visible pmdval_t xen_pmd_val(pmd_t pmd) 409 + { 410 + return pte_mfn_to_pfn(pmd.pmd); 411 + } 412 + PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); 413 + 414 + static void xen_set_pud_hyper(pud_t *ptr, pud_t val) 415 + { 416 + struct mmu_update u; 417 + 418 + preempt_disable(); 419 + 420 + xen_mc_batch(); 421 + 422 + /* ptr may be ioremapped for 64-bit pagetable setup */ 423 + u.ptr = arbitrary_virt_to_machine(ptr).maddr; 424 + u.val = pud_val_ma(val); 425 + xen_extend_mmu_update(&u); 426 + 427 + xen_mc_issue(PARAVIRT_LAZY_MMU); 428 + 429 + preempt_enable(); 430 + } 431 + 432 + static void xen_set_pud(pud_t *ptr, pud_t val) 433 + { 434 + trace_xen_mmu_set_pud(ptr, val); 435 + 436 + /* If page is not pinned, we can just update the entry 437 + directly */ 438 + if (!xen_page_pinned(ptr)) { 439 + *ptr = val; 440 + return; 441 + } 442 + 443 + xen_set_pud_hyper(ptr, val); 444 + } 445 + 446 + #ifdef CONFIG_X86_PAE 447 + static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 448 + { 449 + trace_xen_mmu_set_pte_atomic(ptep, pte); 450 + set_64bit((u64 *)ptep, native_pte_val(pte)); 451 + } 452 + 453 + static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 454 + { 455 + trace_xen_mmu_pte_clear(mm, addr, ptep); 456 + if (!xen_batched_set_pte(ptep, native_make_pte(0))) 457 + native_pte_clear(mm, addr, ptep); 458 + } 459 + 460 + static void xen_pmd_clear(pmd_t *pmdp) 461 + { 462 + trace_xen_mmu_pmd_clear(pmdp); 463 + set_pmd(pmdp, __pmd(0)); 464 + } 465 + #endif /* CONFIG_X86_PAE */ 466 + 467 + __visible pmd_t xen_make_pmd(pmdval_t pmd) 468 + { 469 + pmd = pte_pfn_to_mfn(pmd); 470 + return native_make_pmd(pmd); 471 + } 472 + PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 473 + 474 + #if CONFIG_PGTABLE_LEVELS == 4 475 + __visible pudval_t xen_pud_val(pud_t pud) 476 + { 477 + return pte_mfn_to_pfn(pud.pud); 478 + } 479 + PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); 480 + 481 + __visible pud_t xen_make_pud(pudval_t pud) 482 + { 483 + pud = pte_pfn_to_mfn(pud); 484 + 485 + return native_make_pud(pud); 486 + } 487 + PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); 488 + 489 + static pgd_t *xen_get_user_pgd(pgd_t *pgd) 490 + { 491 + pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); 492 + unsigned offset = pgd - pgd_page; 493 + pgd_t *user_ptr = NULL; 494 + 495 + if (offset < pgd_index(USER_LIMIT)) { 496 + struct page *page = virt_to_page(pgd_page); 497 + user_ptr = (pgd_t *)page->private; 498 + if (user_ptr) 499 + user_ptr += offset; 500 + } 501 + 502 + return user_ptr; 503 + } 504 + 505 + static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) 506 + { 507 + struct mmu_update u; 508 + 509 + u.ptr = virt_to_machine(ptr).maddr; 510 + u.val = p4d_val_ma(val); 511 + xen_extend_mmu_update(&u); 512 + } 513 + 514 + /* 515 + * Raw hypercall-based set_p4d, intended for in early boot before 516 + * there's a page structure. This implies: 517 + * 1. The only existing pagetable is the kernel's 518 + * 2. It is always pinned 519 + * 3. It has no user pagetable attached to it 520 + */ 521 + static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) 522 + { 523 + preempt_disable(); 524 + 525 + xen_mc_batch(); 526 + 527 + __xen_set_p4d_hyper(ptr, val); 528 + 529 + xen_mc_issue(PARAVIRT_LAZY_MMU); 530 + 531 + preempt_enable(); 532 + } 533 + 534 + static void xen_set_p4d(p4d_t *ptr, p4d_t val) 535 + { 536 + pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr); 537 + pgd_t pgd_val; 538 + 539 + trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val); 540 + 541 + /* If page is not pinned, we can just update the entry 542 + directly */ 543 + if (!xen_page_pinned(ptr)) { 544 + *ptr = val; 545 + if (user_ptr) { 546 + WARN_ON(xen_page_pinned(user_ptr)); 547 + pgd_val.pgd = p4d_val_ma(val); 548 + *user_ptr = pgd_val; 549 + } 550 + return; 551 + } 552 + 553 + /* If it's pinned, then we can at least batch the kernel and 554 + user updates together. */ 555 + xen_mc_batch(); 556 + 557 + __xen_set_p4d_hyper(ptr, val); 558 + if (user_ptr) 559 + __xen_set_p4d_hyper((p4d_t *)user_ptr, val); 560 + 561 + xen_mc_issue(PARAVIRT_LAZY_MMU); 562 + } 563 + #endif /* CONFIG_PGTABLE_LEVELS == 4 */ 564 + 565 + static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, 566 + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), 567 + bool last, unsigned long limit) 568 + { 569 + int i, nr, flush = 0; 570 + 571 + nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD; 572 + for (i = 0; i < nr; i++) { 573 + if (!pmd_none(pmd[i])) 574 + flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE); 575 + } 576 + return flush; 577 + } 578 + 579 + static int xen_pud_walk(struct mm_struct *mm, pud_t *pud, 580 + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), 581 + bool last, unsigned long limit) 582 + { 583 + int i, nr, flush = 0; 584 + 585 + nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD; 586 + for (i = 0; i < nr; i++) { 587 + pmd_t *pmd; 588 + 589 + if (pud_none(pud[i])) 590 + continue; 591 + 592 + pmd = pmd_offset(&pud[i], 0); 593 + if (PTRS_PER_PMD > 1) 594 + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); 595 + flush |= xen_pmd_walk(mm, pmd, func, 596 + last && i == nr - 1, limit); 597 + } 598 + return flush; 599 + } 600 + 601 + static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, 602 + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), 603 + bool last, unsigned long limit) 604 + { 605 + int i, nr, flush = 0; 606 + 607 + nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; 608 + for (i = 0; i < nr; i++) { 609 + pud_t *pud; 610 + 611 + if (p4d_none(p4d[i])) 612 + continue; 613 + 614 + pud = pud_offset(&p4d[i], 0); 615 + if (PTRS_PER_PUD > 1) 616 + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); 617 + flush |= xen_pud_walk(mm, pud, func, 618 + last && i == nr - 1, limit); 619 + } 620 + return flush; 621 + } 622 + 623 + /* 624 + * (Yet another) pagetable walker. This one is intended for pinning a 625 + * pagetable. This means that it walks a pagetable and calls the 626 + * callback function on each page it finds making up the page table, 627 + * at every level. It walks the entire pagetable, but it only bothers 628 + * pinning pte pages which are below limit. In the normal case this 629 + * will be STACK_TOP_MAX, but at boot we need to pin up to 630 + * FIXADDR_TOP. 631 + * 632 + * For 32-bit the important bit is that we don't pin beyond there, 633 + * because then we start getting into Xen's ptes. 634 + * 635 + * For 64-bit, we must skip the Xen hole in the middle of the address 636 + * space, just after the big x86-64 virtual hole. 637 + */ 638 + static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, 639 + int (*func)(struct mm_struct *mm, struct page *, 640 + enum pt_level), 641 + unsigned long limit) 642 + { 643 + int i, nr, flush = 0; 644 + unsigned hole_low, hole_high; 645 + 646 + /* The limit is the last byte to be touched */ 647 + limit--; 648 + BUG_ON(limit >= FIXADDR_TOP); 649 + 650 + if (xen_feature(XENFEAT_auto_translated_physmap)) 651 + return 0; 652 + 653 + /* 654 + * 64-bit has a great big hole in the middle of the address 655 + * space, which contains the Xen mappings. On 32-bit these 656 + * will end up making a zero-sized hole and so is a no-op. 657 + */ 658 + hole_low = pgd_index(USER_LIMIT); 659 + hole_high = pgd_index(PAGE_OFFSET); 660 + 661 + nr = pgd_index(limit) + 1; 662 + for (i = 0; i < nr; i++) { 663 + p4d_t *p4d; 664 + 665 + if (i >= hole_low && i < hole_high) 666 + continue; 667 + 668 + if (pgd_none(pgd[i])) 669 + continue; 670 + 671 + p4d = p4d_offset(&pgd[i], 0); 672 + if (PTRS_PER_P4D > 1) 673 + flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); 674 + flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); 675 + } 676 + 677 + /* Do the top level last, so that the callbacks can use it as 678 + a cue to do final things like tlb flushes. */ 679 + flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); 680 + 681 + return flush; 682 + } 683 + 684 + static int xen_pgd_walk(struct mm_struct *mm, 685 + int (*func)(struct mm_struct *mm, struct page *, 686 + enum pt_level), 687 + unsigned long limit) 688 + { 689 + return __xen_pgd_walk(mm, mm->pgd, func, limit); 690 + } 691 + 692 + /* If we're using split pte locks, then take the page's lock and 693 + return a pointer to it. Otherwise return NULL. */ 694 + static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) 695 + { 696 + spinlock_t *ptl = NULL; 697 + 698 + #if USE_SPLIT_PTE_PTLOCKS 699 + ptl = ptlock_ptr(page); 700 + spin_lock_nest_lock(ptl, &mm->page_table_lock); 701 + #endif 702 + 703 + return ptl; 704 + } 705 + 706 + static void xen_pte_unlock(void *v) 707 + { 708 + spinlock_t *ptl = v; 709 + spin_unlock(ptl); 710 + } 711 + 712 + static void xen_do_pin(unsigned level, unsigned long pfn) 713 + { 714 + struct mmuext_op op; 715 + 716 + op.cmd = level; 717 + op.arg1.mfn = pfn_to_mfn(pfn); 718 + 719 + xen_extend_mmuext_op(&op); 720 + } 721 + 722 + static int xen_pin_page(struct mm_struct *mm, struct page *page, 723 + enum pt_level level) 724 + { 725 + unsigned pgfl = TestSetPagePinned(page); 726 + int flush; 727 + 728 + if (pgfl) 729 + flush = 0; /* already pinned */ 730 + else if (PageHighMem(page)) 731 + /* kmaps need flushing if we found an unpinned 732 + highpage */ 733 + flush = 1; 734 + else { 735 + void *pt = lowmem_page_address(page); 736 + unsigned long pfn = page_to_pfn(page); 737 + struct multicall_space mcs = __xen_mc_entry(0); 738 + spinlock_t *ptl; 739 + 740 + flush = 0; 741 + 742 + /* 743 + * We need to hold the pagetable lock between the time 744 + * we make the pagetable RO and when we actually pin 745 + * it. If we don't, then other users may come in and 746 + * attempt to update the pagetable by writing it, 747 + * which will fail because the memory is RO but not 748 + * pinned, so Xen won't do the trap'n'emulate. 749 + * 750 + * If we're using split pte locks, we can't hold the 751 + * entire pagetable's worth of locks during the 752 + * traverse, because we may wrap the preempt count (8 753 + * bits). The solution is to mark RO and pin each PTE 754 + * page while holding the lock. This means the number 755 + * of locks we end up holding is never more than a 756 + * batch size (~32 entries, at present). 757 + * 758 + * If we're not using split pte locks, we needn't pin 759 + * the PTE pages independently, because we're 760 + * protected by the overall pagetable lock. 761 + */ 762 + ptl = NULL; 763 + if (level == PT_PTE) 764 + ptl = xen_pte_lock(page, mm); 765 + 766 + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 767 + pfn_pte(pfn, PAGE_KERNEL_RO), 768 + level == PT_PGD ? UVMF_TLB_FLUSH : 0); 769 + 770 + if (ptl) { 771 + xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 772 + 773 + /* Queue a deferred unlock for when this batch 774 + is completed. */ 775 + xen_mc_callback(xen_pte_unlock, ptl); 776 + } 777 + } 778 + 779 + return flush; 780 + } 781 + 782 + /* This is called just after a mm has been created, but it has not 783 + been used yet. We need to make sure that its pagetable is all 784 + read-only, and can be pinned. */ 785 + static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) 786 + { 787 + trace_xen_mmu_pgd_pin(mm, pgd); 788 + 789 + xen_mc_batch(); 790 + 791 + if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { 792 + /* re-enable interrupts for flushing */ 793 + xen_mc_issue(0); 794 + 795 + kmap_flush_unused(); 796 + 797 + xen_mc_batch(); 798 + } 799 + 800 + #ifdef CONFIG_X86_64 801 + { 802 + pgd_t *user_pgd = xen_get_user_pgd(pgd); 803 + 804 + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 805 + 806 + if (user_pgd) { 807 + xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); 808 + xen_do_pin(MMUEXT_PIN_L4_TABLE, 809 + PFN_DOWN(__pa(user_pgd))); 810 + } 811 + } 812 + #else /* CONFIG_X86_32 */ 813 + #ifdef CONFIG_X86_PAE 814 + /* Need to make sure unshared kernel PMD is pinnable */ 815 + xen_pin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), 816 + PT_PMD); 817 + #endif 818 + xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 819 + #endif /* CONFIG_X86_64 */ 820 + xen_mc_issue(0); 821 + } 822 + 823 + static void xen_pgd_pin(struct mm_struct *mm) 824 + { 825 + __xen_pgd_pin(mm, mm->pgd); 826 + } 827 + 828 + /* 829 + * On save, we need to pin all pagetables to make sure they get their 830 + * mfns turned into pfns. Search the list for any unpinned pgds and pin 831 + * them (unpinned pgds are not currently in use, probably because the 832 + * process is under construction or destruction). 833 + * 834 + * Expected to be called in stop_machine() ("equivalent to taking 835 + * every spinlock in the system"), so the locking doesn't really 836 + * matter all that much. 837 + */ 838 + void xen_mm_pin_all(void) 839 + { 840 + struct page *page; 841 + 842 + spin_lock(&pgd_lock); 843 + 844 + list_for_each_entry(page, &pgd_list, lru) { 845 + if (!PagePinned(page)) { 846 + __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); 847 + SetPageSavePinned(page); 848 + } 849 + } 850 + 851 + spin_unlock(&pgd_lock); 852 + } 853 + 854 + /* 855 + * The init_mm pagetable is really pinned as soon as its created, but 856 + * that's before we have page structures to store the bits. So do all 857 + * the book-keeping now. 858 + */ 859 + static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page, 860 + enum pt_level level) 861 + { 862 + SetPagePinned(page); 863 + return 0; 864 + } 865 + 866 + static void __init xen_mark_init_mm_pinned(void) 867 + { 868 + xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); 869 + } 870 + 871 + static int xen_unpin_page(struct mm_struct *mm, struct page *page, 872 + enum pt_level level) 873 + { 874 + unsigned pgfl = TestClearPagePinned(page); 875 + 876 + if (pgfl && !PageHighMem(page)) { 877 + void *pt = lowmem_page_address(page); 878 + unsigned long pfn = page_to_pfn(page); 879 + spinlock_t *ptl = NULL; 880 + struct multicall_space mcs; 881 + 882 + /* 883 + * Do the converse to pin_page. If we're using split 884 + * pte locks, we must be holding the lock for while 885 + * the pte page is unpinned but still RO to prevent 886 + * concurrent updates from seeing it in this 887 + * partially-pinned state. 888 + */ 889 + if (level == PT_PTE) { 890 + ptl = xen_pte_lock(page, mm); 891 + 892 + if (ptl) 893 + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 894 + } 895 + 896 + mcs = __xen_mc_entry(0); 897 + 898 + MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 899 + pfn_pte(pfn, PAGE_KERNEL), 900 + level == PT_PGD ? UVMF_TLB_FLUSH : 0); 901 + 902 + if (ptl) { 903 + /* unlock when batch completed */ 904 + xen_mc_callback(xen_pte_unlock, ptl); 905 + } 906 + } 907 + 908 + return 0; /* never need to flush on unpin */ 909 + } 910 + 911 + /* Release a pagetables pages back as normal RW */ 912 + static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) 913 + { 914 + trace_xen_mmu_pgd_unpin(mm, pgd); 915 + 916 + xen_mc_batch(); 917 + 918 + xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 919 + 920 + #ifdef CONFIG_X86_64 921 + { 922 + pgd_t *user_pgd = xen_get_user_pgd(pgd); 923 + 924 + if (user_pgd) { 925 + xen_do_pin(MMUEXT_UNPIN_TABLE, 926 + PFN_DOWN(__pa(user_pgd))); 927 + xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); 928 + } 929 + } 930 + #endif 931 + 932 + #ifdef CONFIG_X86_PAE 933 + /* Need to make sure unshared kernel PMD is unpinned */ 934 + xen_unpin_page(mm, pgd_page(pgd[pgd_index(TASK_SIZE)]), 935 + PT_PMD); 936 + #endif 937 + 938 + __xen_pgd_walk(mm, pgd, xen_unpin_page, USER_LIMIT); 939 + 940 + xen_mc_issue(0); 941 + } 942 + 943 + static void xen_pgd_unpin(struct mm_struct *mm) 944 + { 945 + __xen_pgd_unpin(mm, mm->pgd); 946 + } 947 + 948 + /* 949 + * On resume, undo any pinning done at save, so that the rest of the 950 + * kernel doesn't see any unexpected pinned pagetables. 951 + */ 952 + void xen_mm_unpin_all(void) 953 + { 954 + struct page *page; 955 + 956 + spin_lock(&pgd_lock); 957 + 958 + list_for_each_entry(page, &pgd_list, lru) { 959 + if (PageSavePinned(page)) { 960 + BUG_ON(!PagePinned(page)); 961 + __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); 962 + ClearPageSavePinned(page); 963 + } 964 + } 965 + 966 + spin_unlock(&pgd_lock); 967 + } 968 + 969 + static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 970 + { 971 + spin_lock(&next->page_table_lock); 972 + xen_pgd_pin(next); 973 + spin_unlock(&next->page_table_lock); 974 + } 975 + 976 + static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 977 + { 978 + spin_lock(&mm->page_table_lock); 979 + xen_pgd_pin(mm); 980 + spin_unlock(&mm->page_table_lock); 981 + } 982 + 983 + 984 + #ifdef CONFIG_SMP 985 + /* Another cpu may still have their %cr3 pointing at the pagetable, so 986 + we need to repoint it somewhere else before we can unpin it. */ 987 + static void drop_other_mm_ref(void *info) 988 + { 989 + struct mm_struct *mm = info; 990 + struct mm_struct *active_mm; 991 + 992 + active_mm = this_cpu_read(cpu_tlbstate.active_mm); 993 + 994 + if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) 995 + leave_mm(smp_processor_id()); 996 + 997 + /* If this cpu still has a stale cr3 reference, then make sure 998 + it has been flushed. */ 999 + if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) 1000 + load_cr3(swapper_pg_dir); 1001 + } 1002 + 1003 + static void xen_drop_mm_ref(struct mm_struct *mm) 1004 + { 1005 + cpumask_var_t mask; 1006 + unsigned cpu; 1007 + 1008 + if (current->active_mm == mm) { 1009 + if (current->mm == mm) 1010 + load_cr3(swapper_pg_dir); 1011 + else 1012 + leave_mm(smp_processor_id()); 1013 + } 1014 + 1015 + /* Get the "official" set of cpus referring to our pagetable. */ 1016 + if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { 1017 + for_each_online_cpu(cpu) { 1018 + if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) 1019 + && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) 1020 + continue; 1021 + smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); 1022 + } 1023 + return; 1024 + } 1025 + cpumask_copy(mask, mm_cpumask(mm)); 1026 + 1027 + /* It's possible that a vcpu may have a stale reference to our 1028 + cr3, because its in lazy mode, and it hasn't yet flushed 1029 + its set of pending hypercalls yet. In this case, we can 1030 + look at its actual current cr3 value, and force it to flush 1031 + if needed. */ 1032 + for_each_online_cpu(cpu) { 1033 + if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd)) 1034 + cpumask_set_cpu(cpu, mask); 1035 + } 1036 + 1037 + if (!cpumask_empty(mask)) 1038 + smp_call_function_many(mask, drop_other_mm_ref, mm, 1); 1039 + free_cpumask_var(mask); 1040 + } 1041 + #else 1042 + static void xen_drop_mm_ref(struct mm_struct *mm) 1043 + { 1044 + if (current->active_mm == mm) 1045 + load_cr3(swapper_pg_dir); 1046 + } 1047 + #endif 1048 + 1049 + /* 1050 + * While a process runs, Xen pins its pagetables, which means that the 1051 + * hypervisor forces it to be read-only, and it controls all updates 1052 + * to it. This means that all pagetable updates have to go via the 1053 + * hypervisor, which is moderately expensive. 1054 + * 1055 + * Since we're pulling the pagetable down, we switch to use init_mm, 1056 + * unpin old process pagetable and mark it all read-write, which 1057 + * allows further operations on it to be simple memory accesses. 1058 + * 1059 + * The only subtle point is that another CPU may be still using the 1060 + * pagetable because of lazy tlb flushing. This means we need need to 1061 + * switch all CPUs off this pagetable before we can unpin it. 1062 + */ 1063 + static void xen_exit_mmap(struct mm_struct *mm) 1064 + { 1065 + get_cpu(); /* make sure we don't move around */ 1066 + xen_drop_mm_ref(mm); 1067 + put_cpu(); 1068 + 1069 + spin_lock(&mm->page_table_lock); 1070 + 1071 + /* pgd may not be pinned in the error exit path of execve */ 1072 + if (xen_page_pinned(mm->pgd)) 1073 + xen_pgd_unpin(mm); 1074 + 1075 + spin_unlock(&mm->page_table_lock); 1076 + } 1077 + 1078 + static void xen_post_allocator_init(void); 1079 + 1080 + static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1081 + { 1082 + struct mmuext_op op; 1083 + 1084 + op.cmd = cmd; 1085 + op.arg1.mfn = pfn_to_mfn(pfn); 1086 + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) 1087 + BUG(); 1088 + } 1089 + 1090 + #ifdef CONFIG_X86_64 1091 + static void __init xen_cleanhighmap(unsigned long vaddr, 1092 + unsigned long vaddr_end) 1093 + { 1094 + unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1; 1095 + pmd_t *pmd = level2_kernel_pgt + pmd_index(vaddr); 1096 + 1097 + /* NOTE: The loop is more greedy than the cleanup_highmap variant. 1098 + * We include the PMD passed in on _both_ boundaries. */ 1099 + for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD)); 1100 + pmd++, vaddr += PMD_SIZE) { 1101 + if (pmd_none(*pmd)) 1102 + continue; 1103 + if (vaddr < (unsigned long) _text || vaddr > kernel_end) 1104 + set_pmd(pmd, __pmd(0)); 1105 + } 1106 + /* In case we did something silly, we should crash in this function 1107 + * instead of somewhere later and be confusing. */ 1108 + xen_mc_flush(); 1109 + } 1110 + 1111 + /* 1112 + * Make a page range writeable and free it. 1113 + */ 1114 + static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size) 1115 + { 1116 + void *vaddr = __va(paddr); 1117 + void *vaddr_end = vaddr + size; 1118 + 1119 + for (; vaddr < vaddr_end; vaddr += PAGE_SIZE) 1120 + make_lowmem_page_readwrite(vaddr); 1121 + 1122 + memblock_free(paddr, size); 1123 + } 1124 + 1125 + static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) 1126 + { 1127 + unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK; 1128 + 1129 + if (unpin) 1130 + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa)); 1131 + ClearPagePinned(virt_to_page(__va(pa))); 1132 + xen_free_ro_pages(pa, PAGE_SIZE); 1133 + } 1134 + 1135 + static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) 1136 + { 1137 + unsigned long pa; 1138 + pte_t *pte_tbl; 1139 + int i; 1140 + 1141 + if (pmd_large(*pmd)) { 1142 + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; 1143 + xen_free_ro_pages(pa, PMD_SIZE); 1144 + return; 1145 + } 1146 + 1147 + pte_tbl = pte_offset_kernel(pmd, 0); 1148 + for (i = 0; i < PTRS_PER_PTE; i++) { 1149 + if (pte_none(pte_tbl[i])) 1150 + continue; 1151 + pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT; 1152 + xen_free_ro_pages(pa, PAGE_SIZE); 1153 + } 1154 + set_pmd(pmd, __pmd(0)); 1155 + xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin); 1156 + } 1157 + 1158 + static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) 1159 + { 1160 + unsigned long pa; 1161 + pmd_t *pmd_tbl; 1162 + int i; 1163 + 1164 + if (pud_large(*pud)) { 1165 + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; 1166 + xen_free_ro_pages(pa, PUD_SIZE); 1167 + return; 1168 + } 1169 + 1170 + pmd_tbl = pmd_offset(pud, 0); 1171 + for (i = 0; i < PTRS_PER_PMD; i++) { 1172 + if (pmd_none(pmd_tbl[i])) 1173 + continue; 1174 + xen_cleanmfnmap_pmd(pmd_tbl + i, unpin); 1175 + } 1176 + set_pud(pud, __pud(0)); 1177 + xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin); 1178 + } 1179 + 1180 + static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) 1181 + { 1182 + unsigned long pa; 1183 + pud_t *pud_tbl; 1184 + int i; 1185 + 1186 + if (p4d_large(*p4d)) { 1187 + pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; 1188 + xen_free_ro_pages(pa, P4D_SIZE); 1189 + return; 1190 + } 1191 + 1192 + pud_tbl = pud_offset(p4d, 0); 1193 + for (i = 0; i < PTRS_PER_PUD; i++) { 1194 + if (pud_none(pud_tbl[i])) 1195 + continue; 1196 + xen_cleanmfnmap_pud(pud_tbl + i, unpin); 1197 + } 1198 + set_p4d(p4d, __p4d(0)); 1199 + xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin); 1200 + } 1201 + 1202 + /* 1203 + * Since it is well isolated we can (and since it is perhaps large we should) 1204 + * also free the page tables mapping the initial P->M table. 1205 + */ 1206 + static void __init xen_cleanmfnmap(unsigned long vaddr) 1207 + { 1208 + pgd_t *pgd; 1209 + p4d_t *p4d; 1210 + unsigned int i; 1211 + bool unpin; 1212 + 1213 + unpin = (vaddr == 2 * PGDIR_SIZE); 1214 + vaddr &= PMD_MASK; 1215 + pgd = pgd_offset_k(vaddr); 1216 + p4d = p4d_offset(pgd, 0); 1217 + for (i = 0; i < PTRS_PER_P4D; i++) { 1218 + if (p4d_none(p4d[i])) 1219 + continue; 1220 + xen_cleanmfnmap_p4d(p4d + i, unpin); 1221 + } 1222 + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { 1223 + set_pgd(pgd, __pgd(0)); 1224 + xen_cleanmfnmap_free_pgtbl(p4d, unpin); 1225 + } 1226 + } 1227 + 1228 + static void __init xen_pagetable_p2m_free(void) 1229 + { 1230 + unsigned long size; 1231 + unsigned long addr; 1232 + 1233 + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 1234 + 1235 + /* No memory or already called. */ 1236 + if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list) 1237 + return; 1238 + 1239 + /* using __ka address and sticking INVALID_P2M_ENTRY! */ 1240 + memset((void *)xen_start_info->mfn_list, 0xff, size); 1241 + 1242 + addr = xen_start_info->mfn_list; 1243 + /* 1244 + * We could be in __ka space. 1245 + * We roundup to the PMD, which means that if anybody at this stage is 1246 + * using the __ka address of xen_start_info or 1247 + * xen_start_info->shared_info they are in going to crash. Fortunatly 1248 + * we have already revectored in xen_setup_kernel_pagetable and in 1249 + * xen_setup_shared_info. 1250 + */ 1251 + size = roundup(size, PMD_SIZE); 1252 + 1253 + if (addr >= __START_KERNEL_map) { 1254 + xen_cleanhighmap(addr, addr + size); 1255 + size = PAGE_ALIGN(xen_start_info->nr_pages * 1256 + sizeof(unsigned long)); 1257 + memblock_free(__pa(addr), size); 1258 + } else { 1259 + xen_cleanmfnmap(addr); 1260 + } 1261 + } 1262 + 1263 + static void __init xen_pagetable_cleanhighmap(void) 1264 + { 1265 + unsigned long size; 1266 + unsigned long addr; 1267 + 1268 + /* At this stage, cleanup_highmap has already cleaned __ka space 1269 + * from _brk_limit way up to the max_pfn_mapped (which is the end of 1270 + * the ramdisk). We continue on, erasing PMD entries that point to page 1271 + * tables - do note that they are accessible at this stage via __va. 1272 + * For good measure we also round up to the PMD - which means that if 1273 + * anybody is using __ka address to the initial boot-stack - and try 1274 + * to use it - they are going to crash. The xen_start_info has been 1275 + * taken care of already in xen_setup_kernel_pagetable. */ 1276 + addr = xen_start_info->pt_base; 1277 + size = roundup(xen_start_info->nr_pt_frames * PAGE_SIZE, PMD_SIZE); 1278 + 1279 + xen_cleanhighmap(addr, addr + size); 1280 + xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base)); 1281 + #ifdef DEBUG 1282 + /* This is superfluous and is not necessary, but you know what 1283 + * lets do it. The MODULES_VADDR -> MODULES_END should be clear of 1284 + * anything at this stage. */ 1285 + xen_cleanhighmap(MODULES_VADDR, roundup(MODULES_VADDR, PUD_SIZE) - 1); 1286 + #endif 1287 + } 1288 + #endif 1289 + 1290 + static void __init xen_pagetable_p2m_setup(void) 1291 + { 1292 + if (xen_feature(XENFEAT_auto_translated_physmap)) 1293 + return; 1294 + 1295 + xen_vmalloc_p2m_tree(); 1296 + 1297 + #ifdef CONFIG_X86_64 1298 + xen_pagetable_p2m_free(); 1299 + 1300 + xen_pagetable_cleanhighmap(); 1301 + #endif 1302 + /* And revector! Bye bye old array */ 1303 + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; 1304 + } 1305 + 1306 + static void __init xen_pagetable_init(void) 1307 + { 1308 + paging_init(); 1309 + xen_post_allocator_init(); 1310 + 1311 + xen_pagetable_p2m_setup(); 1312 + 1313 + /* Allocate and initialize top and mid mfn levels for p2m structure */ 1314 + xen_build_mfn_list_list(); 1315 + 1316 + /* Remap memory freed due to conflicts with E820 map */ 1317 + if (!xen_feature(XENFEAT_auto_translated_physmap)) 1318 + xen_remap_memory(); 1319 + 1320 + xen_setup_shared_info(); 1321 + } 1322 + static void xen_write_cr2(unsigned long cr2) 1323 + { 1324 + this_cpu_read(xen_vcpu)->arch.cr2 = cr2; 1325 + } 1326 + 1327 + static unsigned long xen_read_cr2(void) 1328 + { 1329 + return this_cpu_read(xen_vcpu)->arch.cr2; 1330 + } 1331 + 1332 + unsigned long xen_read_cr2_direct(void) 1333 + { 1334 + return this_cpu_read(xen_vcpu_info.arch.cr2); 1335 + } 1336 + 1337 + static void xen_flush_tlb(void) 1338 + { 1339 + struct mmuext_op *op; 1340 + struct multicall_space mcs; 1341 + 1342 + trace_xen_mmu_flush_tlb(0); 1343 + 1344 + preempt_disable(); 1345 + 1346 + mcs = xen_mc_entry(sizeof(*op)); 1347 + 1348 + op = mcs.args; 1349 + op->cmd = MMUEXT_TLB_FLUSH_LOCAL; 1350 + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 1351 + 1352 + xen_mc_issue(PARAVIRT_LAZY_MMU); 1353 + 1354 + preempt_enable(); 1355 + } 1356 + 1357 + static void xen_flush_tlb_single(unsigned long addr) 1358 + { 1359 + struct mmuext_op *op; 1360 + struct multicall_space mcs; 1361 + 1362 + trace_xen_mmu_flush_tlb_single(addr); 1363 + 1364 + preempt_disable(); 1365 + 1366 + mcs = xen_mc_entry(sizeof(*op)); 1367 + op = mcs.args; 1368 + op->cmd = MMUEXT_INVLPG_LOCAL; 1369 + op->arg1.linear_addr = addr & PAGE_MASK; 1370 + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 1371 + 1372 + xen_mc_issue(PARAVIRT_LAZY_MMU); 1373 + 1374 + preempt_enable(); 1375 + } 1376 + 1377 + static void xen_flush_tlb_others(const struct cpumask *cpus, 1378 + struct mm_struct *mm, unsigned long start, 1379 + unsigned long end) 1380 + { 1381 + struct { 1382 + struct mmuext_op op; 1383 + #ifdef CONFIG_SMP 1384 + DECLARE_BITMAP(mask, num_processors); 1385 + #else 1386 + DECLARE_BITMAP(mask, NR_CPUS); 1387 + #endif 1388 + } *args; 1389 + struct multicall_space mcs; 1390 + 1391 + trace_xen_mmu_flush_tlb_others(cpus, mm, start, end); 1392 + 1393 + if (cpumask_empty(cpus)) 1394 + return; /* nothing to do */ 1395 + 1396 + mcs = xen_mc_entry(sizeof(*args)); 1397 + args = mcs.args; 1398 + args->op.arg2.vcpumask = to_cpumask(args->mask); 1399 + 1400 + /* Remove us, and any offline CPUS. */ 1401 + cpumask_and(to_cpumask(args->mask), cpus, cpu_online_mask); 1402 + cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask)); 1403 + 1404 + args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; 1405 + if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { 1406 + args->op.cmd = MMUEXT_INVLPG_MULTI; 1407 + args->op.arg1.linear_addr = start; 1408 + } 1409 + 1410 + MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF); 1411 + 1412 + xen_mc_issue(PARAVIRT_LAZY_MMU); 1413 + } 1414 + 1415 + static unsigned long xen_read_cr3(void) 1416 + { 1417 + return this_cpu_read(xen_cr3); 1418 + } 1419 + 1420 + static void set_current_cr3(void *v) 1421 + { 1422 + this_cpu_write(xen_current_cr3, (unsigned long)v); 1423 + } 1424 + 1425 + static void __xen_write_cr3(bool kernel, unsigned long cr3) 1426 + { 1427 + struct mmuext_op op; 1428 + unsigned long mfn; 1429 + 1430 + trace_xen_mmu_write_cr3(kernel, cr3); 1431 + 1432 + if (cr3) 1433 + mfn = pfn_to_mfn(PFN_DOWN(cr3)); 1434 + else 1435 + mfn = 0; 1436 + 1437 + WARN_ON(mfn == 0 && kernel); 1438 + 1439 + op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; 1440 + op.arg1.mfn = mfn; 1441 + 1442 + xen_extend_mmuext_op(&op); 1443 + 1444 + if (kernel) { 1445 + this_cpu_write(xen_cr3, cr3); 1446 + 1447 + /* Update xen_current_cr3 once the batch has actually 1448 + been submitted. */ 1449 + xen_mc_callback(set_current_cr3, (void *)cr3); 1450 + } 1451 + } 1452 + static void xen_write_cr3(unsigned long cr3) 1453 + { 1454 + BUG_ON(preemptible()); 1455 + 1456 + xen_mc_batch(); /* disables interrupts */ 1457 + 1458 + /* Update while interrupts are disabled, so its atomic with 1459 + respect to ipis */ 1460 + this_cpu_write(xen_cr3, cr3); 1461 + 1462 + __xen_write_cr3(true, cr3); 1463 + 1464 + #ifdef CONFIG_X86_64 1465 + { 1466 + pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); 1467 + if (user_pgd) 1468 + __xen_write_cr3(false, __pa(user_pgd)); 1469 + else 1470 + __xen_write_cr3(false, 0); 1471 + } 1472 + #endif 1473 + 1474 + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 1475 + } 1476 + 1477 + #ifdef CONFIG_X86_64 1478 + /* 1479 + * At the start of the day - when Xen launches a guest, it has already 1480 + * built pagetables for the guest. We diligently look over them 1481 + * in xen_setup_kernel_pagetable and graft as appropriate them in the 1482 + * init_level4_pgt and its friends. Then when we are happy we load 1483 + * the new init_level4_pgt - and continue on. 1484 + * 1485 + * The generic code starts (start_kernel) and 'init_mem_mapping' sets 1486 + * up the rest of the pagetables. When it has completed it loads the cr3. 1487 + * N.B. that baremetal would start at 'start_kernel' (and the early 1488 + * #PF handler would create bootstrap pagetables) - so we are running 1489 + * with the same assumptions as what to do when write_cr3 is executed 1490 + * at this point. 1491 + * 1492 + * Since there are no user-page tables at all, we have two variants 1493 + * of xen_write_cr3 - the early bootup (this one), and the late one 1494 + * (xen_write_cr3). The reason we have to do that is that in 64-bit 1495 + * the Linux kernel and user-space are both in ring 3 while the 1496 + * hypervisor is in ring 0. 1497 + */ 1498 + static void __init xen_write_cr3_init(unsigned long cr3) 1499 + { 1500 + BUG_ON(preemptible()); 1501 + 1502 + xen_mc_batch(); /* disables interrupts */ 1503 + 1504 + /* Update while interrupts are disabled, so its atomic with 1505 + respect to ipis */ 1506 + this_cpu_write(xen_cr3, cr3); 1507 + 1508 + __xen_write_cr3(true, cr3); 1509 + 1510 + xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 1511 + } 1512 + #endif 1513 + 1514 + static int xen_pgd_alloc(struct mm_struct *mm) 1515 + { 1516 + pgd_t *pgd = mm->pgd; 1517 + int ret = 0; 1518 + 1519 + BUG_ON(PagePinned(virt_to_page(pgd))); 1520 + 1521 + #ifdef CONFIG_X86_64 1522 + { 1523 + struct page *page = virt_to_page(pgd); 1524 + pgd_t *user_pgd; 1525 + 1526 + BUG_ON(page->private != 0); 1527 + 1528 + ret = -ENOMEM; 1529 + 1530 + user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); 1531 + page->private = (unsigned long)user_pgd; 1532 + 1533 + if (user_pgd != NULL) { 1534 + #ifdef CONFIG_X86_VSYSCALL_EMULATION 1535 + user_pgd[pgd_index(VSYSCALL_ADDR)] = 1536 + __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); 1537 + #endif 1538 + ret = 0; 1539 + } 1540 + 1541 + BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); 1542 + } 1543 + #endif 1544 + return ret; 1545 + } 1546 + 1547 + static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) 1548 + { 1549 + #ifdef CONFIG_X86_64 1550 + pgd_t *user_pgd = xen_get_user_pgd(pgd); 1551 + 1552 + if (user_pgd) 1553 + free_page((unsigned long)user_pgd); 1554 + #endif 1555 + } 1556 + 1557 + /* 1558 + * Init-time set_pte while constructing initial pagetables, which 1559 + * doesn't allow RO page table pages to be remapped RW. 1560 + * 1561 + * If there is no MFN for this PFN then this page is initially 1562 + * ballooned out so clear the PTE (as in decrease_reservation() in 1563 + * drivers/xen/balloon.c). 1564 + * 1565 + * Many of these PTE updates are done on unpinned and writable pages 1566 + * and doing a hypercall for these is unnecessary and expensive. At 1567 + * this point it is not possible to tell if a page is pinned or not, 1568 + * so always write the PTE directly and rely on Xen trapping and 1569 + * emulating any updates as necessary. 1570 + */ 1571 + __visible pte_t xen_make_pte_init(pteval_t pte) 1572 + { 1573 + #ifdef CONFIG_X86_64 1574 + unsigned long pfn; 1575 + 1576 + /* 1577 + * Pages belonging to the initial p2m list mapped outside the default 1578 + * address range must be mapped read-only. This region contains the 1579 + * page tables for mapping the p2m list, too, and page tables MUST be 1580 + * mapped read-only. 1581 + */ 1582 + pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT; 1583 + if (xen_start_info->mfn_list < __START_KERNEL_map && 1584 + pfn >= xen_start_info->first_p2m_pfn && 1585 + pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames) 1586 + pte &= ~_PAGE_RW; 1587 + #endif 1588 + pte = pte_pfn_to_mfn(pte); 1589 + return native_make_pte(pte); 1590 + } 1591 + PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init); 1592 + 1593 + static void __init xen_set_pte_init(pte_t *ptep, pte_t pte) 1594 + { 1595 + #ifdef CONFIG_X86_32 1596 + /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1597 + if (pte_mfn(pte) != INVALID_P2M_ENTRY 1598 + && pte_val_ma(*ptep) & _PAGE_PRESENT) 1599 + pte = __pte_ma(((pte_val_ma(*ptep) & _PAGE_RW) | ~_PAGE_RW) & 1600 + pte_val_ma(pte)); 1601 + #endif 1602 + native_set_pte(ptep, pte); 1603 + } 1604 + 1605 + /* Early in boot, while setting up the initial pagetable, assume 1606 + everything is pinned. */ 1607 + static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) 1608 + { 1609 + #ifdef CONFIG_FLATMEM 1610 + BUG_ON(mem_map); /* should only be used early */ 1611 + #endif 1612 + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 1613 + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 1614 + } 1615 + 1616 + /* Used for pmd and pud */ 1617 + static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) 1618 + { 1619 + #ifdef CONFIG_FLATMEM 1620 + BUG_ON(mem_map); /* should only be used early */ 1621 + #endif 1622 + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 1623 + } 1624 + 1625 + /* Early release_pte assumes that all pts are pinned, since there's 1626 + only init_mm and anything attached to that is pinned. */ 1627 + static void __init xen_release_pte_init(unsigned long pfn) 1628 + { 1629 + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 1630 + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1631 + } 1632 + 1633 + static void __init xen_release_pmd_init(unsigned long pfn) 1634 + { 1635 + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1636 + } 1637 + 1638 + static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1639 + { 1640 + struct multicall_space mcs; 1641 + struct mmuext_op *op; 1642 + 1643 + mcs = __xen_mc_entry(sizeof(*op)); 1644 + op = mcs.args; 1645 + op->cmd = cmd; 1646 + op->arg1.mfn = pfn_to_mfn(pfn); 1647 + 1648 + MULTI_mmuext_op(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 1649 + } 1650 + 1651 + static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot) 1652 + { 1653 + struct multicall_space mcs; 1654 + unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT); 1655 + 1656 + mcs = __xen_mc_entry(0); 1657 + MULTI_update_va_mapping(mcs.mc, (unsigned long)addr, 1658 + pfn_pte(pfn, prot), 0); 1659 + } 1660 + 1661 + /* This needs to make sure the new pte page is pinned iff its being 1662 + attached to a pinned pagetable. */ 1663 + static inline void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, 1664 + unsigned level) 1665 + { 1666 + bool pinned = PagePinned(virt_to_page(mm->pgd)); 1667 + 1668 + trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned); 1669 + 1670 + if (pinned) { 1671 + struct page *page = pfn_to_page(pfn); 1672 + 1673 + SetPagePinned(page); 1674 + 1675 + if (!PageHighMem(page)) { 1676 + xen_mc_batch(); 1677 + 1678 + __set_pfn_prot(pfn, PAGE_KERNEL_RO); 1679 + 1680 + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) 1681 + __pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 1682 + 1683 + xen_mc_issue(PARAVIRT_LAZY_MMU); 1684 + } else { 1685 + /* make sure there are no stray mappings of 1686 + this page */ 1687 + kmap_flush_unused(); 1688 + } 1689 + } 1690 + } 1691 + 1692 + static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) 1693 + { 1694 + xen_alloc_ptpage(mm, pfn, PT_PTE); 1695 + } 1696 + 1697 + static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) 1698 + { 1699 + xen_alloc_ptpage(mm, pfn, PT_PMD); 1700 + } 1701 + 1702 + /* This should never happen until we're OK to use struct page */ 1703 + static inline void xen_release_ptpage(unsigned long pfn, unsigned level) 1704 + { 1705 + struct page *page = pfn_to_page(pfn); 1706 + bool pinned = PagePinned(page); 1707 + 1708 + trace_xen_mmu_release_ptpage(pfn, level, pinned); 1709 + 1710 + if (pinned) { 1711 + if (!PageHighMem(page)) { 1712 + xen_mc_batch(); 1713 + 1714 + if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS) 1715 + __pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 1716 + 1717 + __set_pfn_prot(pfn, PAGE_KERNEL); 1718 + 1719 + xen_mc_issue(PARAVIRT_LAZY_MMU); 1720 + } 1721 + ClearPagePinned(page); 1722 + } 1723 + } 1724 + 1725 + static void xen_release_pte(unsigned long pfn) 1726 + { 1727 + xen_release_ptpage(pfn, PT_PTE); 1728 + } 1729 + 1730 + static void xen_release_pmd(unsigned long pfn) 1731 + { 1732 + xen_release_ptpage(pfn, PT_PMD); 1733 + } 1734 + 1735 + #if CONFIG_PGTABLE_LEVELS >= 4 1736 + static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) 1737 + { 1738 + xen_alloc_ptpage(mm, pfn, PT_PUD); 1739 + } 1740 + 1741 + static void xen_release_pud(unsigned long pfn) 1742 + { 1743 + xen_release_ptpage(pfn, PT_PUD); 1744 + } 1745 + #endif 1746 + 1747 + void __init xen_reserve_top(void) 1748 + { 1749 + #ifdef CONFIG_X86_32 1750 + unsigned long top = HYPERVISOR_VIRT_START; 1751 + struct xen_platform_parameters pp; 1752 + 1753 + if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1754 + top = pp.virt_start; 1755 + 1756 + reserve_top_address(-top); 1757 + #endif /* CONFIG_X86_32 */ 1758 + } 1759 + 1760 + /* 1761 + * Like __va(), but returns address in the kernel mapping (which is 1762 + * all we have until the physical memory mapping has been set up. 1763 + */ 1764 + static void * __init __ka(phys_addr_t paddr) 1765 + { 1766 + #ifdef CONFIG_X86_64 1767 + return (void *)(paddr + __START_KERNEL_map); 1768 + #else 1769 + return __va(paddr); 1770 + #endif 1771 + } 1772 + 1773 + /* Convert a machine address to physical address */ 1774 + static unsigned long __init m2p(phys_addr_t maddr) 1775 + { 1776 + phys_addr_t paddr; 1777 + 1778 + maddr &= PTE_PFN_MASK; 1779 + paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; 1780 + 1781 + return paddr; 1782 + } 1783 + 1784 + /* Convert a machine address to kernel virtual */ 1785 + static void * __init m2v(phys_addr_t maddr) 1786 + { 1787 + return __ka(m2p(maddr)); 1788 + } 1789 + 1790 + /* Set the page permissions on an identity-mapped pages */ 1791 + static void __init set_page_prot_flags(void *addr, pgprot_t prot, 1792 + unsigned long flags) 1793 + { 1794 + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1795 + pte_t pte = pfn_pte(pfn, prot); 1796 + 1797 + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, flags)) 1798 + BUG(); 1799 + } 1800 + static void __init set_page_prot(void *addr, pgprot_t prot) 1801 + { 1802 + return set_page_prot_flags(addr, prot, UVMF_NONE); 1803 + } 1804 + #ifdef CONFIG_X86_32 1805 + static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) 1806 + { 1807 + unsigned pmdidx, pteidx; 1808 + unsigned ident_pte; 1809 + unsigned long pfn; 1810 + 1811 + level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES, 1812 + PAGE_SIZE); 1813 + 1814 + ident_pte = 0; 1815 + pfn = 0; 1816 + for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { 1817 + pte_t *pte_page; 1818 + 1819 + /* Reuse or allocate a page of ptes */ 1820 + if (pmd_present(pmd[pmdidx])) 1821 + pte_page = m2v(pmd[pmdidx].pmd); 1822 + else { 1823 + /* Check for free pte pages */ 1824 + if (ident_pte == LEVEL1_IDENT_ENTRIES) 1825 + break; 1826 + 1827 + pte_page = &level1_ident_pgt[ident_pte]; 1828 + ident_pte += PTRS_PER_PTE; 1829 + 1830 + pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); 1831 + } 1832 + 1833 + /* Install mappings */ 1834 + for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1835 + pte_t pte; 1836 + 1837 + if (pfn > max_pfn_mapped) 1838 + max_pfn_mapped = pfn; 1839 + 1840 + if (!pte_none(pte_page[pteidx])) 1841 + continue; 1842 + 1843 + pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); 1844 + pte_page[pteidx] = pte; 1845 + } 1846 + } 1847 + 1848 + for (pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) 1849 + set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); 1850 + 1851 + set_page_prot(pmd, PAGE_KERNEL_RO); 1852 + } 1853 + #endif 1854 + void __init xen_setup_machphys_mapping(void) 1855 + { 1856 + struct xen_machphys_mapping mapping; 1857 + 1858 + if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) { 1859 + machine_to_phys_mapping = (unsigned long *)mapping.v_start; 1860 + machine_to_phys_nr = mapping.max_mfn + 1; 1861 + } else { 1862 + machine_to_phys_nr = MACH2PHYS_NR_ENTRIES; 1863 + } 1864 + #ifdef CONFIG_X86_32 1865 + WARN_ON((machine_to_phys_mapping + (machine_to_phys_nr - 1)) 1866 + < machine_to_phys_mapping); 1867 + #endif 1868 + } 1869 + 1870 + #ifdef CONFIG_X86_64 1871 + static void __init convert_pfn_mfn(void *v) 1872 + { 1873 + pte_t *pte = v; 1874 + int i; 1875 + 1876 + /* All levels are converted the same way, so just treat them 1877 + as ptes. */ 1878 + for (i = 0; i < PTRS_PER_PTE; i++) 1879 + pte[i] = xen_make_pte(pte[i].pte); 1880 + } 1881 + static void __init check_pt_base(unsigned long *pt_base, unsigned long *pt_end, 1882 + unsigned long addr) 1883 + { 1884 + if (*pt_base == PFN_DOWN(__pa(addr))) { 1885 + set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); 1886 + clear_page((void *)addr); 1887 + (*pt_base)++; 1888 + } 1889 + if (*pt_end == PFN_DOWN(__pa(addr))) { 1890 + set_page_prot_flags((void *)addr, PAGE_KERNEL, UVMF_INVLPG); 1891 + clear_page((void *)addr); 1892 + (*pt_end)--; 1893 + } 1894 + } 1895 + /* 1896 + * Set up the initial kernel pagetable. 1897 + * 1898 + * We can construct this by grafting the Xen provided pagetable into 1899 + * head_64.S's preconstructed pagetables. We copy the Xen L2's into 1900 + * level2_ident_pgt, and level2_kernel_pgt. This means that only the 1901 + * kernel has a physical mapping to start with - but that's enough to 1902 + * get __va working. We need to fill in the rest of the physical 1903 + * mapping once some sort of allocator has been set up. 1904 + */ 1905 + void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 1906 + { 1907 + pud_t *l3; 1908 + pmd_t *l2; 1909 + unsigned long addr[3]; 1910 + unsigned long pt_base, pt_end; 1911 + unsigned i; 1912 + 1913 + /* max_pfn_mapped is the last pfn mapped in the initial memory 1914 + * mappings. Considering that on Xen after the kernel mappings we 1915 + * have the mappings of some pages that don't exist in pfn space, we 1916 + * set max_pfn_mapped to the last real pfn mapped. */ 1917 + if (xen_start_info->mfn_list < __START_KERNEL_map) 1918 + max_pfn_mapped = xen_start_info->first_p2m_pfn; 1919 + else 1920 + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); 1921 + 1922 + pt_base = PFN_DOWN(__pa(xen_start_info->pt_base)); 1923 + pt_end = pt_base + xen_start_info->nr_pt_frames; 1924 + 1925 + /* Zap identity mapping */ 1926 + init_level4_pgt[0] = __pgd(0); 1927 + 1928 + if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1929 + /* Pre-constructed entries are in pfn, so convert to mfn */ 1930 + /* L4[272] -> level3_ident_pgt 1931 + * L4[511] -> level3_kernel_pgt */ 1932 + convert_pfn_mfn(init_level4_pgt); 1933 + 1934 + /* L3_i[0] -> level2_ident_pgt */ 1935 + convert_pfn_mfn(level3_ident_pgt); 1936 + /* L3_k[510] -> level2_kernel_pgt 1937 + * L3_k[511] -> level2_fixmap_pgt */ 1938 + convert_pfn_mfn(level3_kernel_pgt); 1939 + 1940 + /* L3_k[511][506] -> level1_fixmap_pgt */ 1941 + convert_pfn_mfn(level2_fixmap_pgt); 1942 + } 1943 + /* We get [511][511] and have Xen's version of level2_kernel_pgt */ 1944 + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); 1945 + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); 1946 + 1947 + addr[0] = (unsigned long)pgd; 1948 + addr[1] = (unsigned long)l3; 1949 + addr[2] = (unsigned long)l2; 1950 + /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: 1951 + * Both L4[272][0] and L4[511][510] have entries that point to the same 1952 + * L2 (PMD) tables. Meaning that if you modify it in __va space 1953 + * it will be also modified in the __ka space! (But if you just 1954 + * modify the PMD table to point to other PTE's or none, then you 1955 + * are OK - which is what cleanup_highmap does) */ 1956 + copy_page(level2_ident_pgt, l2); 1957 + /* Graft it onto L4[511][510] */ 1958 + copy_page(level2_kernel_pgt, l2); 1959 + 1960 + /* Copy the initial P->M table mappings if necessary. */ 1961 + i = pgd_index(xen_start_info->mfn_list); 1962 + if (i && i < pgd_index(__START_KERNEL_map)) 1963 + init_level4_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i]; 1964 + 1965 + if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1966 + /* Make pagetable pieces RO */ 1967 + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); 1968 + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); 1969 + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); 1970 + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); 1971 + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); 1972 + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 1973 + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); 1974 + set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO); 1975 + 1976 + /* Pin down new L4 */ 1977 + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, 1978 + PFN_DOWN(__pa_symbol(init_level4_pgt))); 1979 + 1980 + /* Unpin Xen-provided one */ 1981 + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1982 + 1983 + /* 1984 + * At this stage there can be no user pgd, and no page 1985 + * structure to attach it to, so make sure we just set kernel 1986 + * pgd. 1987 + */ 1988 + xen_mc_batch(); 1989 + __xen_write_cr3(true, __pa(init_level4_pgt)); 1990 + xen_mc_issue(PARAVIRT_LAZY_CPU); 1991 + } else 1992 + native_write_cr3(__pa(init_level4_pgt)); 1993 + 1994 + /* We can't that easily rip out L3 and L2, as the Xen pagetables are 1995 + * set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for 1996 + * the initial domain. For guests using the toolstack, they are in: 1997 + * [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only 1998 + * rip out the [L4] (pgd), but for guests we shave off three pages. 1999 + */ 2000 + for (i = 0; i < ARRAY_SIZE(addr); i++) 2001 + check_pt_base(&pt_base, &pt_end, addr[i]); 2002 + 2003 + /* Our (by three pages) smaller Xen pagetable that we are using */ 2004 + xen_pt_base = PFN_PHYS(pt_base); 2005 + xen_pt_size = (pt_end - pt_base) * PAGE_SIZE; 2006 + memblock_reserve(xen_pt_base, xen_pt_size); 2007 + 2008 + /* Revector the xen_start_info */ 2009 + xen_start_info = (struct start_info *)__va(__pa(xen_start_info)); 2010 + } 2011 + 2012 + /* 2013 + * Read a value from a physical address. 2014 + */ 2015 + static unsigned long __init xen_read_phys_ulong(phys_addr_t addr) 2016 + { 2017 + unsigned long *vaddr; 2018 + unsigned long val; 2019 + 2020 + vaddr = early_memremap_ro(addr, sizeof(val)); 2021 + val = *vaddr; 2022 + early_memunmap(vaddr, sizeof(val)); 2023 + return val; 2024 + } 2025 + 2026 + /* 2027 + * Translate a virtual address to a physical one without relying on mapped 2028 + * page tables. 2029 + */ 2030 + static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) 2031 + { 2032 + phys_addr_t pa; 2033 + pgd_t pgd; 2034 + pud_t pud; 2035 + pmd_t pmd; 2036 + pte_t pte; 2037 + 2038 + pa = read_cr3(); 2039 + pgd = native_make_pgd(xen_read_phys_ulong(pa + pgd_index(vaddr) * 2040 + sizeof(pgd))); 2041 + if (!pgd_present(pgd)) 2042 + return 0; 2043 + 2044 + pa = pgd_val(pgd) & PTE_PFN_MASK; 2045 + pud = native_make_pud(xen_read_phys_ulong(pa + pud_index(vaddr) * 2046 + sizeof(pud))); 2047 + if (!pud_present(pud)) 2048 + return 0; 2049 + pa = pud_pfn(pud) << PAGE_SHIFT; 2050 + if (pud_large(pud)) 2051 + return pa + (vaddr & ~PUD_MASK); 2052 + 2053 + pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * 2054 + sizeof(pmd))); 2055 + if (!pmd_present(pmd)) 2056 + return 0; 2057 + pa = pmd_pfn(pmd) << PAGE_SHIFT; 2058 + if (pmd_large(pmd)) 2059 + return pa + (vaddr & ~PMD_MASK); 2060 + 2061 + pte = native_make_pte(xen_read_phys_ulong(pa + pte_index(vaddr) * 2062 + sizeof(pte))); 2063 + if (!pte_present(pte)) 2064 + return 0; 2065 + pa = pte_pfn(pte) << PAGE_SHIFT; 2066 + 2067 + return pa | (vaddr & ~PAGE_MASK); 2068 + } 2069 + 2070 + /* 2071 + * Find a new area for the hypervisor supplied p2m list and relocate the p2m to 2072 + * this area. 2073 + */ 2074 + void __init xen_relocate_p2m(void) 2075 + { 2076 + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; 2077 + unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; 2078 + int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; 2079 + pte_t *pt; 2080 + pmd_t *pmd; 2081 + pud_t *pud; 2082 + p4d_t *p4d = NULL; 2083 + pgd_t *pgd; 2084 + unsigned long *new_p2m; 2085 + int save_pud; 2086 + 2087 + size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); 2088 + n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; 2089 + n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; 2090 + n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; 2091 + n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; 2092 + if (PTRS_PER_P4D > 1) 2093 + n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; 2094 + else 2095 + n_p4d = 0; 2096 + n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; 2097 + 2098 + new_area = xen_find_free_area(PFN_PHYS(n_frames)); 2099 + if (!new_area) { 2100 + xen_raw_console_write("Can't find new memory area for p2m needed due to E820 map conflict\n"); 2101 + BUG(); 2102 + } 2103 + 2104 + /* 2105 + * Setup the page tables for addressing the new p2m list. 2106 + * We have asked the hypervisor to map the p2m list at the user address 2107 + * PUD_SIZE. It may have done so, or it may have used a kernel space 2108 + * address depending on the Xen version. 2109 + * To avoid any possible virtual address collision, just use 2110 + * 2 * PUD_SIZE for the new area. 2111 + */ 2112 + p4d_phys = new_area; 2113 + pud_phys = p4d_phys + PFN_PHYS(n_p4d); 2114 + pmd_phys = pud_phys + PFN_PHYS(n_pud); 2115 + pt_phys = pmd_phys + PFN_PHYS(n_pmd); 2116 + p2m_pfn = PFN_DOWN(pt_phys) + n_pt; 2117 + 2118 + pgd = __va(read_cr3()); 2119 + new_p2m = (unsigned long *)(2 * PGDIR_SIZE); 2120 + idx_p4d = 0; 2121 + save_pud = n_pud; 2122 + do { 2123 + if (n_p4d > 0) { 2124 + p4d = early_memremap(p4d_phys, PAGE_SIZE); 2125 + clear_page(p4d); 2126 + n_pud = min(save_pud, PTRS_PER_P4D); 2127 + } 2128 + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { 2129 + pud = early_memremap(pud_phys, PAGE_SIZE); 2130 + clear_page(pud); 2131 + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); 2132 + idx_pmd++) { 2133 + pmd = early_memremap(pmd_phys, PAGE_SIZE); 2134 + clear_page(pmd); 2135 + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); 2136 + idx_pt++) { 2137 + pt = early_memremap(pt_phys, PAGE_SIZE); 2138 + clear_page(pt); 2139 + for (idx_pte = 0; 2140 + idx_pte < min(n_pte, PTRS_PER_PTE); 2141 + idx_pte++) { 2142 + set_pte(pt + idx_pte, 2143 + pfn_pte(p2m_pfn, PAGE_KERNEL)); 2144 + p2m_pfn++; 2145 + } 2146 + n_pte -= PTRS_PER_PTE; 2147 + early_memunmap(pt, PAGE_SIZE); 2148 + make_lowmem_page_readonly(__va(pt_phys)); 2149 + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, 2150 + PFN_DOWN(pt_phys)); 2151 + set_pmd(pmd + idx_pt, 2152 + __pmd(_PAGE_TABLE | pt_phys)); 2153 + pt_phys += PAGE_SIZE; 2154 + } 2155 + n_pt -= PTRS_PER_PMD; 2156 + early_memunmap(pmd, PAGE_SIZE); 2157 + make_lowmem_page_readonly(__va(pmd_phys)); 2158 + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, 2159 + PFN_DOWN(pmd_phys)); 2160 + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); 2161 + pmd_phys += PAGE_SIZE; 2162 + } 2163 + n_pmd -= PTRS_PER_PUD; 2164 + early_memunmap(pud, PAGE_SIZE); 2165 + make_lowmem_page_readonly(__va(pud_phys)); 2166 + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); 2167 + if (n_p4d > 0) 2168 + set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); 2169 + else 2170 + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); 2171 + pud_phys += PAGE_SIZE; 2172 + } 2173 + if (n_p4d > 0) { 2174 + save_pud -= PTRS_PER_P4D; 2175 + early_memunmap(p4d, PAGE_SIZE); 2176 + make_lowmem_page_readonly(__va(p4d_phys)); 2177 + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); 2178 + set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); 2179 + p4d_phys += PAGE_SIZE; 2180 + } 2181 + } while (++idx_p4d < n_p4d); 2182 + 2183 + /* Now copy the old p2m info to the new area. */ 2184 + memcpy(new_p2m, xen_p2m_addr, size); 2185 + xen_p2m_addr = new_p2m; 2186 + 2187 + /* Release the old p2m list and set new list info. */ 2188 + p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list)); 2189 + BUG_ON(!p2m_pfn); 2190 + p2m_pfn_end = p2m_pfn + PFN_DOWN(size); 2191 + 2192 + if (xen_start_info->mfn_list < __START_KERNEL_map) { 2193 + pfn = xen_start_info->first_p2m_pfn; 2194 + pfn_end = xen_start_info->first_p2m_pfn + 2195 + xen_start_info->nr_p2m_frames; 2196 + set_pgd(pgd + 1, __pgd(0)); 2197 + } else { 2198 + pfn = p2m_pfn; 2199 + pfn_end = p2m_pfn_end; 2200 + } 2201 + 2202 + memblock_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn)); 2203 + while (pfn < pfn_end) { 2204 + if (pfn == p2m_pfn) { 2205 + pfn = p2m_pfn_end; 2206 + continue; 2207 + } 2208 + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 2209 + pfn++; 2210 + } 2211 + 2212 + xen_start_info->mfn_list = (unsigned long)xen_p2m_addr; 2213 + xen_start_info->first_p2m_pfn = PFN_DOWN(new_area); 2214 + xen_start_info->nr_p2m_frames = n_frames; 2215 + } 2216 + 2217 + #else /* !CONFIG_X86_64 */ 2218 + static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); 2219 + static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); 2220 + 2221 + static void __init xen_write_cr3_init(unsigned long cr3) 2222 + { 2223 + unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); 2224 + 2225 + BUG_ON(read_cr3() != __pa(initial_page_table)); 2226 + BUG_ON(cr3 != __pa(swapper_pg_dir)); 2227 + 2228 + /* 2229 + * We are switching to swapper_pg_dir for the first time (from 2230 + * initial_page_table) and therefore need to mark that page 2231 + * read-only and then pin it. 2232 + * 2233 + * Xen disallows sharing of kernel PMDs for PAE 2234 + * guests. Therefore we must copy the kernel PMD from 2235 + * initial_page_table into a new kernel PMD to be used in 2236 + * swapper_pg_dir. 2237 + */ 2238 + swapper_kernel_pmd = 2239 + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 2240 + copy_page(swapper_kernel_pmd, initial_kernel_pmd); 2241 + swapper_pg_dir[KERNEL_PGD_BOUNDARY] = 2242 + __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT); 2243 + set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO); 2244 + 2245 + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); 2246 + xen_write_cr3(cr3); 2247 + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn); 2248 + 2249 + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, 2250 + PFN_DOWN(__pa(initial_page_table))); 2251 + set_page_prot(initial_page_table, PAGE_KERNEL); 2252 + set_page_prot(initial_kernel_pmd, PAGE_KERNEL); 2253 + 2254 + pv_mmu_ops.write_cr3 = &xen_write_cr3; 2255 + } 2256 + 2257 + /* 2258 + * For 32 bit domains xen_start_info->pt_base is the pgd address which might be 2259 + * not the first page table in the page table pool. 2260 + * Iterate through the initial page tables to find the real page table base. 2261 + */ 2262 + static phys_addr_t xen_find_pt_base(pmd_t *pmd) 2263 + { 2264 + phys_addr_t pt_base, paddr; 2265 + unsigned pmdidx; 2266 + 2267 + pt_base = min(__pa(xen_start_info->pt_base), __pa(pmd)); 2268 + 2269 + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) 2270 + if (pmd_present(pmd[pmdidx]) && !pmd_large(pmd[pmdidx])) { 2271 + paddr = m2p(pmd[pmdidx].pmd); 2272 + pt_base = min(pt_base, paddr); 2273 + } 2274 + 2275 + return pt_base; 2276 + } 2277 + 2278 + void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) 2279 + { 2280 + pmd_t *kernel_pmd; 2281 + 2282 + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 2283 + 2284 + xen_pt_base = xen_find_pt_base(kernel_pmd); 2285 + xen_pt_size = xen_start_info->nr_pt_frames * PAGE_SIZE; 2286 + 2287 + initial_kernel_pmd = 2288 + extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE); 2289 + 2290 + max_pfn_mapped = PFN_DOWN(xen_pt_base + xen_pt_size + 512 * 1024); 2291 + 2292 + copy_page(initial_kernel_pmd, kernel_pmd); 2293 + 2294 + xen_map_identity_early(initial_kernel_pmd, max_pfn); 2295 + 2296 + copy_page(initial_page_table, pgd); 2297 + initial_page_table[KERNEL_PGD_BOUNDARY] = 2298 + __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT); 2299 + 2300 + set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO); 2301 + set_page_prot(initial_page_table, PAGE_KERNEL_RO); 2302 + set_page_prot(empty_zero_page, PAGE_KERNEL_RO); 2303 + 2304 + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 2305 + 2306 + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, 2307 + PFN_DOWN(__pa(initial_page_table))); 2308 + xen_write_cr3(__pa(initial_page_table)); 2309 + 2310 + memblock_reserve(xen_pt_base, xen_pt_size); 2311 + } 2312 + #endif /* CONFIG_X86_64 */ 2313 + 2314 + void __init xen_reserve_special_pages(void) 2315 + { 2316 + phys_addr_t paddr; 2317 + 2318 + memblock_reserve(__pa(xen_start_info), PAGE_SIZE); 2319 + if (xen_start_info->store_mfn) { 2320 + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn)); 2321 + memblock_reserve(paddr, PAGE_SIZE); 2322 + } 2323 + if (!xen_initial_domain()) { 2324 + paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn)); 2325 + memblock_reserve(paddr, PAGE_SIZE); 2326 + } 2327 + } 2328 + 2329 + void __init xen_pt_check_e820(void) 2330 + { 2331 + if (xen_is_e820_reserved(xen_pt_base, xen_pt_size)) { 2332 + xen_raw_console_write("Xen hypervisor allocated page table memory conflicts with E820 map\n"); 2333 + BUG(); 2334 + } 2335 + } 2336 + 2337 + static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss; 2338 + 2339 + static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) 2340 + { 2341 + pte_t pte; 2342 + 2343 + phys >>= PAGE_SHIFT; 2344 + 2345 + switch (idx) { 2346 + case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: 2347 + case FIX_RO_IDT: 2348 + #ifdef CONFIG_X86_32 2349 + case FIX_WP_TEST: 2350 + # ifdef CONFIG_HIGHMEM 2351 + case FIX_KMAP_BEGIN ... FIX_KMAP_END: 2352 + # endif 2353 + #elif defined(CONFIG_X86_VSYSCALL_EMULATION) 2354 + case VSYSCALL_PAGE: 2355 + #endif 2356 + case FIX_TEXT_POKE0: 2357 + case FIX_TEXT_POKE1: 2358 + case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: 2359 + /* All local page mappings */ 2360 + pte = pfn_pte(phys, prot); 2361 + break; 2362 + 2363 + #ifdef CONFIG_X86_LOCAL_APIC 2364 + case FIX_APIC_BASE: /* maps dummy local APIC */ 2365 + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); 2366 + break; 2367 + #endif 2368 + 2369 + #ifdef CONFIG_X86_IO_APIC 2370 + case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END: 2371 + /* 2372 + * We just don't map the IO APIC - all access is via 2373 + * hypercalls. Keep the address in the pte for reference. 2374 + */ 2375 + pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL); 2376 + break; 2377 + #endif 2378 + 2379 + case FIX_PARAVIRT_BOOTMAP: 2380 + /* This is an MFN, but it isn't an IO mapping from the 2381 + IO domain */ 2382 + pte = mfn_pte(phys, prot); 2383 + break; 2384 + 2385 + default: 2386 + /* By default, set_fixmap is used for hardware mappings */ 2387 + pte = mfn_pte(phys, prot); 2388 + break; 2389 + } 2390 + 2391 + __native_set_fixmap(idx, pte); 2392 + 2393 + #ifdef CONFIG_X86_VSYSCALL_EMULATION 2394 + /* Replicate changes to map the vsyscall page into the user 2395 + pagetable vsyscall mapping. */ 2396 + if (idx == VSYSCALL_PAGE) { 2397 + unsigned long vaddr = __fix_to_virt(idx); 2398 + set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); 2399 + } 2400 + #endif 2401 + } 2402 + 2403 + static void __init xen_post_allocator_init(void) 2404 + { 2405 + if (xen_feature(XENFEAT_auto_translated_physmap)) 2406 + return; 2407 + 2408 + pv_mmu_ops.set_pte = xen_set_pte; 2409 + pv_mmu_ops.set_pmd = xen_set_pmd; 2410 + pv_mmu_ops.set_pud = xen_set_pud; 2411 + #if CONFIG_PGTABLE_LEVELS >= 4 2412 + pv_mmu_ops.set_p4d = xen_set_p4d; 2413 + #endif 2414 + 2415 + /* This will work as long as patching hasn't happened yet 2416 + (which it hasn't) */ 2417 + pv_mmu_ops.alloc_pte = xen_alloc_pte; 2418 + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; 2419 + pv_mmu_ops.release_pte = xen_release_pte; 2420 + pv_mmu_ops.release_pmd = xen_release_pmd; 2421 + #if CONFIG_PGTABLE_LEVELS >= 4 2422 + pv_mmu_ops.alloc_pud = xen_alloc_pud; 2423 + pv_mmu_ops.release_pud = xen_release_pud; 2424 + #endif 2425 + pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte); 2426 + 2427 + #ifdef CONFIG_X86_64 2428 + pv_mmu_ops.write_cr3 = &xen_write_cr3; 2429 + SetPagePinned(virt_to_page(level3_user_vsyscall)); 2430 + #endif 2431 + xen_mark_init_mm_pinned(); 2432 + } 2433 + 2434 + static void xen_leave_lazy_mmu(void) 2435 + { 2436 + preempt_disable(); 2437 + xen_mc_flush(); 2438 + paravirt_leave_lazy_mmu(); 2439 + preempt_enable(); 2440 + } 2441 + 2442 + static const struct pv_mmu_ops xen_mmu_ops __initconst = { 2443 + .read_cr2 = xen_read_cr2, 2444 + .write_cr2 = xen_write_cr2, 2445 + 2446 + .read_cr3 = xen_read_cr3, 2447 + .write_cr3 = xen_write_cr3_init, 2448 + 2449 + .flush_tlb_user = xen_flush_tlb, 2450 + .flush_tlb_kernel = xen_flush_tlb, 2451 + .flush_tlb_single = xen_flush_tlb_single, 2452 + .flush_tlb_others = xen_flush_tlb_others, 2453 + 2454 + .pte_update = paravirt_nop, 2455 + 2456 + .pgd_alloc = xen_pgd_alloc, 2457 + .pgd_free = xen_pgd_free, 2458 + 2459 + .alloc_pte = xen_alloc_pte_init, 2460 + .release_pte = xen_release_pte_init, 2461 + .alloc_pmd = xen_alloc_pmd_init, 2462 + .release_pmd = xen_release_pmd_init, 2463 + 2464 + .set_pte = xen_set_pte_init, 2465 + .set_pte_at = xen_set_pte_at, 2466 + .set_pmd = xen_set_pmd_hyper, 2467 + 2468 + .ptep_modify_prot_start = __ptep_modify_prot_start, 2469 + .ptep_modify_prot_commit = __ptep_modify_prot_commit, 2470 + 2471 + .pte_val = PV_CALLEE_SAVE(xen_pte_val), 2472 + .pgd_val = PV_CALLEE_SAVE(xen_pgd_val), 2473 + 2474 + .make_pte = PV_CALLEE_SAVE(xen_make_pte_init), 2475 + .make_pgd = PV_CALLEE_SAVE(xen_make_pgd), 2476 + 2477 + #ifdef CONFIG_X86_PAE 2478 + .set_pte_atomic = xen_set_pte_atomic, 2479 + .pte_clear = xen_pte_clear, 2480 + .pmd_clear = xen_pmd_clear, 2481 + #endif /* CONFIG_X86_PAE */ 2482 + .set_pud = xen_set_pud_hyper, 2483 + 2484 + .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), 2485 + .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), 2486 + 2487 + #if CONFIG_PGTABLE_LEVELS >= 4 2488 + .pud_val = PV_CALLEE_SAVE(xen_pud_val), 2489 + .make_pud = PV_CALLEE_SAVE(xen_make_pud), 2490 + .set_p4d = xen_set_p4d_hyper, 2491 + 2492 + .alloc_pud = xen_alloc_pmd_init, 2493 + .release_pud = xen_release_pmd_init, 2494 + #endif /* CONFIG_PGTABLE_LEVELS == 4 */ 2495 + 2496 + .activate_mm = xen_activate_mm, 2497 + .dup_mmap = xen_dup_mmap, 2498 + .exit_mmap = xen_exit_mmap, 2499 + 2500 + .lazy_mode = { 2501 + .enter = paravirt_enter_lazy_mmu, 2502 + .leave = xen_leave_lazy_mmu, 2503 + .flush = paravirt_flush_lazy_mmu, 2504 + }, 2505 + 2506 + .set_fixmap = xen_set_fixmap, 2507 + }; 2508 + 2509 + void __init xen_init_mmu_ops(void) 2510 + { 2511 + x86_init.paging.pagetable_init = xen_pagetable_init; 2512 + 2513 + if (xen_feature(XENFEAT_auto_translated_physmap)) 2514 + return; 2515 + 2516 + pv_mmu_ops = xen_mmu_ops; 2517 + 2518 + memset(dummy_mapping, 0xff, PAGE_SIZE); 2519 + } 2520 + 2521 + /* Protected by xen_reservation_lock. */ 2522 + #define MAX_CONTIG_ORDER 9 /* 2MB */ 2523 + static unsigned long discontig_frames[1<<MAX_CONTIG_ORDER]; 2524 + 2525 + #define VOID_PTE (mfn_pte(0, __pgprot(0))) 2526 + static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order, 2527 + unsigned long *in_frames, 2528 + unsigned long *out_frames) 2529 + { 2530 + int i; 2531 + struct multicall_space mcs; 2532 + 2533 + xen_mc_batch(); 2534 + for (i = 0; i < (1UL<<order); i++, vaddr += PAGE_SIZE) { 2535 + mcs = __xen_mc_entry(0); 2536 + 2537 + if (in_frames) 2538 + in_frames[i] = virt_to_mfn(vaddr); 2539 + 2540 + MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); 2541 + __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); 2542 + 2543 + if (out_frames) 2544 + out_frames[i] = virt_to_pfn(vaddr); 2545 + } 2546 + xen_mc_issue(0); 2547 + } 2548 + 2549 + /* 2550 + * Update the pfn-to-mfn mappings for a virtual address range, either to 2551 + * point to an array of mfns, or contiguously from a single starting 2552 + * mfn. 2553 + */ 2554 + static void xen_remap_exchanged_ptes(unsigned long vaddr, int order, 2555 + unsigned long *mfns, 2556 + unsigned long first_mfn) 2557 + { 2558 + unsigned i, limit; 2559 + unsigned long mfn; 2560 + 2561 + xen_mc_batch(); 2562 + 2563 + limit = 1u << order; 2564 + for (i = 0; i < limit; i++, vaddr += PAGE_SIZE) { 2565 + struct multicall_space mcs; 2566 + unsigned flags; 2567 + 2568 + mcs = __xen_mc_entry(0); 2569 + if (mfns) 2570 + mfn = mfns[i]; 2571 + else 2572 + mfn = first_mfn + i; 2573 + 2574 + if (i < (limit - 1)) 2575 + flags = 0; 2576 + else { 2577 + if (order == 0) 2578 + flags = UVMF_INVLPG | UVMF_ALL; 2579 + else 2580 + flags = UVMF_TLB_FLUSH | UVMF_ALL; 2581 + } 2582 + 2583 + MULTI_update_va_mapping(mcs.mc, vaddr, 2584 + mfn_pte(mfn, PAGE_KERNEL), flags); 2585 + 2586 + set_phys_to_machine(virt_to_pfn(vaddr), mfn); 2587 + } 2588 + 2589 + xen_mc_issue(0); 2590 + } 2591 + 2592 + /* 2593 + * Perform the hypercall to exchange a region of our pfns to point to 2594 + * memory with the required contiguous alignment. Takes the pfns as 2595 + * input, and populates mfns as output. 2596 + * 2597 + * Returns a success code indicating whether the hypervisor was able to 2598 + * satisfy the request or not. 2599 + */ 2600 + static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in, 2601 + unsigned long *pfns_in, 2602 + unsigned long extents_out, 2603 + unsigned int order_out, 2604 + unsigned long *mfns_out, 2605 + unsigned int address_bits) 2606 + { 2607 + long rc; 2608 + int success; 2609 + 2610 + struct xen_memory_exchange exchange = { 2611 + .in = { 2612 + .nr_extents = extents_in, 2613 + .extent_order = order_in, 2614 + .extent_start = pfns_in, 2615 + .domid = DOMID_SELF 2616 + }, 2617 + .out = { 2618 + .nr_extents = extents_out, 2619 + .extent_order = order_out, 2620 + .extent_start = mfns_out, 2621 + .address_bits = address_bits, 2622 + .domid = DOMID_SELF 2623 + } 2624 + }; 2625 + 2626 + BUG_ON(extents_in << order_in != extents_out << order_out); 2627 + 2628 + rc = HYPERVISOR_memory_op(XENMEM_exchange, &exchange); 2629 + success = (exchange.nr_exchanged == extents_in); 2630 + 2631 + BUG_ON(!success && ((exchange.nr_exchanged != 0) || (rc == 0))); 2632 + BUG_ON(success && (rc != 0)); 2633 + 2634 + return success; 2635 + } 2636 + 2637 + int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, 2638 + unsigned int address_bits, 2639 + dma_addr_t *dma_handle) 2640 + { 2641 + unsigned long *in_frames = discontig_frames, out_frame; 2642 + unsigned long flags; 2643 + int success; 2644 + unsigned long vstart = (unsigned long)phys_to_virt(pstart); 2645 + 2646 + /* 2647 + * Currently an auto-translated guest will not perform I/O, nor will 2648 + * it require PAE page directories below 4GB. Therefore any calls to 2649 + * this function are redundant and can be ignored. 2650 + */ 2651 + 2652 + if (xen_feature(XENFEAT_auto_translated_physmap)) 2653 + return 0; 2654 + 2655 + if (unlikely(order > MAX_CONTIG_ORDER)) 2656 + return -ENOMEM; 2657 + 2658 + memset((void *) vstart, 0, PAGE_SIZE << order); 2659 + 2660 + spin_lock_irqsave(&xen_reservation_lock, flags); 2661 + 2662 + /* 1. Zap current PTEs, remembering MFNs. */ 2663 + xen_zap_pfn_range(vstart, order, in_frames, NULL); 2664 + 2665 + /* 2. Get a new contiguous memory extent. */ 2666 + out_frame = virt_to_pfn(vstart); 2667 + success = xen_exchange_memory(1UL << order, 0, in_frames, 2668 + 1, order, &out_frame, 2669 + address_bits); 2670 + 2671 + /* 3. Map the new extent in place of old pages. */ 2672 + if (success) 2673 + xen_remap_exchanged_ptes(vstart, order, NULL, out_frame); 2674 + else 2675 + xen_remap_exchanged_ptes(vstart, order, in_frames, 0); 2676 + 2677 + spin_unlock_irqrestore(&xen_reservation_lock, flags); 2678 + 2679 + *dma_handle = virt_to_machine(vstart).maddr; 2680 + return success ? 0 : -ENOMEM; 2681 + } 2682 + EXPORT_SYMBOL_GPL(xen_create_contiguous_region); 2683 + 2684 + void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order) 2685 + { 2686 + unsigned long *out_frames = discontig_frames, in_frame; 2687 + unsigned long flags; 2688 + int success; 2689 + unsigned long vstart; 2690 + 2691 + if (xen_feature(XENFEAT_auto_translated_physmap)) 2692 + return; 2693 + 2694 + if (unlikely(order > MAX_CONTIG_ORDER)) 2695 + return; 2696 + 2697 + vstart = (unsigned long)phys_to_virt(pstart); 2698 + memset((void *) vstart, 0, PAGE_SIZE << order); 2699 + 2700 + spin_lock_irqsave(&xen_reservation_lock, flags); 2701 + 2702 + /* 1. Find start MFN of contiguous extent. */ 2703 + in_frame = virt_to_mfn(vstart); 2704 + 2705 + /* 2. Zap current PTEs. */ 2706 + xen_zap_pfn_range(vstart, order, NULL, out_frames); 2707 + 2708 + /* 3. Do the exchange for non-contiguous MFNs. */ 2709 + success = xen_exchange_memory(1, order, &in_frame, 1UL << order, 2710 + 0, out_frames, 0); 2711 + 2712 + /* 4. Map new pages in place of old pages. */ 2713 + if (success) 2714 + xen_remap_exchanged_ptes(vstart, order, out_frames, 0); 2715 + else 2716 + xen_remap_exchanged_ptes(vstart, order, NULL, in_frame); 2717 + 2718 + spin_unlock_irqrestore(&xen_reservation_lock, flags); 2719 + } 2720 + EXPORT_SYMBOL_GPL(xen_destroy_contiguous_region); 2721 + 2722 + #ifdef CONFIG_KEXEC_CORE 2723 + phys_addr_t paddr_vmcoreinfo_note(void) 2724 + { 2725 + if (xen_pv_domain()) 2726 + return virt_to_machine(&vmcoreinfo_note).maddr; 2727 + else 2728 + return __pa_symbol(&vmcoreinfo_note); 2729 + } 2730 + #endif /* CONFIG_KEXEC_CORE */

+5

arch/x86/xen/pmu.h

··· 4 4 #include <xen/interface/xenpmu.h> 5 5 6 6 irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id); 7 + #ifdef CONFIG_XEN_HAVE_VPMU 7 8 void xen_pmu_init(int cpu); 8 9 void xen_pmu_finish(int cpu); 10 + #else 11 + static inline void xen_pmu_init(int cpu) {} 12 + static inline void xen_pmu_finish(int cpu) {} 13 + #endif 9 14 bool is_xen_pmu(int cpu); 10 15 bool pmu_msr_read(unsigned int msr, uint64_t *val, int *err); 11 16 bool pmu_msr_write(unsigned int msr, uint32_t low, uint32_t high, int *err);

+8 -509

arch/x86/xen/smp.c

··· 1 - /* 2 - * Xen SMP support 3 - * 4 - * This file implements the Xen versions of smp_ops. SMP under Xen is 5 - * very straightforward. Bringing a CPU up is simply a matter of 6 - * loading its initial context and setting it running. 7 - * 8 - * IPIs are handled through the Xen event mechanism. 9 - * 10 - * Because virtual CPUs can be scheduled onto any real CPU, there's no 11 - * useful topology information for the kernel to make use of. As a 12 - * result, all CPUs are treated as if they're single-core and 13 - * single-threaded. 14 - */ 15 - #include <linux/sched.h> 16 - #include <linux/err.h> 17 - #include <linux/slab.h> 18 1 #include <linux/smp.h> 19 - #include <linux/irq_work.h> 20 - #include <linux/tick.h> 21 - #include <linux/nmi.h> 2 + #include <linux/slab.h> 3 + #include <linux/cpumask.h> 4 + #include <linux/percpu.h> 22 5 23 - #include <asm/paravirt.h> 24 - #include <asm/desc.h> 25 - #include <asm/pgtable.h> 26 - #include <asm/cpu.h> 27 - 28 - #include <xen/interface/xen.h> 29 - #include <xen/interface/vcpu.h> 30 - #include <xen/interface/xenpmu.h> 31 - 32 - #include <asm/xen/interface.h> 33 - #include <asm/xen/hypercall.h> 34 - 35 - #include <xen/xen.h> 36 - #include <xen/page.h> 37 6 #include <xen/events.h> 38 7 39 8 #include <xen/hvc-console.h> 40 9 #include "xen-ops.h" 41 - #include "mmu.h" 42 10 #include "smp.h" 43 - #include "pmu.h" 44 11 45 - cpumask_var_t xen_cpu_initialized_map; 46 - 47 - struct xen_common_irq { 48 - int irq; 49 - char *name; 50 - }; 51 12 static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 }; 52 13 static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 }; 53 14 static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 }; 54 - static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; 55 15 static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 }; 56 - static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; 57 16 58 17 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 59 18 static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 60 - static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); 61 19 62 20 /* 63 21 * Reschedule call back. ··· 26 68 scheduler_ipi(); 27 69 28 70 return IRQ_HANDLED; 29 - } 30 - 31 - static void cpu_bringup(void) 32 - { 33 - int cpu; 34 - 35 - cpu_init(); 36 - touch_softlockup_watchdog(); 37 - preempt_disable(); 38 - 39 - /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ 40 - if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { 41 - xen_enable_sysenter(); 42 - xen_enable_syscall(); 43 - } 44 - cpu = smp_processor_id(); 45 - smp_store_cpu_info(cpu); 46 - cpu_data(cpu).x86_max_cores = 1; 47 - set_cpu_sibling_map(cpu); 48 - 49 - xen_setup_cpu_clockevents(); 50 - 51 - notify_cpu_starting(cpu); 52 - 53 - set_cpu_online(cpu, true); 54 - 55 - cpu_set_state_online(cpu); /* Implies full memory barrier. */ 56 - 57 - /* We can take interrupts now: we're officially "up". */ 58 - local_irq_enable(); 59 - } 60 - 61 - asmlinkage __visible void cpu_bringup_and_idle(void) 62 - { 63 - cpu_bringup(); 64 - cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); 65 71 } 66 72 67 73 void xen_smp_intr_free(unsigned int cpu) ··· 55 133 kfree(per_cpu(xen_callfuncsingle_irq, cpu).name); 56 134 per_cpu(xen_callfuncsingle_irq, cpu).name = NULL; 57 135 } 58 - if (xen_hvm_domain()) 59 - return; 136 + } 60 137 61 - if (per_cpu(xen_irq_work, cpu).irq >= 0) { 62 - unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); 63 - per_cpu(xen_irq_work, cpu).irq = -1; 64 - kfree(per_cpu(xen_irq_work, cpu).name); 65 - per_cpu(xen_irq_work, cpu).name = NULL; 66 - } 67 - 68 - if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { 69 - unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); 70 - per_cpu(xen_pmu_irq, cpu).irq = -1; 71 - kfree(per_cpu(xen_pmu_irq, cpu).name); 72 - per_cpu(xen_pmu_irq, cpu).name = NULL; 73 - } 74 - }; 75 138 int xen_smp_intr_init(unsigned int cpu) 76 139 { 77 140 int rc; 78 - char *resched_name, *callfunc_name, *debug_name, *pmu_name; 141 + char *resched_name, *callfunc_name, *debug_name; 79 142 80 143 resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu); 81 144 rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, ··· 107 200 per_cpu(xen_callfuncsingle_irq, cpu).irq = rc; 108 201 per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name; 109 202 110 - /* 111 - * The IRQ worker on PVHVM goes through the native path and uses the 112 - * IPI mechanism. 113 - */ 114 - if (xen_hvm_domain()) 115 - return 0; 116 - 117 - callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); 118 - rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, 119 - cpu, 120 - xen_irq_work_interrupt, 121 - IRQF_PERCPU|IRQF_NOBALANCING, 122 - callfunc_name, 123 - NULL); 124 - if (rc < 0) 125 - goto fail; 126 - per_cpu(xen_irq_work, cpu).irq = rc; 127 - per_cpu(xen_irq_work, cpu).name = callfunc_name; 128 - 129 - if (is_xen_pmu(cpu)) { 130 - pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); 131 - rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, 132 - xen_pmu_irq_handler, 133 - IRQF_PERCPU|IRQF_NOBALANCING, 134 - pmu_name, NULL); 135 - if (rc < 0) 136 - goto fail; 137 - per_cpu(xen_pmu_irq, cpu).irq = rc; 138 - per_cpu(xen_pmu_irq, cpu).name = pmu_name; 139 - } 140 - 141 203 return 0; 142 204 143 205 fail: ··· 114 238 return rc; 115 239 } 116 240 117 - static void __init xen_fill_possible_map(void) 118 - { 119 - int i, rc; 120 - 121 - if (xen_initial_domain()) 122 - return; 123 - 124 - for (i = 0; i < nr_cpu_ids; i++) { 125 - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 126 - if (rc >= 0) { 127 - num_processors++; 128 - set_cpu_possible(i, true); 129 - } 130 - } 131 - } 132 - 133 - static void __init xen_filter_cpu_maps(void) 134 - { 135 - int i, rc; 136 - unsigned int subtract = 0; 137 - 138 - if (!xen_initial_domain()) 139 - return; 140 - 141 - num_processors = 0; 142 - disabled_cpus = 0; 143 - for (i = 0; i < nr_cpu_ids; i++) { 144 - rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 145 - if (rc >= 0) { 146 - num_processors++; 147 - set_cpu_possible(i, true); 148 - } else { 149 - set_cpu_possible(i, false); 150 - set_cpu_present(i, false); 151 - subtract++; 152 - } 153 - } 154 - #ifdef CONFIG_HOTPLUG_CPU 155 - /* This is akin to using 'nr_cpus' on the Linux command line. 156 - * Which is OK as when we use 'dom0_max_vcpus=X' we can only 157 - * have up to X, while nr_cpu_ids is greater than X. This 158 - * normally is not a problem, except when CPU hotplugging 159 - * is involved and then there might be more than X CPUs 160 - * in the guest - which will not work as there is no 161 - * hypercall to expand the max number of VCPUs an already 162 - * running guest has. So cap it up to X. */ 163 - if (subtract) 164 - nr_cpu_ids = nr_cpu_ids - subtract; 165 - #endif 166 - 167 - } 168 - 169 - static void __init xen_smp_prepare_boot_cpu(void) 170 - { 171 - BUG_ON(smp_processor_id() != 0); 172 - native_smp_prepare_boot_cpu(); 173 - 174 - if (xen_pv_domain()) { 175 - if (!xen_feature(XENFEAT_writable_page_tables)) 176 - /* We've switched to the "real" per-cpu gdt, so make 177 - * sure the old memory can be recycled. */ 178 - make_lowmem_page_readwrite(xen_initial_gdt); 179 - 180 - #ifdef CONFIG_X86_32 181 - /* 182 - * Xen starts us with XEN_FLAT_RING1_DS, but linux code 183 - * expects __USER_DS 184 - */ 185 - loadsegment(ds, __USER_DS); 186 - loadsegment(es, __USER_DS); 187 - #endif 188 - 189 - xen_filter_cpu_maps(); 190 - xen_setup_vcpu_info_placement(); 191 - } 192 - 193 - /* 194 - * Setup vcpu_info for boot CPU. 195 - */ 196 - if (xen_hvm_domain()) 197 - xen_vcpu_setup(0); 198 - 199 - /* 200 - * The alternative logic (which patches the unlock/lock) runs before 201 - * the smp bootup up code is activated. Hence we need to set this up 202 - * the core kernel is being patched. Otherwise we will have only 203 - * modules patched but not core code. 204 - */ 205 - xen_init_spinlocks(); 206 - } 207 - 208 - static void __init xen_smp_prepare_cpus(unsigned int max_cpus) 209 - { 210 - unsigned cpu; 211 - unsigned int i; 212 - 213 - if (skip_ioapic_setup) { 214 - char *m = (max_cpus == 0) ? 215 - "The nosmp parameter is incompatible with Xen; " \ 216 - "use Xen dom0_max_vcpus=1 parameter" : 217 - "The noapic parameter is incompatible with Xen"; 218 - 219 - xen_raw_printk(m); 220 - panic(m); 221 - } 222 - xen_init_lock_cpu(0); 223 - 224 - smp_store_boot_cpu_info(); 225 - cpu_data(0).x86_max_cores = 1; 226 - 227 - for_each_possible_cpu(i) { 228 - zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 229 - zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 230 - zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); 231 - } 232 - set_cpu_sibling_map(0); 233 - 234 - xen_pmu_init(0); 235 - 236 - if (xen_smp_intr_init(0)) 237 - BUG(); 238 - 239 - if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) 240 - panic("could not allocate xen_cpu_initialized_map\n"); 241 - 242 - cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); 243 - 244 - /* Restrict the possible_map according to max_cpus. */ 245 - while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { 246 - for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) 247 - continue; 248 - set_cpu_possible(cpu, false); 249 - } 250 - 251 - for_each_possible_cpu(cpu) 252 - set_cpu_present(cpu, true); 253 - } 254 - 255 - static int 256 - cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 257 - { 258 - struct vcpu_guest_context *ctxt; 259 - struct desc_struct *gdt; 260 - unsigned long gdt_mfn; 261 - 262 - /* used to tell cpu_init() that it can proceed with initialization */ 263 - cpumask_set_cpu(cpu, cpu_callout_mask); 264 - if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) 265 - return 0; 266 - 267 - ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); 268 - if (ctxt == NULL) 269 - return -ENOMEM; 270 - 271 - gdt = get_cpu_gdt_rw(cpu); 272 - 273 - #ifdef CONFIG_X86_32 274 - ctxt->user_regs.fs = __KERNEL_PERCPU; 275 - ctxt->user_regs.gs = __KERNEL_STACK_CANARY; 276 - #endif 277 - memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); 278 - 279 - ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 280 - ctxt->flags = VGCF_IN_KERNEL; 281 - ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 282 - ctxt->user_regs.ds = __USER_DS; 283 - ctxt->user_regs.es = __USER_DS; 284 - ctxt->user_regs.ss = __KERNEL_DS; 285 - 286 - xen_copy_trap_info(ctxt->trap_ctxt); 287 - 288 - ctxt->ldt_ents = 0; 289 - 290 - BUG_ON((unsigned long)gdt & ~PAGE_MASK); 291 - 292 - gdt_mfn = arbitrary_virt_to_mfn(gdt); 293 - make_lowmem_page_readonly(gdt); 294 - make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); 295 - 296 - ctxt->gdt_frames[0] = gdt_mfn; 297 - ctxt->gdt_ents = GDT_ENTRIES; 298 - 299 - ctxt->kernel_ss = __KERNEL_DS; 300 - ctxt->kernel_sp = idle->thread.sp0; 301 - 302 - #ifdef CONFIG_X86_32 303 - ctxt->event_callback_cs = __KERNEL_CS; 304 - ctxt->failsafe_callback_cs = __KERNEL_CS; 305 - #else 306 - ctxt->gs_base_kernel = per_cpu_offset(cpu); 307 - #endif 308 - ctxt->event_callback_eip = 309 - (unsigned long)xen_hypervisor_callback; 310 - ctxt->failsafe_callback_eip = 311 - (unsigned long)xen_failsafe_callback; 312 - ctxt->user_regs.cs = __KERNEL_CS; 313 - per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 314 - 315 - ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 316 - ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); 317 - if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) 318 - BUG(); 319 - 320 - kfree(ctxt); 321 - return 0; 322 - } 323 - 324 - static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) 325 - { 326 - int rc; 327 - 328 - common_cpu_up(cpu, idle); 329 - 330 - xen_setup_runstate_info(cpu); 331 - 332 - /* 333 - * PV VCPUs are always successfully taken down (see 'while' loop 334 - * in xen_cpu_die()), so -EBUSY is an error. 335 - */ 336 - rc = cpu_check_up_prepare(cpu); 337 - if (rc) 338 - return rc; 339 - 340 - /* make sure interrupts start blocked */ 341 - per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 342 - 343 - rc = cpu_initialize_context(cpu, idle); 344 - if (rc) 345 - return rc; 346 - 347 - xen_pmu_init(cpu); 348 - 349 - rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); 350 - BUG_ON(rc); 351 - 352 - while (cpu_report_state(cpu) != CPU_ONLINE) 353 - HYPERVISOR_sched_op(SCHEDOP_yield, NULL); 354 - 355 - return 0; 356 - } 357 - 358 - static void xen_smp_cpus_done(unsigned int max_cpus) 359 - { 360 - } 361 - 362 - #ifdef CONFIG_HOTPLUG_CPU 363 - static int xen_cpu_disable(void) 364 - { 365 - unsigned int cpu = smp_processor_id(); 366 - if (cpu == 0) 367 - return -EBUSY; 368 - 369 - cpu_disable_common(); 370 - 371 - load_cr3(swapper_pg_dir); 372 - return 0; 373 - } 374 - 375 - static void xen_cpu_die(unsigned int cpu) 376 - { 377 - while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, 378 - xen_vcpu_nr(cpu), NULL)) { 379 - __set_current_state(TASK_UNINTERRUPTIBLE); 380 - schedule_timeout(HZ/10); 381 - } 382 - 383 - if (common_cpu_die(cpu) == 0) { 384 - xen_smp_intr_free(cpu); 385 - xen_uninit_lock_cpu(cpu); 386 - xen_teardown_timer(cpu); 387 - xen_pmu_finish(cpu); 388 - } 389 - } 390 - 391 - static void xen_play_dead(void) /* used only with HOTPLUG_CPU */ 392 - { 393 - play_dead_common(); 394 - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); 395 - cpu_bringup(); 396 - /* 397 - * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) 398 - * clears certain data that the cpu_idle loop (which called us 399 - * and that we return from) expects. The only way to get that 400 - * data back is to call: 401 - */ 402 - tick_nohz_idle_enter(); 403 - 404 - cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); 405 - } 406 - 407 - #else /* !CONFIG_HOTPLUG_CPU */ 408 - static int xen_cpu_disable(void) 409 - { 410 - return -ENOSYS; 411 - } 412 - 413 - static void xen_cpu_die(unsigned int cpu) 414 - { 415 - BUG(); 416 - } 417 - 418 - static void xen_play_dead(void) 419 - { 420 - BUG(); 421 - } 422 - 423 - #endif 424 - static void stop_self(void *v) 425 - { 426 - int cpu = smp_processor_id(); 427 - 428 - /* make sure we're not pinning something down */ 429 - load_cr3(swapper_pg_dir); 430 - /* should set up a minimal gdt */ 431 - 432 - set_cpu_online(cpu, false); 433 - 434 - HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL); 435 - BUG(); 436 - } 437 - 438 - static void xen_stop_other_cpus(int wait) 439 - { 440 - smp_call_function(stop_self, NULL, wait); 441 - } 442 - 443 - static void xen_smp_send_reschedule(int cpu) 241 + void xen_smp_send_reschedule(int cpu) 444 242 { 445 243 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 446 244 } ··· 128 578 xen_send_IPI_one(cpu, vector); 129 579 } 130 580 131 - static void xen_smp_send_call_function_ipi(const struct cpumask *mask) 581 + void xen_smp_send_call_function_ipi(const struct cpumask *mask) 132 582 { 133 583 int cpu; 134 584 ··· 143 593 } 144 594 } 145 595 146 - static void xen_smp_send_call_function_single_ipi(int cpu) 596 + void xen_smp_send_call_function_single_ipi(int cpu) 147 597 { 148 598 __xen_send_IPI_mask(cpumask_of(cpu), 149 599 XEN_CALL_FUNCTION_SINGLE_VECTOR); ··· 247 697 irq_exit(); 248 698 249 699 return IRQ_HANDLED; 250 - } 251 - 252 - static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) 253 - { 254 - irq_enter(); 255 - irq_work_run(); 256 - inc_irq_stat(apic_irq_work_irqs); 257 - irq_exit(); 258 - 259 - return IRQ_HANDLED; 260 - } 261 - 262 - static const struct smp_ops xen_smp_ops __initconst = { 263 - .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 264 - .smp_prepare_cpus = xen_smp_prepare_cpus, 265 - .smp_cpus_done = xen_smp_cpus_done, 266 - 267 - .cpu_up = xen_cpu_up, 268 - .cpu_die = xen_cpu_die, 269 - .cpu_disable = xen_cpu_disable, 270 - .play_dead = xen_play_dead, 271 - 272 - .stop_other_cpus = xen_stop_other_cpus, 273 - .smp_send_reschedule = xen_smp_send_reschedule, 274 - 275 - .send_call_func_ipi = xen_smp_send_call_function_ipi, 276 - .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, 277 - }; 278 - 279 - void __init xen_smp_init(void) 280 - { 281 - smp_ops = xen_smp_ops; 282 - xen_fill_possible_map(); 283 - } 284 - 285 - static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) 286 - { 287 - native_smp_prepare_cpus(max_cpus); 288 - WARN_ON(xen_smp_intr_init(0)); 289 - 290 - xen_init_lock_cpu(0); 291 - } 292 - 293 - void __init xen_hvm_smp_init(void) 294 - { 295 - smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; 296 - smp_ops.smp_send_reschedule = xen_smp_send_reschedule; 297 - smp_ops.cpu_die = xen_cpu_die; 298 - smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; 299 - smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; 300 - smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu; 301 700 }

+16

arch/x86/xen/smp.h

··· 11 11 12 12 extern int xen_smp_intr_init(unsigned int cpu); 13 13 extern void xen_smp_intr_free(unsigned int cpu); 14 + int xen_smp_intr_init_pv(unsigned int cpu); 15 + void xen_smp_intr_free_pv(unsigned int cpu); 14 16 17 + void xen_smp_send_reschedule(int cpu); 18 + void xen_smp_send_call_function_ipi(const struct cpumask *mask); 19 + void xen_smp_send_call_function_single_ipi(int cpu); 20 + 21 + struct xen_common_irq { 22 + int irq; 23 + char *name; 24 + }; 15 25 #else /* CONFIG_SMP */ 16 26 17 27 static inline int xen_smp_intr_init(unsigned int cpu) ··· 29 19 return 0; 30 20 } 31 21 static inline void xen_smp_intr_free(unsigned int cpu) {} 22 + 23 + static inline int xen_smp_intr_init_pv(unsigned int cpu) 24 + { 25 + return 0; 26 + } 27 + static inline void xen_smp_intr_free_pv(unsigned int cpu) {} 32 28 #endif /* CONFIG_SMP */ 33 29 34 30 #endif

+63

arch/x86/xen/smp_hvm.c

··· 1 + #include <asm/smp.h> 2 + 3 + #include <xen/events.h> 4 + 5 + #include "xen-ops.h" 6 + #include "smp.h" 7 + 8 + 9 + static void __init xen_hvm_smp_prepare_boot_cpu(void) 10 + { 11 + BUG_ON(smp_processor_id() != 0); 12 + native_smp_prepare_boot_cpu(); 13 + 14 + /* 15 + * Setup vcpu_info for boot CPU. 16 + */ 17 + xen_vcpu_setup(0); 18 + 19 + /* 20 + * The alternative logic (which patches the unlock/lock) runs before 21 + * the smp bootup up code is activated. Hence we need to set this up 22 + * the core kernel is being patched. Otherwise we will have only 23 + * modules patched but not core code. 24 + */ 25 + xen_init_spinlocks(); 26 + } 27 + 28 + static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) 29 + { 30 + native_smp_prepare_cpus(max_cpus); 31 + WARN_ON(xen_smp_intr_init(0)); 32 + 33 + xen_init_lock_cpu(0); 34 + } 35 + 36 + #ifdef CONFIG_HOTPLUG_CPU 37 + static void xen_hvm_cpu_die(unsigned int cpu) 38 + { 39 + if (common_cpu_die(cpu) == 0) { 40 + xen_smp_intr_free(cpu); 41 + xen_uninit_lock_cpu(cpu); 42 + xen_teardown_timer(cpu); 43 + } 44 + } 45 + #else 46 + static void xen_hvm_cpu_die(unsigned int cpu) 47 + { 48 + BUG(); 49 + } 50 + #endif 51 + 52 + void __init xen_hvm_smp_init(void) 53 + { 54 + if (!xen_have_vector_callback) 55 + return; 56 + 57 + smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; 58 + smp_ops.smp_send_reschedule = xen_smp_send_reschedule; 59 + smp_ops.cpu_die = xen_hvm_cpu_die; 60 + smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; 61 + smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; 62 + smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; 63 + }

+490

arch/x86/xen/smp_pv.c

··· 1 + /* 2 + * Xen SMP support 3 + * 4 + * This file implements the Xen versions of smp_ops. SMP under Xen is 5 + * very straightforward. Bringing a CPU up is simply a matter of 6 + * loading its initial context and setting it running. 7 + * 8 + * IPIs are handled through the Xen event mechanism. 9 + * 10 + * Because virtual CPUs can be scheduled onto any real CPU, there's no 11 + * useful topology information for the kernel to make use of. As a 12 + * result, all CPUs are treated as if they're single-core and 13 + * single-threaded. 14 + */ 15 + #include <linux/sched.h> 16 + #include <linux/err.h> 17 + #include <linux/slab.h> 18 + #include <linux/smp.h> 19 + #include <linux/irq_work.h> 20 + #include <linux/tick.h> 21 + #include <linux/nmi.h> 22 + 23 + #include <asm/paravirt.h> 24 + #include <asm/desc.h> 25 + #include <asm/pgtable.h> 26 + #include <asm/cpu.h> 27 + 28 + #include <xen/interface/xen.h> 29 + #include <xen/interface/vcpu.h> 30 + #include <xen/interface/xenpmu.h> 31 + 32 + #include <asm/xen/interface.h> 33 + #include <asm/xen/hypercall.h> 34 + 35 + #include <xen/xen.h> 36 + #include <xen/page.h> 37 + #include <xen/events.h> 38 + 39 + #include <xen/hvc-console.h> 40 + #include "xen-ops.h" 41 + #include "mmu.h" 42 + #include "smp.h" 43 + #include "pmu.h" 44 + 45 + cpumask_var_t xen_cpu_initialized_map; 46 + 47 + static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 }; 48 + static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 }; 49 + 50 + static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id); 51 + 52 + static void cpu_bringup(void) 53 + { 54 + int cpu; 55 + 56 + cpu_init(); 57 + touch_softlockup_watchdog(); 58 + preempt_disable(); 59 + 60 + /* PVH runs in ring 0 and allows us to do native syscalls. Yay! */ 61 + if (!xen_feature(XENFEAT_supervisor_mode_kernel)) { 62 + xen_enable_sysenter(); 63 + xen_enable_syscall(); 64 + } 65 + cpu = smp_processor_id(); 66 + smp_store_cpu_info(cpu); 67 + cpu_data(cpu).x86_max_cores = 1; 68 + set_cpu_sibling_map(cpu); 69 + 70 + xen_setup_cpu_clockevents(); 71 + 72 + notify_cpu_starting(cpu); 73 + 74 + set_cpu_online(cpu, true); 75 + 76 + cpu_set_state_online(cpu); /* Implies full memory barrier. */ 77 + 78 + /* We can take interrupts now: we're officially "up". */ 79 + local_irq_enable(); 80 + } 81 + 82 + asmlinkage __visible void cpu_bringup_and_idle(void) 83 + { 84 + cpu_bringup(); 85 + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); 86 + } 87 + 88 + void xen_smp_intr_free_pv(unsigned int cpu) 89 + { 90 + if (per_cpu(xen_irq_work, cpu).irq >= 0) { 91 + unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL); 92 + per_cpu(xen_irq_work, cpu).irq = -1; 93 + kfree(per_cpu(xen_irq_work, cpu).name); 94 + per_cpu(xen_irq_work, cpu).name = NULL; 95 + } 96 + 97 + if (per_cpu(xen_pmu_irq, cpu).irq >= 0) { 98 + unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL); 99 + per_cpu(xen_pmu_irq, cpu).irq = -1; 100 + kfree(per_cpu(xen_pmu_irq, cpu).name); 101 + per_cpu(xen_pmu_irq, cpu).name = NULL; 102 + } 103 + } 104 + 105 + int xen_smp_intr_init_pv(unsigned int cpu) 106 + { 107 + int rc; 108 + char *callfunc_name, *pmu_name; 109 + 110 + callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu); 111 + rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR, 112 + cpu, 113 + xen_irq_work_interrupt, 114 + IRQF_PERCPU|IRQF_NOBALANCING, 115 + callfunc_name, 116 + NULL); 117 + if (rc < 0) 118 + goto fail; 119 + per_cpu(xen_irq_work, cpu).irq = rc; 120 + per_cpu(xen_irq_work, cpu).name = callfunc_name; 121 + 122 + if (is_xen_pmu(cpu)) { 123 + pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu); 124 + rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu, 125 + xen_pmu_irq_handler, 126 + IRQF_PERCPU|IRQF_NOBALANCING, 127 + pmu_name, NULL); 128 + if (rc < 0) 129 + goto fail; 130 + per_cpu(xen_pmu_irq, cpu).irq = rc; 131 + per_cpu(xen_pmu_irq, cpu).name = pmu_name; 132 + } 133 + 134 + return 0; 135 + 136 + fail: 137 + xen_smp_intr_free_pv(cpu); 138 + return rc; 139 + } 140 + 141 + static void __init xen_fill_possible_map(void) 142 + { 143 + int i, rc; 144 + 145 + if (xen_initial_domain()) 146 + return; 147 + 148 + for (i = 0; i < nr_cpu_ids; i++) { 149 + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 150 + if (rc >= 0) { 151 + num_processors++; 152 + set_cpu_possible(i, true); 153 + } 154 + } 155 + } 156 + 157 + static void __init xen_filter_cpu_maps(void) 158 + { 159 + int i, rc; 160 + unsigned int subtract = 0; 161 + 162 + if (!xen_initial_domain()) 163 + return; 164 + 165 + num_processors = 0; 166 + disabled_cpus = 0; 167 + for (i = 0; i < nr_cpu_ids; i++) { 168 + rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 169 + if (rc >= 0) { 170 + num_processors++; 171 + set_cpu_possible(i, true); 172 + } else { 173 + set_cpu_possible(i, false); 174 + set_cpu_present(i, false); 175 + subtract++; 176 + } 177 + } 178 + #ifdef CONFIG_HOTPLUG_CPU 179 + /* This is akin to using 'nr_cpus' on the Linux command line. 180 + * Which is OK as when we use 'dom0_max_vcpus=X' we can only 181 + * have up to X, while nr_cpu_ids is greater than X. This 182 + * normally is not a problem, except when CPU hotplugging 183 + * is involved and then there might be more than X CPUs 184 + * in the guest - which will not work as there is no 185 + * hypercall to expand the max number of VCPUs an already 186 + * running guest has. So cap it up to X. */ 187 + if (subtract) 188 + nr_cpu_ids = nr_cpu_ids - subtract; 189 + #endif 190 + 191 + } 192 + 193 + static void __init xen_pv_smp_prepare_boot_cpu(void) 194 + { 195 + BUG_ON(smp_processor_id() != 0); 196 + native_smp_prepare_boot_cpu(); 197 + 198 + if (!xen_feature(XENFEAT_writable_page_tables)) 199 + /* We've switched to the "real" per-cpu gdt, so make 200 + * sure the old memory can be recycled. */ 201 + make_lowmem_page_readwrite(xen_initial_gdt); 202 + 203 + #ifdef CONFIG_X86_32 204 + /* 205 + * Xen starts us with XEN_FLAT_RING1_DS, but linux code 206 + * expects __USER_DS 207 + */ 208 + loadsegment(ds, __USER_DS); 209 + loadsegment(es, __USER_DS); 210 + #endif 211 + 212 + xen_filter_cpu_maps(); 213 + xen_setup_vcpu_info_placement(); 214 + 215 + /* 216 + * The alternative logic (which patches the unlock/lock) runs before 217 + * the smp bootup up code is activated. Hence we need to set this up 218 + * the core kernel is being patched. Otherwise we will have only 219 + * modules patched but not core code. 220 + */ 221 + xen_init_spinlocks(); 222 + } 223 + 224 + static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus) 225 + { 226 + unsigned cpu; 227 + unsigned int i; 228 + 229 + if (skip_ioapic_setup) { 230 + char *m = (max_cpus == 0) ? 231 + "The nosmp parameter is incompatible with Xen; " \ 232 + "use Xen dom0_max_vcpus=1 parameter" : 233 + "The noapic parameter is incompatible with Xen"; 234 + 235 + xen_raw_printk(m); 236 + panic(m); 237 + } 238 + xen_init_lock_cpu(0); 239 + 240 + smp_store_boot_cpu_info(); 241 + cpu_data(0).x86_max_cores = 1; 242 + 243 + for_each_possible_cpu(i) { 244 + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 245 + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 246 + zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); 247 + } 248 + set_cpu_sibling_map(0); 249 + 250 + xen_pmu_init(0); 251 + 252 + if (xen_smp_intr_init(0) || xen_smp_intr_init_pv(0)) 253 + BUG(); 254 + 255 + if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL)) 256 + panic("could not allocate xen_cpu_initialized_map\n"); 257 + 258 + cpumask_copy(xen_cpu_initialized_map, cpumask_of(0)); 259 + 260 + /* Restrict the possible_map according to max_cpus. */ 261 + while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { 262 + for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--) 263 + continue; 264 + set_cpu_possible(cpu, false); 265 + } 266 + 267 + for_each_possible_cpu(cpu) 268 + set_cpu_present(cpu, true); 269 + } 270 + 271 + static int 272 + cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 273 + { 274 + struct vcpu_guest_context *ctxt; 275 + struct desc_struct *gdt; 276 + unsigned long gdt_mfn; 277 + 278 + /* used to tell cpu_init() that it can proceed with initialization */ 279 + cpumask_set_cpu(cpu, cpu_callout_mask); 280 + if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map)) 281 + return 0; 282 + 283 + ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL); 284 + if (ctxt == NULL) 285 + return -ENOMEM; 286 + 287 + gdt = get_cpu_gdt_rw(cpu); 288 + 289 + #ifdef CONFIG_X86_32 290 + ctxt->user_regs.fs = __KERNEL_PERCPU; 291 + ctxt->user_regs.gs = __KERNEL_STACK_CANARY; 292 + #endif 293 + memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt)); 294 + 295 + ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 296 + ctxt->flags = VGCF_IN_KERNEL; 297 + ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 298 + ctxt->user_regs.ds = __USER_DS; 299 + ctxt->user_regs.es = __USER_DS; 300 + ctxt->user_regs.ss = __KERNEL_DS; 301 + 302 + xen_copy_trap_info(ctxt->trap_ctxt); 303 + 304 + ctxt->ldt_ents = 0; 305 + 306 + BUG_ON((unsigned long)gdt & ~PAGE_MASK); 307 + 308 + gdt_mfn = arbitrary_virt_to_mfn(gdt); 309 + make_lowmem_page_readonly(gdt); 310 + make_lowmem_page_readonly(mfn_to_virt(gdt_mfn)); 311 + 312 + ctxt->gdt_frames[0] = gdt_mfn; 313 + ctxt->gdt_ents = GDT_ENTRIES; 314 + 315 + ctxt->kernel_ss = __KERNEL_DS; 316 + ctxt->kernel_sp = idle->thread.sp0; 317 + 318 + #ifdef CONFIG_X86_32 319 + ctxt->event_callback_cs = __KERNEL_CS; 320 + ctxt->failsafe_callback_cs = __KERNEL_CS; 321 + #else 322 + ctxt->gs_base_kernel = per_cpu_offset(cpu); 323 + #endif 324 + ctxt->event_callback_eip = 325 + (unsigned long)xen_hypervisor_callback; 326 + ctxt->failsafe_callback_eip = 327 + (unsigned long)xen_failsafe_callback; 328 + ctxt->user_regs.cs = __KERNEL_CS; 329 + per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 330 + 331 + ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 332 + ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir)); 333 + if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt)) 334 + BUG(); 335 + 336 + kfree(ctxt); 337 + return 0; 338 + } 339 + 340 + static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) 341 + { 342 + int rc; 343 + 344 + common_cpu_up(cpu, idle); 345 + 346 + xen_setup_runstate_info(cpu); 347 + 348 + /* 349 + * PV VCPUs are always successfully taken down (see 'while' loop 350 + * in xen_cpu_die()), so -EBUSY is an error. 351 + */ 352 + rc = cpu_check_up_prepare(cpu); 353 + if (rc) 354 + return rc; 355 + 356 + /* make sure interrupts start blocked */ 357 + per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 358 + 359 + rc = cpu_initialize_context(cpu, idle); 360 + if (rc) 361 + return rc; 362 + 363 + xen_pmu_init(cpu); 364 + 365 + rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL); 366 + BUG_ON(rc); 367 + 368 + while (cpu_report_state(cpu) != CPU_ONLINE) 369 + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); 370 + 371 + return 0; 372 + } 373 + 374 + static void xen_pv_smp_cpus_done(unsigned int max_cpus) 375 + { 376 + } 377 + 378 + #ifdef CONFIG_HOTPLUG_CPU 379 + static int xen_pv_cpu_disable(void) 380 + { 381 + unsigned int cpu = smp_processor_id(); 382 + if (cpu == 0) 383 + return -EBUSY; 384 + 385 + cpu_disable_common(); 386 + 387 + load_cr3(swapper_pg_dir); 388 + return 0; 389 + } 390 + 391 + static void xen_pv_cpu_die(unsigned int cpu) 392 + { 393 + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, 394 + xen_vcpu_nr(cpu), NULL)) { 395 + __set_current_state(TASK_UNINTERRUPTIBLE); 396 + schedule_timeout(HZ/10); 397 + } 398 + 399 + if (common_cpu_die(cpu) == 0) { 400 + xen_smp_intr_free(cpu); 401 + xen_uninit_lock_cpu(cpu); 402 + xen_teardown_timer(cpu); 403 + xen_pmu_finish(cpu); 404 + } 405 + } 406 + 407 + static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */ 408 + { 409 + play_dead_common(); 410 + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL); 411 + cpu_bringup(); 412 + /* 413 + * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down) 414 + * clears certain data that the cpu_idle loop (which called us 415 + * and that we return from) expects. The only way to get that 416 + * data back is to call: 417 + */ 418 + tick_nohz_idle_enter(); 419 + 420 + cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); 421 + } 422 + 423 + #else /* !CONFIG_HOTPLUG_CPU */ 424 + static int xen_pv_cpu_disable(void) 425 + { 426 + return -ENOSYS; 427 + } 428 + 429 + static void xen_pv_cpu_die(unsigned int cpu) 430 + { 431 + BUG(); 432 + } 433 + 434 + static void xen_pv_play_dead(void) 435 + { 436 + BUG(); 437 + } 438 + 439 + #endif 440 + static void stop_self(void *v) 441 + { 442 + int cpu = smp_processor_id(); 443 + 444 + /* make sure we're not pinning something down */ 445 + load_cr3(swapper_pg_dir); 446 + /* should set up a minimal gdt */ 447 + 448 + set_cpu_online(cpu, false); 449 + 450 + HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL); 451 + BUG(); 452 + } 453 + 454 + static void xen_pv_stop_other_cpus(int wait) 455 + { 456 + smp_call_function(stop_self, NULL, wait); 457 + } 458 + 459 + static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id) 460 + { 461 + irq_enter(); 462 + irq_work_run(); 463 + inc_irq_stat(apic_irq_work_irqs); 464 + irq_exit(); 465 + 466 + return IRQ_HANDLED; 467 + } 468 + 469 + static const struct smp_ops xen_smp_ops __initconst = { 470 + .smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu, 471 + .smp_prepare_cpus = xen_pv_smp_prepare_cpus, 472 + .smp_cpus_done = xen_pv_smp_cpus_done, 473 + 474 + .cpu_up = xen_pv_cpu_up, 475 + .cpu_die = xen_pv_cpu_die, 476 + .cpu_disable = xen_pv_cpu_disable, 477 + .play_dead = xen_pv_play_dead, 478 + 479 + .stop_other_cpus = xen_pv_stop_other_cpus, 480 + .smp_send_reschedule = xen_smp_send_reschedule, 481 + 482 + .send_call_func_ipi = xen_smp_send_call_function_ipi, 483 + .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, 484 + }; 485 + 486 + void __init xen_smp_init(void) 487 + { 488 + smp_ops = xen_smp_ops; 489 + xen_fill_possible_map(); 490 + }

-54

arch/x86/xen/suspend.c

··· 14 14 #include "mmu.h" 15 15 #include "pmu.h" 16 16 17 - static void xen_pv_pre_suspend(void) 18 - { 19 - xen_mm_pin_all(); 20 - 21 - xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); 22 - xen_start_info->console.domU.mfn = 23 - mfn_to_pfn(xen_start_info->console.domU.mfn); 24 - 25 - BUG_ON(!irqs_disabled()); 26 - 27 - HYPERVISOR_shared_info = &xen_dummy_shared_info; 28 - if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP), 29 - __pte_ma(0), 0)) 30 - BUG(); 31 - } 32 - 33 - static void xen_hvm_post_suspend(int suspend_cancelled) 34 - { 35 - #ifdef CONFIG_XEN_PVHVM 36 - int cpu; 37 - if (!suspend_cancelled) 38 - xen_hvm_init_shared_info(); 39 - xen_callback_vector(); 40 - xen_unplug_emulated_devices(); 41 - if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 42 - for_each_online_cpu(cpu) { 43 - xen_setup_runstate_info(cpu); 44 - } 45 - } 46 - #endif 47 - } 48 - 49 - static void xen_pv_post_suspend(int suspend_cancelled) 50 - { 51 - xen_build_mfn_list_list(); 52 - 53 - xen_setup_shared_info(); 54 - 55 - if (suspend_cancelled) { 56 - xen_start_info->store_mfn = 57 - pfn_to_mfn(xen_start_info->store_mfn); 58 - xen_start_info->console.domU.mfn = 59 - pfn_to_mfn(xen_start_info->console.domU.mfn); 60 - } else { 61 - #ifdef CONFIG_SMP 62 - BUG_ON(xen_cpu_initialized_map == NULL); 63 - cpumask_copy(xen_cpu_initialized_map, cpu_online_mask); 64 - #endif 65 - xen_vcpu_restore(); 66 - } 67 - 68 - xen_mm_unpin_all(); 69 - } 70 - 71 17 void xen_arch_pre_suspend(void) 72 18 { 73 19 if (xen_pv_domain())

+22

arch/x86/xen/suspend_hvm.c

··· 1 + #include <linux/types.h> 2 + 3 + #include <xen/xen.h> 4 + #include <xen/features.h> 5 + #include <xen/interface/features.h> 6 + 7 + #include "xen-ops.h" 8 + 9 + void xen_hvm_post_suspend(int suspend_cancelled) 10 + { 11 + int cpu; 12 + 13 + if (!suspend_cancelled) 14 + xen_hvm_init_shared_info(); 15 + xen_callback_vector(); 16 + xen_unplug_emulated_devices(); 17 + if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 18 + for_each_online_cpu(cpu) { 19 + xen_setup_runstate_info(cpu); 20 + } 21 + } 22 + }

+46

arch/x86/xen/suspend_pv.c

··· 1 + #include <linux/types.h> 2 + 3 + #include <asm/fixmap.h> 4 + 5 + #include <asm/xen/hypercall.h> 6 + #include <asm/xen/page.h> 7 + 8 + #include "xen-ops.h" 9 + 10 + void xen_pv_pre_suspend(void) 11 + { 12 + xen_mm_pin_all(); 13 + 14 + xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); 15 + xen_start_info->console.domU.mfn = 16 + mfn_to_pfn(xen_start_info->console.domU.mfn); 17 + 18 + BUG_ON(!irqs_disabled()); 19 + 20 + HYPERVISOR_shared_info = &xen_dummy_shared_info; 21 + if (HYPERVISOR_update_va_mapping(fix_to_virt(FIX_PARAVIRT_BOOTMAP), 22 + __pte_ma(0), 0)) 23 + BUG(); 24 + } 25 + 26 + void xen_pv_post_suspend(int suspend_cancelled) 27 + { 28 + xen_build_mfn_list_list(); 29 + 30 + xen_setup_shared_info(); 31 + 32 + if (suspend_cancelled) { 33 + xen_start_info->store_mfn = 34 + pfn_to_mfn(xen_start_info->store_mfn); 35 + xen_start_info->console.domU.mfn = 36 + pfn_to_mfn(xen_start_info->console.domU.mfn); 37 + } else { 38 + #ifdef CONFIG_SMP 39 + BUG_ON(xen_cpu_initialized_map == NULL); 40 + cpumask_copy(xen_cpu_initialized_map, cpu_online_mask); 41 + #endif 42 + xen_vcpu_restore(); 43 + } 44 + 45 + xen_mm_unpin_all(); 46 + }

+8

arch/x86/xen/time.c

··· 436 436 437 437 void __init xen_hvm_init_time_ops(void) 438 438 { 439 + /* 440 + * vector callback is needed otherwise we cannot receive interrupts 441 + * on cpu > 0 and at this point we don't know how many cpus are 442 + * available. 443 + */ 444 + if (!xen_have_vector_callback) 445 + return; 446 + 439 447 if (!xen_feature(XENFEAT_hvm_safe_pvclock)) { 440 448 printk(KERN_INFO "Xen doesn't support pvclock on HVM," 441 449 "disable pv timer\n");

+4

arch/x86/xen/xen-head.S

··· 16 16 #include <xen/interface/xen-mca.h> 17 17 #include <asm/xen/interface.h> 18 18 19 + #ifdef CONFIG_XEN_PV 19 20 __INIT 20 21 ENTRY(startup_xen) 21 22 cld ··· 35 34 jmp xen_start_kernel 36 35 37 36 __FINIT 37 + #endif 38 38 39 39 .pushsection .text 40 40 .balign PAGE_SIZE ··· 60 58 /* Map the p2m table to a 512GB-aligned user address. */ 61 59 ELFNOTE(Xen, XEN_ELFNOTE_INIT_P2M, .quad PGDIR_SIZE) 62 60 #endif 61 + #ifdef CONFIG_XEN_PV 63 62 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) 63 + #endif 64 64 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) 65 65 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, 66 66 .ascii "!writable_page_tables|pae_pgdir_above_4gb")

+22

arch/x86/xen/xen-ops.h

··· 76 76 77 77 bool xen_vcpu_stolen(int vcpu); 78 78 79 + extern int xen_have_vcpu_info_placement; 80 + 79 81 void xen_vcpu_setup(int cpu); 80 82 void xen_setup_vcpu_info_placement(void); 81 83 ··· 147 145 __visible void xen_adjust_exception_frame(void); 148 146 149 147 extern int xen_panic_handler_init(void); 148 + 149 + int xen_cpuhp_setup(int (*cpu_up_prepare_cb)(unsigned int), 150 + int (*cpu_dead_cb)(unsigned int)); 151 + 152 + void xen_pin_vcpu(int cpu); 153 + 154 + void xen_emergency_restart(void); 155 + #ifdef CONFIG_XEN_PV 156 + void xen_pv_pre_suspend(void); 157 + void xen_pv_post_suspend(int suspend_cancelled); 158 + #else 159 + static inline void xen_pv_pre_suspend(void) {} 160 + static inline void xen_pv_post_suspend(int suspend_cancelled) {} 161 + #endif 162 + 163 + #ifdef CONFIG_XEN_PVHVM 164 + void xen_hvm_post_suspend(int suspend_cancelled); 165 + #else 166 + static inline void xen_hvm_post_suspend(int suspend_cancelled) {} 167 + #endif 150 168 151 169 #endif /* XEN_OPS_H */

+1 -1

drivers/scsi/xen-scsifront.c

··· 434 434 435 435 if (seg_grants) { 436 436 page = virt_to_page(seg); 437 - off = (unsigned long)seg & ~PAGE_MASK; 437 + off = offset_in_page(seg); 438 438 len = sizeof(struct scsiif_request_segment) * data_grants; 439 439 while (len > 0) { 440 440 bytes = min_t(unsigned int, len, PAGE_SIZE - off);

+20 -10

drivers/xen/balloon.c

··· 709 709 } 710 710 EXPORT_SYMBOL(free_xenballooned_pages); 711 711 712 + #ifdef CONFIG_XEN_PV 712 713 static void __init balloon_add_region(unsigned long start_pfn, 713 714 unsigned long pages) 714 715 { ··· 733 732 734 733 balloon_stats.total_pages += extra_pfn_end - start_pfn; 735 734 } 735 + #endif 736 736 737 737 static int __init balloon_init(void) 738 738 { 739 - int i; 740 - 741 739 if (!xen_domain()) 742 740 return -ENODEV; 743 741 744 742 pr_info("Initialising balloon driver\n"); 745 743 744 + #ifdef CONFIG_XEN_PV 746 745 balloon_stats.current_pages = xen_pv_domain() 747 746 ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) 748 747 : get_num_physpages(); 748 + #else 749 + balloon_stats.current_pages = get_num_physpages(); 750 + #endif 749 751 balloon_stats.target_pages = balloon_stats.current_pages; 750 752 balloon_stats.balloon_low = 0; 751 753 balloon_stats.balloon_high = 0; ··· 765 761 register_sysctl_table(xen_root); 766 762 #endif 767 763 768 - /* 769 - * Initialize the balloon with pages from the extra memory 770 - * regions (see arch/x86/xen/setup.c). 771 - */ 772 - for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) 773 - if (xen_extra_mem[i].n_pfns) 774 - balloon_add_region(xen_extra_mem[i].start_pfn, 775 - xen_extra_mem[i].n_pfns); 764 + #ifdef CONFIG_XEN_PV 765 + { 766 + int i; 767 + 768 + /* 769 + * Initialize the balloon with pages from the extra memory 770 + * regions (see arch/x86/xen/setup.c). 771 + */ 772 + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) 773 + if (xen_extra_mem[i].n_pfns) 774 + balloon_add_region(xen_extra_mem[i].start_pfn, 775 + xen_extra_mem[i].n_pfns); 776 + } 777 + #endif 776 778 777 779 return 0; 778 780 }

+18

drivers/xen/efi.c

··· 26 26 #include <xen/interface/xen.h> 27 27 #include <xen/interface/platform.h> 28 28 #include <xen/xen.h> 29 + #include <xen/xen-ops.h> 29 30 30 31 #include <asm/page.h> 31 32 ··· 264 263 return efi_data(op).status; 265 264 } 266 265 EXPORT_SYMBOL_GPL(xen_efi_query_capsule_caps); 266 + 267 + void xen_efi_reset_system(int reset_type, efi_status_t status, 268 + unsigned long data_size, efi_char16_t *data) 269 + { 270 + switch (reset_type) { 271 + case EFI_RESET_COLD: 272 + case EFI_RESET_WARM: 273 + xen_reboot(SHUTDOWN_reboot); 274 + break; 275 + case EFI_RESET_SHUTDOWN: 276 + xen_reboot(SHUTDOWN_poweroff); 277 + break; 278 + default: 279 + BUG(); 280 + } 281 + } 282 + EXPORT_SYMBOL_GPL(xen_efi_reset_system);

+17 -8

drivers/xen/events/events_base.c

··· 1312 1312 if (!VALID_EVTCHN(evtchn)) 1313 1313 return -1; 1314 1314 1315 + if (!xen_support_evtchn_rebind()) 1316 + return -1; 1317 + 1315 1318 /* Send future instances of this interrupt to other vcpu. */ 1316 1319 bind_vcpu.port = evtchn; 1317 1320 bind_vcpu.vcpu = xen_vcpu_nr(tcpu); ··· 1649 1646 int rc; 1650 1647 uint64_t callback_via; 1651 1648 1652 - callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); 1653 - rc = xen_set_callback_via(callback_via); 1654 - BUG_ON(rc); 1655 - pr_info("Xen HVM callback vector for event delivery is enabled\n"); 1656 - /* in the restore case the vector has already been allocated */ 1657 - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) 1658 - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, 1659 - xen_hvm_callback_vector); 1649 + if (xen_have_vector_callback) { 1650 + callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); 1651 + rc = xen_set_callback_via(callback_via); 1652 + if (rc) { 1653 + pr_err("Request for Xen HVM callback vector failed\n"); 1654 + xen_have_vector_callback = 0; 1655 + return; 1656 + } 1657 + pr_info("Xen HVM callback vector for event delivery is enabled\n"); 1658 + /* in the restore case the vector has already been allocated */ 1659 + if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) 1660 + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, 1661 + xen_hvm_callback_vector); 1662 + } 1660 1663 } 1661 1664 #else 1662 1665 void xen_callback_vector(void) {}

+4 -10

drivers/xen/platform-pci.c

··· 90 90 static int platform_pci_resume(struct pci_dev *pdev) 91 91 { 92 92 int err; 93 - if (!xen_pv_domain()) 93 + 94 + if (xen_have_vector_callback) 94 95 return 0; 96 + 95 97 err = xen_set_callback_via(callback_via); 96 98 if (err) { 97 99 dev_err(&pdev->dev, "platform_pci_resume failure!\n"); ··· 139 137 140 138 platform_mmio = mmio_addr; 141 139 platform_mmiolen = mmio_len; 142 - 143 - /* 144 - * Xen HVM guests always use the vector callback mechanism. 145 - * L1 Dom0 in a nested Xen environment is a PV guest inside in an 146 - * HVM environment. It needs the platform-pci driver to get 147 - * notifications from L0 Xen, but it cannot use the vector callback 148 - * as it is not exported by L1 Xen. 149 - */ 150 - if (xen_pv_domain()) { 140 + if (!xen_have_vector_callback) { 151 141 ret = xen_allocate_irq(pdev); 152 142 if (ret) { 153 143 dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret);

+4 -4

drivers/xen/swiotlb-xen.c

··· 693 693 unsigned long attrs) 694 694 { 695 695 #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) 696 - if (__generic_dma_ops(dev)->mmap) 697 - return __generic_dma_ops(dev)->mmap(dev, vma, cpu_addr, 696 + if (xen_get_dma_ops(dev)->mmap) 697 + return xen_get_dma_ops(dev)->mmap(dev, vma, cpu_addr, 698 698 dma_addr, size, attrs); 699 699 #endif 700 700 return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size); ··· 711 711 unsigned long attrs) 712 712 { 713 713 #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) 714 - if (__generic_dma_ops(dev)->get_sgtable) { 714 + if (xen_get_dma_ops(dev)->get_sgtable) { 715 715 #if 0 716 716 /* 717 717 * This check verifies that the page belongs to the current domain and ··· 721 721 unsigned long bfn = PHYS_PFN(dma_to_phys(dev, handle)); 722 722 BUG_ON (!page_is_ram(bfn)); 723 723 #endif 724 - return __generic_dma_ops(dev)->get_sgtable(dev, sgt, cpu_addr, 724 + return xen_get_dma_ops(dev)->get_sgtable(dev, sgt, cpu_addr, 725 725 handle, size, attrs); 726 726 } 727 727 #endif

+17 -9

include/xen/arm/page-coherent.h

··· 2 2 #define _ASM_ARM_XEN_PAGE_COHERENT_H 3 3 4 4 #include <asm/page.h> 5 + #include <asm/dma-mapping.h> 5 6 #include <linux/dma-mapping.h> 7 + 8 + static inline const struct dma_map_ops *xen_get_dma_ops(struct device *dev) 9 + { 10 + if (dev && dev->archdata.dev_dma_ops) 11 + return dev->archdata.dev_dma_ops; 12 + return get_arch_dma_ops(NULL); 13 + } 6 14 7 15 void __xen_dma_map_page(struct device *hwdev, struct page *page, 8 16 dma_addr_t dev_addr, unsigned long offset, size_t size, ··· 27 19 static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, 28 20 dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs) 29 21 { 30 - return __generic_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs); 22 + return xen_get_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs); 31 23 } 32 24 33 25 static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, 34 26 void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) 35 27 { 36 - __generic_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs); 28 + xen_get_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs); 37 29 } 38 30 39 31 static inline void xen_dma_map_page(struct device *hwdev, struct page *page, ··· 57 49 * specific function. 58 50 */ 59 51 if (local) 60 - __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); 52 + xen_get_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); 61 53 else 62 54 __xen_dma_map_page(hwdev, page, dev_addr, offset, size, dir, attrs); 63 55 } ··· 75 67 * specific function. 76 68 */ 77 69 if (pfn_valid(pfn)) { 78 - if (__generic_dma_ops(hwdev)->unmap_page) 79 - __generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); 70 + if (xen_get_dma_ops(hwdev)->unmap_page) 71 + xen_get_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); 80 72 } else 81 73 __xen_dma_unmap_page(hwdev, handle, size, dir, attrs); 82 74 } ··· 86 78 { 87 79 unsigned long pfn = PFN_DOWN(handle); 88 80 if (pfn_valid(pfn)) { 89 - if (__generic_dma_ops(hwdev)->sync_single_for_cpu) 90 - __generic_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir); 81 + if (xen_get_dma_ops(hwdev)->sync_single_for_cpu) 82 + xen_get_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir); 91 83 } else 92 84 __xen_dma_sync_single_for_cpu(hwdev, handle, size, dir); 93 85 } ··· 97 89 { 98 90 unsigned long pfn = PFN_DOWN(handle); 99 91 if (pfn_valid(pfn)) { 100 - if (__generic_dma_ops(hwdev)->sync_single_for_device) 101 - __generic_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir); 92 + if (xen_get_dma_ops(hwdev)->sync_single_for_device) 93 + xen_get_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir); 102 94 } else 103 95 __xen_dma_sync_single_for_device(hwdev, handle, size, dir); 104 96 }

+36

include/xen/interface/io/9pfs.h

··· 1 + /* 2 + * 9pfs.h -- Xen 9PFS transport 3 + * 4 + * Permission is hereby granted, free of charge, to any person obtaining a copy 5 + * of this software and associated documentation files (the "Software"), to 6 + * deal in the Software without restriction, including without limitation the 7 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 + * sell copies of the Software, and to permit persons to whom the Software is 9 + * furnished to do so, subject to the following conditions: 10 + * 11 + * The above copyright notice and this permission notice shall be included in 12 + * all copies or substantial portions of the Software. 13 + * 14 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20 + * DEALINGS IN THE SOFTWARE. 21 + * 22 + * Copyright (C) 2017 Stefano Stabellini <stefano@aporeto.com> 23 + */ 24 + 25 + #ifndef __XEN_PUBLIC_IO_9PFS_H__ 26 + #define __XEN_PUBLIC_IO_9PFS_H__ 27 + 28 + #include "xen/interface/io/ring.h" 29 + 30 + /* 31 + * See docs/misc/9pfs.markdown in xen.git for the full specification: 32 + * https://xenbits.xen.org/docs/unstable/misc/9pfs.html 33 + */ 34 + DEFINE_XEN_FLEX_RING_AND_INTF(xen_9pfs); 35 + 36 + #endif

+854

include/xen/interface/io/displif.h

··· 1 + /****************************************************************************** 2 + * displif.h 3 + * 4 + * Unified display device I/O interface for Xen guest OSes. 5 + * 6 + * Permission is hereby granted, free of charge, to any person obtaining a copy 7 + * of this software and associated documentation files (the "Software"), to 8 + * deal in the Software without restriction, including without limitation the 9 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 + * sell copies of the Software, and to permit persons to whom the Software is 11 + * furnished to do so, subject to the following conditions: 12 + * 13 + * The above copyright notice and this permission notice shall be included in 14 + * all copies or substantial portions of the Software. 15 + * 16 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 + * DEALINGS IN THE SOFTWARE. 23 + * 24 + * Copyright (C) 2016-2017 EPAM Systems Inc. 25 + * 26 + * Authors: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> 27 + * Oleksandr Grytsov <oleksandr_grytsov@epam.com> 28 + */ 29 + 30 + #ifndef __XEN_PUBLIC_IO_DISPLIF_H__ 31 + #define __XEN_PUBLIC_IO_DISPLIF_H__ 32 + 33 + #include "ring.h" 34 + #include "../grant_table.h" 35 + 36 + /* 37 + ****************************************************************************** 38 + * Protocol version 39 + ****************************************************************************** 40 + */ 41 + #define XENDISPL_PROTOCOL_VERSION "1" 42 + 43 + /* 44 + ****************************************************************************** 45 + * Main features provided by the protocol 46 + ****************************************************************************** 47 + * This protocol aims to provide a unified protocol which fits more 48 + * sophisticated use-cases than a framebuffer device can handle. At the 49 + * moment basic functionality is supported with the intention to be extended: 50 + * o multiple dynamically allocated/destroyed framebuffers 51 + * o buffers of arbitrary sizes 52 + * o buffer allocation at either back or front end 53 + * o better configuration options including multiple display support 54 + * 55 + * Note: existing fbif can be used together with displif running at the 56 + * same time, e.g. on Linux one provides framebuffer and another DRM/KMS 57 + * 58 + * Note: display resolution (XenStore's "resolution" property) defines 59 + * visible area of the virtual display. At the same time resolution of 60 + * the display and frame buffers may differ: buffers can be smaller, equal 61 + * or bigger than the visible area. This is to enable use-cases, where backend 62 + * may do some post-processing of the display and frame buffers supplied, 63 + * e.g. those buffers can be just a part of the final composition. 64 + * 65 + ****************************************************************************** 66 + * Direction of improvements 67 + ****************************************************************************** 68 + * Future extensions to the existing protocol may include: 69 + * o display/connector cloning 70 + * o allocation of objects other than display buffers 71 + * o plane/overlay support 72 + * o scaling support 73 + * o rotation support 74 + * 75 + ****************************************************************************** 76 + * Feature and Parameter Negotiation 77 + ****************************************************************************** 78 + * 79 + * Front->back notifications: when enqueuing a new request, sending a 80 + * notification can be made conditional on xendispl_req (i.e., the generic 81 + * hold-off mechanism provided by the ring macros). Backends must set 82 + * xendispl_req appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()). 83 + * 84 + * Back->front notifications: when enqueuing a new response, sending a 85 + * notification can be made conditional on xendispl_resp (i.e., the generic 86 + * hold-off mechanism provided by the ring macros). Frontends must set 87 + * xendispl_resp appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()). 88 + * 89 + * The two halves of a para-virtual display driver utilize nodes within 90 + * XenStore to communicate capabilities and to negotiate operating parameters. 91 + * This section enumerates these nodes which reside in the respective front and 92 + * backend portions of XenStore, following the XenBus convention. 93 + * 94 + * All data in XenStore is stored as strings. Nodes specifying numeric 95 + * values are encoded in decimal. Integer value ranges listed below are 96 + * expressed as fixed sized integer types capable of storing the conversion 97 + * of a properly formated node string, without loss of information. 98 + * 99 + ****************************************************************************** 100 + * Example configuration 101 + ****************************************************************************** 102 + * 103 + * Note: depending on the use-case backend can expose more display connectors 104 + * than the underlying HW physically has by employing SW graphics compositors 105 + * 106 + * This is an example of backend and frontend configuration: 107 + * 108 + *--------------------------------- Backend ----------------------------------- 109 + * 110 + * /local/domain/0/backend/vdispl/1/0/frontend-id = "1" 111 + * /local/domain/0/backend/vdispl/1/0/frontend = "/local/domain/1/device/vdispl/0" 112 + * /local/domain/0/backend/vdispl/1/0/state = "4" 113 + * /local/domain/0/backend/vdispl/1/0/versions = "1,2" 114 + * 115 + *--------------------------------- Frontend ---------------------------------- 116 + * 117 + * /local/domain/1/device/vdispl/0/backend-id = "0" 118 + * /local/domain/1/device/vdispl/0/backend = "/local/domain/0/backend/vdispl/1/0" 119 + * /local/domain/1/device/vdispl/0/state = "4" 120 + * /local/domain/1/device/vdispl/0/version = "1" 121 + * /local/domain/1/device/vdispl/0/be-alloc = "1" 122 + * 123 + *-------------------------- Connector 0 configuration ------------------------ 124 + * 125 + * /local/domain/1/device/vdispl/0/0/resolution = "1920x1080" 126 + * /local/domain/1/device/vdispl/0/0/req-ring-ref = "2832" 127 + * /local/domain/1/device/vdispl/0/0/req-event-channel = "15" 128 + * /local/domain/1/device/vdispl/0/0/evt-ring-ref = "387" 129 + * /local/domain/1/device/vdispl/0/0/evt-event-channel = "16" 130 + * 131 + *-------------------------- Connector 1 configuration ------------------------ 132 + * 133 + * /local/domain/1/device/vdispl/0/1/resolution = "800x600" 134 + * /local/domain/1/device/vdispl/0/1/req-ring-ref = "2833" 135 + * /local/domain/1/device/vdispl/0/1/req-event-channel = "17" 136 + * /local/domain/1/device/vdispl/0/1/evt-ring-ref = "388" 137 + * /local/domain/1/device/vdispl/0/1/evt-event-channel = "18" 138 + * 139 + ****************************************************************************** 140 + * Backend XenBus Nodes 141 + ****************************************************************************** 142 + * 143 + *----------------------------- Protocol version ------------------------------ 144 + * 145 + * versions 146 + * Values: <string> 147 + * 148 + * List of XENDISPL_LIST_SEPARATOR separated protocol versions supported 149 + * by the backend. For example "1,2,3". 150 + * 151 + ****************************************************************************** 152 + * Frontend XenBus Nodes 153 + ****************************************************************************** 154 + * 155 + *-------------------------------- Addressing --------------------------------- 156 + * 157 + * dom-id 158 + * Values: <uint16_t> 159 + * 160 + * Domain identifier. 161 + * 162 + * dev-id 163 + * Values: <uint16_t> 164 + * 165 + * Device identifier. 166 + * 167 + * conn-idx 168 + * Values: <uint8_t> 169 + * 170 + * Zero based contigous index of the connector. 171 + * /local/domain/<dom-id>/device/vdispl/<dev-id>/<conn-idx>/... 172 + * 173 + *----------------------------- Protocol version ------------------------------ 174 + * 175 + * version 176 + * Values: <string> 177 + * 178 + * Protocol version, chosen among the ones supported by the backend. 179 + * 180 + *------------------------- Backend buffer allocation ------------------------- 181 + * 182 + * be-alloc 183 + * Values: "0", "1" 184 + * 185 + * If value is set to "1", then backend can be a buffer provider/allocator 186 + * for this domain during XENDISPL_OP_DBUF_CREATE operation (see below 187 + * for negotiation). 188 + * If value is not "1" or omitted frontend must allocate buffers itself. 189 + * 190 + *----------------------------- Connector settings ---------------------------- 191 + * 192 + * resolution 193 + * Values: <width, uint32_t>x<height, uint32_t> 194 + * 195 + * Width and height of the connector in pixels separated by 196 + * XENDISPL_RESOLUTION_SEPARATOR. This defines visible area of the 197 + * display. 198 + * 199 + *------------------ Connector Request Transport Parameters ------------------- 200 + * 201 + * This communication path is used to deliver requests from frontend to backend 202 + * and get the corresponding responses from backend to frontend, 203 + * set up per connector. 204 + * 205 + * req-event-channel 206 + * Values: <uint32_t> 207 + * 208 + * The identifier of the Xen connector's control event channel 209 + * used to signal activity in the ring buffer. 210 + * 211 + * req-ring-ref 212 + * Values: <uint32_t> 213 + * 214 + * The Xen grant reference granting permission for the backend to map 215 + * a sole page of connector's control ring buffer. 216 + * 217 + *------------------- Connector Event Transport Parameters -------------------- 218 + * 219 + * This communication path is used to deliver asynchronous events from backend 220 + * to frontend, set up per connector. 221 + * 222 + * evt-event-channel 223 + * Values: <uint32_t> 224 + * 225 + * The identifier of the Xen connector's event channel 226 + * used to signal activity in the ring buffer. 227 + * 228 + * evt-ring-ref 229 + * Values: <uint32_t> 230 + * 231 + * The Xen grant reference granting permission for the backend to map 232 + * a sole page of connector's event ring buffer. 233 + */ 234 + 235 + /* 236 + ****************************************************************************** 237 + * STATE DIAGRAMS 238 + ****************************************************************************** 239 + * 240 + * Tool stack creates front and back state nodes with initial state 241 + * XenbusStateInitialising. 242 + * Tool stack creates and sets up frontend display configuration 243 + * nodes per domain. 244 + * 245 + *-------------------------------- Normal flow -------------------------------- 246 + * 247 + * Front Back 248 + * ================================= ===================================== 249 + * XenbusStateInitialising XenbusStateInitialising 250 + * o Query backend device identification 251 + * data. 252 + * o Open and validate backend device. 253 + * | 254 + * | 255 + * V 256 + * XenbusStateInitWait 257 + * 258 + * o Query frontend configuration 259 + * o Allocate and initialize 260 + * event channels per configured 261 + * connector. 262 + * o Publish transport parameters 263 + * that will be in effect during 264 + * this connection. 265 + * | 266 + * | 267 + * V 268 + * XenbusStateInitialised 269 + * 270 + * o Query frontend transport parameters. 271 + * o Connect to the event channels. 272 + * | 273 + * | 274 + * V 275 + * XenbusStateConnected 276 + * 277 + * o Create and initialize OS 278 + * virtual display connectors 279 + * as per configuration. 280 + * | 281 + * | 282 + * V 283 + * XenbusStateConnected 284 + * 285 + * XenbusStateUnknown 286 + * XenbusStateClosed 287 + * XenbusStateClosing 288 + * o Remove virtual display device 289 + * o Remove event channels 290 + * | 291 + * | 292 + * V 293 + * XenbusStateClosed 294 + * 295 + *------------------------------- Recovery flow ------------------------------- 296 + * 297 + * In case of frontend unrecoverable errors backend handles that as 298 + * if frontend goes into the XenbusStateClosed state. 299 + * 300 + * In case of backend unrecoverable errors frontend tries removing 301 + * the virtualized device. If this is possible at the moment of error, 302 + * then frontend goes into the XenbusStateInitialising state and is ready for 303 + * new connection with backend. If the virtualized device is still in use and 304 + * cannot be removed, then frontend goes into the XenbusStateReconfiguring state 305 + * until either the virtualized device is removed or backend initiates a new 306 + * connection. On the virtualized device removal frontend goes into the 307 + * XenbusStateInitialising state. 308 + * 309 + * Note on XenbusStateReconfiguring state of the frontend: if backend has 310 + * unrecoverable errors then frontend cannot send requests to the backend 311 + * and thus cannot provide functionality of the virtualized device anymore. 312 + * After backend is back to normal the virtualized device may still hold some 313 + * state: configuration in use, allocated buffers, client application state etc. 314 + * In most cases, this will require frontend to implement complex recovery 315 + * reconnect logic. Instead, by going into XenbusStateReconfiguring state, 316 + * frontend will make sure no new clients of the virtualized device are 317 + * accepted, allow existing client(s) to exit gracefully by signaling error 318 + * state etc. 319 + * Once all the clients are gone frontend can reinitialize the virtualized 320 + * device and get into XenbusStateInitialising state again signaling the 321 + * backend that a new connection can be made. 322 + * 323 + * There are multiple conditions possible under which frontend will go from 324 + * XenbusStateReconfiguring into XenbusStateInitialising, some of them are OS 325 + * specific. For example: 326 + * 1. The underlying OS framework may provide callbacks to signal that the last 327 + * client of the virtualized device has gone and the device can be removed 328 + * 2. Frontend can schedule a deferred work (timer/tasklet/workqueue) 329 + * to periodically check if this is the right time to re-try removal of 330 + * the virtualized device. 331 + * 3. By any other means. 332 + * 333 + ****************************************************************************** 334 + * REQUEST CODES 335 + ****************************************************************************** 336 + * Request codes [0; 15] are reserved and must not be used 337 + */ 338 + 339 + #define XENDISPL_OP_DBUF_CREATE 0x10 340 + #define XENDISPL_OP_DBUF_DESTROY 0x11 341 + #define XENDISPL_OP_FB_ATTACH 0x12 342 + #define XENDISPL_OP_FB_DETACH 0x13 343 + #define XENDISPL_OP_SET_CONFIG 0x14 344 + #define XENDISPL_OP_PG_FLIP 0x15 345 + 346 + /* 347 + ****************************************************************************** 348 + * EVENT CODES 349 + ****************************************************************************** 350 + */ 351 + #define XENDISPL_EVT_PG_FLIP 0x00 352 + 353 + /* 354 + ****************************************************************************** 355 + * XENSTORE FIELD AND PATH NAME STRINGS, HELPERS 356 + ****************************************************************************** 357 + */ 358 + #define XENDISPL_DRIVER_NAME "vdispl" 359 + 360 + #define XENDISPL_LIST_SEPARATOR "," 361 + #define XENDISPL_RESOLUTION_SEPARATOR "x" 362 + 363 + #define XENDISPL_FIELD_BE_VERSIONS "versions" 364 + #define XENDISPL_FIELD_FE_VERSION "version" 365 + #define XENDISPL_FIELD_REQ_RING_REF "req-ring-ref" 366 + #define XENDISPL_FIELD_REQ_CHANNEL "req-event-channel" 367 + #define XENDISPL_FIELD_EVT_RING_REF "evt-ring-ref" 368 + #define XENDISPL_FIELD_EVT_CHANNEL "evt-event-channel" 369 + #define XENDISPL_FIELD_RESOLUTION "resolution" 370 + #define XENDISPL_FIELD_BE_ALLOC "be-alloc" 371 + 372 + /* 373 + ****************************************************************************** 374 + * STATUS RETURN CODES 375 + ****************************************************************************** 376 + * 377 + * Status return code is zero on success and -XEN_EXX on failure. 378 + * 379 + ****************************************************************************** 380 + * Assumptions 381 + ****************************************************************************** 382 + * o usage of grant reference 0 as invalid grant reference: 383 + * grant reference 0 is valid, but never exposed to a PV driver, 384 + * because of the fact it is already in use/reserved by the PV console. 385 + * o all references in this document to page sizes must be treated 386 + * as pages of size XEN_PAGE_SIZE unless otherwise noted. 387 + * 388 + ****************************************************************************** 389 + * Description of the protocol between frontend and backend driver 390 + ****************************************************************************** 391 + * 392 + * The two halves of a Para-virtual display driver communicate with 393 + * each other using shared pages and event channels. 394 + * Shared page contains a ring with request/response packets. 395 + * 396 + * All reserved fields in the structures below must be 0. 397 + * Display buffers's cookie of value 0 is treated as invalid. 398 + * Framebuffer's cookie of value 0 is treated as invalid. 399 + * 400 + * For all request/response/event packets that use cookies: 401 + * dbuf_cookie - uint64_t, unique to guest domain value used by the backend 402 + * to map remote display buffer to its local one 403 + * fb_cookie - uint64_t, unique to guest domain value used by the backend 404 + * to map remote framebuffer to its local one 405 + * 406 + *---------------------------------- Requests --------------------------------- 407 + * 408 + * All requests/responses, which are not connector specific, must be sent over 409 + * control ring of the connector which has the index value of 0: 410 + * /local/domain/<dom-id>/device/vdispl/<dev-id>/0/req-ring-ref 411 + * 412 + * All request packets have the same length (64 octets) 413 + * All request packets have common header: 414 + * 0 1 2 3 octet 415 + * +----------------+----------------+----------------+----------------+ 416 + * | id | operation | reserved | 4 417 + * +----------------+----------------+----------------+----------------+ 418 + * | reserved | 8 419 + * +----------------+----------------+----------------+----------------+ 420 + * id - uint16_t, private guest value, echoed in response 421 + * operation - uint8_t, operation code, XENDISPL_OP_??? 422 + * 423 + * Request dbuf creation - request creation of a display buffer. 424 + * 0 1 2 3 octet 425 + * +----------------+----------------+----------------+----------------+ 426 + * | id |_OP_DBUF_CREATE | reserved | 4 427 + * +----------------+----------------+----------------+----------------+ 428 + * | reserved | 8 429 + * +----------------+----------------+----------------+----------------+ 430 + * | dbuf_cookie low 32-bit | 12 431 + * +----------------+----------------+----------------+----------------+ 432 + * | dbuf_cookie high 32-bit | 16 433 + * +----------------+----------------+----------------+----------------+ 434 + * | width | 20 435 + * +----------------+----------------+----------------+----------------+ 436 + * | height | 24 437 + * +----------------+----------------+----------------+----------------+ 438 + * | bpp | 28 439 + * +----------------+----------------+----------------+----------------+ 440 + * | buffer_sz | 32 441 + * +----------------+----------------+----------------+----------------+ 442 + * | flags | 36 443 + * +----------------+----------------+----------------+----------------+ 444 + * | gref_directory | 40 445 + * +----------------+----------------+----------------+----------------+ 446 + * | reserved | 44 447 + * +----------------+----------------+----------------+----------------+ 448 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 449 + * +----------------+----------------+----------------+----------------+ 450 + * | reserved | 64 451 + * +----------------+----------------+----------------+----------------+ 452 + * 453 + * Must be sent over control ring of the connector which has the index 454 + * value of 0: 455 + * /local/domain/<dom-id>/device/vdispl/<dev-id>/0/req-ring-ref 456 + * All unused bits in flags field must be set to 0. 457 + * 458 + * An attempt to create multiple display buffers with the same dbuf_cookie is 459 + * an error. dbuf_cookie can be re-used after destroying the corresponding 460 + * display buffer. 461 + * 462 + * Width and height of the display buffers can be smaller, equal or bigger 463 + * than the connector's resolution. Depth/pixel format of the individual 464 + * buffers can differ as well. 465 + * 466 + * width - uint32_t, width in pixels 467 + * height - uint32_t, height in pixels 468 + * bpp - uint32_t, bits per pixel 469 + * buffer_sz - uint32_t, buffer size to be allocated, octets 470 + * flags - uint32_t, flags of the operation 471 + * o XENDISPL_DBUF_FLG_REQ_ALLOC - if set, then backend is requested 472 + * to allocate the buffer with the parameters provided in this request. 473 + * Page directory is handled as follows: 474 + * Frontend on request: 475 + * o allocates pages for the directory (gref_directory, 476 + * gref_dir_next_page(s) 477 + * o grants permissions for the pages of the directory to the backend 478 + * o sets gref_dir_next_page fields 479 + * Backend on response: 480 + * o grants permissions for the pages of the buffer allocated to 481 + * the frontend 482 + * o fills in page directory with grant references 483 + * (gref[] in struct xendispl_page_directory) 484 + * gref_directory - grant_ref_t, a reference to the first shared page 485 + * describing shared buffer references. At least one page exists. If shared 486 + * buffer size (buffer_sz) exceeds what can be addressed by this single page, 487 + * then reference to the next page must be supplied (see gref_dir_next_page 488 + * below) 489 + */ 490 + 491 + #define XENDISPL_DBUF_FLG_REQ_ALLOC (1 << 0) 492 + 493 + struct xendispl_dbuf_create_req { 494 + uint64_t dbuf_cookie; 495 + uint32_t width; 496 + uint32_t height; 497 + uint32_t bpp; 498 + uint32_t buffer_sz; 499 + uint32_t flags; 500 + grant_ref_t gref_directory; 501 + }; 502 + 503 + /* 504 + * Shared page for XENDISPL_OP_DBUF_CREATE buffer descriptor (gref_directory in 505 + * the request) employs a list of pages, describing all pages of the shared 506 + * data buffer: 507 + * 0 1 2 3 octet 508 + * +----------------+----------------+----------------+----------------+ 509 + * | gref_dir_next_page | 4 510 + * +----------------+----------------+----------------+----------------+ 511 + * | gref[0] | 8 512 + * +----------------+----------------+----------------+----------------+ 513 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 514 + * +----------------+----------------+----------------+----------------+ 515 + * | gref[i] | i*4+8 516 + * +----------------+----------------+----------------+----------------+ 517 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 518 + * +----------------+----------------+----------------+----------------+ 519 + * | gref[N - 1] | N*4+8 520 + * +----------------+----------------+----------------+----------------+ 521 + * 522 + * gref_dir_next_page - grant_ref_t, reference to the next page describing 523 + * page directory. Must be 0 if there are no more pages in the list. 524 + * gref[i] - grant_ref_t, reference to a shared page of the buffer 525 + * allocated at XENDISPL_OP_DBUF_CREATE 526 + * 527 + * Number of grant_ref_t entries in the whole page directory is not 528 + * passed, but instead can be calculated as: 529 + * num_grefs_total = (XENDISPL_OP_DBUF_CREATE.buffer_sz + XEN_PAGE_SIZE - 1) / 530 + * XEN_PAGE_SIZE 531 + */ 532 + 533 + struct xendispl_page_directory { 534 + grant_ref_t gref_dir_next_page; 535 + grant_ref_t gref[1]; /* Variable length */ 536 + }; 537 + 538 + /* 539 + * Request dbuf destruction - destroy a previously allocated display buffer: 540 + * 0 1 2 3 octet 541 + * +----------------+----------------+----------------+----------------+ 542 + * | id |_OP_DBUF_DESTROY| reserved | 4 543 + * +----------------+----------------+----------------+----------------+ 544 + * | reserved | 8 545 + * +----------------+----------------+----------------+----------------+ 546 + * | dbuf_cookie low 32-bit | 12 547 + * +----------------+----------------+----------------+----------------+ 548 + * | dbuf_cookie high 32-bit | 16 549 + * +----------------+----------------+----------------+----------------+ 550 + * | reserved | 20 551 + * +----------------+----------------+----------------+----------------+ 552 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 553 + * +----------------+----------------+----------------+----------------+ 554 + * | reserved | 64 555 + * +----------------+----------------+----------------+----------------+ 556 + * 557 + * Must be sent over control ring of the connector which has the index 558 + * value of 0: 559 + * /local/domain/<dom-id>/device/vdispl/<dev-id>/0/req-ring-ref 560 + */ 561 + 562 + struct xendispl_dbuf_destroy_req { 563 + uint64_t dbuf_cookie; 564 + }; 565 + 566 + /* 567 + * Request framebuffer attachment - request attachment of a framebuffer to 568 + * previously created display buffer. 569 + * 0 1 2 3 octet 570 + * +----------------+----------------+----------------+----------------+ 571 + * | id | _OP_FB_ATTACH | reserved | 4 572 + * +----------------+----------------+----------------+----------------+ 573 + * | reserved | 8 574 + * +----------------+----------------+----------------+----------------+ 575 + * | dbuf_cookie low 32-bit | 12 576 + * +----------------+----------------+----------------+----------------+ 577 + * | dbuf_cookie high 32-bit | 16 578 + * +----------------+----------------+----------------+----------------+ 579 + * | fb_cookie low 32-bit | 20 580 + * +----------------+----------------+----------------+----------------+ 581 + * | fb_cookie high 32-bit | 24 582 + * +----------------+----------------+----------------+----------------+ 583 + * | width | 28 584 + * +----------------+----------------+----------------+----------------+ 585 + * | height | 32 586 + * +----------------+----------------+----------------+----------------+ 587 + * | pixel_format | 36 588 + * +----------------+----------------+----------------+----------------+ 589 + * | reserved | 40 590 + * +----------------+----------------+----------------+----------------+ 591 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 592 + * +----------------+----------------+----------------+----------------+ 593 + * | reserved | 64 594 + * +----------------+----------------+----------------+----------------+ 595 + * 596 + * Must be sent over control ring of the connector which has the index 597 + * value of 0: 598 + * /local/domain/<dom-id>/device/vdispl/<dev-id>/0/req-ring-ref 599 + * Width and height can be smaller, equal or bigger than the connector's 600 + * resolution. 601 + * 602 + * An attempt to create multiple frame buffers with the same fb_cookie is 603 + * an error. fb_cookie can be re-used after destroying the corresponding 604 + * frame buffer. 605 + * 606 + * width - uint32_t, width in pixels 607 + * height - uint32_t, height in pixels 608 + * pixel_format - uint32_t, pixel format of the framebuffer, FOURCC code 609 + */ 610 + 611 + struct xendispl_fb_attach_req { 612 + uint64_t dbuf_cookie; 613 + uint64_t fb_cookie; 614 + uint32_t width; 615 + uint32_t height; 616 + uint32_t pixel_format; 617 + }; 618 + 619 + /* 620 + * Request framebuffer detach - detach a previously 621 + * attached framebuffer from the display buffer in request: 622 + * 0 1 2 3 octet 623 + * +----------------+----------------+----------------+----------------+ 624 + * | id | _OP_FB_DETACH | reserved | 4 625 + * +----------------+----------------+----------------+----------------+ 626 + * | reserved | 8 627 + * +----------------+----------------+----------------+----------------+ 628 + * | fb_cookie low 32-bit | 12 629 + * +----------------+----------------+----------------+----------------+ 630 + * | fb_cookie high 32-bit | 16 631 + * +----------------+----------------+----------------+----------------+ 632 + * | reserved | 20 633 + * +----------------+----------------+----------------+----------------+ 634 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 635 + * +----------------+----------------+----------------+----------------+ 636 + * | reserved | 64 637 + * +----------------+----------------+----------------+----------------+ 638 + * 639 + * Must be sent over control ring of the connector which has the index 640 + * value of 0: 641 + * /local/domain/<dom-id>/device/vdispl/<dev-id>/0/req-ring-ref 642 + */ 643 + 644 + struct xendispl_fb_detach_req { 645 + uint64_t fb_cookie; 646 + }; 647 + 648 + /* 649 + * Request configuration set/reset - request to set or reset 650 + * the configuration/mode of the display: 651 + * 0 1 2 3 octet 652 + * +----------------+----------------+----------------+----------------+ 653 + * | id | _OP_SET_CONFIG | reserved | 4 654 + * +----------------+----------------+----------------+----------------+ 655 + * | reserved | 8 656 + * +----------------+----------------+----------------+----------------+ 657 + * | fb_cookie low 32-bit | 12 658 + * +----------------+----------------+----------------+----------------+ 659 + * | fb_cookie high 32-bit | 16 660 + * +----------------+----------------+----------------+----------------+ 661 + * | x | 20 662 + * +----------------+----------------+----------------+----------------+ 663 + * | y | 24 664 + * +----------------+----------------+----------------+----------------+ 665 + * | width | 28 666 + * +----------------+----------------+----------------+----------------+ 667 + * | height | 32 668 + * +----------------+----------------+----------------+----------------+ 669 + * | bpp | 40 670 + * +----------------+----------------+----------------+----------------+ 671 + * | reserved | 44 672 + * +----------------+----------------+----------------+----------------+ 673 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 674 + * +----------------+----------------+----------------+----------------+ 675 + * | reserved | 64 676 + * +----------------+----------------+----------------+----------------+ 677 + * 678 + * Pass all zeros to reset, otherwise command is treated as 679 + * configuration set. 680 + * Framebuffer's cookie defines which framebuffer/dbuf must be 681 + * displayed while enabling display (applying configuration). 682 + * x, y, width and height are bound by the connector's resolution and must not 683 + * exceed it. 684 + * 685 + * x - uint32_t, starting position in pixels by X axis 686 + * y - uint32_t, starting position in pixels by Y axis 687 + * width - uint32_t, width in pixels 688 + * height - uint32_t, height in pixels 689 + * bpp - uint32_t, bits per pixel 690 + */ 691 + 692 + struct xendispl_set_config_req { 693 + uint64_t fb_cookie; 694 + uint32_t x; 695 + uint32_t y; 696 + uint32_t width; 697 + uint32_t height; 698 + uint32_t bpp; 699 + }; 700 + 701 + /* 702 + * Request page flip - request to flip a page identified by the framebuffer 703 + * cookie: 704 + * 0 1 2 3 octet 705 + * +----------------+----------------+----------------+----------------+ 706 + * | id | _OP_PG_FLIP | reserved | 4 707 + * +----------------+----------------+----------------+----------------+ 708 + * | reserved | 8 709 + * +----------------+----------------+----------------+----------------+ 710 + * | fb_cookie low 32-bit | 12 711 + * +----------------+----------------+----------------+----------------+ 712 + * | fb_cookie high 32-bit | 16 713 + * +----------------+----------------+----------------+----------------+ 714 + * | reserved | 20 715 + * +----------------+----------------+----------------+----------------+ 716 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 717 + * +----------------+----------------+----------------+----------------+ 718 + * | reserved | 64 719 + * +----------------+----------------+----------------+----------------+ 720 + */ 721 + 722 + struct xendispl_page_flip_req { 723 + uint64_t fb_cookie; 724 + }; 725 + 726 + /* 727 + *---------------------------------- Responses -------------------------------- 728 + * 729 + * All response packets have the same length (64 octets) 730 + * 731 + * All response packets have common header: 732 + * 0 1 2 3 octet 733 + * +----------------+----------------+----------------+----------------+ 734 + * | id | reserved | 4 735 + * +----------------+----------------+----------------+----------------+ 736 + * | status | 8 737 + * +----------------+----------------+----------------+----------------+ 738 + * | reserved | 12 739 + * +----------------+----------------+----------------+----------------+ 740 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 741 + * +----------------+----------------+----------------+----------------+ 742 + * | reserved | 64 743 + * +----------------+----------------+----------------+----------------+ 744 + * 745 + * id - uint16_t, private guest value, echoed from request 746 + * status - int32_t, response status, zero on success and -XEN_EXX on failure 747 + * 748 + *----------------------------------- Events ---------------------------------- 749 + * 750 + * Events are sent via a shared page allocated by the front and propagated by 751 + * evt-event-channel/evt-ring-ref XenStore entries 752 + * All event packets have the same length (64 octets) 753 + * All event packets have common header: 754 + * 0 1 2 3 octet 755 + * +----------------+----------------+----------------+----------------+ 756 + * | id | type | reserved | 4 757 + * +----------------+----------------+----------------+----------------+ 758 + * | reserved | 8 759 + * +----------------+----------------+----------------+----------------+ 760 + * 761 + * id - uint16_t, event id, may be used by front 762 + * type - uint8_t, type of the event 763 + * 764 + * 765 + * Page flip complete event - event from back to front on page flip completed: 766 + * 0 1 2 3 octet 767 + * +----------------+----------------+----------------+----------------+ 768 + * | id | _EVT_PG_FLIP | reserved | 4 769 + * +----------------+----------------+----------------+----------------+ 770 + * | reserved | 8 771 + * +----------------+----------------+----------------+----------------+ 772 + * | fb_cookie low 32-bit | 12 773 + * +----------------+----------------+----------------+----------------+ 774 + * | fb_cookie high 32-bit | 16 775 + * +----------------+----------------+----------------+----------------+ 776 + * | reserved | 20 777 + * +----------------+----------------+----------------+----------------+ 778 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 779 + * +----------------+----------------+----------------+----------------+ 780 + * | reserved | 64 781 + * +----------------+----------------+----------------+----------------+ 782 + */ 783 + 784 + struct xendispl_pg_flip_evt { 785 + uint64_t fb_cookie; 786 + }; 787 + 788 + struct xendispl_req { 789 + uint16_t id; 790 + uint8_t operation; 791 + uint8_t reserved[5]; 792 + union { 793 + struct xendispl_dbuf_create_req dbuf_create; 794 + struct xendispl_dbuf_destroy_req dbuf_destroy; 795 + struct xendispl_fb_attach_req fb_attach; 796 + struct xendispl_fb_detach_req fb_detach; 797 + struct xendispl_set_config_req set_config; 798 + struct xendispl_page_flip_req pg_flip; 799 + uint8_t reserved[56]; 800 + } op; 801 + }; 802 + 803 + struct xendispl_resp { 804 + uint16_t id; 805 + uint8_t operation; 806 + uint8_t reserved; 807 + int32_t status; 808 + uint8_t reserved1[56]; 809 + }; 810 + 811 + struct xendispl_evt { 812 + uint16_t id; 813 + uint8_t type; 814 + uint8_t reserved[5]; 815 + union { 816 + struct xendispl_pg_flip_evt pg_flip; 817 + uint8_t reserved[56]; 818 + } op; 819 + }; 820 + 821 + DEFINE_RING_TYPES(xen_displif, struct xendispl_req, struct xendispl_resp); 822 + 823 + /* 824 + ****************************************************************************** 825 + * Back to front events delivery 826 + ****************************************************************************** 827 + * In order to deliver asynchronous events from back to front a shared page is 828 + * allocated by front and its granted reference propagated to back via 829 + * XenStore entries (evt-ring-ref/evt-event-channel). 830 + * This page has a common header used by both front and back to synchronize 831 + * access and control event's ring buffer, while back being a producer of the 832 + * events and front being a consumer. The rest of the page after the header 833 + * is used for event packets. 834 + * 835 + * Upon reception of an event(s) front may confirm its reception 836 + * for either each event, group of events or none. 837 + */ 838 + 839 + struct xendispl_event_page { 840 + uint32_t in_cons; 841 + uint32_t in_prod; 842 + uint8_t reserved[56]; 843 + }; 844 + 845 + #define XENDISPL_EVENT_PAGE_SIZE XEN_PAGE_SIZE 846 + #define XENDISPL_IN_RING_OFFS (sizeof(struct xendispl_event_page)) 847 + #define XENDISPL_IN_RING_SIZE (XENDISPL_EVENT_PAGE_SIZE - XENDISPL_IN_RING_OFFS) 848 + #define XENDISPL_IN_RING_LEN (XENDISPL_IN_RING_SIZE / sizeof(struct xendispl_evt)) 849 + #define XENDISPL_IN_RING(page) \ 850 + ((struct xendispl_evt *)((char *)(page) + XENDISPL_IN_RING_OFFS)) 851 + #define XENDISPL_IN_RING_REF(page, idx) \ 852 + (XENDISPL_IN_RING((page))[(idx) % XENDISPL_IN_RING_LEN]) 853 + 854 + #endif /* __XEN_PUBLIC_IO_DISPLIF_H__ */

+433 -29

include/xen/interface/io/kbdif.h

··· 26 26 #ifndef __XEN_PUBLIC_IO_KBDIF_H__ 27 27 #define __XEN_PUBLIC_IO_KBDIF_H__ 28 28 29 - /* In events (backend -> frontend) */ 29 + /* 30 + ***************************************************************************** 31 + * Feature and Parameter Negotiation 32 + ***************************************************************************** 33 + * 34 + * The two halves of a para-virtual driver utilize nodes within 35 + * XenStore to communicate capabilities and to negotiate operating parameters. 36 + * This section enumerates these nodes which reside in the respective front and 37 + * backend portions of XenStore, following XenBus convention. 38 + * 39 + * All data in XenStore is stored as strings. Nodes specifying numeric 40 + * values are encoded in decimal. Integer value ranges listed below are 41 + * expressed as fixed sized integer types capable of storing the conversion 42 + * of a properly formated node string, without loss of information. 43 + * 44 + ***************************************************************************** 45 + * Backend XenBus Nodes 46 + ***************************************************************************** 47 + * 48 + *---------------------------- Features supported ---------------------------- 49 + * 50 + * Capable backend advertises supported features by publishing 51 + * corresponding entries in XenStore and puts 1 as the value of the entry. 52 + * If a feature is not supported then 0 must be set or feature entry omitted. 53 + * 54 + * feature-abs-pointer 55 + * Values: <uint> 56 + * 57 + * Backends, which support reporting of absolute coordinates for pointer 58 + * device should set this to 1. 59 + * 60 + * feature-multi-touch 61 + * Values: <uint> 62 + * 63 + * Backends, which support reporting of multi-touch events 64 + * should set this to 1. 65 + * 66 + *------------------------- Pointer Device Parameters ------------------------ 67 + * 68 + * width 69 + * Values: <uint> 70 + * 71 + * Maximum X coordinate (width) to be used by the frontend 72 + * while reporting input events, pixels, [0; UINT32_MAX]. 73 + * 74 + * height 75 + * Values: <uint> 76 + * 77 + * Maximum Y coordinate (height) to be used by the frontend 78 + * while reporting input events, pixels, [0; UINT32_MAX]. 79 + * 80 + ***************************************************************************** 81 + * Frontend XenBus Nodes 82 + ***************************************************************************** 83 + * 84 + *------------------------------ Feature request ----------------------------- 85 + * 86 + * Capable frontend requests features from backend via setting corresponding 87 + * entries to 1 in XenStore. Requests for features not advertised as supported 88 + * by the backend have no effect. 89 + * 90 + * request-abs-pointer 91 + * Values: <uint> 92 + * 93 + * Request backend to report absolute pointer coordinates 94 + * (XENKBD_TYPE_POS) instead of relative ones (XENKBD_TYPE_MOTION). 95 + * 96 + * request-multi-touch 97 + * Values: <uint> 98 + * 99 + * Request backend to report multi-touch events. 100 + * 101 + *----------------------- Request Transport Parameters ----------------------- 102 + * 103 + * event-channel 104 + * Values: <uint> 105 + * 106 + * The identifier of the Xen event channel used to signal activity 107 + * in the ring buffer. 108 + * 109 + * page-gref 110 + * Values: <uint> 111 + * 112 + * The Xen grant reference granting permission for the backend to map 113 + * a sole page in a single page sized event ring buffer. 114 + * 115 + * page-ref 116 + * Values: <uint> 117 + * 118 + * OBSOLETE, not recommended for use. 119 + * PFN of the shared page. 120 + * 121 + *----------------------- Multi-touch Device Parameters ----------------------- 122 + * 123 + * multi-touch-num-contacts 124 + * Values: <uint> 125 + * 126 + * Number of simultaneous touches reported. 127 + * 128 + * multi-touch-width 129 + * Values: <uint> 130 + * 131 + * Width of the touch area to be used by the frontend 132 + * while reporting input events, pixels, [0; UINT32_MAX]. 133 + * 134 + * multi-touch-height 135 + * Values: <uint> 136 + * 137 + * Height of the touch area to be used by the frontend 138 + * while reporting input events, pixels, [0; UINT32_MAX]. 139 + */ 30 140 31 141 /* 142 + * EVENT CODES. 143 + */ 144 + 145 + #define XENKBD_TYPE_MOTION 1 146 + #define XENKBD_TYPE_RESERVED 2 147 + #define XENKBD_TYPE_KEY 3 148 + #define XENKBD_TYPE_POS 4 149 + #define XENKBD_TYPE_MTOUCH 5 150 + 151 + /* Multi-touch event sub-codes */ 152 + 153 + #define XENKBD_MT_EV_DOWN 0 154 + #define XENKBD_MT_EV_UP 1 155 + #define XENKBD_MT_EV_MOTION 2 156 + #define XENKBD_MT_EV_SYN 3 157 + #define XENKBD_MT_EV_SHAPE 4 158 + #define XENKBD_MT_EV_ORIENT 5 159 + 160 + /* 161 + * CONSTANTS, XENSTORE FIELD AND PATH NAME STRINGS, HELPERS. 162 + */ 163 + 164 + #define XENKBD_DRIVER_NAME "vkbd" 165 + 166 + #define XENKBD_FIELD_FEAT_ABS_POINTER "feature-abs-pointer" 167 + #define XENKBD_FIELD_FEAT_MTOUCH "feature-multi-touch" 168 + #define XENKBD_FIELD_REQ_ABS_POINTER "request-abs-pointer" 169 + #define XENKBD_FIELD_REQ_MTOUCH "request-multi-touch" 170 + #define XENKBD_FIELD_RING_GREF "page-gref" 171 + #define XENKBD_FIELD_EVT_CHANNEL "event-channel" 172 + #define XENKBD_FIELD_WIDTH "width" 173 + #define XENKBD_FIELD_HEIGHT "height" 174 + #define XENKBD_FIELD_MT_WIDTH "multi-touch-width" 175 + #define XENKBD_FIELD_MT_HEIGHT "multi-touch-height" 176 + #define XENKBD_FIELD_MT_NUM_CONTACTS "multi-touch-num-contacts" 177 + 178 + /* OBSOLETE, not recommended for use */ 179 + #define XENKBD_FIELD_RING_REF "page-ref" 180 + 181 + /* 182 + ***************************************************************************** 183 + * Description of the protocol between frontend and backend driver. 184 + ***************************************************************************** 185 + * 186 + * The two halves of a Para-virtual driver communicate with 187 + * each other using a shared page and an event channel. 188 + * Shared page contains a ring with event structures. 189 + * 190 + * All reserved fields in the structures below must be 0. 191 + * 192 + ***************************************************************************** 193 + * Backend to frontend events 194 + ***************************************************************************** 195 + * 32 196 * Frontends should ignore unknown in events. 197 + * All event packets have the same length (40 octets) 198 + * All event packets have common header: 199 + * 200 + * 0 octet 201 + * +-----------------+ 202 + * | type | 203 + * +-----------------+ 204 + * type - uint8_t, event code, XENKBD_TYPE_??? 205 + * 206 + * 207 + * Pointer relative movement event 208 + * 0 1 2 3 octet 209 + * +----------------+----------------+----------------+----------------+ 210 + * | _TYPE_MOTION | reserved | 4 211 + * +----------------+----------------+----------------+----------------+ 212 + * | rel_x | 8 213 + * +----------------+----------------+----------------+----------------+ 214 + * | rel_y | 12 215 + * +----------------+----------------+----------------+----------------+ 216 + * | rel_z | 16 217 + * +----------------+----------------+----------------+----------------+ 218 + * | reserved | 20 219 + * +----------------+----------------+----------------+----------------+ 220 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 221 + * +----------------+----------------+----------------+----------------+ 222 + * | reserved | 40 223 + * +----------------+----------------+----------------+----------------+ 224 + * 225 + * rel_x - int32_t, relative X motion 226 + * rel_y - int32_t, relative Y motion 227 + * rel_z - int32_t, relative Z motion (wheel) 33 228 */ 34 - 35 - /* Pointer movement event */ 36 - #define XENKBD_TYPE_MOTION 1 37 - /* Event type 2 currently not used */ 38 - /* Key event (includes pointer buttons) */ 39 - #define XENKBD_TYPE_KEY 3 40 - /* 41 - * Pointer position event 42 - * Capable backend sets feature-abs-pointer in xenstore. 43 - * Frontend requests ot instead of XENKBD_TYPE_MOTION by setting 44 - * request-abs-update in xenstore. 45 - */ 46 - #define XENKBD_TYPE_POS 4 47 229 48 230 struct xenkbd_motion { 49 - uint8_t type; /* XENKBD_TYPE_MOTION */ 50 - int32_t rel_x; /* relative X motion */ 51 - int32_t rel_y; /* relative Y motion */ 52 - int32_t rel_z; /* relative Z motion (wheel) */ 231 + uint8_t type; 232 + int32_t rel_x; 233 + int32_t rel_y; 234 + int32_t rel_z; 53 235 }; 236 + 237 + /* 238 + * Key event (includes pointer buttons) 239 + * 0 1 2 3 octet 240 + * +----------------+----------------+----------------+----------------+ 241 + * | _TYPE_KEY | pressed | reserved | 4 242 + * +----------------+----------------+----------------+----------------+ 243 + * | keycode | 8 244 + * +----------------+----------------+----------------+----------------+ 245 + * | reserved | 12 246 + * +----------------+----------------+----------------+----------------+ 247 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 248 + * +----------------+----------------+----------------+----------------+ 249 + * | reserved | 40 250 + * +----------------+----------------+----------------+----------------+ 251 + * 252 + * pressed - uint8_t, 1 if pressed; 0 otherwise 253 + * keycode - uint32_t, KEY_* from linux/input.h 254 + */ 54 255 55 256 struct xenkbd_key { 56 - uint8_t type; /* XENKBD_TYPE_KEY */ 57 - uint8_t pressed; /* 1 if pressed; 0 otherwise */ 58 - uint32_t keycode; /* KEY_* from linux/input.h */ 257 + uint8_t type; 258 + uint8_t pressed; 259 + uint32_t keycode; 59 260 }; 60 261 262 + /* 263 + * Pointer absolute position event 264 + * 0 1 2 3 octet 265 + * +----------------+----------------+----------------+----------------+ 266 + * | _TYPE_POS | reserved | 4 267 + * +----------------+----------------+----------------+----------------+ 268 + * | abs_x | 8 269 + * +----------------+----------------+----------------+----------------+ 270 + * | abs_y | 12 271 + * +----------------+----------------+----------------+----------------+ 272 + * | rel_z | 16 273 + * +----------------+----------------+----------------+----------------+ 274 + * | reserved | 20 275 + * +----------------+----------------+----------------+----------------+ 276 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 277 + * +----------------+----------------+----------------+----------------+ 278 + * | reserved | 40 279 + * +----------------+----------------+----------------+----------------+ 280 + * 281 + * abs_x - int32_t, absolute X position (in FB pixels) 282 + * abs_y - int32_t, absolute Y position (in FB pixels) 283 + * rel_z - int32_t, relative Z motion (wheel) 284 + */ 285 + 61 286 struct xenkbd_position { 62 - uint8_t type; /* XENKBD_TYPE_POS */ 63 - int32_t abs_x; /* absolute X position (in FB pixels) */ 64 - int32_t abs_y; /* absolute Y position (in FB pixels) */ 65 - int32_t rel_z; /* relative Z motion (wheel) */ 287 + uint8_t type; 288 + int32_t abs_x; 289 + int32_t abs_y; 290 + int32_t rel_z; 291 + }; 292 + 293 + /* 294 + * Multi-touch event and its sub-types 295 + * 296 + * All multi-touch event packets have common header: 297 + * 298 + * 0 1 2 3 octet 299 + * +----------------+----------------+----------------+----------------+ 300 + * | _TYPE_MTOUCH | event_type | contact_id | reserved | 4 301 + * +----------------+----------------+----------------+----------------+ 302 + * | reserved | 8 303 + * +----------------+----------------+----------------+----------------+ 304 + * 305 + * event_type - unt8_t, multi-touch event sub-type, XENKBD_MT_EV_??? 306 + * contact_id - unt8_t, ID of the contact 307 + * 308 + * Touch interactions can consist of one or more contacts. 309 + * For each contact, a series of events is generated, starting 310 + * with a down event, followed by zero or more motion events, 311 + * and ending with an up event. Events relating to the same 312 + * contact point can be identified by the ID of the sequence: contact ID. 313 + * Contact ID may be reused after XENKBD_MT_EV_UP event and 314 + * is in the [0; XENKBD_FIELD_NUM_CONTACTS - 1] range. 315 + * 316 + * For further information please refer to documentation on Wayland [1], 317 + * Linux [2] and Windows [3] multi-touch support. 318 + * 319 + * [1] https://cgit.freedesktop.org/wayland/wayland/tree/protocol/wayland.xml 320 + * [2] https://www.kernel.org/doc/Documentation/input/multi-touch-protocol.txt 321 + * [3] https://msdn.microsoft.com/en-us/library/jj151564(v=vs.85).aspx 322 + * 323 + * 324 + * Multi-touch down event - sent when a new touch is made: touch is assigned 325 + * a unique contact ID, sent with this and consequent events related 326 + * to this touch. 327 + * 0 1 2 3 octet 328 + * +----------------+----------------+----------------+----------------+ 329 + * | _TYPE_MTOUCH | _MT_EV_DOWN | contact_id | reserved | 4 330 + * +----------------+----------------+----------------+----------------+ 331 + * | reserved | 8 332 + * +----------------+----------------+----------------+----------------+ 333 + * | abs_x | 12 334 + * +----------------+----------------+----------------+----------------+ 335 + * | abs_y | 16 336 + * +----------------+----------------+----------------+----------------+ 337 + * | reserved | 20 338 + * +----------------+----------------+----------------+----------------+ 339 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 340 + * +----------------+----------------+----------------+----------------+ 341 + * | reserved | 40 342 + * +----------------+----------------+----------------+----------------+ 343 + * 344 + * abs_x - int32_t, absolute X position, in pixels 345 + * abs_y - int32_t, absolute Y position, in pixels 346 + * 347 + * Multi-touch contact release event 348 + * 0 1 2 3 octet 349 + * +----------------+----------------+----------------+----------------+ 350 + * | _TYPE_MTOUCH | _MT_EV_UP | contact_id | reserved | 4 351 + * +----------------+----------------+----------------+----------------+ 352 + * | reserved | 8 353 + * +----------------+----------------+----------------+----------------+ 354 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 355 + * +----------------+----------------+----------------+----------------+ 356 + * | reserved | 40 357 + * +----------------+----------------+----------------+----------------+ 358 + * 359 + * Multi-touch motion event 360 + * 0 1 2 3 octet 361 + * +----------------+----------------+----------------+----------------+ 362 + * | _TYPE_MTOUCH | _MT_EV_MOTION | contact_id | reserved | 4 363 + * +----------------+----------------+----------------+----------------+ 364 + * | reserved | 8 365 + * +----------------+----------------+----------------+----------------+ 366 + * | abs_x | 12 367 + * +----------------+----------------+----------------+----------------+ 368 + * | abs_y | 16 369 + * +----------------+----------------+----------------+----------------+ 370 + * | reserved | 20 371 + * +----------------+----------------+----------------+----------------+ 372 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 373 + * +----------------+----------------+----------------+----------------+ 374 + * | reserved | 40 375 + * +----------------+----------------+----------------+----------------+ 376 + * 377 + * abs_x - int32_t, absolute X position, in pixels, 378 + * abs_y - int32_t, absolute Y position, in pixels, 379 + * 380 + * Multi-touch input synchronization event - shows end of a set of events 381 + * which logically belong together. 382 + * 0 1 2 3 octet 383 + * +----------------+----------------+----------------+----------------+ 384 + * | _TYPE_MTOUCH | _MT_EV_SYN | contact_id | reserved | 4 385 + * +----------------+----------------+----------------+----------------+ 386 + * | reserved | 8 387 + * +----------------+----------------+----------------+----------------+ 388 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 389 + * +----------------+----------------+----------------+----------------+ 390 + * | reserved | 40 391 + * +----------------+----------------+----------------+----------------+ 392 + * 393 + * Multi-touch shape event - touch point's shape has changed its shape. 394 + * Shape is approximated by an ellipse through the major and minor axis 395 + * lengths: major is the longer diameter of the ellipse and minor is the 396 + * shorter one. Center of the ellipse is reported via 397 + * XENKBD_MT_EV_DOWN/XENKBD_MT_EV_MOTION events. 398 + * 0 1 2 3 octet 399 + * +----------------+----------------+----------------+----------------+ 400 + * | _TYPE_MTOUCH | _MT_EV_SHAPE | contact_id | reserved | 4 401 + * +----------------+----------------+----------------+----------------+ 402 + * | reserved | 8 403 + * +----------------+----------------+----------------+----------------+ 404 + * | major | 12 405 + * +----------------+----------------+----------------+----------------+ 406 + * | minor | 16 407 + * +----------------+----------------+----------------+----------------+ 408 + * | reserved | 20 409 + * +----------------+----------------+----------------+----------------+ 410 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 411 + * +----------------+----------------+----------------+----------------+ 412 + * | reserved | 40 413 + * +----------------+----------------+----------------+----------------+ 414 + * 415 + * major - unt32_t, length of the major axis, pixels 416 + * minor - unt32_t, length of the minor axis, pixels 417 + * 418 + * Multi-touch orientation event - touch point's shape has changed 419 + * its orientation: calculated as a clockwise angle between the major axis 420 + * of the ellipse and positive Y axis in degrees, [-180; +180]. 421 + * 0 1 2 3 octet 422 + * +----------------+----------------+----------------+----------------+ 423 + * | _TYPE_MTOUCH | _MT_EV_ORIENT | contact_id | reserved | 4 424 + * +----------------+----------------+----------------+----------------+ 425 + * | reserved | 8 426 + * +----------------+----------------+----------------+----------------+ 427 + * | orientation | reserved | 12 428 + * +----------------+----------------+----------------+----------------+ 429 + * | reserved | 16 430 + * +----------------+----------------+----------------+----------------+ 431 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 432 + * +----------------+----------------+----------------+----------------+ 433 + * | reserved | 40 434 + * +----------------+----------------+----------------+----------------+ 435 + * 436 + * orientation - int16_t, clockwise angle of the major axis 437 + */ 438 + 439 + struct xenkbd_mtouch { 440 + uint8_t type; /* XENKBD_TYPE_MTOUCH */ 441 + uint8_t event_type; /* XENKBD_MT_EV_??? */ 442 + uint8_t contact_id; 443 + uint8_t reserved[5]; /* reserved for the future use */ 444 + union { 445 + struct { 446 + int32_t abs_x; /* absolute X position, pixels */ 447 + int32_t abs_y; /* absolute Y position, pixels */ 448 + } pos; 449 + struct { 450 + uint32_t major; /* length of the major axis, pixels */ 451 + uint32_t minor; /* length of the minor axis, pixels */ 452 + } shape; 453 + int16_t orientation; /* clockwise angle of the major axis */ 454 + } u; 66 455 }; 67 456 68 457 #define XENKBD_IN_EVENT_SIZE 40 ··· 461 72 struct xenkbd_motion motion; 462 73 struct xenkbd_key key; 463 74 struct xenkbd_position pos; 75 + struct xenkbd_mtouch mtouch; 464 76 char pad[XENKBD_IN_EVENT_SIZE]; 465 77 }; 466 78 467 - /* Out events (frontend -> backend) */ 468 - 469 79 /* 80 + ***************************************************************************** 81 + * Frontend to backend events 82 + ***************************************************************************** 83 + * 470 84 * Out events may be sent only when requested by backend, and receipt 471 85 * of an unknown out event is an error. 472 86 * No out events currently defined. 87 + 88 + * All event packets have the same length (40 octets) 89 + * All event packets have common header: 90 + * 0 octet 91 + * +-----------------+ 92 + * | type | 93 + * +-----------------+ 94 + * type - uint8_t, event code 473 95 */ 474 96 475 97 #define XENKBD_OUT_EVENT_SIZE 40 ··· 490 90 char pad[XENKBD_OUT_EVENT_SIZE]; 491 91 }; 492 92 493 - /* shared page */ 93 + /* 94 + ***************************************************************************** 95 + * Shared page 96 + ***************************************************************************** 97 + */ 494 98 495 99 #define XENKBD_IN_RING_SIZE 2048 496 100 #define XENKBD_IN_RING_LEN (XENKBD_IN_RING_SIZE / XENKBD_IN_EVENT_SIZE) ··· 517 113 uint32_t out_cons, out_prod; 518 114 }; 519 115 520 - #endif 116 + #endif /* __XEN_PUBLIC_IO_KBDIF_H__ */

+143

include/xen/interface/io/ring.h

··· 283 283 (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ 284 284 } while (0) 285 285 286 + 287 + /* 288 + * DEFINE_XEN_FLEX_RING_AND_INTF defines two monodirectional rings and 289 + * functions to check if there is data on the ring, and to read and 290 + * write to them. 291 + * 292 + * DEFINE_XEN_FLEX_RING is similar to DEFINE_XEN_FLEX_RING_AND_INTF, but 293 + * does not define the indexes page. As different protocols can have 294 + * extensions to the basic format, this macro allow them to define their 295 + * own struct. 296 + * 297 + * XEN_FLEX_RING_SIZE 298 + * Convenience macro to calculate the size of one of the two rings 299 + * from the overall order. 300 + * 301 + * $NAME_mask 302 + * Function to apply the size mask to an index, to reduce the index 303 + * within the range [0-size]. 304 + * 305 + * $NAME_read_packet 306 + * Function to read data from the ring. The amount of data to read is 307 + * specified by the "size" argument. 308 + * 309 + * $NAME_write_packet 310 + * Function to write data to the ring. The amount of data to write is 311 + * specified by the "size" argument. 312 + * 313 + * $NAME_get_ring_ptr 314 + * Convenience function that returns a pointer to read/write to the 315 + * ring at the right location. 316 + * 317 + * $NAME_data_intf 318 + * Indexes page, shared between frontend and backend. It also 319 + * contains the array of grant refs. 320 + * 321 + * $NAME_queued 322 + * Function to calculate how many bytes are currently on the ring, 323 + * ready to be read. It can also be used to calculate how much free 324 + * space is currently on the ring (XEN_FLEX_RING_SIZE() - 325 + * $NAME_queued()). 326 + */ 327 + 328 + #ifndef XEN_PAGE_SHIFT 329 + /* The PAGE_SIZE for ring protocols and hypercall interfaces is always 330 + * 4K, regardless of the architecture, and page granularity chosen by 331 + * operating systems. 332 + */ 333 + #define XEN_PAGE_SHIFT 12 334 + #endif 335 + #define XEN_FLEX_RING_SIZE(order) \ 336 + (1UL << ((order) + XEN_PAGE_SHIFT - 1)) 337 + 338 + #define DEFINE_XEN_FLEX_RING(name) \ 339 + static inline RING_IDX name##_mask(RING_IDX idx, RING_IDX ring_size) \ 340 + { \ 341 + return idx & (ring_size - 1); \ 342 + } \ 343 + \ 344 + static inline unsigned char *name##_get_ring_ptr(unsigned char *buf, \ 345 + RING_IDX idx, \ 346 + RING_IDX ring_size) \ 347 + { \ 348 + return buf + name##_mask(idx, ring_size); \ 349 + } \ 350 + \ 351 + static inline void name##_read_packet(void *opaque, \ 352 + const unsigned char *buf, \ 353 + size_t size, \ 354 + RING_IDX masked_prod, \ 355 + RING_IDX *masked_cons, \ 356 + RING_IDX ring_size) \ 357 + { \ 358 + if (*masked_cons < masked_prod || \ 359 + size <= ring_size - *masked_cons) { \ 360 + memcpy(opaque, buf + *masked_cons, size); \ 361 + } else { \ 362 + memcpy(opaque, buf + *masked_cons, ring_size - *masked_cons); \ 363 + memcpy((unsigned char *)opaque + ring_size - *masked_cons, buf, \ 364 + size - (ring_size - *masked_cons)); \ 365 + } \ 366 + *masked_cons = name##_mask(*masked_cons + size, ring_size); \ 367 + } \ 368 + \ 369 + static inline void name##_write_packet(unsigned char *buf, \ 370 + const void *opaque, \ 371 + size_t size, \ 372 + RING_IDX *masked_prod, \ 373 + RING_IDX masked_cons, \ 374 + RING_IDX ring_size) \ 375 + { \ 376 + if (*masked_prod < masked_cons || \ 377 + size <= ring_size - *masked_prod) { \ 378 + memcpy(buf + *masked_prod, opaque, size); \ 379 + } else { \ 380 + memcpy(buf + *masked_prod, opaque, ring_size - *masked_prod); \ 381 + memcpy(buf, (unsigned char *)opaque + (ring_size - *masked_prod), \ 382 + size - (ring_size - *masked_prod)); \ 383 + } \ 384 + *masked_prod = name##_mask(*masked_prod + size, ring_size); \ 385 + } \ 386 + \ 387 + static inline RING_IDX name##_queued(RING_IDX prod, \ 388 + RING_IDX cons, \ 389 + RING_IDX ring_size) \ 390 + { \ 391 + RING_IDX size; \ 392 + \ 393 + if (prod == cons) \ 394 + return 0; \ 395 + \ 396 + prod = name##_mask(prod, ring_size); \ 397 + cons = name##_mask(cons, ring_size); \ 398 + \ 399 + if (prod == cons) \ 400 + return ring_size; \ 401 + \ 402 + if (prod > cons) \ 403 + size = prod - cons; \ 404 + else \ 405 + size = ring_size - (cons - prod); \ 406 + return size; \ 407 + } \ 408 + \ 409 + struct name##_data { \ 410 + unsigned char *in; /* half of the allocation */ \ 411 + unsigned char *out; /* half of the allocation */ \ 412 + } 413 + 414 + #define DEFINE_XEN_FLEX_RING_AND_INTF(name) \ 415 + struct name##_data_intf { \ 416 + RING_IDX in_cons, in_prod; \ 417 + \ 418 + uint8_t pad1[56]; \ 419 + \ 420 + RING_IDX out_cons, out_prod; \ 421 + \ 422 + uint8_t pad2[56]; \ 423 + \ 424 + RING_IDX ring_order; \ 425 + grant_ref_t ref[]; \ 426 + }; \ 427 + DEFINE_XEN_FLEX_RING(name) 428 + 286 429 #endif /* __XEN_PUBLIC_IO_RING_H__ */

+793

include/xen/interface/io/sndif.h

··· 1 + /****************************************************************************** 2 + * sndif.h 3 + * 4 + * Unified sound-device I/O interface for Xen guest OSes. 5 + * 6 + * Permission is hereby granted, free of charge, to any person obtaining a copy 7 + * of this software and associated documentation files (the "Software"), to 8 + * deal in the Software without restriction, including without limitation the 9 + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 10 + * sell copies of the Software, and to permit persons to whom the Software is 11 + * furnished to do so, subject to the following conditions: 12 + * 13 + * The above copyright notice and this permission notice shall be included in 14 + * all copies or substantial portions of the Software. 15 + * 16 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 + * DEALINGS IN THE SOFTWARE. 23 + * 24 + * Copyright (C) 2013-2015 GlobalLogic Inc. 25 + * Copyright (C) 2016-2017 EPAM Systems Inc. 26 + * 27 + * Authors: Oleksandr Andrushchenko <oleksandr_andrushchenko@epam.com> 28 + * Oleksandr Grytsov <oleksandr_grytsov@epam.com> 29 + * Oleksandr Dmytryshyn <oleksandr.dmytryshyn@globallogic.com> 30 + * Iurii Konovalenko <iurii.konovalenko@globallogic.com> 31 + */ 32 + 33 + #ifndef __XEN_PUBLIC_IO_SNDIF_H__ 34 + #define __XEN_PUBLIC_IO_SNDIF_H__ 35 + 36 + #include "ring.h" 37 + #include "../grant_table.h" 38 + 39 + /* 40 + ****************************************************************************** 41 + * Feature and Parameter Negotiation 42 + ****************************************************************************** 43 + * 44 + * Front->back notifications: when enqueuing a new request, sending a 45 + * notification can be made conditional on xensnd_req (i.e., the generic 46 + * hold-off mechanism provided by the ring macros). Backends must set 47 + * xensnd_req appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()). 48 + * 49 + * Back->front notifications: when enqueuing a new response, sending a 50 + * notification can be made conditional on xensnd_resp (i.e., the generic 51 + * hold-off mechanism provided by the ring macros). Frontends must set 52 + * xensnd_resp appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()). 53 + * 54 + * The two halves of a para-virtual sound card driver utilize nodes within 55 + * XenStore to communicate capabilities and to negotiate operating parameters. 56 + * This section enumerates these nodes which reside in the respective front and 57 + * backend portions of XenStore, following the XenBus convention. 58 + * 59 + * All data in XenStore is stored as strings. Nodes specifying numeric 60 + * values are encoded in decimal. Integer value ranges listed below are 61 + * expressed as fixed sized integer types capable of storing the conversion 62 + * of a properly formated node string, without loss of information. 63 + * 64 + ****************************************************************************** 65 + * Example configuration 66 + ****************************************************************************** 67 + * 68 + * Note: depending on the use-case backend can expose more sound cards and 69 + * PCM devices/streams than the underlying HW physically has by employing 70 + * SW mixers, configuring virtual sound streams, channels etc. 71 + * 72 + * This is an example of backend and frontend configuration: 73 + * 74 + *--------------------------------- Backend ----------------------------------- 75 + * 76 + * /local/domain/0/backend/vsnd/1/0/frontend-id = "1" 77 + * /local/domain/0/backend/vsnd/1/0/frontend = "/local/domain/1/device/vsnd/0" 78 + * /local/domain/0/backend/vsnd/1/0/state = "4" 79 + * /local/domain/0/backend/vsnd/1/0/versions = "1,2" 80 + * 81 + *--------------------------------- Frontend ---------------------------------- 82 + * 83 + * /local/domain/1/device/vsnd/0/backend-id = "0" 84 + * /local/domain/1/device/vsnd/0/backend = "/local/domain/0/backend/vsnd/1/0" 85 + * /local/domain/1/device/vsnd/0/state = "4" 86 + * /local/domain/1/device/vsnd/0/version = "1" 87 + * 88 + *----------------------------- Card configuration ---------------------------- 89 + * 90 + * /local/domain/1/device/vsnd/0/short-name = "Card short name" 91 + * /local/domain/1/device/vsnd/0/long-name = "Card long name" 92 + * /local/domain/1/device/vsnd/0/sample-rates = "8000,32000,44100,48000,96000" 93 + * /local/domain/1/device/vsnd/0/sample-formats = "s8,u8,s16_le,s16_be" 94 + * /local/domain/1/device/vsnd/0/buffer-size = "262144" 95 + * 96 + *------------------------------- PCM device 0 -------------------------------- 97 + * 98 + * /local/domain/1/device/vsnd/0/0/name = "General analog" 99 + * /local/domain/1/device/vsnd/0/0/channels-max = "5" 100 + * 101 + *----------------------------- Stream 0, playback ---------------------------- 102 + * 103 + * /local/domain/1/device/vsnd/0/0/0/type = "p" 104 + * /local/domain/1/device/vsnd/0/0/0/sample-formats = "s8,u8" 105 + * /local/domain/1/device/vsnd/0/0/0/unique-id = "0" 106 + * 107 + * /local/domain/1/device/vsnd/0/0/0/ring-ref = "386" 108 + * /local/domain/1/device/vsnd/0/0/0/event-channel = "15" 109 + * 110 + *------------------------------ Stream 1, capture ---------------------------- 111 + * 112 + * /local/domain/1/device/vsnd/0/0/1/type = "c" 113 + * /local/domain/1/device/vsnd/0/0/1/channels-max = "2" 114 + * /local/domain/1/device/vsnd/0/0/1/unique-id = "1" 115 + * 116 + * /local/domain/1/device/vsnd/0/0/1/ring-ref = "384" 117 + * /local/domain/1/device/vsnd/0/0/1/event-channel = "13" 118 + * 119 + *------------------------------- PCM device 1 -------------------------------- 120 + * 121 + * /local/domain/1/device/vsnd/0/1/name = "HDMI-0" 122 + * /local/domain/1/device/vsnd/0/1/sample-rates = "8000,32000,44100" 123 + * 124 + *------------------------------ Stream 0, capture ---------------------------- 125 + * 126 + * /local/domain/1/device/vsnd/0/1/0/type = "c" 127 + * /local/domain/1/device/vsnd/0/1/0/unique-id = "2" 128 + * 129 + * /local/domain/1/device/vsnd/0/1/0/ring-ref = "387" 130 + * /local/domain/1/device/vsnd/0/1/0/event-channel = "151" 131 + * 132 + *------------------------------- PCM device 2 -------------------------------- 133 + * 134 + * /local/domain/1/device/vsnd/0/2/name = "SPDIF" 135 + * 136 + *----------------------------- Stream 0, playback ---------------------------- 137 + * 138 + * /local/domain/1/device/vsnd/0/2/0/type = "p" 139 + * /local/domain/1/device/vsnd/0/2/0/unique-id = "3" 140 + * 141 + * /local/domain/1/device/vsnd/0/2/0/ring-ref = "389" 142 + * /local/domain/1/device/vsnd/0/2/0/event-channel = "152" 143 + * 144 + ****************************************************************************** 145 + * Backend XenBus Nodes 146 + ****************************************************************************** 147 + * 148 + *----------------------------- Protocol version ------------------------------ 149 + * 150 + * versions 151 + * Values: <string> 152 + * 153 + * List of XENSND_LIST_SEPARATOR separated protocol versions supported 154 + * by the backend. For example "1,2,3". 155 + * 156 + ****************************************************************************** 157 + * Frontend XenBus Nodes 158 + ****************************************************************************** 159 + * 160 + *-------------------------------- Addressing --------------------------------- 161 + * 162 + * dom-id 163 + * Values: <uint16_t> 164 + * 165 + * Domain identifier. 166 + * 167 + * dev-id 168 + * Values: <uint16_t> 169 + * 170 + * Device identifier. 171 + * 172 + * pcm-dev-idx 173 + * Values: <uint8_t> 174 + * 175 + * Zero based contigous index of the PCM device. 176 + * 177 + * stream-idx 178 + * Values: <uint8_t> 179 + * 180 + * Zero based contigous index of the stream of the PCM device. 181 + * 182 + * The following pattern is used for addressing: 183 + * /local/domain/<dom-id>/device/vsnd/<dev-id>/<pcm-dev-idx>/<stream-idx>/... 184 + * 185 + *----------------------------- Protocol version ------------------------------ 186 + * 187 + * version 188 + * Values: <string> 189 + * 190 + * Protocol version, chosen among the ones supported by the backend. 191 + * 192 + *------------------------------- PCM settings -------------------------------- 193 + * 194 + * Every virtualized sound frontend has a set of PCM devices and streams, each 195 + * could be individually configured. Part of the PCM configuration can be 196 + * defined at higher level of the hierarchy and be fully or partially re-used 197 + * by the underlying layers. These configuration values are: 198 + * o number of channels (min/max) 199 + * o supported sample rates 200 + * o supported sample formats. 201 + * E.g. one can define these values for the whole card, device or stream. 202 + * Every underlying layer in turn can re-define some or all of them to better 203 + * fit its needs. For example, card may define number of channels to be 204 + * in [1; 8] range, and some particular stream may be limited to [1; 2] only. 205 + * The rule is that the underlying layer must be a subset of the upper layer 206 + * range. 207 + * 208 + * channels-min 209 + * Values: <uint8_t> 210 + * 211 + * The minimum amount of channels that is supported, [1; channels-max]. 212 + * Optional, if not set or omitted a value of 1 is used. 213 + * 214 + * channels-max 215 + * Values: <uint8_t> 216 + * 217 + * The maximum amount of channels that is supported. 218 + * Must be at least <channels-min>. 219 + * 220 + * sample-rates 221 + * Values: <list of uint32_t> 222 + * 223 + * List of supported sample rates separated by XENSND_LIST_SEPARATOR. 224 + * Sample rates are expressed as a list of decimal values w/o any 225 + * ordering requirement. 226 + * 227 + * sample-formats 228 + * Values: <list of XENSND_PCM_FORMAT_XXX_STR> 229 + * 230 + * List of supported sample formats separated by XENSND_LIST_SEPARATOR. 231 + * Items must not exceed XENSND_SAMPLE_FORMAT_MAX_LEN length. 232 + * 233 + * buffer-size 234 + * Values: <uint32_t> 235 + * 236 + * The maximum size in octets of the buffer to allocate per stream. 237 + * 238 + *----------------------- Virtual sound card settings ------------------------- 239 + * short-name 240 + * Values: <char[32]> 241 + * 242 + * Short name of the virtual sound card. Optional. 243 + * 244 + * long-name 245 + * Values: <char[80]> 246 + * 247 + * Long name of the virtual sound card. Optional. 248 + * 249 + *----------------------------- Device settings ------------------------------- 250 + * name 251 + * Values: <char[80]> 252 + * 253 + * Name of the sound device within the virtual sound card. Optional. 254 + * 255 + *----------------------------- Stream settings ------------------------------- 256 + * 257 + * type 258 + * Values: "p", "c" 259 + * 260 + * Stream type: "p" - playback stream, "c" - capture stream 261 + * 262 + * If both capture and playback are needed then two streams need to be 263 + * defined under the same device. 264 + * 265 + * unique-id 266 + * Values: <uint32_t> 267 + * 268 + * After stream initialization it is assigned a unique ID (within the front 269 + * driver), so every stream of the frontend can be identified by the 270 + * backend by this ID. This is not equal to stream-idx as the later is 271 + * zero based within the device, but this index is contigous within the 272 + * driver. 273 + * 274 + *-------------------- Stream Request Transport Parameters -------------------- 275 + * 276 + * event-channel 277 + * Values: <uint32_t> 278 + * 279 + * The identifier of the Xen event channel used to signal activity 280 + * in the ring buffer. 281 + * 282 + * ring-ref 283 + * Values: <uint32_t> 284 + * 285 + * The Xen grant reference granting permission for the backend to map 286 + * a sole page in a single page sized ring buffer. 287 + * 288 + ****************************************************************************** 289 + * STATE DIAGRAMS 290 + ****************************************************************************** 291 + * 292 + * Tool stack creates front and back state nodes with initial state 293 + * XenbusStateInitialising. 294 + * Tool stack creates and sets up frontend sound configuration nodes per domain. 295 + * 296 + * Front Back 297 + * ================================= ===================================== 298 + * XenbusStateInitialising XenbusStateInitialising 299 + * o Query backend device identification 300 + * data. 301 + * o Open and validate backend device. 302 + * | 303 + * | 304 + * V 305 + * XenbusStateInitWait 306 + * 307 + * o Query frontend configuration 308 + * o Allocate and initialize 309 + * event channels per configured 310 + * playback/capture stream. 311 + * o Publish transport parameters 312 + * that will be in effect during 313 + * this connection. 314 + * | 315 + * | 316 + * V 317 + * XenbusStateInitialised 318 + * 319 + * o Query frontend transport parameters. 320 + * o Connect to the event channels. 321 + * | 322 + * | 323 + * V 324 + * XenbusStateConnected 325 + * 326 + * o Create and initialize OS 327 + * virtual sound device instances 328 + * as per configuration. 329 + * | 330 + * | 331 + * V 332 + * XenbusStateConnected 333 + * 334 + * XenbusStateUnknown 335 + * XenbusStateClosed 336 + * XenbusStateClosing 337 + * o Remove virtual sound device 338 + * o Remove event channels 339 + * | 340 + * | 341 + * V 342 + * XenbusStateClosed 343 + * 344 + *------------------------------- Recovery flow ------------------------------- 345 + * 346 + * In case of frontend unrecoverable errors backend handles that as 347 + * if frontend goes into the XenbusStateClosed state. 348 + * 349 + * In case of backend unrecoverable errors frontend tries removing 350 + * the virtualized device. If this is possible at the moment of error, 351 + * then frontend goes into the XenbusStateInitialising state and is ready for 352 + * new connection with backend. If the virtualized device is still in use and 353 + * cannot be removed, then frontend goes into the XenbusStateReconfiguring state 354 + * until either the virtualized device removed or backend initiates a new 355 + * connection. On the virtualized device removal frontend goes into the 356 + * XenbusStateInitialising state. 357 + * 358 + * Note on XenbusStateReconfiguring state of the frontend: if backend has 359 + * unrecoverable errors then frontend cannot send requests to the backend 360 + * and thus cannot provide functionality of the virtualized device anymore. 361 + * After backend is back to normal the virtualized device may still hold some 362 + * state: configuration in use, allocated buffers, client application state etc. 363 + * So, in most cases, this will require frontend to implement complex recovery 364 + * reconnect logic. Instead, by going into XenbusStateReconfiguring state, 365 + * frontend will make sure no new clients of the virtualized device are 366 + * accepted, allow existing client(s) to exit gracefully by signaling error 367 + * state etc. 368 + * Once all the clients are gone frontend can reinitialize the virtualized 369 + * device and get into XenbusStateInitialising state again signaling the 370 + * backend that a new connection can be made. 371 + * 372 + * There are multiple conditions possible under which frontend will go from 373 + * XenbusStateReconfiguring into XenbusStateInitialising, some of them are OS 374 + * specific. For example: 375 + * 1. The underlying OS framework may provide callbacks to signal that the last 376 + * client of the virtualized device has gone and the device can be removed 377 + * 2. Frontend can schedule a deferred work (timer/tasklet/workqueue) 378 + * to periodically check if this is the right time to re-try removal of 379 + * the virtualized device. 380 + * 3. By any other means. 381 + * 382 + ****************************************************************************** 383 + * PCM FORMATS 384 + ****************************************************************************** 385 + * 386 + * XENSND_PCM_FORMAT_<format>[_<endian>] 387 + * 388 + * format: <S/U/F><bits> or <name> 389 + * S - signed, U - unsigned, F - float 390 + * bits - 8, 16, 24, 32 391 + * name - MU_LAW, GSM, etc. 392 + * 393 + * endian: <LE/BE>, may be absent 394 + * LE - Little endian, BE - Big endian 395 + */ 396 + #define XENSND_PCM_FORMAT_S8 0 397 + #define XENSND_PCM_FORMAT_U8 1 398 + #define XENSND_PCM_FORMAT_S16_LE 2 399 + #define XENSND_PCM_FORMAT_S16_BE 3 400 + #define XENSND_PCM_FORMAT_U16_LE 4 401 + #define XENSND_PCM_FORMAT_U16_BE 5 402 + #define XENSND_PCM_FORMAT_S24_LE 6 403 + #define XENSND_PCM_FORMAT_S24_BE 7 404 + #define XENSND_PCM_FORMAT_U24_LE 8 405 + #define XENSND_PCM_FORMAT_U24_BE 9 406 + #define XENSND_PCM_FORMAT_S32_LE 10 407 + #define XENSND_PCM_FORMAT_S32_BE 11 408 + #define XENSND_PCM_FORMAT_U32_LE 12 409 + #define XENSND_PCM_FORMAT_U32_BE 13 410 + #define XENSND_PCM_FORMAT_F32_LE 14 /* 4-byte float, IEEE-754 32-bit, */ 411 + #define XENSND_PCM_FORMAT_F32_BE 15 /* range -1.0 to 1.0 */ 412 + #define XENSND_PCM_FORMAT_F64_LE 16 /* 8-byte float, IEEE-754 64-bit, */ 413 + #define XENSND_PCM_FORMAT_F64_BE 17 /* range -1.0 to 1.0 */ 414 + #define XENSND_PCM_FORMAT_IEC958_SUBFRAME_LE 18 415 + #define XENSND_PCM_FORMAT_IEC958_SUBFRAME_BE 19 416 + #define XENSND_PCM_FORMAT_MU_LAW 20 417 + #define XENSND_PCM_FORMAT_A_LAW 21 418 + #define XENSND_PCM_FORMAT_IMA_ADPCM 22 419 + #define XENSND_PCM_FORMAT_MPEG 23 420 + #define XENSND_PCM_FORMAT_GSM 24 421 + 422 + /* 423 + ****************************************************************************** 424 + * REQUEST CODES 425 + ****************************************************************************** 426 + */ 427 + #define XENSND_OP_OPEN 0 428 + #define XENSND_OP_CLOSE 1 429 + #define XENSND_OP_READ 2 430 + #define XENSND_OP_WRITE 3 431 + #define XENSND_OP_SET_VOLUME 4 432 + #define XENSND_OP_GET_VOLUME 5 433 + #define XENSND_OP_MUTE 6 434 + #define XENSND_OP_UNMUTE 7 435 + 436 + /* 437 + ****************************************************************************** 438 + * XENSTORE FIELD AND PATH NAME STRINGS, HELPERS 439 + ****************************************************************************** 440 + */ 441 + #define XENSND_DRIVER_NAME "vsnd" 442 + 443 + #define XENSND_LIST_SEPARATOR "," 444 + /* Field names */ 445 + #define XENSND_FIELD_BE_VERSIONS "versions" 446 + #define XENSND_FIELD_FE_VERSION "version" 447 + #define XENSND_FIELD_VCARD_SHORT_NAME "short-name" 448 + #define XENSND_FIELD_VCARD_LONG_NAME "long-name" 449 + #define XENSND_FIELD_RING_REF "ring-ref" 450 + #define XENSND_FIELD_EVT_CHNL "event-channel" 451 + #define XENSND_FIELD_DEVICE_NAME "name" 452 + #define XENSND_FIELD_TYPE "type" 453 + #define XENSND_FIELD_STREAM_UNIQUE_ID "unique-id" 454 + #define XENSND_FIELD_CHANNELS_MIN "channels-min" 455 + #define XENSND_FIELD_CHANNELS_MAX "channels-max" 456 + #define XENSND_FIELD_SAMPLE_RATES "sample-rates" 457 + #define XENSND_FIELD_SAMPLE_FORMATS "sample-formats" 458 + #define XENSND_FIELD_BUFFER_SIZE "buffer-size" 459 + 460 + /* Stream type field values. */ 461 + #define XENSND_STREAM_TYPE_PLAYBACK "p" 462 + #define XENSND_STREAM_TYPE_CAPTURE "c" 463 + /* Sample rate max string length */ 464 + #define XENSND_SAMPLE_RATE_MAX_LEN 11 465 + /* Sample format field values */ 466 + #define XENSND_SAMPLE_FORMAT_MAX_LEN 24 467 + 468 + #define XENSND_PCM_FORMAT_S8_STR "s8" 469 + #define XENSND_PCM_FORMAT_U8_STR "u8" 470 + #define XENSND_PCM_FORMAT_S16_LE_STR "s16_le" 471 + #define XENSND_PCM_FORMAT_S16_BE_STR "s16_be" 472 + #define XENSND_PCM_FORMAT_U16_LE_STR "u16_le" 473 + #define XENSND_PCM_FORMAT_U16_BE_STR "u16_be" 474 + #define XENSND_PCM_FORMAT_S24_LE_STR "s24_le" 475 + #define XENSND_PCM_FORMAT_S24_BE_STR "s24_be" 476 + #define XENSND_PCM_FORMAT_U24_LE_STR "u24_le" 477 + #define XENSND_PCM_FORMAT_U24_BE_STR "u24_be" 478 + #define XENSND_PCM_FORMAT_S32_LE_STR "s32_le" 479 + #define XENSND_PCM_FORMAT_S32_BE_STR "s32_be" 480 + #define XENSND_PCM_FORMAT_U32_LE_STR "u32_le" 481 + #define XENSND_PCM_FORMAT_U32_BE_STR "u32_be" 482 + #define XENSND_PCM_FORMAT_F32_LE_STR "float_le" 483 + #define XENSND_PCM_FORMAT_F32_BE_STR "float_be" 484 + #define XENSND_PCM_FORMAT_F64_LE_STR "float64_le" 485 + #define XENSND_PCM_FORMAT_F64_BE_STR "float64_be" 486 + #define XENSND_PCM_FORMAT_IEC958_SUBFRAME_LE_STR "iec958_subframe_le" 487 + #define XENSND_PCM_FORMAT_IEC958_SUBFRAME_BE_STR "iec958_subframe_be" 488 + #define XENSND_PCM_FORMAT_MU_LAW_STR "mu_law" 489 + #define XENSND_PCM_FORMAT_A_LAW_STR "a_law" 490 + #define XENSND_PCM_FORMAT_IMA_ADPCM_STR "ima_adpcm" 491 + #define XENSND_PCM_FORMAT_MPEG_STR "mpeg" 492 + #define XENSND_PCM_FORMAT_GSM_STR "gsm" 493 + 494 + 495 + /* 496 + ****************************************************************************** 497 + * STATUS RETURN CODES 498 + ****************************************************************************** 499 + * 500 + * Status return code is zero on success and -XEN_EXX on failure. 501 + * 502 + ****************************************************************************** 503 + * Assumptions 504 + ****************************************************************************** 505 + * o usage of grant reference 0 as invalid grant reference: 506 + * grant reference 0 is valid, but never exposed to a PV driver, 507 + * because of the fact it is already in use/reserved by the PV console. 508 + * o all references in this document to page sizes must be treated 509 + * as pages of size XEN_PAGE_SIZE unless otherwise noted. 510 + * 511 + ****************************************************************************** 512 + * Description of the protocol between frontend and backend driver 513 + ****************************************************************************** 514 + * 515 + * The two halves of a Para-virtual sound driver communicate with 516 + * each other using shared pages and event channels. 517 + * Shared page contains a ring with request/response packets. 518 + * 519 + * Packets, used for input/output operations, e.g. read/write, set/get volume, 520 + * etc., provide offset/length fields in order to allow asynchronous protocol 521 + * operation with buffer space sharing: part of the buffer allocated at 522 + * XENSND_OP_OPEN can be used for audio samples and part, for example, 523 + * for volume control. 524 + * 525 + * All reserved fields in the structures below must be 0. 526 + * 527 + *---------------------------------- Requests --------------------------------- 528 + * 529 + * All request packets have the same length (32 octets) 530 + * All request packets have common header: 531 + * 0 1 2 3 octet 532 + * +----------------+----------------+----------------+----------------+ 533 + * | id | operation | reserved | 4 534 + * +----------------+----------------+----------------+----------------+ 535 + * | reserved | 8 536 + * +----------------+----------------+----------------+----------------+ 537 + * id - uint16_t, private guest value, echoed in response 538 + * operation - uint8_t, operation code, XENSND_OP_??? 539 + * 540 + * For all packets which use offset and length: 541 + * offset - uint32_t, read or write data offset within the shared buffer, 542 + * passed with XENSND_OP_OPEN request, octets, 543 + * [0; XENSND_OP_OPEN.buffer_sz - 1]. 544 + * length - uint32_t, read or write data length, octets 545 + * 546 + * Request open - open a PCM stream for playback or capture: 547 + * 548 + * 0 1 2 3 octet 549 + * +----------------+----------------+----------------+----------------+ 550 + * | id | XENSND_OP_OPEN | reserved | 4 551 + * +----------------+----------------+----------------+----------------+ 552 + * | reserved | 8 553 + * +----------------+----------------+----------------+----------------+ 554 + * | pcm_rate | 12 555 + * +----------------+----------------+----------------+----------------+ 556 + * | pcm_format | pcm_channels | reserved | 16 557 + * +----------------+----------------+----------------+----------------+ 558 + * | buffer_sz | 20 559 + * +----------------+----------------+----------------+----------------+ 560 + * | gref_directory | 24 561 + * +----------------+----------------+----------------+----------------+ 562 + * | reserved | 28 563 + * +----------------+----------------+----------------+----------------+ 564 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 565 + * +----------------+----------------+----------------+----------------+ 566 + * | reserved | 32 567 + * +----------------+----------------+----------------+----------------+ 568 + * 569 + * pcm_rate - uint32_t, stream data rate, Hz 570 + * pcm_format - uint8_t, XENSND_PCM_FORMAT_XXX value 571 + * pcm_channels - uint8_t, number of channels of this stream, 572 + * [channels-min; channels-max] 573 + * buffer_sz - uint32_t, buffer size to be allocated, octets 574 + * gref_directory - grant_ref_t, a reference to the first shared page 575 + * describing shared buffer references. At least one page exists. If shared 576 + * buffer size (buffer_sz) exceeds what can be addressed by this single page, 577 + * then reference to the next page must be supplied (see gref_dir_next_page 578 + * below) 579 + */ 580 + 581 + struct xensnd_open_req { 582 + uint32_t pcm_rate; 583 + uint8_t pcm_format; 584 + uint8_t pcm_channels; 585 + uint16_t reserved; 586 + uint32_t buffer_sz; 587 + grant_ref_t gref_directory; 588 + }; 589 + 590 + /* 591 + * Shared page for XENSND_OP_OPEN buffer descriptor (gref_directory in the 592 + * request) employs a list of pages, describing all pages of the shared data 593 + * buffer: 594 + * 0 1 2 3 octet 595 + * +----------------+----------------+----------------+----------------+ 596 + * | gref_dir_next_page | 4 597 + * +----------------+----------------+----------------+----------------+ 598 + * | gref[0] | 8 599 + * +----------------+----------------+----------------+----------------+ 600 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 601 + * +----------------+----------------+----------------+----------------+ 602 + * | gref[i] | i*4+8 603 + * +----------------+----------------+----------------+----------------+ 604 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 605 + * +----------------+----------------+----------------+----------------+ 606 + * | gref[N - 1] | N*4+8 607 + * +----------------+----------------+----------------+----------------+ 608 + * 609 + * gref_dir_next_page - grant_ref_t, reference to the next page describing 610 + * page directory. Must be 0 if there are no more pages in the list. 611 + * gref[i] - grant_ref_t, reference to a shared page of the buffer 612 + * allocated at XENSND_OP_OPEN 613 + * 614 + * Number of grant_ref_t entries in the whole page directory is not 615 + * passed, but instead can be calculated as: 616 + * num_grefs_total = (XENSND_OP_OPEN.buffer_sz + XEN_PAGE_SIZE - 1) / 617 + * XEN_PAGE_SIZE 618 + */ 619 + 620 + struct xensnd_page_directory { 621 + grant_ref_t gref_dir_next_page; 622 + grant_ref_t gref[1]; /* Variable length */ 623 + }; 624 + 625 + /* 626 + * Request close - close an opened pcm stream: 627 + * 0 1 2 3 octet 628 + * +----------------+----------------+----------------+----------------+ 629 + * | id | XENSND_OP_CLOSE| reserved | 4 630 + * +----------------+----------------+----------------+----------------+ 631 + * | reserved | 8 632 + * +----------------+----------------+----------------+----------------+ 633 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 634 + * +----------------+----------------+----------------+----------------+ 635 + * | reserved | 32 636 + * +----------------+----------------+----------------+----------------+ 637 + * 638 + * Request read/write - used for read (for capture) or write (for playback): 639 + * 0 1 2 3 octet 640 + * +----------------+----------------+----------------+----------------+ 641 + * | id | operation | reserved | 4 642 + * +----------------+----------------+----------------+----------------+ 643 + * | reserved | 8 644 + * +----------------+----------------+----------------+----------------+ 645 + * | offset | 12 646 + * +----------------+----------------+----------------+----------------+ 647 + * | length | 16 648 + * +----------------+----------------+----------------+----------------+ 649 + * | reserved | 20 650 + * +----------------+----------------+----------------+----------------+ 651 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 652 + * +----------------+----------------+----------------+----------------+ 653 + * | reserved | 32 654 + * +----------------+----------------+----------------+----------------+ 655 + * 656 + * operation - XENSND_OP_READ for read or XENSND_OP_WRITE for write 657 + */ 658 + 659 + struct xensnd_rw_req { 660 + uint32_t offset; 661 + uint32_t length; 662 + }; 663 + 664 + /* 665 + * Request set/get volume - set/get channels' volume of the stream given: 666 + * 0 1 2 3 octet 667 + * +----------------+----------------+----------------+----------------+ 668 + * | id | operation | reserved | 4 669 + * +----------------+----------------+----------------+----------------+ 670 + * | reserved | 8 671 + * +----------------+----------------+----------------+----------------+ 672 + * | offset | 12 673 + * +----------------+----------------+----------------+----------------+ 674 + * | length | 16 675 + * +----------------+----------------+----------------+----------------+ 676 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 677 + * +----------------+----------------+----------------+----------------+ 678 + * | reserved | 32 679 + * +----------------+----------------+----------------+----------------+ 680 + * 681 + * operation - XENSND_OP_SET_VOLUME for volume set 682 + * or XENSND_OP_GET_VOLUME for volume get 683 + * Buffer passed with XENSND_OP_OPEN is used to exchange volume 684 + * values: 685 + * 686 + * 0 1 2 3 octet 687 + * +----------------+----------------+----------------+----------------+ 688 + * | channel[0] | 4 689 + * +----------------+----------------+----------------+----------------+ 690 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 691 + * +----------------+----------------+----------------+----------------+ 692 + * | channel[i] | i*4 693 + * +----------------+----------------+----------------+----------------+ 694 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 695 + * +----------------+----------------+----------------+----------------+ 696 + * | channel[N - 1] | (N-1)*4 697 + * +----------------+----------------+----------------+----------------+ 698 + * 699 + * N = XENSND_OP_OPEN.pcm_channels 700 + * i - uint8_t, index of a channel 701 + * channel[i] - sint32_t, volume of i-th channel 702 + * Volume is expressed as a signed value in steps of 0.001 dB, 703 + * while 0 being 0 dB. 704 + * 705 + * Request mute/unmute - mute/unmute stream: 706 + * 0 1 2 3 octet 707 + * +----------------+----------------+----------------+----------------+ 708 + * | id | operation | reserved | 4 709 + * +----------------+----------------+----------------+----------------+ 710 + * | reserved | 8 711 + * +----------------+----------------+----------------+----------------+ 712 + * | offset | 12 713 + * +----------------+----------------+----------------+----------------+ 714 + * | length | 16 715 + * +----------------+----------------+----------------+----------------+ 716 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 717 + * +----------------+----------------+----------------+----------------+ 718 + * | reserved | 32 719 + * +----------------+----------------+----------------+----------------+ 720 + * 721 + * operation - XENSND_OP_MUTE for mute or XENSND_OP_UNMUTE for unmute 722 + * Buffer passed with XENSND_OP_OPEN is used to exchange mute/unmute 723 + * values: 724 + * 725 + * 0 octet 726 + * +----------------+----------------+----------------+----------------+ 727 + * | channel[0] | 4 728 + * +----------------+----------------+----------------+----------------+ 729 + * +/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 730 + * +----------------+----------------+----------------+----------------+ 731 + * | channel[i] | i*4 732 + * +----------------+----------------+----------------+----------------+ 733 + * +/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 734 + * +----------------+----------------+----------------+----------------+ 735 + * | channel[N - 1] | (N-1)*4 736 + * +----------------+----------------+----------------+----------------+ 737 + * 738 + * N = XENSND_OP_OPEN.pcm_channels 739 + * i - uint8_t, index of a channel 740 + * channel[i] - uint8_t, non-zero if i-th channel needs to be muted/unmuted 741 + * 742 + *------------------------------------ N.B. ----------------------------------- 743 + * 744 + * The 'struct xensnd_rw_req' is also used for XENSND_OP_SET_VOLUME, 745 + * XENSND_OP_GET_VOLUME, XENSND_OP_MUTE, XENSND_OP_UNMUTE. 746 + */ 747 + 748 + /* 749 + *---------------------------------- Responses -------------------------------- 750 + * 751 + * All response packets have the same length (32 octets) 752 + * 753 + * Response for all requests: 754 + * 0 1 2 3 octet 755 + * +----------------+----------------+----------------+----------------+ 756 + * | id | operation | reserved | 4 757 + * +----------------+----------------+----------------+----------------+ 758 + * | status | 8 759 + * +----------------+----------------+----------------+----------------+ 760 + * | reserved | 12 761 + * +----------------+----------------+----------------+----------------+ 762 + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| 763 + * +----------------+----------------+----------------+----------------+ 764 + * | reserved | 32 765 + * +----------------+----------------+----------------+----------------+ 766 + * 767 + * id - uint16_t, copied from the request 768 + * operation - uint8_t, XENSND_OP_* - copied from request 769 + * status - int32_t, response status, zero on success and -XEN_EXX on failure 770 + */ 771 + 772 + struct xensnd_req { 773 + uint16_t id; 774 + uint8_t operation; 775 + uint8_t reserved[5]; 776 + union { 777 + struct xensnd_open_req open; 778 + struct xensnd_rw_req rw; 779 + uint8_t reserved[24]; 780 + } op; 781 + }; 782 + 783 + struct xensnd_resp { 784 + uint16_t id; 785 + uint8_t operation; 786 + uint8_t reserved; 787 + int32_t status; 788 + uint8_t reserved1[24]; 789 + }; 790 + 791 + DEFINE_RING_TYPES(xen_sndif, struct xensnd_req, struct xensnd_resp); 792 + 793 + #endif /* __XEN_PUBLIC_IO_SNDIF_H__ */

+19

include/xen/xen-ops.h

··· 22 22 void xen_arch_resume(void); 23 23 void xen_arch_suspend(void); 24 24 25 + void xen_reboot(int reason); 26 + 25 27 void xen_resume_notifier_register(struct notifier_block *nb); 26 28 void xen_resume_notifier_unregister(struct notifier_block *nb); 27 29 ··· 36 34 int xen_setup_shutdown_event(void); 37 35 38 36 extern unsigned long *xen_contiguous_bitmap; 37 + 38 + #ifdef CONFIG_XEN_PV 39 39 int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, 40 40 unsigned int address_bits, 41 41 dma_addr_t *dma_handle); 42 42 43 43 void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); 44 + #else 45 + static inline int xen_create_contiguous_region(phys_addr_t pstart, 46 + unsigned int order, 47 + unsigned int address_bits, 48 + dma_addr_t *dma_handle) 49 + { 50 + return 0; 51 + } 52 + 53 + static inline void xen_destroy_contiguous_region(phys_addr_t pstart, 54 + unsigned int order) { } 55 + #endif 44 56 45 57 struct vm_area_struct; 46 58 ··· 136 120 efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, 137 121 unsigned long count, u64 *max_size, 138 122 int *reset_type); 123 + void xen_efi_reset_system(int reset_type, efi_status_t status, 124 + unsigned long data_size, efi_char16_t *data); 125 + 139 126 140 127 #ifdef CONFIG_PREEMPT 141 128

+9

net/9p/Kconfig

··· 22 22 This builds support for a transports between 23 23 guest partitions and a host partition. 24 24 25 + config NET_9P_XEN 26 + depends on XEN 27 + select XEN_XENBUS_FRONTEND 28 + tristate "9P Xen Transport" 29 + help 30 + This builds support for a transport for 9pfs between 31 + two Xen domains. 32 + 33 + 25 34 config NET_9P_RDMA 26 35 depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS 27 36 tristate "9P RDMA Transport (Experimental)"

+4

net/9p/Makefile

··· 1 1 obj-$(CONFIG_NET_9P) := 9pnet.o 2 + obj-$(CONFIG_NET_9P_XEN) += 9pnet_xen.o 2 3 obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o 3 4 obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o 4 5 ··· 14 13 15 14 9pnet_virtio-objs := \ 16 15 trans_virtio.o \ 16 + 17 + 9pnet_xen-objs := \ 18 + trans_xen.o \ 17 19 18 20 9pnet_rdma-objs := \ 19 21 trans_rdma.o \

+545

net/9p/trans_xen.c

··· 1 + /* 2 + * linux/fs/9p/trans_xen 3 + * 4 + * Xen transport layer. 5 + * 6 + * Copyright (C) 2017 by Stefano Stabellini <stefano@aporeto.com> 7 + * 8 + * This program is free software; you can redistribute it and/or 9 + * modify it under the terms of the GNU General Public License version 2 10 + * as published by the Free Software Foundation; or, when distributed 11 + * separately from the Linux kernel or incorporated into other 12 + * software packages, subject to the following license: 13 + * 14 + * Permission is hereby granted, free of charge, to any person obtaining a copy 15 + * of this source file (the "Software"), to deal in the Software without 16 + * restriction, including without limitation the rights to use, copy, modify, 17 + * merge, publish, distribute, sublicense, and/or sell copies of the Software, 18 + * and to permit persons to whom the Software is furnished to do so, subject to 19 + * the following conditions: 20 + * 21 + * The above copyright notice and this permission notice shall be included in 22 + * all copies or substantial portions of the Software. 23 + * 24 + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 29 + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 30 + * IN THE SOFTWARE. 31 + */ 32 + 33 + #include <xen/events.h> 34 + #include <xen/grant_table.h> 35 + #include <xen/xen.h> 36 + #include <xen/xenbus.h> 37 + #include <xen/interface/io/9pfs.h> 38 + 39 + #include <linux/module.h> 40 + #include <linux/spinlock.h> 41 + #include <linux/rwlock.h> 42 + #include <net/9p/9p.h> 43 + #include <net/9p/client.h> 44 + #include <net/9p/transport.h> 45 + 46 + #define XEN_9PFS_NUM_RINGS 2 47 + #define XEN_9PFS_RING_ORDER 6 48 + #define XEN_9PFS_RING_SIZE XEN_FLEX_RING_SIZE(XEN_9PFS_RING_ORDER) 49 + 50 + struct xen_9pfs_header { 51 + uint32_t size; 52 + uint8_t id; 53 + uint16_t tag; 54 + 55 + /* uint8_t sdata[]; */ 56 + } __attribute__((packed)); 57 + 58 + /* One per ring, more than one per 9pfs share */ 59 + struct xen_9pfs_dataring { 60 + struct xen_9pfs_front_priv *priv; 61 + 62 + struct xen_9pfs_data_intf *intf; 63 + grant_ref_t ref; 64 + int evtchn; 65 + int irq; 66 + /* protect a ring from concurrent accesses */ 67 + spinlock_t lock; 68 + 69 + struct xen_9pfs_data data; 70 + wait_queue_head_t wq; 71 + struct work_struct work; 72 + }; 73 + 74 + /* One per 9pfs share */ 75 + struct xen_9pfs_front_priv { 76 + struct list_head list; 77 + struct xenbus_device *dev; 78 + char *tag; 79 + struct p9_client *client; 80 + 81 + int num_rings; 82 + struct xen_9pfs_dataring *rings; 83 + }; 84 + 85 + static LIST_HEAD(xen_9pfs_devs); 86 + static DEFINE_RWLOCK(xen_9pfs_lock); 87 + 88 + /* We don't currently allow canceling of requests */ 89 + static int p9_xen_cancel(struct p9_client *client, struct p9_req_t *req) 90 + { 91 + return 1; 92 + } 93 + 94 + static int p9_xen_create(struct p9_client *client, const char *addr, char *args) 95 + { 96 + struct xen_9pfs_front_priv *priv; 97 + 98 + read_lock(&xen_9pfs_lock); 99 + list_for_each_entry(priv, &xen_9pfs_devs, list) { 100 + if (!strcmp(priv->tag, addr)) { 101 + priv->client = client; 102 + read_unlock(&xen_9pfs_lock); 103 + return 0; 104 + } 105 + } 106 + read_unlock(&xen_9pfs_lock); 107 + return -EINVAL; 108 + } 109 + 110 + static void p9_xen_close(struct p9_client *client) 111 + { 112 + struct xen_9pfs_front_priv *priv; 113 + 114 + read_lock(&xen_9pfs_lock); 115 + list_for_each_entry(priv, &xen_9pfs_devs, list) { 116 + if (priv->client == client) { 117 + priv->client = NULL; 118 + read_unlock(&xen_9pfs_lock); 119 + return; 120 + } 121 + } 122 + read_unlock(&xen_9pfs_lock); 123 + } 124 + 125 + static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size) 126 + { 127 + RING_IDX cons, prod; 128 + 129 + cons = ring->intf->out_cons; 130 + prod = ring->intf->out_prod; 131 + virt_mb(); 132 + 133 + return XEN_9PFS_RING_SIZE - 134 + xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) >= size; 135 + } 136 + 137 + static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) 138 + { 139 + struct xen_9pfs_front_priv *priv = NULL; 140 + RING_IDX cons, prod, masked_cons, masked_prod; 141 + unsigned long flags; 142 + u32 size = p9_req->tc->size; 143 + struct xen_9pfs_dataring *ring; 144 + int num; 145 + 146 + read_lock(&xen_9pfs_lock); 147 + list_for_each_entry(priv, &xen_9pfs_devs, list) { 148 + if (priv->client == client) 149 + break; 150 + } 151 + read_unlock(&xen_9pfs_lock); 152 + if (!priv || priv->client != client) 153 + return -EINVAL; 154 + 155 + num = p9_req->tc->tag % priv->num_rings; 156 + ring = &priv->rings[num]; 157 + 158 + again: 159 + while (wait_event_interruptible(ring->wq, 160 + p9_xen_write_todo(ring, size)) != 0) 161 + ; 162 + 163 + spin_lock_irqsave(&ring->lock, flags); 164 + cons = ring->intf->out_cons; 165 + prod = ring->intf->out_prod; 166 + virt_mb(); 167 + 168 + if (XEN_9PFS_RING_SIZE - xen_9pfs_queued(prod, cons, 169 + XEN_9PFS_RING_SIZE) < size) { 170 + spin_unlock_irqrestore(&ring->lock, flags); 171 + goto again; 172 + } 173 + 174 + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); 175 + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); 176 + 177 + xen_9pfs_write_packet(ring->data.out, p9_req->tc->sdata, size, 178 + &masked_prod, masked_cons, XEN_9PFS_RING_SIZE); 179 + 180 + p9_req->status = REQ_STATUS_SENT; 181 + virt_wmb(); /* write ring before updating pointer */ 182 + prod += size; 183 + ring->intf->out_prod = prod; 184 + spin_unlock_irqrestore(&ring->lock, flags); 185 + notify_remote_via_irq(ring->irq); 186 + 187 + return 0; 188 + } 189 + 190 + static void p9_xen_response(struct work_struct *work) 191 + { 192 + struct xen_9pfs_front_priv *priv; 193 + struct xen_9pfs_dataring *ring; 194 + RING_IDX cons, prod, masked_cons, masked_prod; 195 + struct xen_9pfs_header h; 196 + struct p9_req_t *req; 197 + int status; 198 + 199 + ring = container_of(work, struct xen_9pfs_dataring, work); 200 + priv = ring->priv; 201 + 202 + while (1) { 203 + cons = ring->intf->in_cons; 204 + prod = ring->intf->in_prod; 205 + virt_rmb(); 206 + 207 + if (xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) < 208 + sizeof(h)) { 209 + notify_remote_via_irq(ring->irq); 210 + return; 211 + } 212 + 213 + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); 214 + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); 215 + 216 + /* First, read just the header */ 217 + xen_9pfs_read_packet(&h, ring->data.in, sizeof(h), 218 + masked_prod, &masked_cons, 219 + XEN_9PFS_RING_SIZE); 220 + 221 + req = p9_tag_lookup(priv->client, h.tag); 222 + if (!req || req->status != REQ_STATUS_SENT) { 223 + dev_warn(&priv->dev->dev, "Wrong req tag=%x\n", h.tag); 224 + cons += h.size; 225 + virt_mb(); 226 + ring->intf->in_cons = cons; 227 + continue; 228 + } 229 + 230 + memcpy(req->rc, &h, sizeof(h)); 231 + req->rc->offset = 0; 232 + 233 + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); 234 + /* Then, read the whole packet (including the header) */ 235 + xen_9pfs_read_packet(req->rc->sdata, ring->data.in, h.size, 236 + masked_prod, &masked_cons, 237 + XEN_9PFS_RING_SIZE); 238 + 239 + virt_mb(); 240 + cons += h.size; 241 + ring->intf->in_cons = cons; 242 + 243 + status = (req->status != REQ_STATUS_ERROR) ? 244 + REQ_STATUS_RCVD : REQ_STATUS_ERROR; 245 + 246 + p9_client_cb(priv->client, req, status); 247 + } 248 + } 249 + 250 + static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r) 251 + { 252 + struct xen_9pfs_dataring *ring = r; 253 + 254 + if (!ring || !ring->priv->client) { 255 + /* ignore spurious interrupt */ 256 + return IRQ_HANDLED; 257 + } 258 + 259 + wake_up_interruptible(&ring->wq); 260 + schedule_work(&ring->work); 261 + 262 + return IRQ_HANDLED; 263 + } 264 + 265 + static struct p9_trans_module p9_xen_trans = { 266 + .name = "xen", 267 + .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT), 268 + .def = 1, 269 + .create = p9_xen_create, 270 + .close = p9_xen_close, 271 + .request = p9_xen_request, 272 + .cancel = p9_xen_cancel, 273 + .owner = THIS_MODULE, 274 + }; 275 + 276 + static const struct xenbus_device_id xen_9pfs_front_ids[] = { 277 + { "9pfs" }, 278 + { "" } 279 + }; 280 + 281 + static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) 282 + { 283 + int i, j; 284 + 285 + write_lock(&xen_9pfs_lock); 286 + list_del(&priv->list); 287 + write_unlock(&xen_9pfs_lock); 288 + 289 + for (i = 0; i < priv->num_rings; i++) { 290 + if (!priv->rings[i].intf) 291 + break; 292 + if (priv->rings[i].irq > 0) 293 + unbind_from_irqhandler(priv->rings[i].irq, priv->dev); 294 + if (priv->rings[i].data.in) { 295 + for (j = 0; j < (1 << XEN_9PFS_RING_ORDER); j++) { 296 + grant_ref_t ref; 297 + 298 + ref = priv->rings[i].intf->ref[j]; 299 + gnttab_end_foreign_access(ref, 0, 0); 300 + } 301 + free_pages((unsigned long)priv->rings[i].data.in, 302 + XEN_9PFS_RING_ORDER - 303 + (PAGE_SHIFT - XEN_PAGE_SHIFT)); 304 + } 305 + gnttab_end_foreign_access(priv->rings[i].ref, 0, 0); 306 + free_page((unsigned long)priv->rings[i].intf); 307 + } 308 + kfree(priv->rings); 309 + kfree(priv->tag); 310 + kfree(priv); 311 + } 312 + 313 + static int xen_9pfs_front_remove(struct xenbus_device *dev) 314 + { 315 + struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev); 316 + 317 + dev_set_drvdata(&dev->dev, NULL); 318 + xen_9pfs_front_free(priv); 319 + return 0; 320 + } 321 + 322 + static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, 323 + struct xen_9pfs_dataring *ring) 324 + { 325 + int i = 0; 326 + int ret = -ENOMEM; 327 + void *bytes = NULL; 328 + 329 + init_waitqueue_head(&ring->wq); 330 + spin_lock_init(&ring->lock); 331 + INIT_WORK(&ring->work, p9_xen_response); 332 + 333 + ring->intf = (struct xen_9pfs_data_intf *)get_zeroed_page(GFP_KERNEL); 334 + if (!ring->intf) 335 + return ret; 336 + ret = gnttab_grant_foreign_access(dev->otherend_id, 337 + virt_to_gfn(ring->intf), 0); 338 + if (ret < 0) 339 + goto out; 340 + ring->ref = ret; 341 + bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 342 + XEN_9PFS_RING_ORDER - (PAGE_SHIFT - XEN_PAGE_SHIFT)); 343 + if (!bytes) { 344 + ret = -ENOMEM; 345 + goto out; 346 + } 347 + for (; i < (1 << XEN_9PFS_RING_ORDER); i++) { 348 + ret = gnttab_grant_foreign_access( 349 + dev->otherend_id, virt_to_gfn(bytes) + i, 0); 350 + if (ret < 0) 351 + goto out; 352 + ring->intf->ref[i] = ret; 353 + } 354 + ring->intf->ring_order = XEN_9PFS_RING_ORDER; 355 + ring->data.in = bytes; 356 + ring->data.out = bytes + XEN_9PFS_RING_SIZE; 357 + 358 + ret = xenbus_alloc_evtchn(dev, &ring->evtchn); 359 + if (ret) 360 + goto out; 361 + ring->irq = bind_evtchn_to_irqhandler(ring->evtchn, 362 + xen_9pfs_front_event_handler, 363 + 0, "xen_9pfs-frontend", ring); 364 + if (ring->irq >= 0) 365 + return 0; 366 + 367 + xenbus_free_evtchn(dev, ring->evtchn); 368 + ret = ring->irq; 369 + out: 370 + if (bytes) { 371 + for (i--; i >= 0; i--) 372 + gnttab_end_foreign_access(ring->intf->ref[i], 0, 0); 373 + free_pages((unsigned long)bytes, 374 + XEN_9PFS_RING_ORDER - 375 + (PAGE_SHIFT - XEN_PAGE_SHIFT)); 376 + } 377 + gnttab_end_foreign_access(ring->ref, 0, 0); 378 + free_page((unsigned long)ring->intf); 379 + return ret; 380 + } 381 + 382 + static int xen_9pfs_front_probe(struct xenbus_device *dev, 383 + const struct xenbus_device_id *id) 384 + { 385 + int ret, i; 386 + struct xenbus_transaction xbt; 387 + struct xen_9pfs_front_priv *priv = NULL; 388 + char *versions; 389 + unsigned int max_rings, max_ring_order, len = 0; 390 + 391 + versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); 392 + if (!len) 393 + return -EINVAL; 394 + if (strcmp(versions, "1")) { 395 + kfree(versions); 396 + return -EINVAL; 397 + } 398 + kfree(versions); 399 + max_rings = xenbus_read_unsigned(dev->otherend, "max-rings", 0); 400 + if (max_rings < XEN_9PFS_NUM_RINGS) 401 + return -EINVAL; 402 + max_ring_order = xenbus_read_unsigned(dev->otherend, 403 + "max-ring-page-order", 0); 404 + if (max_ring_order < XEN_9PFS_RING_ORDER) 405 + return -EINVAL; 406 + 407 + priv = kzalloc(sizeof(*priv), GFP_KERNEL); 408 + if (!priv) 409 + return -ENOMEM; 410 + 411 + priv->dev = dev; 412 + priv->num_rings = XEN_9PFS_NUM_RINGS; 413 + priv->rings = kcalloc(priv->num_rings, sizeof(*priv->rings), 414 + GFP_KERNEL); 415 + if (!priv->rings) { 416 + kfree(priv); 417 + return -ENOMEM; 418 + } 419 + 420 + for (i = 0; i < priv->num_rings; i++) { 421 + priv->rings[i].priv = priv; 422 + ret = xen_9pfs_front_alloc_dataring(dev, &priv->rings[i]); 423 + if (ret < 0) 424 + goto error; 425 + } 426 + 427 + again: 428 + ret = xenbus_transaction_start(&xbt); 429 + if (ret) { 430 + xenbus_dev_fatal(dev, ret, "starting transaction"); 431 + goto error; 432 + } 433 + ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1); 434 + if (ret) 435 + goto error_xenbus; 436 + ret = xenbus_printf(xbt, dev->nodename, "num-rings", "%u", 437 + priv->num_rings); 438 + if (ret) 439 + goto error_xenbus; 440 + for (i = 0; i < priv->num_rings; i++) { 441 + char str[16]; 442 + 443 + BUILD_BUG_ON(XEN_9PFS_NUM_RINGS > 9); 444 + sprintf(str, "ring-ref%u", i); 445 + ret = xenbus_printf(xbt, dev->nodename, str, "%d", 446 + priv->rings[i].ref); 447 + if (ret) 448 + goto error_xenbus; 449 + 450 + sprintf(str, "event-channel-%u", i); 451 + ret = xenbus_printf(xbt, dev->nodename, str, "%u", 452 + priv->rings[i].evtchn); 453 + if (ret) 454 + goto error_xenbus; 455 + } 456 + priv->tag = xenbus_read(xbt, dev->nodename, "tag", NULL); 457 + if (!priv->tag) { 458 + ret = -EINVAL; 459 + goto error_xenbus; 460 + } 461 + ret = xenbus_transaction_end(xbt, 0); 462 + if (ret) { 463 + if (ret == -EAGAIN) 464 + goto again; 465 + xenbus_dev_fatal(dev, ret, "completing transaction"); 466 + goto error; 467 + } 468 + 469 + write_lock(&xen_9pfs_lock); 470 + list_add_tail(&priv->list, &xen_9pfs_devs); 471 + write_unlock(&xen_9pfs_lock); 472 + dev_set_drvdata(&dev->dev, priv); 473 + xenbus_switch_state(dev, XenbusStateInitialised); 474 + 475 + return 0; 476 + 477 + error_xenbus: 478 + xenbus_transaction_end(xbt, 1); 479 + xenbus_dev_fatal(dev, ret, "writing xenstore"); 480 + error: 481 + dev_set_drvdata(&dev->dev, NULL); 482 + xen_9pfs_front_free(priv); 483 + return ret; 484 + } 485 + 486 + static int xen_9pfs_front_resume(struct xenbus_device *dev) 487 + { 488 + dev_warn(&dev->dev, "suspsend/resume unsupported\n"); 489 + return 0; 490 + } 491 + 492 + static void xen_9pfs_front_changed(struct xenbus_device *dev, 493 + enum xenbus_state backend_state) 494 + { 495 + switch (backend_state) { 496 + case XenbusStateReconfiguring: 497 + case XenbusStateReconfigured: 498 + case XenbusStateInitialising: 499 + case XenbusStateInitialised: 500 + case XenbusStateUnknown: 501 + break; 502 + 503 + case XenbusStateInitWait: 504 + break; 505 + 506 + case XenbusStateConnected: 507 + xenbus_switch_state(dev, XenbusStateConnected); 508 + break; 509 + 510 + case XenbusStateClosed: 511 + if (dev->state == XenbusStateClosed) 512 + break; 513 + /* Missed the backend's CLOSING state -- fallthrough */ 514 + case XenbusStateClosing: 515 + xenbus_frontend_closed(dev); 516 + break; 517 + } 518 + } 519 + 520 + static struct xenbus_driver xen_9pfs_front_driver = { 521 + .ids = xen_9pfs_front_ids, 522 + .probe = xen_9pfs_front_probe, 523 + .remove = xen_9pfs_front_remove, 524 + .resume = xen_9pfs_front_resume, 525 + .otherend_changed = xen_9pfs_front_changed, 526 + }; 527 + 528 + int p9_trans_xen_init(void) 529 + { 530 + if (!xen_domain()) 531 + return -ENODEV; 532 + 533 + pr_info("Initialising Xen transport for 9pfs\n"); 534 + 535 + v9fs_register_trans(&p9_xen_trans); 536 + return xenbus_register_frontend(&xen_9pfs_front_driver); 537 + } 538 + module_init(p9_trans_xen_init); 539 + 540 + void p9_trans_xen_exit(void) 541 + { 542 + v9fs_unregister_trans(&p9_xen_trans); 543 + return xenbus_unregister_driver(&xen_9pfs_front_driver); 544 + } 545 + module_exit(p9_trans_xen_exit);