Merge tag 'kvmarm-5.13' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD

+14

Documentation/ABI/testing/sysfs-bus-coresight-devices-trbe

··· 1 + What: /sys/bus/coresight/devices/trbe<cpu>/align 2 + Date: March 2021 3 + KernelVersion: 5.13 4 + Contact: Anshuman Khandual <anshuman.khandual@arm.com> 5 + Description: (Read) Shows the TRBE write pointer alignment. This value 6 + is fetched from the TRBIDR register. 7 + 8 + What: /sys/bus/coresight/devices/trbe<cpu>/flag 9 + Date: March 2021 10 + KernelVersion: 5.13 11 + Contact: Anshuman Khandual <anshuman.khandual@arm.com> 12 + Description: (Read) Shows if TRBE updates in the memory are with access 13 + and dirty flag updates as well. This value is fetched from 14 + the TRBIDR register.

+1 -2

Documentation/admin-guide/kernel-parameters.txt

··· 2279 2279 state is kept private from the host. 2280 2280 Not valid if the kernel is running in EL2. 2281 2281 2282 - Defaults to VHE/nVHE based on hardware support and 2283 - the value of CONFIG_ARM64_VHE. 2282 + Defaults to VHE/nVHE based on hardware support. 2284 2283 2285 2284 kvm-arm.vgic_v3_group0_trap= 2286 2285 [KVM,ARM] Trap guest accesses to GICv3 group-0

+75

Documentation/devicetree/bindings/arm/ete.yaml

··· 1 + # SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause 2 + # Copyright 2021, Arm Ltd 3 + %YAML 1.2 4 + --- 5 + $id: "http://devicetree.org/schemas/arm/ete.yaml#" 6 + $schema: "http://devicetree.org/meta-schemas/core.yaml#" 7 + 8 + title: ARM Embedded Trace Extensions 9 + 10 + maintainers: 11 + - Suzuki K Poulose <suzuki.poulose@arm.com> 12 + - Mathieu Poirier <mathieu.poirier@linaro.org> 13 + 14 + description: | 15 + Arm Embedded Trace Extension(ETE) is a per CPU trace component that 16 + allows tracing the CPU execution. It overlaps with the CoreSight ETMv4 17 + architecture and has extended support for future architecture changes. 18 + The trace generated by the ETE could be stored via legacy CoreSight 19 + components (e.g, TMC-ETR) or other means (e.g, using a per CPU buffer 20 + Arm Trace Buffer Extension (TRBE)). Since the ETE can be connected to 21 + legacy CoreSight components, a node must be listed per instance, along 22 + with any optional connection graph as per the coresight bindings. 23 + See bindings/arm/coresight.txt. 24 + 25 + properties: 26 + $nodename: 27 + pattern: "^ete([0-9a-f]+)$" 28 + compatible: 29 + items: 30 + - const: arm,embedded-trace-extension 31 + 32 + cpu: 33 + description: | 34 + Handle to the cpu this ETE is bound to. 35 + $ref: /schemas/types.yaml#/definitions/phandle 36 + 37 + out-ports: 38 + description: | 39 + Output connections from the ETE to legacy CoreSight trace bus. 40 + $ref: /schemas/graph.yaml#/properties/ports 41 + properties: 42 + port: 43 + description: Output connection from the ETE to legacy CoreSight Trace bus. 44 + $ref: /schemas/graph.yaml#/properties/port 45 + 46 + required: 47 + - compatible 48 + - cpu 49 + 50 + additionalProperties: false 51 + 52 + examples: 53 + 54 + # An ETE node without legacy CoreSight connections 55 + - | 56 + ete0 { 57 + compatible = "arm,embedded-trace-extension"; 58 + cpu = <&cpu_0>; 59 + }; 60 + # An ETE node with legacy CoreSight connections 61 + - | 62 + ete1 { 63 + compatible = "arm,embedded-trace-extension"; 64 + cpu = <&cpu_1>; 65 + 66 + out-ports { /* legacy coresight connection */ 67 + port { 68 + ete1_out_port: endpoint { 69 + remote-endpoint = <&funnel_in_port0>; 70 + }; 71 + }; 72 + }; 73 + }; 74 + 75 + ...

+49

Documentation/devicetree/bindings/arm/trbe.yaml

··· 1 + # SPDX-License-Identifier: GPL-2.0-only or BSD-2-Clause 2 + # Copyright 2021, Arm Ltd 3 + %YAML 1.2 4 + --- 5 + $id: "http://devicetree.org/schemas/arm/trbe.yaml#" 6 + $schema: "http://devicetree.org/meta-schemas/core.yaml#" 7 + 8 + title: ARM Trace Buffer Extensions 9 + 10 + maintainers: 11 + - Anshuman Khandual <anshuman.khandual@arm.com> 12 + 13 + description: | 14 + Arm Trace Buffer Extension (TRBE) is a per CPU component 15 + for storing trace generated on the CPU to memory. It is 16 + accessed via CPU system registers. The software can verify 17 + if it is permitted to use the component by checking the 18 + TRBIDR register. 19 + 20 + properties: 21 + $nodename: 22 + const: "trbe" 23 + compatible: 24 + items: 25 + - const: arm,trace-buffer-extension 26 + 27 + interrupts: 28 + description: | 29 + Exactly 1 PPI must be listed. For heterogeneous systems where 30 + TRBE is only supported on a subset of the CPUs, please consult 31 + the arm,gic-v3 binding for details on describing a PPI partition. 32 + maxItems: 1 33 + 34 + required: 35 + - compatible 36 + - interrupts 37 + 38 + additionalProperties: false 39 + 40 + examples: 41 + 42 + - | 43 + #include <dt-bindings/interrupt-controller/arm-gic.h> 44 + 45 + trbe { 46 + compatible = "arm,trace-buffer-extension"; 47 + interrupts = <GIC_PPI 15 IRQ_TYPE_LEVEL_HIGH>; 48 + }; 49 + ...

+38

Documentation/trace/coresight/coresight-trbe.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + ============================== 4 + Trace Buffer Extension (TRBE). 5 + ============================== 6 + 7 + :Author: Anshuman Khandual <anshuman.khandual@arm.com> 8 + :Date: November 2020 9 + 10 + Hardware Description 11 + -------------------- 12 + 13 + Trace Buffer Extension (TRBE) is a percpu hardware which captures in system 14 + memory, CPU traces generated from a corresponding percpu tracing unit. This 15 + gets plugged in as a coresight sink device because the corresponding trace 16 + generators (ETE), are plugged in as source device. 17 + 18 + The TRBE is not compliant to CoreSight architecture specifications, but is 19 + driven via the CoreSight driver framework to support the ETE (which is 20 + CoreSight compliant) integration. 21 + 22 + Sysfs files and directories 23 + --------------------------- 24 + 25 + The TRBE devices appear on the existing coresight bus alongside the other 26 + coresight devices:: 27 + 28 + >$ ls /sys/bus/coresight/devices 29 + trbe0 trbe1 trbe2 trbe3 30 + 31 + The ``trbe<N>`` named TRBEs are associated with a CPU.:: 32 + 33 + >$ ls /sys/bus/coresight/devices/trbe0/ 34 + align flag 35 + 36 + *Key file items are:-* 37 + * ``align``: TRBE write pointer alignment 38 + * ``flag``: TRBE updates memory with access and dirty flags

+23 -1

Documentation/virt/kvm/api.rst

··· 3116 3116 registers to their initial values. If this is not called, KVM_RUN will 3117 3117 return ENOEXEC for that vcpu. 3118 3118 3119 + The initial values are defined as: 3120 + - Processor state: 3121 + * AArch64: EL1h, D, A, I and F bits set. All other bits 3122 + are cleared. 3123 + * AArch32: SVC, A, I and F bits set. All other bits are 3124 + cleared. 3125 + - General Purpose registers, including PC and SP: set to 0 3126 + - FPSIMD/NEON registers: set to 0 3127 + - SVE registers: set to 0 3128 + - System registers: Reset to their architecturally defined 3129 + values as for a warm reset to EL1 (resp. SVC) 3130 + 3119 3131 Note that because some registers reflect machine topology, all vcpus 3120 3132 should be created before this ioctl is invoked. 3121 3133 ··· 3347 3335 flags which can include the following: 3348 3336 3349 3337 - KVM_GUESTDBG_USE_SW_BP: using software breakpoints [x86, arm64] 3350 - - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390, arm64] 3338 + - KVM_GUESTDBG_USE_HW_BP: using hardware breakpoints [x86, s390] 3339 + - KVM_GUESTDBG_USE_HW: using hardware debug events [arm64] 3351 3340 - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] 3352 3341 - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] 3353 3342 - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390] ··· 6882 6869 an implementation for these despite the in kernel acceleration. 6883 6870 6884 6871 This capability is always enabled. 6872 + 6873 + 8.32 KVM_CAP_PTP_KVM 6874 + -------------------- 6875 + 6876 + :Architectures: arm64 6877 + 6878 + This capability indicates that the KVM virtual PTP service is 6879 + supported in the host. A VMM can check whether the service is 6880 + available to the guest on migration.

+1

Documentation/virt/kvm/arm/index.rst

··· 10 10 hyp-abi 11 11 psci 12 12 pvtime 13 + ptp_kvm

+25

Documentation/virt/kvm/arm/ptp_kvm.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + PTP_KVM support for arm/arm64 4 + ============================= 5 + 6 + PTP_KVM is used for high precision time sync between host and guests. 7 + It relies on transferring the wall clock and counter value from the 8 + host to the guest using a KVM-specific hypercall. 9 + 10 + * ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: 0x86000001 11 + 12 + This hypercall uses the SMC32/HVC32 calling convention: 13 + 14 + ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID 15 + ============== ======== ===================================== 16 + Function ID: (uint32) 0x86000001 17 + Arguments: (uint32) KVM_PTP_VIRT_COUNTER(0) 18 + KVM_PTP_PHYS_COUNTER(1) 19 + Return Values: (int32) NOT_SUPPORTED(-1) on error, or 20 + (uint32) Upper 32 bits of wall clock time (r0) 21 + (uint32) Lower 32 bits of wall clock time (r1) 22 + (uint32) Upper 32 bits of counter (r2) 23 + (uint32) Lower 32 bits of counter (r3) 24 + Endianness: No Restrictions. 25 + ============== ======== =====================================

+1 -1

Documentation/virt/kvm/devices/arm-vgic-its.rst

··· 80 80 -EFAULT Invalid guest ram access 81 81 -EBUSY One or more VCPUS are running 82 82 -EACCES The virtual ITS is backed by a physical GICv4 ITS, and the 83 - state is not available 83 + state is not available without GICv4.1 84 84 ======= ========================================================== 85 85 86 86 KVM_DEV_ARM_VGIC_GRP_ITS_REGS

+1 -1

Documentation/virt/kvm/devices/arm-vgic-v3.rst

··· 228 228 229 229 KVM_DEV_ARM_VGIC_CTRL_INIT 230 230 request the initialization of the VGIC, no additional parameter in 231 - kvm_device_attr.addr. 231 + kvm_device_attr.addr. Must be called after all VCPUs have been created. 232 232 KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 233 233 save all LPI pending bits into guest RAM pending tables. 234 234

+4 -2

MAINTAINERS

··· 1761 1761 F: Documentation/devicetree/bindings/arm/coresight-cpu-debug.txt 1762 1762 F: Documentation/devicetree/bindings/arm/coresight-cti.yaml 1763 1763 F: Documentation/devicetree/bindings/arm/coresight.txt 1764 + F: Documentation/devicetree/bindings/arm/ete.yaml 1765 + F: Documentation/devicetree/bindings/arm/trbe.yaml 1764 1766 F: Documentation/trace/coresight/* 1765 1767 F: drivers/hwtracing/coresight/* 1766 1768 F: include/dt-bindings/arm/coresight-cti-dt.h ··· 9767 9765 KERNEL VIRTUAL MACHINE FOR ARM64 (KVM/arm64) 9768 9766 M: Marc Zyngier <maz@kernel.org> 9769 9767 R: James Morse <james.morse@arm.com> 9770 - R: Julien Thierry <julien.thierry.kdev@gmail.com> 9768 + R: Alexandru Elisei <alexandru.elisei@arm.com> 9771 9769 R: Suzuki K Poulose <suzuki.poulose@arm.com> 9772 9770 L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) 9773 - L: kvmarm@lists.cs.columbia.edu 9771 + L: kvmarm@lists.cs.columbia.edu (moderated for non-subscribers) 9774 9772 S: Maintained 9775 9773 T: git git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm.git 9776 9774 F: arch/arm64/include/asm/kvm*

+3

arch/arm/include/asm/hypervisor.h

··· 4 4 5 5 #include <asm/xen/hypervisor.h> 6 6 7 + void kvm_init_hyp_services(void); 8 + bool kvm_arm_hyp_service_available(u32 func_id); 9 + 7 10 #endif

-20

arch/arm64/Kconfig

··· 1426 1426 built with binutils >= 2.25 in order for the new instructions 1427 1427 to be used. 1428 1428 1429 - config ARM64_VHE 1430 - bool "Enable support for Virtualization Host Extensions (VHE)" 1431 - default y 1432 - help 1433 - Virtualization Host Extensions (VHE) allow the kernel to run 1434 - directly at EL2 (instead of EL1) on processors that support 1435 - it. This leads to better performance for KVM, as they reduce 1436 - the cost of the world switch. 1437 - 1438 - Selecting this option allows the VHE feature to be detected 1439 - at runtime, and does not affect processors that do not 1440 - implement this feature. 1441 - 1442 1429 endmenu 1443 1430 1444 1431 menu "ARMv8.2 architectural features" ··· 1681 1694 config ARM64_SVE 1682 1695 bool "ARM Scalable Vector Extension support" 1683 1696 default y 1684 - depends on !KVM || ARM64_VHE 1685 1697 help 1686 1698 The Scalable Vector Extension (SVE) is an extension to the AArch64 1687 1699 execution state which complements and extends the SIMD functionality ··· 1708 1722 fixed. Otherwise, you may experience firmware panics or lockups when 1709 1723 booting the kernel. If unsure and you are not observing these 1710 1724 symptoms, you should assume that it is safe to say Y. 1711 - 1712 - CPUs that support SVE are architecturally required to support the 1713 - Virtualization Host Extensions (VHE), so the kernel makes no 1714 - provision for supporting SVE alongside KVM without VHE enabled. 1715 - Thus, you will need to enable CONFIG_ARM64_VHE if you want to support 1716 - KVM in the same kernel image. 1717 1725 1718 1726 config ARM64_MODULE_PLTS 1719 1727 bool "Use PLTs to allow module memory to spill over into vmalloc area"

+1 -1

arch/arm64/crypto/aes-modes.S

··· 700 700 cbz w5, .Lmacout 701 701 encrypt_block v0, w2, x1, x7, w8 702 702 st1 {v0.16b}, [x4] /* return dg */ 703 - cond_yield .Lmacout, x7 703 + cond_yield .Lmacout, x7, x8 704 704 b .Lmacloop4x 705 705 .Lmac1x: 706 706 add w3, w3, #4

+1 -1

arch/arm64/crypto/sha1-ce-core.S

··· 121 121 add dgav.4s, dgav.4s, dg0v.4s 122 122 123 123 cbz w2, 2f 124 - cond_yield 3f, x5 124 + cond_yield 3f, x5, x6 125 125 b 0b 126 126 127 127 /*

+1 -1

arch/arm64/crypto/sha2-ce-core.S

··· 129 129 130 130 /* handled all input blocks? */ 131 131 cbz w2, 2f 132 - cond_yield 3f, x5 132 + cond_yield 3f, x5, x6 133 133 b 0b 134 134 135 135 /*

+2 -2

arch/arm64/crypto/sha3-ce-core.S

··· 184 184 eor v0.16b, v0.16b, v31.16b 185 185 186 186 cbnz w8, 3b 187 - cond_yield 3f, x8 187 + cond_yield 4f, x8, x9 188 188 cbnz w2, 0b 189 189 190 190 /* save state */ 191 - 3: st1 { v0.1d- v3.1d}, [x0], #32 191 + 4: st1 { v0.1d- v3.1d}, [x0], #32 192 192 st1 { v4.1d- v7.1d}, [x0], #32 193 193 st1 { v8.1d-v11.1d}, [x0], #32 194 194 st1 {v12.1d-v15.1d}, [x0], #32

+1 -1

arch/arm64/crypto/sha512-ce-core.S

··· 195 195 add v10.2d, v10.2d, v2.2d 196 196 add v11.2d, v11.2d, v3.2d 197 197 198 - cond_yield 3f, x4 198 + cond_yield 3f, x4, x5 199 199 /* handled all input blocks? */ 200 200 cbnz w2, 0b 201 201

+51 -78

arch/arm64/include/asm/assembler.h

··· 15 15 #include <asm-generic/export.h> 16 16 17 17 #include <asm/asm-offsets.h> 18 + #include <asm/alternative.h> 19 + #include <asm/asm-bug.h> 18 20 #include <asm/cpufeature.h> 19 21 #include <asm/cputype.h> 20 22 #include <asm/debug-monitors.h> ··· 24 22 #include <asm/pgtable-hwdef.h> 25 23 #include <asm/ptrace.h> 26 24 #include <asm/thread_info.h> 25 + 26 + /* 27 + * Provide a wxN alias for each wN register so what we can paste a xN 28 + * reference after a 'w' to obtain the 32-bit version. 29 + */ 30 + .irp n,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 31 + wx\n .req w\n 32 + .endr 27 33 28 34 .macro save_and_disable_daif, flags 29 35 mrs \flags, daif ··· 280 270 * provide the system wide safe value from arm64_ftr_reg_ctrel0.sys_val 281 271 */ 282 272 .macro read_ctr, reg 273 + #ifndef __KVM_NVHE_HYPERVISOR__ 283 274 alternative_if_not ARM64_MISMATCHED_CACHE_TYPE 284 275 mrs \reg, ctr_el0 // read CTR 285 276 nop 286 277 alternative_else 287 278 ldr_l \reg, arm64_ftr_reg_ctrel0 + ARM64_FTR_SYSVAL 288 279 alternative_endif 280 + #else 281 + alternative_if_not ARM64_KVM_PROTECTED_MODE 282 + ASM_BUG() 283 + alternative_else_nop_endif 284 + alternative_cb kvm_compute_final_ctr_el0 285 + movz \reg, #0 286 + movk \reg, #0, lsl #16 287 + movk \reg, #0, lsl #32 288 + movk \reg, #0, lsl #48 289 + alternative_cb_end 290 + #endif 289 291 .endm 290 292 291 293 ··· 698 676 .endm 699 677 700 678 /* 701 - * Set SCTLR_EL1 to the passed value, and invalidate the local icache 679 + * Set SCTLR_ELx to the @reg value, and invalidate the local icache 702 680 * in the process. This is called when setting the MMU on. 703 681 */ 704 - .macro set_sctlr_el1, reg 705 - msr sctlr_el1, \reg 682 + .macro set_sctlr, sreg, reg 683 + msr \sreg, \reg 706 684 isb 707 685 /* 708 686 * Invalidate the local I-cache so that any instructions fetched ··· 714 692 isb 715 693 .endm 716 694 717 - /* 718 - * Check whether to yield to another runnable task from kernel mode NEON code 719 - * (which runs with preemption disabled). 720 - * 721 - * if_will_cond_yield_neon 722 - * // pre-yield patchup code 723 - * do_cond_yield_neon 724 - * // post-yield patchup code 725 - * endif_yield_neon <label> 726 - * 727 - * where <label> is optional, and marks the point where execution will resume 728 - * after a yield has been performed. If omitted, execution resumes right after 729 - * the endif_yield_neon invocation. Note that the entire sequence, including 730 - * the provided patchup code, will be omitted from the image if 731 - * CONFIG_PREEMPTION is not defined. 732 - * 733 - * As a convenience, in the case where no patchup code is required, the above 734 - * sequence may be abbreviated to 735 - * 736 - * cond_yield_neon <label> 737 - * 738 - * Note that the patchup code does not support assembler directives that change 739 - * the output section, any use of such directives is undefined. 740 - * 741 - * The yield itself consists of the following: 742 - * - Check whether the preempt count is exactly 1 and a reschedule is also 743 - * needed. If so, calling of preempt_enable() in kernel_neon_end() will 744 - * trigger a reschedule. If it is not the case, yielding is pointless. 745 - * - Disable and re-enable kernel mode NEON, and branch to the yield fixup 746 - * code. 747 - * 748 - * This macro sequence may clobber all CPU state that is not guaranteed by the 749 - * AAPCS to be preserved across an ordinary function call. 750 - */ 695 + .macro set_sctlr_el1, reg 696 + set_sctlr sctlr_el1, \reg 697 + .endm 751 698 752 - .macro cond_yield_neon, lbl 753 - if_will_cond_yield_neon 754 - do_cond_yield_neon 755 - endif_yield_neon \lbl 756 - .endm 757 - 758 - .macro if_will_cond_yield_neon 759 - #ifdef CONFIG_PREEMPTION 760 - get_current_task x0 761 - ldr x0, [x0, #TSK_TI_PREEMPT] 762 - sub x0, x0, #PREEMPT_DISABLE_OFFSET 763 - cbz x0, .Lyield_\@ 764 - /* fall through to endif_yield_neon */ 765 - .subsection 1 766 - .Lyield_\@ : 767 - #else 768 - .section ".discard.cond_yield_neon", "ax" 769 - #endif 770 - .endm 771 - 772 - .macro do_cond_yield_neon 773 - bl kernel_neon_end 774 - bl kernel_neon_begin 775 - .endm 776 - 777 - .macro endif_yield_neon, lbl 778 - .ifnb \lbl 779 - b \lbl 780 - .else 781 - b .Lyield_out_\@ 782 - .endif 783 - .previous 784 - .Lyield_out_\@ : 785 - .endm 699 + .macro set_sctlr_el2, reg 700 + set_sctlr sctlr_el2, \reg 701 + .endm 786 702 787 703 /* 788 - * Check whether preempt-disabled code should yield as soon as it 789 - * is able. This is the case if re-enabling preemption a single 790 - * time results in a preempt count of zero, and the TIF_NEED_RESCHED 791 - * flag is set. (Note that the latter is stored negated in the 792 - * top word of the thread_info::preempt_count field) 704 + * Check whether preempt/bh-disabled asm code should yield as soon as 705 + * it is able. This is the case if we are currently running in task 706 + * context, and either a softirq is pending, or the TIF_NEED_RESCHED 707 + * flag is set and re-enabling preemption a single time would result in 708 + * a preempt count of zero. (Note that the TIF_NEED_RESCHED flag is 709 + * stored negated in the top word of the thread_info::preempt_count 710 + * field) 793 711 */ 794 - .macro cond_yield, lbl:req, tmp:req 795 - #ifdef CONFIG_PREEMPTION 712 + .macro cond_yield, lbl:req, tmp:req, tmp2:req 796 713 get_current_task \tmp 797 714 ldr \tmp, [\tmp, #TSK_TI_PREEMPT] 715 + /* 716 + * If we are serving a softirq, there is no point in yielding: the 717 + * softirq will not be preempted no matter what we do, so we should 718 + * run to completion as quickly as we can. 719 + */ 720 + tbnz \tmp, #SOFTIRQ_SHIFT, .Lnoyield_\@ 721 + #ifdef CONFIG_PREEMPTION 798 722 sub \tmp, \tmp, #PREEMPT_DISABLE_OFFSET 799 723 cbz \tmp, \lbl 800 724 #endif 725 + adr_l \tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING 726 + this_cpu_offset \tmp2 727 + ldr w\tmp, [\tmp, \tmp2] 728 + cbnz w\tmp, \lbl // yield on pending softirq in task context 729 + .Lnoyield_\@: 801 730 .endm 802 731 803 732 /*

+1

arch/arm64/include/asm/barrier.h

··· 23 23 #define dsb(opt) asm volatile("dsb " #opt : : : "memory") 24 24 25 25 #define psb_csync() asm volatile("hint #17" : : : "memory") 26 + #define tsb_csync() asm volatile("hint #18" : : : "memory") 26 27 #define csdb() asm volatile("hint #20" : : : "memory") 27 28 28 29 #define spec_bar() asm volatile(ALTERNATIVE("dsb nsh\nisb\n", \

+17

arch/arm64/include/asm/cpufeature.h

··· 63 63 s64 safe_val; /* safe value for FTR_EXACT features */ 64 64 }; 65 65 66 + /* 67 + * Describe the early feature override to the core override code: 68 + * 69 + * @val Values that are to be merged into the final 70 + * sanitised value of the register. Only the bitfields 71 + * set to 1 in @mask are valid 72 + * @mask Mask of the features that are overridden by @val 73 + * 74 + * A @mask field set to full-1 indicates that the corresponding field 75 + * in @val is a valid override. 76 + * 77 + * A @mask field set to full-0 with the corresponding @val field set 78 + * to full-0 denotes that this field has no override 79 + * 80 + * A @mask field set to full-0 with the corresponding @val field set 81 + * to full-1 denotes thath this field has an invalid override. 82 + */ 66 83 struct arm64_ftr_override { 67 84 u64 val; 68 85 u64 mask;

+13

arch/arm64/include/asm/el2_setup.h

··· 65 65 // use EL1&0 translation. 66 66 67 67 .Lskip_spe_\@: 68 + /* Trace buffer */ 69 + ubfx x0, x1, #ID_AA64DFR0_TRBE_SHIFT, #4 70 + cbz x0, .Lskip_trace_\@ // Skip if TraceBuffer is not present 71 + 72 + mrs_s x0, SYS_TRBIDR_EL1 73 + and x0, x0, TRBIDR_PROG 74 + cbnz x0, .Lskip_trace_\@ // If TRBE is available at EL2 75 + 76 + mov x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) 77 + orr x2, x2, x0 // allow the EL1&0 translation 78 + // to own it. 79 + 80 + .Lskip_trace_\@: 68 81 msr mdcr_el2, x2 // Configure debug traps 69 82 .endm 70 83

+11

arch/arm64/include/asm/fpsimd.h

··· 130 130 sysreg_clear_set(cpacr_el1, 0, CPACR_EL1_ZEN_EL0EN); 131 131 } 132 132 133 + #define sve_cond_update_zcr_vq(val, reg) \ 134 + do { \ 135 + u64 __zcr = read_sysreg_s((reg)); \ 136 + u64 __new = __zcr & ~ZCR_ELx_LEN_MASK; \ 137 + __new |= (val) & ZCR_ELx_LEN_MASK; \ 138 + if (__zcr != __new) \ 139 + write_sysreg_s(__new, (reg)); \ 140 + } while (0) 141 + 133 142 /* 134 143 * Probing and setup functions. 135 144 * Calls to these functions must be serialised with one another. ··· 167 158 168 159 static inline void sve_user_disable(void) { BUILD_BUG(); } 169 160 static inline void sve_user_enable(void) { BUILD_BUG(); } 161 + 162 + #define sve_cond_update_zcr_vq(val, reg) do { } while (0) 170 163 171 164 static inline void sve_init_vq_map(void) { } 172 165 static inline void sve_update_vq_map(void) { }

+8 -2

arch/arm64/include/asm/fpsimdmacros.h

··· 6 6 * Author: Catalin Marinas <catalin.marinas@arm.com> 7 7 */ 8 8 9 + #include <asm/assembler.h> 10 + 9 11 .macro fpsimd_save state, tmpnr 10 12 stp q0, q1, [\state, #16 * 0] 11 13 stp q2, q3, [\state, #16 * 2] ··· 232 230 str w\nxtmp, [\xpfpsr, #4] 233 231 .endm 234 232 235 - .macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2 236 - sve_load_vq \xvqminus1, x\nxtmp, \xtmp2 233 + .macro __sve_load nxbase, xpfpsr, nxtmp 237 234 _for n, 0, 31, _sve_ldr_v \n, \nxbase, \n - 34 238 235 _sve_ldr_p 0, \nxbase 239 236 _sve_wrffr 0 ··· 242 241 msr fpsr, x\nxtmp 243 242 ldr w\nxtmp, [\xpfpsr, #4] 244 243 msr fpcr, x\nxtmp 244 + .endm 245 + 246 + .macro sve_load nxbase, xpfpsr, xvqminus1, nxtmp, xtmp2 247 + sve_load_vq \xvqminus1, x\nxtmp, \xtmp2 248 + __sve_load \nxbase, \xpfpsr, \nxtmp 245 249 .endm

+7

arch/arm64/include/asm/hyp_image.h

··· 10 10 #define __HYP_CONCAT(a, b) a ## b 11 11 #define HYP_CONCAT(a, b) __HYP_CONCAT(a, b) 12 12 13 + #ifndef __KVM_NVHE_HYPERVISOR__ 13 14 /* 14 15 * KVM nVHE code has its own symbol namespace prefixed with __kvm_nvhe_, 15 16 * to separate it from the kernel proper. 16 17 */ 17 18 #define kvm_nvhe_sym(sym) __kvm_nvhe_##sym 19 + #else 20 + #define kvm_nvhe_sym(sym) sym 21 + #endif 18 22 19 23 #ifdef LINKER_SCRIPT 20 24 ··· 59 55 * KVM nVHE hyp code. 60 56 */ 61 57 #define KVM_NVHE_ALIAS(sym) kvm_nvhe_sym(sym) = sym; 58 + 59 + /* Defines a linker script alias for KVM nVHE hyp symbols */ 60 + #define KVM_NVHE_ALIAS_HYP(first, sec) kvm_nvhe_sym(first) = kvm_nvhe_sym(sec); 62 61 63 62 #endif /* LINKER_SCRIPT */ 64 63

+3

arch/arm64/include/asm/hypervisor.h

··· 4 4 5 5 #include <asm/xen/hypervisor.h> 6 6 7 + void kvm_init_hyp_services(void); 8 + bool kvm_arm_hyp_service_available(u32 func_id); 9 + 7 10 #endif

+2

arch/arm64/include/asm/kvm_arm.h

··· 278 278 #define CPTR_EL2_DEFAULT CPTR_EL2_RES1 279 279 280 280 /* Hyp Debug Configuration Register bits */ 281 + #define MDCR_EL2_E2TB_MASK (UL(0x3)) 282 + #define MDCR_EL2_E2TB_SHIFT (UL(24)) 281 283 #define MDCR_EL2_TTRF (1 << 19) 282 284 #define MDCR_EL2_TPMS (1 << 14) 283 285 #define MDCR_EL2_E2PB_MASK (UL(0x3))

+9

arch/arm64/include/asm/kvm_asm.h

··· 57 57 #define __KVM_HOST_SMCCC_FUNC___kvm_get_mdcr_el2 12 58 58 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 59 59 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 60 + #define __KVM_HOST_SMCCC_FUNC___pkvm_init 15 61 + #define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings 16 62 + #define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17 63 + #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 64 + #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 65 + #define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20 60 66 61 67 #ifndef __ASSEMBLY__ 62 68 ··· 160 154 unsigned long tpidr_el2; 161 155 unsigned long stack_hyp_va; 162 156 phys_addr_t pgd_pa; 157 + unsigned long hcr_el2; 158 + unsigned long vttbr; 159 + unsigned long vtcr; 163 160 }; 164 161 165 162 /* Translate a kernel address @ptr into its equivalent linear mapping */

+28 -18

arch/arm64/include/asm/kvm_host.h

··· 94 94 /* The last vcpu id that ran on each physical CPU */ 95 95 int __percpu *last_vcpu_ran; 96 96 97 - struct kvm *kvm; 97 + struct kvm_arch *arch; 98 98 }; 99 99 100 100 struct kvm_arch_memory_slot { ··· 315 315 struct kvm_guest_debug_arch regs; 316 316 /* Statistical profiling extension */ 317 317 u64 pmscr_el1; 318 + /* Self-hosted trace */ 319 + u64 trfcr_el1; 318 320 } host_debug_state; 319 321 320 322 /* VGIC state */ ··· 374 372 }; 375 373 376 374 /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */ 377 - #define vcpu_sve_pffr(vcpu) ((void *)((char *)((vcpu)->arch.sve_state) + \ 378 - sve_ffr_offset((vcpu)->arch.sve_max_vl))) 375 + #define vcpu_sve_pffr(vcpu) (kern_hyp_va((vcpu)->arch.sve_state) + \ 376 + sve_ffr_offset((vcpu)->arch.sve_max_vl)) 377 + 378 + #define vcpu_sve_max_vq(vcpu) sve_vq_from_vl((vcpu)->arch.sve_max_vl) 379 379 380 380 #define vcpu_sve_state_size(vcpu) ({ \ 381 381 size_t __size_ret; \ ··· 386 382 if (WARN_ON(!sve_vl_valid((vcpu)->arch.sve_max_vl))) { \ 387 383 __size_ret = 0; \ 388 384 } else { \ 389 - __vcpu_vq = sve_vq_from_vl((vcpu)->arch.sve_max_vl); \ 385 + __vcpu_vq = vcpu_sve_max_vq(vcpu); \ 390 386 __size_ret = SVE_SIG_REGS_SIZE(__vcpu_vq); \ 391 387 } \ 392 388 \ ··· 404 400 #define KVM_ARM64_GUEST_HAS_PTRAUTH (1 << 7) /* PTRAUTH exposed to guest */ 405 401 #define KVM_ARM64_PENDING_EXCEPTION (1 << 8) /* Exception pending */ 406 402 #define KVM_ARM64_EXCEPT_MASK (7 << 9) /* Target EL/MODE */ 403 + #define KVM_ARM64_DEBUG_STATE_SAVE_SPE (1 << 12) /* Save SPE context if active */ 404 + #define KVM_ARM64_DEBUG_STATE_SAVE_TRBE (1 << 13) /* Save TRBE context if active */ 407 405 408 406 #define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE | \ 409 407 KVM_GUESTDBG_USE_SW_BP | \ ··· 596 590 void kvm_arm_halt_guest(struct kvm *kvm); 597 591 void kvm_arm_resume_guest(struct kvm *kvm); 598 592 593 + #ifndef __KVM_NVHE_HYPERVISOR__ 599 594 #define kvm_call_hyp_nvhe(f, ...) \ 600 595 ({ \ 601 596 struct arm_smccc_res res; \ ··· 636 629 \ 637 630 ret; \ 638 631 }) 632 + #else /* __KVM_NVHE_HYPERVISOR__ */ 633 + #define kvm_call_hyp(f, ...) f(__VA_ARGS__) 634 + #define kvm_call_hyp_ret(f, ...) f(__VA_ARGS__) 635 + #define kvm_call_hyp_nvhe(f, ...) f(__VA_ARGS__) 636 + #endif /* __KVM_NVHE_HYPERVISOR__ */ 639 637 640 638 void force_vm_exit(const cpumask_t *mask); 641 - void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot); 642 639 643 640 int handle_exit(struct kvm_vcpu *vcpu, int exception_index); 644 641 void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index); ··· 702 691 ctxt_sys_reg(cpu_ctxt, MPIDR_EL1) = read_cpuid_mpidr(); 703 692 } 704 693 705 - static inline bool kvm_arch_requires_vhe(void) 706 - { 707 - /* 708 - * The Arm architecture specifies that implementation of SVE 709 - * requires VHE also to be implemented. The KVM code for arm64 710 - * relies on this when SVE is present: 711 - */ 712 - if (system_supports_sve()) 713 - return true; 714 - 715 - return false; 716 - } 717 - 718 694 void kvm_arm_vcpu_ptrauth_trap(struct kvm_vcpu *vcpu); 719 695 720 696 static inline void kvm_arch_hardware_unsetup(void) {} ··· 710 712 static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} 711 713 712 714 void kvm_arm_init_debug(void); 715 + void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu); 713 716 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); 714 717 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu); 715 718 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu); ··· 731 732 { 732 733 return (!has_vhe() && attr->exclude_host); 733 734 } 735 + 736 + /* Flags for host debug state */ 737 + void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu); 738 + void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu); 734 739 735 740 #ifdef CONFIG_KVM /* Avoid conflicts with core headers if CONFIG_KVM=n */ 736 741 static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) ··· 773 770 (test_bit(KVM_ARM_VCPU_PMU_V3, (vcpu)->arch.features)) 774 771 775 772 int kvm_trng_call(struct kvm_vcpu *vcpu); 773 + #ifdef CONFIG_KVM 774 + extern phys_addr_t hyp_mem_base; 775 + extern phys_addr_t hyp_mem_size; 776 + void __init kvm_hyp_reserve(void); 777 + #else 778 + static inline void kvm_hyp_reserve(void) { } 779 + #endif 776 780 777 781 #endif /* __ARM64_KVM_HOST_H__ */

+13 -1

arch/arm64/include/asm/kvm_hyp.h

··· 90 90 91 91 void __fpsimd_save_state(struct user_fpsimd_state *fp_regs); 92 92 void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs); 93 + void __sve_save_state(void *sve_pffr, u32 *fpsr); 94 + void __sve_restore_state(void *sve_pffr, u32 *fpsr); 93 95 94 96 #ifndef __KVM_NVHE_HYPERVISOR__ 95 97 void activate_traps_vhe_load(struct kvm_vcpu *vcpu); ··· 102 100 103 101 bool kvm_host_psci_handler(struct kvm_cpu_context *host_ctxt); 104 102 105 - void __noreturn hyp_panic(void); 106 103 #ifdef __KVM_NVHE_HYPERVISOR__ 107 104 void __noreturn __hyp_do_panic(struct kvm_cpu_context *host_ctxt, u64 spsr, 108 105 u64 elr, u64 par); 109 106 #endif 107 + 108 + #ifdef __KVM_NVHE_HYPERVISOR__ 109 + void __pkvm_init_switch_pgd(phys_addr_t phys, unsigned long size, 110 + phys_addr_t pgd, void *sp, void *cont_fn); 111 + int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, 112 + unsigned long *per_cpu_base, u32 hyp_va_bits); 113 + void __noreturn __host_enter(struct kvm_cpu_context *host_ctxt); 114 + #endif 115 + 116 + extern u64 kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val); 117 + extern u64 kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val); 110 118 111 119 #endif /* __ARM64_KVM_HYP_H__ */

+22 -3

arch/arm64/include/asm/kvm_mmu.h

··· 121 121 void kvm_compute_layout(void); 122 122 void kvm_apply_hyp_relocations(void); 123 123 124 + #define __hyp_pa(x) (((phys_addr_t)(x)) + hyp_physvirt_offset) 125 + 124 126 static __always_inline unsigned long __kern_hyp_va(unsigned long v) 125 127 { 126 128 asm volatile(ALTERNATIVE_CB("and %0, %0, #1\n" ··· 168 166 169 167 phys_addr_t kvm_mmu_get_httbr(void); 170 168 phys_addr_t kvm_get_idmap_vector(void); 171 - int kvm_mmu_init(void); 169 + int kvm_mmu_init(u32 *hyp_va_bits); 170 + 171 + static inline void *__kvm_vector_slot2addr(void *base, 172 + enum arm64_hyp_spectre_vector slot) 173 + { 174 + int idx = slot - (slot != HYP_VECTOR_DIRECT); 175 + 176 + return base + (idx * SZ_2K); 177 + } 172 178 173 179 struct kvm; 174 180 ··· 272 262 * Must be called from hyp code running at EL2 with an updated VTTBR 273 263 * and interrupts disabled. 274 264 */ 275 - static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu) 265 + static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr) 276 266 { 277 - write_sysreg(kern_hyp_va(mmu->kvm)->arch.vtcr, vtcr_el2); 267 + write_sysreg(vtcr, vtcr_el2); 278 268 write_sysreg(kvm_get_vttbr(mmu), vttbr_el2); 279 269 280 270 /* ··· 285 275 asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); 286 276 } 287 277 278 + static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu) 279 + { 280 + __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr); 281 + } 282 + 283 + static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu) 284 + { 285 + return container_of(mmu->arch, struct kvm, arch); 286 + } 288 287 #endif /* __ASSEMBLY__ */ 289 288 #endif /* __ARM64_KVM_MMU_H__ */

+148 -16

arch/arm64/include/asm/kvm_pgtable.h

··· 11 11 #include <linux/kvm_host.h> 12 12 #include <linux/types.h> 13 13 14 + #define KVM_PGTABLE_MAX_LEVELS 4U 15 + 16 + static inline u64 kvm_get_parange(u64 mmfr0) 17 + { 18 + u64 parange = cpuid_feature_extract_unsigned_field(mmfr0, 19 + ID_AA64MMFR0_PARANGE_SHIFT); 20 + if (parange > ID_AA64MMFR0_PARANGE_MAX) 21 + parange = ID_AA64MMFR0_PARANGE_MAX; 22 + 23 + return parange; 24 + } 25 + 14 26 typedef u64 kvm_pte_t; 27 + 28 + /** 29 + * struct kvm_pgtable_mm_ops - Memory management callbacks. 30 + * @zalloc_page: Allocate a single zeroed memory page. The @arg parameter 31 + * can be used by the walker to pass a memcache. The 32 + * initial refcount of the page is 1. 33 + * @zalloc_pages_exact: Allocate an exact number of zeroed memory pages. The 34 + * @size parameter is in bytes, and is rounded-up to the 35 + * next page boundary. The resulting allocation is 36 + * physically contiguous. 37 + * @free_pages_exact: Free an exact number of memory pages previously 38 + * allocated by zalloc_pages_exact. 39 + * @get_page: Increment the refcount on a page. 40 + * @put_page: Decrement the refcount on a page. When the refcount 41 + * reaches 0 the page is automatically freed. 42 + * @page_count: Return the refcount of a page. 43 + * @phys_to_virt: Convert a physical address into a virtual address mapped 44 + * in the current context. 45 + * @virt_to_phys: Convert a virtual address mapped in the current context 46 + * into a physical address. 47 + */ 48 + struct kvm_pgtable_mm_ops { 49 + void* (*zalloc_page)(void *arg); 50 + void* (*zalloc_pages_exact)(size_t size); 51 + void (*free_pages_exact)(void *addr, size_t size); 52 + void (*get_page)(void *addr); 53 + void (*put_page)(void *addr); 54 + int (*page_count)(void *addr); 55 + void* (*phys_to_virt)(phys_addr_t phys); 56 + phys_addr_t (*virt_to_phys)(void *addr); 57 + }; 58 + 59 + /** 60 + * enum kvm_pgtable_stage2_flags - Stage-2 page-table flags. 61 + * @KVM_PGTABLE_S2_NOFWB: Don't enforce Normal-WB even if the CPUs have 62 + * ARM64_HAS_STAGE2_FWB. 63 + * @KVM_PGTABLE_S2_IDMAP: Only use identity mappings. 64 + */ 65 + enum kvm_pgtable_stage2_flags { 66 + KVM_PGTABLE_S2_NOFWB = BIT(0), 67 + KVM_PGTABLE_S2_IDMAP = BIT(1), 68 + }; 15 69 16 70 /** 17 71 * struct kvm_pgtable - KVM page-table. 18 72 * @ia_bits: Maximum input address size, in bits. 19 73 * @start_level: Level at which the page-table walk starts. 20 74 * @pgd: Pointer to the first top-level entry of the page-table. 75 + * @mm_ops: Memory management callbacks. 21 76 * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. 22 77 */ 23 78 struct kvm_pgtable { 24 79 u32 ia_bits; 25 80 u32 start_level; 26 81 kvm_pte_t *pgd; 82 + struct kvm_pgtable_mm_ops *mm_ops; 27 83 28 84 /* Stage-2 only */ 29 85 struct kvm_s2_mmu *mmu; 86 + enum kvm_pgtable_stage2_flags flags; 30 87 }; 31 88 32 89 /** ··· 105 48 #define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) 106 49 #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) 107 50 #define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) 51 + 52 + /** 53 + * struct kvm_mem_range - Range of Intermediate Physical Addresses 54 + * @start: Start of the range. 55 + * @end: End of the range. 56 + */ 57 + struct kvm_mem_range { 58 + u64 start; 59 + u64 end; 60 + }; 108 61 109 62 /** 110 63 * enum kvm_pgtable_walk_flags - Flags to control a depth-first page-table walk. ··· 153 86 * kvm_pgtable_hyp_init() - Initialise a hypervisor stage-1 page-table. 154 87 * @pgt: Uninitialised page-table structure to initialise. 155 88 * @va_bits: Maximum virtual address bits. 89 + * @mm_ops: Memory management callbacks. 156 90 * 157 91 * Return: 0 on success, negative error code on failure. 158 92 */ 159 - int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits); 93 + int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, 94 + struct kvm_pgtable_mm_ops *mm_ops); 160 95 161 96 /** 162 97 * kvm_pgtable_hyp_destroy() - Destroy an unused hypervisor stage-1 page-table. ··· 192 123 enum kvm_pgtable_prot prot); 193 124 194 125 /** 195 - * kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. 126 + * kvm_get_vtcr() - Helper to construct VTCR_EL2 127 + * @mmfr0: Sanitized value of SYS_ID_AA64MMFR0_EL1 register. 128 + * @mmfr1: Sanitized value of SYS_ID_AA64MMFR1_EL1 register. 129 + * @phys_shfit: Value to set in VTCR_EL2.T0SZ. 130 + * 131 + * The VTCR value is common across all the physical CPUs on the system. 132 + * We use system wide sanitised values to fill in different fields, 133 + * except for Hardware Management of Access Flags. HA Flag is set 134 + * unconditionally on all CPUs, as it is safe to run with or without 135 + * the feature and the bit is RES0 on CPUs that don't support it. 136 + * 137 + * Return: VTCR_EL2 value 138 + */ 139 + u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); 140 + 141 + /** 142 + * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table. 196 143 * @pgt: Uninitialised page-table structure to initialise. 197 - * @kvm: KVM structure representing the guest virtual machine. 144 + * @arch: Arch-specific KVM structure representing the guest virtual 145 + * machine. 146 + * @mm_ops: Memory management callbacks. 147 + * @flags: Stage-2 configuration flags. 198 148 * 199 149 * Return: 0 on success, negative error code on failure. 200 150 */ 201 - int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm); 151 + int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, 152 + struct kvm_pgtable_mm_ops *mm_ops, 153 + enum kvm_pgtable_stage2_flags flags); 154 + 155 + #define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \ 156 + kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0) 202 157 203 158 /** 204 159 * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. 205 - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). 160 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 206 161 * 207 162 * The page-table is assumed to be unreachable by any hardware walkers prior 208 163 * to freeing and therefore no TLB invalidation is performed. ··· 235 142 236 143 /** 237 144 * kvm_pgtable_stage2_map() - Install a mapping in a guest stage-2 page-table. 238 - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). 145 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 239 146 * @addr: Intermediate physical address at which to place the mapping. 240 147 * @size: Size of the mapping. 241 148 * @phys: Physical address of the memory to map. 242 149 * @prot: Permissions and attributes for the mapping. 243 - * @mc: Cache of pre-allocated GFP_PGTABLE_USER memory from which to 244 - * allocate page-table pages. 150 + * @mc: Cache of pre-allocated and zeroed memory from which to allocate 151 + * page-table pages. 245 152 * 246 153 * The offset of @addr within a page is ignored, @size is rounded-up to 247 154 * the next page boundary and @phys is rounded-down to the previous page ··· 263 170 */ 264 171 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, 265 172 u64 phys, enum kvm_pgtable_prot prot, 266 - struct kvm_mmu_memory_cache *mc); 173 + void *mc); 174 + 175 + /** 176 + * kvm_pgtable_stage2_set_owner() - Unmap and annotate pages in the IPA space to 177 + * track ownership. 178 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 179 + * @addr: Base intermediate physical address to annotate. 180 + * @size: Size of the annotated range. 181 + * @mc: Cache of pre-allocated and zeroed memory from which to allocate 182 + * page-table pages. 183 + * @owner_id: Unique identifier for the owner of the page. 184 + * 185 + * By default, all page-tables are owned by identifier 0. This function can be 186 + * used to mark portions of the IPA space as owned by other entities. When a 187 + * stage 2 is used with identity-mappings, these annotations allow to use the 188 + * page-table data structure as a simple rmap. 189 + * 190 + * Return: 0 on success, negative error code on failure. 191 + */ 192 + int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, 193 + void *mc, u8 owner_id); 267 194 268 195 /** 269 196 * kvm_pgtable_stage2_unmap() - Remove a mapping from a guest stage-2 page-table. 270 - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). 197 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 271 198 * @addr: Intermediate physical address from which to remove the mapping. 272 199 * @size: Size of the mapping. 273 200 * ··· 307 194 /** 308 195 * kvm_pgtable_stage2_wrprotect() - Write-protect guest stage-2 address range 309 196 * without TLB invalidation. 310 - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). 197 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 311 198 * @addr: Intermediate physical address from which to write-protect, 312 199 * @size: Size of the range. 313 200 * ··· 324 211 325 212 /** 326 213 * kvm_pgtable_stage2_mkyoung() - Set the access flag in a page-table entry. 327 - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). 214 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 328 215 * @addr: Intermediate physical address to identify the page-table entry. 329 216 * 330 217 * The offset of @addr within a page is ignored. ··· 338 225 339 226 /** 340 227 * kvm_pgtable_stage2_mkold() - Clear the access flag in a page-table entry. 341 - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). 228 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 342 229 * @addr: Intermediate physical address to identify the page-table entry. 343 230 * 344 231 * The offset of @addr within a page is ignored. ··· 357 244 /** 358 245 * kvm_pgtable_stage2_relax_perms() - Relax the permissions enforced by a 359 246 * page-table entry. 360 - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). 247 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 361 248 * @addr: Intermediate physical address to identify the page-table entry. 362 249 * @prot: Additional permissions to grant for the mapping. 363 250 * ··· 376 263 /** 377 264 * kvm_pgtable_stage2_is_young() - Test whether a page-table entry has the 378 265 * access flag set. 379 - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). 266 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 380 267 * @addr: Intermediate physical address to identify the page-table entry. 381 268 * 382 269 * The offset of @addr within a page is ignored. ··· 389 276 * kvm_pgtable_stage2_flush_range() - Clean and invalidate data cache to Point 390 277 * of Coherency for guest stage-2 address 391 278 * range. 392 - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init(). 279 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 393 280 * @addr: Intermediate physical address from which to flush. 394 281 * @size: Size of the range. 395 282 * ··· 424 311 int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, 425 312 struct kvm_pgtable_walker *walker); 426 313 314 + /** 315 + * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical 316 + * Addresses with compatible permission 317 + * attributes. 318 + * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). 319 + * @addr: Address that must be covered by the range. 320 + * @prot: Protection attributes that the range must be compatible with. 321 + * @range: Range structure used to limit the search space at call time and 322 + * that will hold the result. 323 + * 324 + * The offset of @addr within a page is ignored. An IPA is compatible with @prot 325 + * iff its corresponding stage-2 page-table entry has default ownership and, if 326 + * valid, is mapped with protection attributes identical to @prot. 327 + * 328 + * Return: 0 on success, negative error code on failure. 329 + */ 330 + int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, 331 + enum kvm_pgtable_prot prot, 332 + struct kvm_mem_range *range); 427 333 #endif /* __ARM64_KVM_PGTABLE_H__ */

+2 -2

arch/arm64/include/asm/pgtable-prot.h

··· 71 71 #define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN) 72 72 #define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) 73 73 74 - #define PAGE_S2_MEMATTR(attr) \ 74 + #define PAGE_S2_MEMATTR(attr, has_fwb) \ 75 75 ({ \ 76 76 u64 __val; \ 77 - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) \ 77 + if (has_fwb) \ 78 78 __val = PTE_S2_MEMATTR(MT_S2_FWB_ ## attr); \ 79 79 else \ 80 80 __val = PTE_S2_MEMATTR(MT_S2_ ## attr); \

+1

arch/arm64/include/asm/sections.h

··· 13 13 extern char __hyp_text_start[], __hyp_text_end[]; 14 14 extern char __hyp_rodata_start[], __hyp_rodata_end[]; 15 15 extern char __hyp_reloc_begin[], __hyp_reloc_end[]; 16 + extern char __hyp_bss_start[], __hyp_bss_end[]; 16 17 extern char __idmap_text_start[], __idmap_text_end[]; 17 18 extern char __initdata_begin[], __initdata_end[]; 18 19 extern char __inittext_begin[], __inittext_end[];

+56 -3

arch/arm64/include/asm/sysreg.h

··· 283 283 #define SYS_PMSIRR_EL1_INTERVAL_MASK 0xffffffUL 284 284 285 285 /* Filtering controls */ 286 + #define SYS_PMSNEVFR_EL1 sys_reg(3, 0, 9, 9, 1) 287 + 286 288 #define SYS_PMSFCR_EL1 sys_reg(3, 0, 9, 9, 4) 287 289 #define SYS_PMSFCR_EL1_FE_SHIFT 0 288 290 #define SYS_PMSFCR_EL1_FT_SHIFT 1 ··· 334 332 #define SYS_PMBSR_EL1_BUF_BSC_FULL (0x1UL << SYS_PMBSR_EL1_BUF_BSC_SHIFT) 335 333 336 334 /*** End of Statistical Profiling Extension ***/ 335 + 336 + /* 337 + * TRBE Registers 338 + */ 339 + #define SYS_TRBLIMITR_EL1 sys_reg(3, 0, 9, 11, 0) 340 + #define SYS_TRBPTR_EL1 sys_reg(3, 0, 9, 11, 1) 341 + #define SYS_TRBBASER_EL1 sys_reg(3, 0, 9, 11, 2) 342 + #define SYS_TRBSR_EL1 sys_reg(3, 0, 9, 11, 3) 343 + #define SYS_TRBMAR_EL1 sys_reg(3, 0, 9, 11, 4) 344 + #define SYS_TRBTRG_EL1 sys_reg(3, 0, 9, 11, 6) 345 + #define SYS_TRBIDR_EL1 sys_reg(3, 0, 9, 11, 7) 346 + 347 + #define TRBLIMITR_LIMIT_MASK GENMASK_ULL(51, 0) 348 + #define TRBLIMITR_LIMIT_SHIFT 12 349 + #define TRBLIMITR_NVM BIT(5) 350 + #define TRBLIMITR_TRIG_MODE_MASK GENMASK(1, 0) 351 + #define TRBLIMITR_TRIG_MODE_SHIFT 3 352 + #define TRBLIMITR_FILL_MODE_MASK GENMASK(1, 0) 353 + #define TRBLIMITR_FILL_MODE_SHIFT 1 354 + #define TRBLIMITR_ENABLE BIT(0) 355 + #define TRBPTR_PTR_MASK GENMASK_ULL(63, 0) 356 + #define TRBPTR_PTR_SHIFT 0 357 + #define TRBBASER_BASE_MASK GENMASK_ULL(51, 0) 358 + #define TRBBASER_BASE_SHIFT 12 359 + #define TRBSR_EC_MASK GENMASK(5, 0) 360 + #define TRBSR_EC_SHIFT 26 361 + #define TRBSR_IRQ BIT(22) 362 + #define TRBSR_TRG BIT(21) 363 + #define TRBSR_WRAP BIT(20) 364 + #define TRBSR_ABORT BIT(18) 365 + #define TRBSR_STOP BIT(17) 366 + #define TRBSR_MSS_MASK GENMASK(15, 0) 367 + #define TRBSR_MSS_SHIFT 0 368 + #define TRBSR_BSC_MASK GENMASK(5, 0) 369 + #define TRBSR_BSC_SHIFT 0 370 + #define TRBSR_FSC_MASK GENMASK(5, 0) 371 + #define TRBSR_FSC_SHIFT 0 372 + #define TRBMAR_SHARE_MASK GENMASK(1, 0) 373 + #define TRBMAR_SHARE_SHIFT 8 374 + #define TRBMAR_OUTER_MASK GENMASK(3, 0) 375 + #define TRBMAR_OUTER_SHIFT 4 376 + #define TRBMAR_INNER_MASK GENMASK(3, 0) 377 + #define TRBMAR_INNER_SHIFT 0 378 + #define TRBTRG_TRG_MASK GENMASK(31, 0) 379 + #define TRBTRG_TRG_SHIFT 0 380 + #define TRBIDR_FLAG BIT(5) 381 + #define TRBIDR_PROG BIT(4) 382 + #define TRBIDR_ALIGN_MASK GENMASK(3, 0) 383 + #define TRBIDR_ALIGN_SHIFT 0 337 384 338 385 #define SYS_PMINTENSET_EL1 sys_reg(3, 0, 9, 14, 1) 339 386 #define SYS_PMINTENCLR_EL1 sys_reg(3, 0, 9, 14, 2) ··· 630 579 #define SCTLR_ELx_A (BIT(1)) 631 580 #define SCTLR_ELx_M (BIT(0)) 632 581 633 - #define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ 634 - SCTLR_ELx_SA | SCTLR_ELx_I | SCTLR_ELx_IESB) 635 - 636 582 /* SCTLR_EL2 specific flags. */ 637 583 #define SCTLR_EL2_RES1 ((BIT(4)) | (BIT(5)) | (BIT(11)) | (BIT(16)) | \ 638 584 (BIT(18)) | (BIT(22)) | (BIT(23)) | (BIT(28)) | \ ··· 640 592 #else 641 593 #define ENDIAN_SET_EL2 0 642 594 #endif 595 + 596 + #define INIT_SCTLR_EL2_MMU_ON \ 597 + (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_ELx_I | \ 598 + SCTLR_ELx_IESB | SCTLR_ELx_WXN | ENDIAN_SET_EL2 | SCTLR_EL2_RES1) 643 599 644 600 #define INIT_SCTLR_EL2_MMU_OFF \ 645 601 (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) ··· 892 840 #define ID_AA64MMFR2_CNP_SHIFT 0 893 841 894 842 /* id_aa64dfr0 */ 843 + #define ID_AA64DFR0_TRBE_SHIFT 44 895 844 #define ID_AA64DFR0_TRACE_FILT_SHIFT 40 896 845 #define ID_AA64DFR0_DOUBLELOCK_SHIFT 36 897 846 #define ID_AA64DFR0_PMSVER_SHIFT 32

+5

arch/arm64/kernel/asm-offsets.c

··· 95 95 DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); 96 96 BLANK(); 97 97 DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET); 98 + DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT); 99 + DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending)); 98 100 BLANK(); 99 101 DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); 100 102 DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); ··· 122 120 DEFINE(NVHE_INIT_TPIDR_EL2, offsetof(struct kvm_nvhe_init_params, tpidr_el2)); 123 121 DEFINE(NVHE_INIT_STACK_HYP_VA, offsetof(struct kvm_nvhe_init_params, stack_hyp_va)); 124 122 DEFINE(NVHE_INIT_PGD_PA, offsetof(struct kvm_nvhe_init_params, pgd_pa)); 123 + DEFINE(NVHE_INIT_HCR_EL2, offsetof(struct kvm_nvhe_init_params, hcr_el2)); 124 + DEFINE(NVHE_INIT_VTTBR, offsetof(struct kvm_nvhe_init_params, vttbr)); 125 + DEFINE(NVHE_INIT_VTCR, offsetof(struct kvm_nvhe_init_params, vtcr)); 125 126 #endif 126 127 #ifdef CONFIG_CPU_PM 127 128 DEFINE(CPU_CTX_SP, offsetof(struct cpu_suspend_ctx, sp));

+1 -4

arch/arm64/kernel/cpu-reset.S

··· 30 30 * flat identity mapping. 31 31 */ 32 32 SYM_CODE_START(__cpu_soft_restart) 33 - /* Clear sctlr_el1 flags. */ 34 - mrs x12, sctlr_el1 35 - mov_q x13, SCTLR_ELx_FLAGS 36 - bic x12, x12, x13 33 + mov_q x12, INIT_SCTLR_EL1_MMU_OFF 37 34 pre_disable_mmu_workaround 38 35 /* 39 36 * either disable EL1&0 translation regime or disable EL2&0 translation

+6 -4

arch/arm64/kernel/cpufeature.c

··· 808 808 reg->name, 809 809 ftrp->shift + ftrp->width - 1, 810 810 ftrp->shift, str, tmp); 811 + } else if ((ftr_mask & reg->override->val) == ftr_mask) { 812 + reg->override->val &= ~ftr_mask; 813 + pr_warn("%s[%d:%d]: impossible override, ignored\n", 814 + reg->name, 815 + ftrp->shift + ftrp->width - 1, 816 + ftrp->shift); 811 817 } 812 818 813 819 val = arm64_ftr_set_value(ftrp, val, ftr_new); ··· 1625 1619 } 1626 1620 #endif 1627 1621 1628 - #ifdef CONFIG_ARM64_VHE 1629 1622 static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused) 1630 1623 { 1631 1624 return is_kernel_in_hyp_mode(); ··· 1643 1638 if (!alternative_is_applied(ARM64_HAS_VIRT_HOST_EXTN)) 1644 1639 write_sysreg(read_sysreg(tpidr_el1), tpidr_el2); 1645 1640 } 1646 - #endif 1647 1641 1648 1642 static void cpu_has_fwb(const struct arm64_cpu_capabilities *__unused) 1649 1643 { ··· 1845 1841 .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, 1846 1842 .matches = has_no_hw_prefetch, 1847 1843 }, 1848 - #ifdef CONFIG_ARM64_VHE 1849 1844 { 1850 1845 .desc = "Virtualization Host Extensions", 1851 1846 .capability = ARM64_HAS_VIRT_HOST_EXTN, ··· 1852 1849 .matches = runs_at_el2, 1853 1850 .cpu_enable = cpu_copy_el2regs, 1854 1851 }, 1855 - #endif /* CONFIG_ARM64_VHE */ 1856 1852 { 1857 1853 .desc = "32-bit EL0 Support", 1858 1854 .capability = ARM64_HAS_32BIT_EL0,

+2 -2

arch/arm64/kernel/fpsimd.c

··· 180 180 */ 181 181 static void get_cpu_fpsimd_context(void) 182 182 { 183 - preempt_disable(); 183 + local_bh_disable(); 184 184 __get_cpu_fpsimd_context(); 185 185 } 186 186 ··· 201 201 static void put_cpu_fpsimd_context(void) 202 202 { 203 203 __put_cpu_fpsimd_context(); 204 - preempt_enable(); 204 + local_bh_enable(); 205 205 } 206 206 207 207 static bool have_cpu_fpsimd_context(void)

+36 -3

arch/arm64/kernel/head.S

··· 477 477 * booted in EL1 or EL2 respectively. 478 478 */ 479 479 SYM_FUNC_START(init_kernel_el) 480 - mov_q x0, INIT_SCTLR_EL1_MMU_OFF 481 - msr sctlr_el1, x0 482 - 483 480 mrs x0, CurrentEL 484 481 cmp x0, #CurrentEL_EL2 485 482 b.eq init_el2 486 483 487 484 SYM_INNER_LABEL(init_el1, SYM_L_LOCAL) 485 + mov_q x0, INIT_SCTLR_EL1_MMU_OFF 486 + msr sctlr_el1, x0 488 487 isb 489 488 mov_q x0, INIT_PSTATE_EL1 490 489 msr spsr_el1, x0 ··· 503 504 msr vbar_el2, x0 504 505 isb 505 506 507 + /* 508 + * Fruity CPUs seem to have HCR_EL2.E2H set to RES1, 509 + * making it impossible to start in nVHE mode. Is that 510 + * compliant with the architecture? Absolutely not! 511 + */ 512 + mrs x0, hcr_el2 513 + and x0, x0, #HCR_E2H 514 + cbz x0, 1f 515 + 516 + /* Switching to VHE requires a sane SCTLR_EL1 as a start */ 517 + mov_q x0, INIT_SCTLR_EL1_MMU_OFF 518 + msr_s SYS_SCTLR_EL12, x0 519 + 520 + /* 521 + * Force an eret into a helper "function", and let it return 522 + * to our original caller... This makes sure that we have 523 + * initialised the basic PSTATE state. 524 + */ 525 + mov x0, #INIT_PSTATE_EL2 526 + msr spsr_el1, x0 527 + adr x0, __cpu_stick_to_vhe 528 + msr elr_el1, x0 529 + eret 530 + 531 + 1: 532 + mov_q x0, INIT_SCTLR_EL1_MMU_OFF 533 + msr sctlr_el1, x0 534 + 506 535 msr elr_el2, lr 507 536 mov w0, #BOOT_CPU_MODE_EL2 508 537 eret 538 + 539 + __cpu_stick_to_vhe: 540 + mov x0, #HVC_VHE_RESTART 541 + hvc #0 542 + mov x0, #BOOT_CPU_MODE_EL2 543 + ret 509 544 SYM_FUNC_END(init_kernel_el) 510 545 511 546 /*

+6 -7

arch/arm64/kernel/hyp-stub.S

··· 27 27 ventry el2_fiq_invalid // FIQ EL2t 28 28 ventry el2_error_invalid // Error EL2t 29 29 30 - ventry el2_sync_invalid // Synchronous EL2h 30 + ventry elx_sync // Synchronous EL2h 31 31 ventry el2_irq_invalid // IRQ EL2h 32 32 ventry el2_fiq_invalid // FIQ EL2h 33 33 ventry el2_error_invalid // Error EL2h 34 34 35 - ventry el1_sync // Synchronous 64-bit EL1 35 + ventry elx_sync // Synchronous 64-bit EL1 36 36 ventry el1_irq_invalid // IRQ 64-bit EL1 37 37 ventry el1_fiq_invalid // FIQ 64-bit EL1 38 38 ventry el1_error_invalid // Error 64-bit EL1 ··· 45 45 46 46 .align 11 47 47 48 - SYM_CODE_START_LOCAL(el1_sync) 48 + SYM_CODE_START_LOCAL(elx_sync) 49 49 cmp x0, #HVC_SET_VECTORS 50 50 b.ne 1f 51 51 msr vbar_el2, x1 ··· 71 71 72 72 9: mov x0, xzr 73 73 eret 74 - SYM_CODE_END(el1_sync) 74 + SYM_CODE_END(elx_sync) 75 75 76 76 // nVHE? No way! Give me the real thing! 77 77 SYM_CODE_START_LOCAL(mutate_to_vhe) ··· 115 115 mrs_s x0, SYS_VBAR_EL12 116 116 msr vbar_el1, x0 117 117 118 - // Use EL2 translations for SPE and disable access from EL1 118 + // Use EL2 translations for SPE & TRBE and disable access from EL1 119 119 mrs x0, mdcr_el2 120 120 bic x0, x0, #(MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT) 121 + bic x0, x0, #(MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT) 121 122 msr mdcr_el2, x0 122 123 123 124 // Transfer the MM state from EL1 to EL2 ··· 225 224 * Entry point to switch to VHE if deemed capable 226 225 */ 227 226 SYM_FUNC_START(switch_to_vhe) 228 - #ifdef CONFIG_ARM64_VHE 229 227 // Need to have booted at EL2 230 228 adr_l x1, __boot_cpu_mode 231 229 ldr w0, [x1] ··· 240 240 mov x0, #HVC_VHE_RESTART 241 241 hvc #0 242 242 1: 243 - #endif 244 243 ret 245 244 SYM_FUNC_END(switch_to_vhe)

+25 -1

arch/arm64/kernel/idreg-override.c

··· 25 25 struct { 26 26 char name[FTR_DESC_FIELD_LEN]; 27 27 u8 shift; 28 + bool (*filter)(u64 val); 28 29 } fields[]; 29 30 }; 31 + 32 + static bool __init mmfr1_vh_filter(u64 val) 33 + { 34 + /* 35 + * If we ever reach this point while running VHE, we're 36 + * guaranteed to be on one of these funky, VHE-stuck CPUs. If 37 + * the user was trying to force nVHE on us, proceed with 38 + * attitude adjustment. 39 + */ 40 + return !(is_kernel_in_hyp_mode() && val == 0); 41 + } 30 42 31 43 static const struct ftr_set_desc mmfr1 __initconst = { 32 44 .name = "id_aa64mmfr1", 33 45 .override = &id_aa64mmfr1_override, 34 46 .fields = { 35 - { "vh", ID_AA64MMFR1_VHE_SHIFT }, 47 + { "vh", ID_AA64MMFR1_VHE_SHIFT, mmfr1_vh_filter }, 36 48 {} 37 49 }, 38 50 }; ··· 135 123 136 124 if (find_field(cmdline, regs[i], f, &v)) 137 125 continue; 126 + 127 + /* 128 + * If an override gets filtered out, advertise 129 + * it by setting the value to 0xf, but 130 + * clearing the mask... Yes, this is fragile. 131 + */ 132 + if (regs[i]->fields[f].filter && 133 + !regs[i]->fields[f].filter(v)) { 134 + regs[i]->override->val |= mask; 135 + regs[i]->override->mask &= ~mask; 136 + continue; 137 + } 138 138 139 139 regs[i]->override->val &= ~mask; 140 140 regs[i]->override->val |= (v << shift) & mask;

+32 -2

arch/arm64/kernel/image-vars.h

··· 65 65 KVM_NVHE_ALIAS(kvm_patch_vector_branch); 66 66 KVM_NVHE_ALIAS(kvm_update_va_mask); 67 67 KVM_NVHE_ALIAS(kvm_get_kimage_voffset); 68 + KVM_NVHE_ALIAS(kvm_compute_final_ctr_el0); 68 69 69 70 /* Global kernel state accessed by nVHE hyp code. */ 70 71 KVM_NVHE_ALIAS(kvm_vgic_global_state); 71 72 72 73 /* Kernel symbols used to call panic() from nVHE hyp code (via ERET). */ 73 - KVM_NVHE_ALIAS(__hyp_panic_string); 74 - KVM_NVHE_ALIAS(panic); 74 + KVM_NVHE_ALIAS(nvhe_hyp_panic_handler); 75 75 76 76 /* Vectors installed by hyp-init on reset HVC. */ 77 77 KVM_NVHE_ALIAS(__hyp_stub_vectors); ··· 103 103 104 104 /* PMU available static key */ 105 105 KVM_NVHE_ALIAS(kvm_arm_pmu_available); 106 + 107 + /* Position-independent library routines */ 108 + KVM_NVHE_ALIAS_HYP(clear_page, __pi_clear_page); 109 + KVM_NVHE_ALIAS_HYP(copy_page, __pi_copy_page); 110 + KVM_NVHE_ALIAS_HYP(memcpy, __pi_memcpy); 111 + KVM_NVHE_ALIAS_HYP(memset, __pi_memset); 112 + 113 + #ifdef CONFIG_KASAN 114 + KVM_NVHE_ALIAS_HYP(__memcpy, __pi_memcpy); 115 + KVM_NVHE_ALIAS_HYP(__memset, __pi_memset); 116 + #endif 117 + 118 + /* Kernel memory sections */ 119 + KVM_NVHE_ALIAS(__start_rodata); 120 + KVM_NVHE_ALIAS(__end_rodata); 121 + KVM_NVHE_ALIAS(__bss_start); 122 + KVM_NVHE_ALIAS(__bss_stop); 123 + 124 + /* Hyp memory sections */ 125 + KVM_NVHE_ALIAS(__hyp_idmap_text_start); 126 + KVM_NVHE_ALIAS(__hyp_idmap_text_end); 127 + KVM_NVHE_ALIAS(__hyp_text_start); 128 + KVM_NVHE_ALIAS(__hyp_text_end); 129 + KVM_NVHE_ALIAS(__hyp_bss_start); 130 + KVM_NVHE_ALIAS(__hyp_bss_end); 131 + KVM_NVHE_ALIAS(__hyp_rodata_start); 132 + KVM_NVHE_ALIAS(__hyp_rodata_end); 133 + 134 + /* pKVM static key */ 135 + KVM_NVHE_ALIAS(kvm_protected_mode_initialized); 106 136 107 137 #endif /* CONFIG_KVM */ 108 138

+43 -31

arch/arm64/kernel/vmlinux.lds.S

··· 5 5 * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz> 6 6 */ 7 7 8 - #define RO_EXCEPTION_TABLE_ALIGN 8 9 - #define RUNTIME_DISCARD_EXIT 10 - 11 - #include <asm-generic/vmlinux.lds.h> 12 - #include <asm/cache.h> 13 8 #include <asm/hyp_image.h> 14 - #include <asm/kernel-pgtable.h> 15 - #include <asm/memory.h> 16 - #include <asm/page.h> 17 - 18 - #include "image.h" 19 - 20 - OUTPUT_ARCH(aarch64) 21 - ENTRY(_text) 22 - 23 - jiffies = jiffies_64; 24 - 25 - 26 9 #ifdef CONFIG_KVM 27 10 #define HYPERVISOR_EXTABLE \ 28 11 . = ALIGN(SZ_8); \ ··· 15 32 16 33 #define HYPERVISOR_DATA_SECTIONS \ 17 34 HYP_SECTION_NAME(.rodata) : { \ 35 + . = ALIGN(PAGE_SIZE); \ 18 36 __hyp_rodata_start = .; \ 19 37 *(HYP_SECTION_NAME(.data..ro_after_init)) \ 20 38 *(HYP_SECTION_NAME(.rodata)) \ 39 + . = ALIGN(PAGE_SIZE); \ 21 40 __hyp_rodata_end = .; \ 22 41 } 23 42 ··· 36 51 __hyp_reloc_end = .; \ 37 52 } 38 53 54 + #define BSS_FIRST_SECTIONS \ 55 + __hyp_bss_start = .; \ 56 + *(HYP_SECTION_NAME(.bss)) \ 57 + . = ALIGN(PAGE_SIZE); \ 58 + __hyp_bss_end = .; 59 + 60 + /* 61 + * We require that __hyp_bss_start and __bss_start are aligned, and enforce it 62 + * with an assertion. But the BSS_SECTION macro places an empty .sbss section 63 + * between them, which can in some cases cause the linker to misalign them. To 64 + * work around the issue, force a page alignment for __bss_start. 65 + */ 66 + #define SBSS_ALIGN PAGE_SIZE 39 67 #else /* CONFIG_KVM */ 40 68 #define HYPERVISOR_EXTABLE 41 69 #define HYPERVISOR_DATA_SECTIONS 42 70 #define HYPERVISOR_PERCPU_SECTION 43 71 #define HYPERVISOR_RELOC_SECTION 72 + #define SBSS_ALIGN 0 44 73 #endif 45 74 75 + #define RO_EXCEPTION_TABLE_ALIGN 8 76 + #define RUNTIME_DISCARD_EXIT 77 + 78 + #include <asm-generic/vmlinux.lds.h> 79 + #include <asm/cache.h> 80 + #include <asm/kernel-pgtable.h> 81 + #include <asm/memory.h> 82 + #include <asm/page.h> 83 + 84 + #include "image.h" 85 + 86 + OUTPUT_ARCH(aarch64) 87 + ENTRY(_text) 88 + 89 + jiffies = jiffies_64; 90 + 46 91 #define HYPERVISOR_TEXT \ 47 - /* \ 48 - * Align to 4 KB so that \ 49 - * a) the HYP vector table is at its minimum \ 50 - * alignment of 2048 bytes \ 51 - * b) the HYP init code will not cross a page \ 52 - * boundary if its size does not exceed \ 53 - * 4 KB (see related ASSERT() below) \ 54 - */ \ 55 - . = ALIGN(SZ_4K); \ 92 + . = ALIGN(PAGE_SIZE); \ 56 93 __hyp_idmap_text_start = .; \ 57 94 *(.hyp.idmap.text) \ 58 95 __hyp_idmap_text_end = .; \ 59 96 __hyp_text_start = .; \ 60 97 *(.hyp.text) \ 61 98 HYPERVISOR_EXTABLE \ 99 + . = ALIGN(PAGE_SIZE); \ 62 100 __hyp_text_end = .; 63 101 64 102 #define IDMAP_TEXT \ ··· 284 276 __pecoff_data_rawsize = ABSOLUTE(. - __initdata_begin); 285 277 _edata = .; 286 278 287 - BSS_SECTION(0, 0, 0) 279 + BSS_SECTION(SBSS_ALIGN, 0, 0) 288 280 289 281 . = ALIGN(PAGE_SIZE); 290 282 init_pg_dir = .; ··· 317 309 #include "image-vars.h" 318 310 319 311 /* 320 - * The HYP init code and ID map text can't be longer than a page each, 321 - * and should not cross a page boundary. 312 + * The HYP init code and ID map text can't be longer than a page each. The 313 + * former is page-aligned, but the latter may not be with 16K or 64K pages, so 314 + * it should also not cross a page boundary. 322 315 */ 323 - ASSERT(__hyp_idmap_text_end - (__hyp_idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, 324 - "HYP init code too big or misaligned") 316 + ASSERT(__hyp_idmap_text_end - __hyp_idmap_text_start <= PAGE_SIZE, 317 + "HYP init code too big") 325 318 ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, 326 319 "ID map text too big or misaligned") 327 320 #ifdef CONFIG_HIBERNATION ··· 332 323 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 333 324 ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, 334 325 "Entry trampoline text too big") 326 + #endif 327 + #ifdef CONFIG_KVM 328 + ASSERT(__hyp_bss_start == __bss_start, "HYP and Host BSS are misaligned") 335 329 #endif 336 330 /* 337 331 * If padding is applied before .head.text, virt<->phys conversions will fail.

+184 -32

arch/arm64/kvm/arm.c

··· 206 206 case KVM_CAP_ARM_INJECT_EXT_DABT: 207 207 case KVM_CAP_SET_GUEST_DEBUG: 208 208 case KVM_CAP_VCPU_ATTRIBUTES: 209 + case KVM_CAP_PTP_KVM: 209 210 r = 1; 210 211 break; 211 212 case KVM_CAP_SET_GUEST_DEBUG2: ··· 419 418 420 419 if (vcpu_has_ptrauth(vcpu)) 421 420 vcpu_ptrauth_disable(vcpu); 421 + kvm_arch_vcpu_load_debug_state_flags(vcpu); 422 422 } 423 423 424 424 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 425 425 { 426 + kvm_arch_vcpu_put_debug_state_flags(vcpu); 426 427 kvm_arch_vcpu_put_fp(vcpu); 427 428 if (has_vhe()) 428 429 kvm_vcpu_put_sysregs_vhe(vcpu); ··· 584 581 return -EPERM; 585 582 586 583 vcpu->arch.has_run_once = true; 584 + 585 + kvm_arm_vcpu_init_debug(vcpu); 587 586 588 587 if (likely(irqchip_in_kernel(kvm))) { 589 588 /* ··· 1357 1352 /* A lookup table holding the hypervisor VA for each vector slot */ 1358 1353 static void *hyp_spectre_vector_selector[BP_HARDEN_EL2_SLOTS]; 1359 1354 1360 - static int __kvm_vector_slot2idx(enum arm64_hyp_spectre_vector slot) 1361 - { 1362 - return slot - (slot != HYP_VECTOR_DIRECT); 1363 - } 1364 - 1365 1355 static void kvm_init_vector_slot(void *base, enum arm64_hyp_spectre_vector slot) 1366 1356 { 1367 - int idx = __kvm_vector_slot2idx(slot); 1368 - 1369 - hyp_spectre_vector_selector[slot] = base + (idx * SZ_2K); 1357 + hyp_spectre_vector_selector[slot] = __kvm_vector_slot2addr(base, slot); 1370 1358 } 1371 1359 1372 1360 static int kvm_init_vector_slots(void) ··· 1388 1390 return 0; 1389 1391 } 1390 1392 1391 - static void cpu_init_hyp_mode(void) 1393 + static void cpu_prepare_hyp_mode(int cpu) 1392 1394 { 1393 - struct kvm_nvhe_init_params *params = this_cpu_ptr_nvhe_sym(kvm_init_params); 1394 - struct arm_smccc_res res; 1395 + struct kvm_nvhe_init_params *params = per_cpu_ptr_nvhe_sym(kvm_init_params, cpu); 1395 1396 unsigned long tcr; 1396 - 1397 - /* Switch from the HYP stub to our own HYP init vector */ 1398 - __hyp_set_vectors(kvm_get_idmap_vector()); 1399 1397 1400 1398 /* 1401 1399 * Calculate the raw per-cpu offset without a translation from the ··· 1399 1405 * so that we can use adr_l to access per-cpu variables in EL2. 1400 1406 * Also drop the KASAN tag which gets in the way... 1401 1407 */ 1402 - params->tpidr_el2 = (unsigned long)kasan_reset_tag(this_cpu_ptr_nvhe_sym(__per_cpu_start)) - 1408 + params->tpidr_el2 = (unsigned long)kasan_reset_tag(per_cpu_ptr_nvhe_sym(__per_cpu_start, cpu)) - 1403 1409 (unsigned long)kvm_ksym_ref(CHOOSE_NVHE_SYM(__per_cpu_start)); 1404 1410 1405 1411 params->mair_el2 = read_sysreg(mair_el1); ··· 1423 1429 tcr |= (idmap_t0sz & GENMASK(TCR_TxSZ_WIDTH - 1, 0)) << TCR_T0SZ_OFFSET; 1424 1430 params->tcr_el2 = tcr; 1425 1431 1426 - params->stack_hyp_va = kern_hyp_va(__this_cpu_read(kvm_arm_hyp_stack_page) + PAGE_SIZE); 1432 + params->stack_hyp_va = kern_hyp_va(per_cpu(kvm_arm_hyp_stack_page, cpu) + PAGE_SIZE); 1427 1433 params->pgd_pa = kvm_mmu_get_httbr(); 1434 + if (is_protected_kvm_enabled()) 1435 + params->hcr_el2 = HCR_HOST_NVHE_PROTECTED_FLAGS; 1436 + else 1437 + params->hcr_el2 = HCR_HOST_NVHE_FLAGS; 1438 + params->vttbr = params->vtcr = 0; 1428 1439 1429 1440 /* 1430 1441 * Flush the init params from the data cache because the struct will 1431 1442 * be read while the MMU is off. 1432 1443 */ 1433 1444 kvm_flush_dcache_to_poc(params, sizeof(*params)); 1445 + } 1446 + 1447 + static void hyp_install_host_vector(void) 1448 + { 1449 + struct kvm_nvhe_init_params *params; 1450 + struct arm_smccc_res res; 1451 + 1452 + /* Switch from the HYP stub to our own HYP init vector */ 1453 + __hyp_set_vectors(kvm_get_idmap_vector()); 1434 1454 1435 1455 /* 1436 1456 * Call initialization code, and switch to the full blown HYP code. ··· 1453 1445 * cpus_have_const_cap() wrapper. 1454 1446 */ 1455 1447 BUG_ON(!system_capabilities_finalized()); 1448 + params = this_cpu_ptr_nvhe_sym(kvm_init_params); 1456 1449 arm_smccc_1_1_hvc(KVM_HOST_SMCCC_FUNC(__kvm_hyp_init), virt_to_phys(params), &res); 1457 1450 WARN_ON(res.a0 != SMCCC_RET_SUCCESS); 1451 + } 1452 + 1453 + static void cpu_init_hyp_mode(void) 1454 + { 1455 + hyp_install_host_vector(); 1458 1456 1459 1457 /* 1460 1458 * Disabling SSBD on a non-VHE system requires us to enable SSBS ··· 1503 1489 struct bp_hardening_data *data = this_cpu_ptr(&bp_hardening_data); 1504 1490 void *vector = hyp_spectre_vector_selector[data->slot]; 1505 1491 1506 - *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector; 1492 + if (!is_protected_kvm_enabled()) 1493 + *this_cpu_ptr_hyp_sym(kvm_hyp_vector) = (unsigned long)vector; 1494 + else 1495 + kvm_call_hyp_nvhe(__pkvm_cpu_set_vector, data->slot); 1507 1496 } 1508 1497 1509 1498 static void cpu_hyp_reinit(void) ··· 1514 1497 kvm_init_host_cpu_context(&this_cpu_ptr_hyp_sym(kvm_host_data)->host_ctxt); 1515 1498 1516 1499 cpu_hyp_reset(); 1517 - cpu_set_hyp_vector(); 1518 1500 1519 1501 if (is_kernel_in_hyp_mode()) 1520 1502 kvm_timer_init_vhe(); 1521 1503 else 1522 1504 cpu_init_hyp_mode(); 1505 + 1506 + cpu_set_hyp_vector(); 1523 1507 1524 1508 kvm_arm_init_debug(); 1525 1509 ··· 1717 1699 } 1718 1700 } 1719 1701 1702 + static int do_pkvm_init(u32 hyp_va_bits) 1703 + { 1704 + void *per_cpu_base = kvm_ksym_ref(kvm_arm_hyp_percpu_base); 1705 + int ret; 1706 + 1707 + preempt_disable(); 1708 + hyp_install_host_vector(); 1709 + ret = kvm_call_hyp_nvhe(__pkvm_init, hyp_mem_base, hyp_mem_size, 1710 + num_possible_cpus(), kern_hyp_va(per_cpu_base), 1711 + hyp_va_bits); 1712 + preempt_enable(); 1713 + 1714 + return ret; 1715 + } 1716 + 1717 + static int kvm_hyp_init_protection(u32 hyp_va_bits) 1718 + { 1719 + void *addr = phys_to_virt(hyp_mem_base); 1720 + int ret; 1721 + 1722 + kvm_nvhe_sym(id_aa64mmfr0_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 1723 + kvm_nvhe_sym(id_aa64mmfr1_el1_sys_val) = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 1724 + 1725 + ret = create_hyp_mappings(addr, addr + hyp_mem_size, PAGE_HYP); 1726 + if (ret) 1727 + return ret; 1728 + 1729 + ret = do_pkvm_init(hyp_va_bits); 1730 + if (ret) 1731 + return ret; 1732 + 1733 + free_hyp_pgds(); 1734 + 1735 + return 0; 1736 + } 1737 + 1720 1738 /** 1721 1739 * Inits Hyp-mode on all online CPUs 1722 1740 */ 1723 1741 static int init_hyp_mode(void) 1724 1742 { 1743 + u32 hyp_va_bits; 1725 1744 int cpu; 1726 - int err = 0; 1745 + int err = -ENOMEM; 1746 + 1747 + /* 1748 + * The protected Hyp-mode cannot be initialized if the memory pool 1749 + * allocation has failed. 1750 + */ 1751 + if (is_protected_kvm_enabled() && !hyp_mem_base) 1752 + goto out_err; 1727 1753 1728 1754 /* 1729 1755 * Allocate Hyp PGD and setup Hyp identity mapping 1730 1756 */ 1731 - err = kvm_mmu_init(); 1757 + err = kvm_mmu_init(&hyp_va_bits); 1732 1758 if (err) 1733 1759 goto out_err; 1734 1760 ··· 1833 1771 goto out_err; 1834 1772 } 1835 1773 1836 - err = create_hyp_mappings(kvm_ksym_ref(__bss_start), 1774 + /* 1775 + * .hyp.bss is guaranteed to be placed at the beginning of the .bss 1776 + * section thanks to an assertion in the linker script. Map it RW and 1777 + * the rest of .bss RO. 1778 + */ 1779 + err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_start), 1780 + kvm_ksym_ref(__hyp_bss_end), PAGE_HYP); 1781 + if (err) { 1782 + kvm_err("Cannot map hyp bss section: %d\n", err); 1783 + goto out_err; 1784 + } 1785 + 1786 + err = create_hyp_mappings(kvm_ksym_ref(__hyp_bss_end), 1837 1787 kvm_ksym_ref(__bss_stop), PAGE_HYP_RO); 1838 1788 if (err) { 1839 1789 kvm_err("Cannot map bss section\n"); ··· 1866 1792 } 1867 1793 } 1868 1794 1869 - /* 1870 - * Map Hyp percpu pages 1871 - */ 1872 1795 for_each_possible_cpu(cpu) { 1873 1796 char *percpu_begin = (char *)kvm_arm_hyp_percpu_base[cpu]; 1874 1797 char *percpu_end = percpu_begin + nvhe_percpu_size(); 1875 1798 1799 + /* Map Hyp percpu pages */ 1876 1800 err = create_hyp_mappings(percpu_begin, percpu_end, PAGE_HYP); 1877 - 1878 1801 if (err) { 1879 1802 kvm_err("Cannot map hyp percpu region\n"); 1880 1803 goto out_err; 1881 1804 } 1805 + 1806 + /* Prepare the CPU initialization parameters */ 1807 + cpu_prepare_hyp_mode(cpu); 1882 1808 } 1883 1809 1884 1810 if (is_protected_kvm_enabled()) { 1885 1811 init_cpu_logical_map(); 1886 1812 1887 - if (!init_psci_relay()) 1813 + if (!init_psci_relay()) { 1814 + err = -ENODEV; 1888 1815 goto out_err; 1816 + } 1817 + } 1818 + 1819 + if (is_protected_kvm_enabled()) { 1820 + err = kvm_hyp_init_protection(hyp_va_bits); 1821 + if (err) { 1822 + kvm_err("Failed to init hyp memory protection\n"); 1823 + goto out_err; 1824 + } 1889 1825 } 1890 1826 1891 1827 return 0; ··· 1904 1820 teardown_hyp_mode(); 1905 1821 kvm_err("error initializing Hyp mode: %d\n", err); 1906 1822 return err; 1823 + } 1824 + 1825 + static void _kvm_host_prot_finalize(void *discard) 1826 + { 1827 + WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)); 1828 + } 1829 + 1830 + static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) 1831 + { 1832 + return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end); 1833 + } 1834 + 1835 + #define pkvm_mark_hyp_section(__section) \ 1836 + pkvm_mark_hyp(__pa_symbol(__section##_start), \ 1837 + __pa_symbol(__section##_end)) 1838 + 1839 + static int finalize_hyp_mode(void) 1840 + { 1841 + int cpu, ret; 1842 + 1843 + if (!is_protected_kvm_enabled()) 1844 + return 0; 1845 + 1846 + ret = pkvm_mark_hyp_section(__hyp_idmap_text); 1847 + if (ret) 1848 + return ret; 1849 + 1850 + ret = pkvm_mark_hyp_section(__hyp_text); 1851 + if (ret) 1852 + return ret; 1853 + 1854 + ret = pkvm_mark_hyp_section(__hyp_rodata); 1855 + if (ret) 1856 + return ret; 1857 + 1858 + ret = pkvm_mark_hyp_section(__hyp_bss); 1859 + if (ret) 1860 + return ret; 1861 + 1862 + ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size); 1863 + if (ret) 1864 + return ret; 1865 + 1866 + for_each_possible_cpu(cpu) { 1867 + phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]); 1868 + phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order()); 1869 + 1870 + ret = pkvm_mark_hyp(start, end); 1871 + if (ret) 1872 + return ret; 1873 + 1874 + start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu)); 1875 + end = start + PAGE_SIZE; 1876 + ret = pkvm_mark_hyp(start, end); 1877 + if (ret) 1878 + return ret; 1879 + } 1880 + 1881 + /* 1882 + * Flip the static key upfront as that may no longer be possible 1883 + * once the host stage 2 is installed. 1884 + */ 1885 + static_branch_enable(&kvm_protected_mode_initialized); 1886 + on_each_cpu(_kvm_host_prot_finalize, NULL, 1); 1887 + 1888 + return 0; 1907 1889 } 1908 1890 1909 1891 static void check_kvm_target_cpu(void *ret) ··· 2046 1896 2047 1897 in_hyp_mode = is_kernel_in_hyp_mode(); 2048 1898 2049 - if (!in_hyp_mode && kvm_arch_requires_vhe()) { 2050 - kvm_pr_unimpl("CPU unsupported in non-VHE mode, not initializing\n"); 2051 - return -ENODEV; 2052 - } 2053 - 2054 1899 if (cpus_have_final_cap(ARM64_WORKAROUND_DEVICE_LOAD_ACQUIRE) || 2055 1900 cpus_have_final_cap(ARM64_WORKAROUND_1508412)) 2056 1901 kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ ··· 2083 1938 if (err) 2084 1939 goto out_hyp; 2085 1940 1941 + if (!in_hyp_mode) { 1942 + err = finalize_hyp_mode(); 1943 + if (err) { 1944 + kvm_err("Failed to finalize Hyp protection\n"); 1945 + goto out_hyp; 1946 + } 1947 + } 1948 + 2086 1949 if (is_protected_kvm_enabled()) { 2087 - static_branch_enable(&kvm_protected_mode_initialized); 2088 1950 kvm_info("Protected nVHE mode initialized successfully\n"); 2089 1951 } else if (in_hyp_mode) { 2090 1952 kvm_info("VHE mode initialized successfully\n");

+90 -28

arch/arm64/kvm/debug.c

··· 69 69 } 70 70 71 71 /** 72 + * kvm_arm_setup_mdcr_el2 - configure vcpu mdcr_el2 value 73 + * 74 + * @vcpu: the vcpu pointer 75 + * 76 + * This ensures we will trap access to: 77 + * - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR) 78 + * - Debug ROM Address (MDCR_EL2_TDRA) 79 + * - OS related registers (MDCR_EL2_TDOSA) 80 + * - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB) 81 + * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF) 82 + * - Self-hosted Trace (MDCR_EL2_TTRF/MDCR_EL2_E2TB) 83 + */ 84 + static void kvm_arm_setup_mdcr_el2(struct kvm_vcpu *vcpu) 85 + { 86 + /* 87 + * This also clears MDCR_EL2_E2PB_MASK and MDCR_EL2_E2TB_MASK 88 + * to disable guest access to the profiling and trace buffers 89 + */ 90 + vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; 91 + vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | 92 + MDCR_EL2_TPMS | 93 + MDCR_EL2_TTRF | 94 + MDCR_EL2_TPMCR | 95 + MDCR_EL2_TDRA | 96 + MDCR_EL2_TDOSA); 97 + 98 + /* Is the VM being debugged by userspace? */ 99 + if (vcpu->guest_debug) 100 + /* Route all software debug exceptions to EL2 */ 101 + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; 102 + 103 + /* 104 + * Trap debug register access when one of the following is true: 105 + * - Userspace is using the hardware to debug the guest 106 + * (KVM_GUESTDBG_USE_HW is set). 107 + * - The guest is not using debug (KVM_ARM64_DEBUG_DIRTY is clear). 108 + */ 109 + if ((vcpu->guest_debug & KVM_GUESTDBG_USE_HW) || 110 + !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY)) 111 + vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; 112 + 113 + trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); 114 + } 115 + 116 + /** 117 + * kvm_arm_vcpu_init_debug - setup vcpu debug traps 118 + * 119 + * @vcpu: the vcpu pointer 120 + * 121 + * Set vcpu initial mdcr_el2 value. 122 + */ 123 + void kvm_arm_vcpu_init_debug(struct kvm_vcpu *vcpu) 124 + { 125 + preempt_disable(); 126 + kvm_arm_setup_mdcr_el2(vcpu); 127 + preempt_enable(); 128 + } 129 + 130 + /** 72 131 * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state 73 132 */ 74 133 ··· 142 83 * @vcpu: the vcpu pointer 143 84 * 144 85 * This is called before each entry into the hypervisor to setup any 145 - * debug related registers. Currently this just ensures we will trap 146 - * access to: 147 - * - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR) 148 - * - Debug ROM Address (MDCR_EL2_TDRA) 149 - * - OS related registers (MDCR_EL2_TDOSA) 150 - * - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB) 151 - * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF) 86 + * debug related registers. 152 87 * 153 88 * Additionally, KVM only traps guest accesses to the debug registers if 154 89 * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY ··· 154 101 155 102 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) 156 103 { 157 - bool trap_debug = !(vcpu->arch.flags & KVM_ARM64_DEBUG_DIRTY); 158 104 unsigned long mdscr, orig_mdcr_el2 = vcpu->arch.mdcr_el2; 159 105 160 106 trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug); 161 107 162 - /* 163 - * This also clears MDCR_EL2_E2PB_MASK to disable guest access 164 - * to the profiling buffer. 165 - */ 166 - vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; 167 - vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | 168 - MDCR_EL2_TPMS | 169 - MDCR_EL2_TTRF | 170 - MDCR_EL2_TPMCR | 171 - MDCR_EL2_TDRA | 172 - MDCR_EL2_TDOSA); 108 + kvm_arm_setup_mdcr_el2(vcpu); 173 109 174 110 /* Is Guest debugging in effect? */ 175 111 if (vcpu->guest_debug) { 176 - /* Route all software debug exceptions to EL2 */ 177 - vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE; 178 - 179 112 /* Save guest debug state */ 180 113 save_guest_debug_regs(vcpu); 181 114 ··· 215 176 216 177 vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state; 217 178 vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; 218 - trap_debug = true; 219 179 220 180 trace_kvm_arm_set_regset("BKPTS", get_num_brps(), 221 181 &vcpu->arch.debug_ptr->dbg_bcr[0], ··· 229 191 BUG_ON(!vcpu->guest_debug && 230 192 vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state); 231 193 232 - /* Trap debug register access */ 233 - if (trap_debug) 234 - vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA; 235 - 236 194 /* If KDE or MDE are set, perform a full save/restore cycle. */ 237 195 if (vcpu_read_sys_reg(vcpu, MDSCR_EL1) & (DBG_MDSCR_KDE | DBG_MDSCR_MDE)) 238 196 vcpu->arch.flags |= KVM_ARM64_DEBUG_DIRTY; ··· 237 203 if (has_vhe() && orig_mdcr_el2 != vcpu->arch.mdcr_el2) 238 204 write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); 239 205 240 - trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2); 241 206 trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_read_sys_reg(vcpu, MDSCR_EL1)); 242 207 } 243 208 ··· 263 230 &vcpu->arch.debug_ptr->dbg_wvr[0]); 264 231 } 265 232 } 233 + } 234 + 235 + void kvm_arch_vcpu_load_debug_state_flags(struct kvm_vcpu *vcpu) 236 + { 237 + u64 dfr0; 238 + 239 + /* For VHE, there is nothing to do */ 240 + if (has_vhe()) 241 + return; 242 + 243 + dfr0 = read_sysreg(id_aa64dfr0_el1); 244 + /* 245 + * If SPE is present on this CPU and is available at current EL, 246 + * we may need to check if the host state needs to be saved. 247 + */ 248 + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_PMSVER_SHIFT) && 249 + !(read_sysreg_s(SYS_PMBIDR_EL1) & BIT(SYS_PMBIDR_EL1_P_SHIFT))) 250 + vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_SPE; 251 + 252 + /* Check if we have TRBE implemented and available at the host */ 253 + if (cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRBE_SHIFT) && 254 + !(read_sysreg_s(SYS_TRBIDR_EL1) & TRBIDR_PROG)) 255 + vcpu->arch.flags |= KVM_ARM64_DEBUG_STATE_SAVE_TRBE; 256 + } 257 + 258 + void kvm_arch_vcpu_put_debug_state_flags(struct kvm_vcpu *vcpu) 259 + { 260 + vcpu->arch.flags &= ~(KVM_ARM64_DEBUG_STATE_SAVE_SPE | 261 + KVM_ARM64_DEBUG_STATE_SAVE_TRBE); 266 262 }

+22 -4

arch/arm64/kvm/fpsimd.c

··· 11 11 #include <linux/kvm_host.h> 12 12 #include <asm/fpsimd.h> 13 13 #include <asm/kvm_asm.h> 14 + #include <asm/kvm_hyp.h> 14 15 #include <asm/kvm_mmu.h> 15 16 #include <asm/sysreg.h> 16 17 ··· 42 41 ret = create_hyp_mappings(fpsimd, fpsimd + 1, PAGE_HYP); 43 42 if (ret) 44 43 goto error; 44 + 45 + if (vcpu->arch.sve_state) { 46 + void *sve_end; 47 + 48 + sve_end = vcpu->arch.sve_state + vcpu_sve_state_size(vcpu); 49 + 50 + ret = create_hyp_mappings(vcpu->arch.sve_state, sve_end, 51 + PAGE_HYP); 52 + if (ret) 53 + goto error; 54 + } 45 55 46 56 vcpu->arch.host_thread_info = kern_hyp_va(ti); 47 57 vcpu->arch.host_fpsimd_state = kern_hyp_va(fpsimd); ··· 121 109 local_irq_save(flags); 122 110 123 111 if (vcpu->arch.flags & KVM_ARM64_FP_ENABLED) { 124 - fpsimd_save_and_flush_cpu_state(); 112 + if (guest_has_sve) { 113 + __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_el1(SYS_ZCR); 125 114 126 - if (guest_has_sve) 127 - __vcpu_sys_reg(vcpu, ZCR_EL1) = read_sysreg_s(SYS_ZCR_EL12); 128 - } else if (host_has_sve) { 115 + /* Restore the VL that was saved when bound to the CPU */ 116 + if (!has_vhe()) 117 + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, 118 + SYS_ZCR_EL1); 119 + } 120 + 121 + fpsimd_save_and_flush_cpu_state(); 122 + } else if (has_vhe() && host_has_sve) { 129 123 /* 130 124 * The FPSIMD/SVE state in the CPU has not been touched, and we 131 125 * have SVE (and VHE): CPACR_EL1 (alias CPTR_EL2) has been

+3 -3

arch/arm64/kvm/guest.c

··· 299 299 300 300 memset(vqs, 0, sizeof(vqs)); 301 301 302 - max_vq = sve_vq_from_vl(vcpu->arch.sve_max_vl); 302 + max_vq = vcpu_sve_max_vq(vcpu); 303 303 for (vq = SVE_VQ_MIN; vq <= max_vq; ++vq) 304 304 if (sve_vq_available(vq)) 305 305 vqs[vq_word(vq)] |= vq_mask(vq); ··· 427 427 if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) 428 428 return -ENOENT; 429 429 430 - vq = sve_vq_from_vl(vcpu->arch.sve_max_vl); 430 + vq = vcpu_sve_max_vq(vcpu); 431 431 432 432 reqoffset = SVE_SIG_ZREG_OFFSET(vq, reg_num) - 433 433 SVE_SIG_REGS_OFFSET; ··· 437 437 if (!vcpu_has_sve(vcpu) || (reg->id & SVE_REG_SLICE_MASK) > 0) 438 438 return -ENOENT; 439 439 440 - vq = sve_vq_from_vl(vcpu->arch.sve_max_vl); 440 + vq = vcpu_sve_max_vq(vcpu); 441 441 442 442 reqoffset = SVE_SIG_PREG_OFFSET(vq, reg_num) - 443 443 SVE_SIG_REGS_OFFSET;

+45

arch/arm64/kvm/handle_exit.c

··· 291 291 if (exception_index == ARM_EXCEPTION_EL1_SERROR) 292 292 kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); 293 293 } 294 + 295 + void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, 296 + u64 par, uintptr_t vcpu, 297 + u64 far, u64 hpfar) { 298 + u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr)); 299 + u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr; 300 + u64 mode = spsr & PSR_MODE_MASK; 301 + 302 + /* 303 + * The nVHE hyp symbols are not included by kallsyms to avoid issues 304 + * with aliasing. That means that the symbols cannot be printed with the 305 + * "%pS" format specifier, so fall back to the vmlinux address if 306 + * there's no better option. 307 + */ 308 + if (mode != PSR_MODE_EL2t && mode != PSR_MODE_EL2h) { 309 + kvm_err("Invalid host exception to nVHE hyp!\n"); 310 + } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && 311 + (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) { 312 + struct bug_entry *bug = find_bug(elr_in_kimg); 313 + const char *file = NULL; 314 + unsigned int line = 0; 315 + 316 + /* All hyp bugs, including warnings, are treated as fatal. */ 317 + if (bug) 318 + bug_get_file_line(bug, &file, &line); 319 + 320 + if (file) 321 + kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); 322 + else 323 + kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset); 324 + } else { 325 + kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset); 326 + } 327 + 328 + /* 329 + * Hyp has panicked and we're going to handle that by panicking the 330 + * kernel. The kernel offset will be revealed in the panic so we're 331 + * also safe to reveal the hyp offset as a debugging aid for translating 332 + * hyp VAs to vmlinux addresses. 333 + */ 334 + kvm_err("Hyp Offset: 0x%llx\n", hyp_offset); 335 + 336 + panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n", 337 + spsr, elr, esr, far, hpfar, par, vcpu); 338 + }

+1 -1

arch/arm64/kvm/hyp/Makefile

··· 10 10 -DDISABLE_BRANCH_PROFILING \ 11 11 $(DISABLE_STACKLEAK_PLUGIN) 12 12 13 - obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o 13 + obj-$(CONFIG_KVM) += vhe/ nvhe/ pgtable.o reserved_mem.o

+10

arch/arm64/kvm/hyp/fpsimd.S

··· 19 19 fpsimd_restore x0, 1 20 20 ret 21 21 SYM_FUNC_END(__fpsimd_restore_state) 22 + 23 + SYM_FUNC_START(__sve_restore_state) 24 + __sve_load 0, x1, 2 25 + ret 26 + SYM_FUNC_END(__sve_restore_state) 27 + 28 + SYM_FUNC_START(__sve_save_state) 29 + sve_save 0, x1, 2 30 + ret 31 + SYM_FUNC_END(__sve_save_state)

+54 -53

arch/arm64/kvm/hyp/include/hyp/switch.h

··· 30 30 #include <asm/processor.h> 31 31 #include <asm/thread_info.h> 32 32 33 - extern const char __hyp_panic_string[]; 34 - 35 33 extern struct exception_table_entry __start___kvm_ex_table; 36 34 extern struct exception_table_entry __stop___kvm_ex_table; 37 35 ··· 158 160 return true; 159 161 } 160 162 161 - static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) 163 + static inline bool __get_fault_info(u64 esr, struct kvm_vcpu_fault_info *fault) 162 164 { 163 - u8 ec; 164 - u64 esr; 165 165 u64 hpfar, far; 166 - 167 - esr = vcpu->arch.fault.esr_el2; 168 - ec = ESR_ELx_EC(esr); 169 - 170 - if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW) 171 - return true; 172 166 173 167 far = read_sysreg_el2(SYS_FAR); 174 168 ··· 184 194 hpfar = read_sysreg(hpfar_el2); 185 195 } 186 196 187 - vcpu->arch.fault.far_el2 = far; 188 - vcpu->arch.fault.hpfar_el2 = hpfar; 197 + fault->far_el2 = far; 198 + fault->hpfar_el2 = hpfar; 189 199 return true; 200 + } 201 + 202 + static inline bool __populate_fault_info(struct kvm_vcpu *vcpu) 203 + { 204 + u8 ec; 205 + u64 esr; 206 + 207 + esr = vcpu->arch.fault.esr_el2; 208 + ec = ESR_ELx_EC(esr); 209 + 210 + if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW) 211 + return true; 212 + 213 + return __get_fault_info(esr, &vcpu->arch.fault); 214 + } 215 + 216 + static inline void __hyp_sve_save_host(struct kvm_vcpu *vcpu) 217 + { 218 + struct thread_struct *thread; 219 + 220 + thread = container_of(vcpu->arch.host_fpsimd_state, struct thread_struct, 221 + uw.fpsimd_state); 222 + 223 + __sve_save_state(sve_pffr(thread), &vcpu->arch.host_fpsimd_state->fpsr); 224 + } 225 + 226 + static inline void __hyp_sve_restore_guest(struct kvm_vcpu *vcpu) 227 + { 228 + sve_cond_update_zcr_vq(vcpu_sve_max_vq(vcpu) - 1, SYS_ZCR_EL2); 229 + __sve_restore_state(vcpu_sve_pffr(vcpu), 230 + &vcpu->arch.ctxt.fp_regs.fpsr); 231 + write_sysreg_el1(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR); 190 232 } 191 233 192 234 /* Check for an FPSIMD/SVE trap and handle as appropriate */ 193 235 static inline bool __hyp_handle_fpsimd(struct kvm_vcpu *vcpu) 194 236 { 195 - bool vhe, sve_guest, sve_host; 237 + bool sve_guest, sve_host; 196 238 u8 esr_ec; 239 + u64 reg; 197 240 198 241 if (!system_supports_fpsimd()) 199 242 return false; 200 243 201 - /* 202 - * Currently system_supports_sve() currently implies has_vhe(), 203 - * so the check is redundant. However, has_vhe() can be determined 204 - * statically and helps the compiler remove dead code. 205 - */ 206 - if (has_vhe() && system_supports_sve()) { 244 + if (system_supports_sve()) { 207 245 sve_guest = vcpu_has_sve(vcpu); 208 246 sve_host = vcpu->arch.flags & KVM_ARM64_HOST_SVE_IN_USE; 209 - vhe = true; 210 247 } else { 211 248 sve_guest = false; 212 249 sve_host = false; 213 - vhe = has_vhe(); 214 250 } 215 251 216 252 esr_ec = kvm_vcpu_trap_get_class(vcpu); ··· 245 229 return false; 246 230 247 231 /* Don't handle SVE traps for non-SVE vcpus here: */ 248 - if (!sve_guest) 249 - if (esr_ec != ESR_ELx_EC_FP_ASIMD) 250 - return false; 232 + if (!sve_guest && esr_ec != ESR_ELx_EC_FP_ASIMD) 233 + return false; 251 234 252 235 /* Valid trap. Switch the context: */ 253 - 254 - if (vhe) { 255 - u64 reg = read_sysreg(cpacr_el1) | CPACR_EL1_FPEN; 256 - 236 + if (has_vhe()) { 237 + reg = CPACR_EL1_FPEN; 257 238 if (sve_guest) 258 239 reg |= CPACR_EL1_ZEN; 259 240 260 - write_sysreg(reg, cpacr_el1); 241 + sysreg_clear_set(cpacr_el1, 0, reg); 261 242 } else { 262 - write_sysreg(read_sysreg(cptr_el2) & ~(u64)CPTR_EL2_TFP, 263 - cptr_el2); 264 - } 243 + reg = CPTR_EL2_TFP; 244 + if (sve_guest) 245 + reg |= CPTR_EL2_TZ; 265 246 247 + sysreg_clear_set(cptr_el2, reg, 0); 248 + } 266 249 isb(); 267 250 268 251 if (vcpu->arch.flags & KVM_ARM64_FP_HOST) { 269 - /* 270 - * In the SVE case, VHE is assumed: it is enforced by 271 - * Kconfig and kvm_arch_init(). 272 - */ 273 - if (sve_host) { 274 - struct thread_struct *thread = container_of( 275 - vcpu->arch.host_fpsimd_state, 276 - struct thread_struct, uw.fpsimd_state); 277 - 278 - sve_save_state(sve_pffr(thread), 279 - &vcpu->arch.host_fpsimd_state->fpsr); 280 - } else { 252 + if (sve_host) 253 + __hyp_sve_save_host(vcpu); 254 + else 281 255 __fpsimd_save_state(vcpu->arch.host_fpsimd_state); 282 - } 283 256 284 257 vcpu->arch.flags &= ~KVM_ARM64_FP_HOST; 285 258 } 286 259 287 - if (sve_guest) { 288 - sve_load_state(vcpu_sve_pffr(vcpu), 289 - &vcpu->arch.ctxt.fp_regs.fpsr, 290 - sve_vq_from_vl(vcpu->arch.sve_max_vl) - 1); 291 - write_sysreg_s(__vcpu_sys_reg(vcpu, ZCR_EL1), SYS_ZCR_EL12); 292 - } else { 260 + if (sve_guest) 261 + __hyp_sve_restore_guest(vcpu); 262 + else 293 263 __fpsimd_restore_state(&vcpu->arch.ctxt.fp_regs); 294 - } 295 264 296 265 /* Skip restoring fpexc32 for AArch64 guests */ 297 266 if (!(read_sysreg(hcr_el2) & HCR_RW))

+14

arch/arm64/kvm/hyp/include/nvhe/early_alloc.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef __KVM_HYP_EARLY_ALLOC_H 3 + #define __KVM_HYP_EARLY_ALLOC_H 4 + 5 + #include <asm/kvm_pgtable.h> 6 + 7 + void hyp_early_alloc_init(void *virt, unsigned long size); 8 + unsigned long hyp_early_alloc_nr_used_pages(void); 9 + void *hyp_early_alloc_page(void *arg); 10 + void *hyp_early_alloc_contig(unsigned int nr_pages); 11 + 12 + extern struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops; 13 + 14 + #endif /* __KVM_HYP_EARLY_ALLOC_H */

+68

arch/arm64/kvm/hyp/include/nvhe/gfp.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef __KVM_HYP_GFP_H 3 + #define __KVM_HYP_GFP_H 4 + 5 + #include <linux/list.h> 6 + 7 + #include <nvhe/memory.h> 8 + #include <nvhe/spinlock.h> 9 + 10 + #define HYP_NO_ORDER UINT_MAX 11 + 12 + struct hyp_pool { 13 + /* 14 + * Spinlock protecting concurrent changes to the memory pool as well as 15 + * the struct hyp_page of the pool's pages until we have a proper atomic 16 + * API at EL2. 17 + */ 18 + hyp_spinlock_t lock; 19 + struct list_head free_area[MAX_ORDER]; 20 + phys_addr_t range_start; 21 + phys_addr_t range_end; 22 + unsigned int max_order; 23 + }; 24 + 25 + static inline void hyp_page_ref_inc(struct hyp_page *p) 26 + { 27 + struct hyp_pool *pool = hyp_page_to_pool(p); 28 + 29 + hyp_spin_lock(&pool->lock); 30 + p->refcount++; 31 + hyp_spin_unlock(&pool->lock); 32 + } 33 + 34 + static inline int hyp_page_ref_dec_and_test(struct hyp_page *p) 35 + { 36 + struct hyp_pool *pool = hyp_page_to_pool(p); 37 + int ret; 38 + 39 + hyp_spin_lock(&pool->lock); 40 + p->refcount--; 41 + ret = (p->refcount == 0); 42 + hyp_spin_unlock(&pool->lock); 43 + 44 + return ret; 45 + } 46 + 47 + static inline void hyp_set_page_refcounted(struct hyp_page *p) 48 + { 49 + struct hyp_pool *pool = hyp_page_to_pool(p); 50 + 51 + hyp_spin_lock(&pool->lock); 52 + if (p->refcount) { 53 + hyp_spin_unlock(&pool->lock); 54 + BUG(); 55 + } 56 + p->refcount = 1; 57 + hyp_spin_unlock(&pool->lock); 58 + } 59 + 60 + /* Allocation */ 61 + void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order); 62 + void hyp_get_page(void *addr); 63 + void hyp_put_page(void *addr); 64 + 65 + /* Used pages cannot be freed */ 66 + int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, 67 + unsigned int reserved_pages); 68 + #endif /* __KVM_HYP_GFP_H */

+36

arch/arm64/kvm/hyp/include/nvhe/mem_protect.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (C) 2020 Google LLC 4 + * Author: Quentin Perret <qperret@google.com> 5 + */ 6 + 7 + #ifndef __KVM_NVHE_MEM_PROTECT__ 8 + #define __KVM_NVHE_MEM_PROTECT__ 9 + #include <linux/kvm_host.h> 10 + #include <asm/kvm_hyp.h> 11 + #include <asm/kvm_pgtable.h> 12 + #include <asm/virt.h> 13 + #include <nvhe/spinlock.h> 14 + 15 + struct host_kvm { 16 + struct kvm_arch arch; 17 + struct kvm_pgtable pgt; 18 + struct kvm_pgtable_mm_ops mm_ops; 19 + hyp_spinlock_t lock; 20 + }; 21 + extern struct host_kvm host_kvm; 22 + 23 + int __pkvm_prot_finalize(void); 24 + int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end); 25 + 26 + int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool); 27 + void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); 28 + 29 + static __always_inline void __load_host_stage2(void) 30 + { 31 + if (static_branch_likely(&kvm_protected_mode_initialized)) 32 + __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); 33 + else 34 + write_sysreg(0, vttbr_el2); 35 + } 36 + #endif /* __KVM_NVHE_MEM_PROTECT__ */

+51

arch/arm64/kvm/hyp/include/nvhe/memory.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef __KVM_HYP_MEMORY_H 3 + #define __KVM_HYP_MEMORY_H 4 + 5 + #include <asm/kvm_mmu.h> 6 + #include <asm/page.h> 7 + 8 + #include <linux/types.h> 9 + 10 + struct hyp_pool; 11 + struct hyp_page { 12 + unsigned int refcount; 13 + unsigned int order; 14 + struct hyp_pool *pool; 15 + struct list_head node; 16 + }; 17 + 18 + extern u64 __hyp_vmemmap; 19 + #define hyp_vmemmap ((struct hyp_page *)__hyp_vmemmap) 20 + 21 + #define __hyp_va(phys) ((void *)((phys_addr_t)(phys) - hyp_physvirt_offset)) 22 + 23 + static inline void *hyp_phys_to_virt(phys_addr_t phys) 24 + { 25 + return __hyp_va(phys); 26 + } 27 + 28 + static inline phys_addr_t hyp_virt_to_phys(void *addr) 29 + { 30 + return __hyp_pa(addr); 31 + } 32 + 33 + #define hyp_phys_to_pfn(phys) ((phys) >> PAGE_SHIFT) 34 + #define hyp_pfn_to_phys(pfn) ((phys_addr_t)((pfn) << PAGE_SHIFT)) 35 + #define hyp_phys_to_page(phys) (&hyp_vmemmap[hyp_phys_to_pfn(phys)]) 36 + #define hyp_virt_to_page(virt) hyp_phys_to_page(__hyp_pa(virt)) 37 + #define hyp_virt_to_pfn(virt) hyp_phys_to_pfn(__hyp_pa(virt)) 38 + 39 + #define hyp_page_to_pfn(page) ((struct hyp_page *)(page) - hyp_vmemmap) 40 + #define hyp_page_to_phys(page) hyp_pfn_to_phys((hyp_page_to_pfn(page))) 41 + #define hyp_page_to_virt(page) __hyp_va(hyp_page_to_phys(page)) 42 + #define hyp_page_to_pool(page) (((struct hyp_page *)page)->pool) 43 + 44 + static inline int hyp_page_count(void *addr) 45 + { 46 + struct hyp_page *p = hyp_virt_to_page(addr); 47 + 48 + return p->refcount; 49 + } 50 + 51 + #endif /* __KVM_HYP_MEMORY_H */

+96

arch/arm64/kvm/hyp/include/nvhe/mm.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + #ifndef __KVM_HYP_MM_H 3 + #define __KVM_HYP_MM_H 4 + 5 + #include <asm/kvm_pgtable.h> 6 + #include <asm/spectre.h> 7 + #include <linux/memblock.h> 8 + #include <linux/types.h> 9 + 10 + #include <nvhe/memory.h> 11 + #include <nvhe/spinlock.h> 12 + 13 + #define HYP_MEMBLOCK_REGIONS 128 14 + extern struct memblock_region kvm_nvhe_sym(hyp_memory)[]; 15 + extern unsigned int kvm_nvhe_sym(hyp_memblock_nr); 16 + extern struct kvm_pgtable pkvm_pgtable; 17 + extern hyp_spinlock_t pkvm_pgd_lock; 18 + extern struct hyp_pool hpool; 19 + extern u64 __io_map_base; 20 + 21 + int hyp_create_idmap(u32 hyp_va_bits); 22 + int hyp_map_vectors(void); 23 + int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back); 24 + int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); 25 + int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); 26 + int __pkvm_create_mappings(unsigned long start, unsigned long size, 27 + unsigned long phys, enum kvm_pgtable_prot prot); 28 + unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, 29 + enum kvm_pgtable_prot prot); 30 + 31 + static inline void hyp_vmemmap_range(phys_addr_t phys, unsigned long size, 32 + unsigned long *start, unsigned long *end) 33 + { 34 + unsigned long nr_pages = size >> PAGE_SHIFT; 35 + struct hyp_page *p = hyp_phys_to_page(phys); 36 + 37 + *start = (unsigned long)p; 38 + *end = *start + nr_pages * sizeof(struct hyp_page); 39 + *start = ALIGN_DOWN(*start, PAGE_SIZE); 40 + *end = ALIGN(*end, PAGE_SIZE); 41 + } 42 + 43 + static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages) 44 + { 45 + unsigned long total = 0, i; 46 + 47 + /* Provision the worst case scenario */ 48 + for (i = 0; i < KVM_PGTABLE_MAX_LEVELS; i++) { 49 + nr_pages = DIV_ROUND_UP(nr_pages, PTRS_PER_PTE); 50 + total += nr_pages; 51 + } 52 + 53 + return total; 54 + } 55 + 56 + static inline unsigned long __hyp_pgtable_total_pages(void) 57 + { 58 + unsigned long res = 0, i; 59 + 60 + /* Cover all of memory with page-granularity */ 61 + for (i = 0; i < kvm_nvhe_sym(hyp_memblock_nr); i++) { 62 + struct memblock_region *reg = &kvm_nvhe_sym(hyp_memory)[i]; 63 + res += __hyp_pgtable_max_pages(reg->size >> PAGE_SHIFT); 64 + } 65 + 66 + return res; 67 + } 68 + 69 + static inline unsigned long hyp_s1_pgtable_pages(void) 70 + { 71 + unsigned long res; 72 + 73 + res = __hyp_pgtable_total_pages(); 74 + 75 + /* Allow 1 GiB for private mappings */ 76 + res += __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); 77 + 78 + return res; 79 + } 80 + 81 + static inline unsigned long host_s2_mem_pgtable_pages(void) 82 + { 83 + /* 84 + * Include an extra 16 pages to safely upper-bound the worst case of 85 + * concatenated pgds. 86 + */ 87 + return __hyp_pgtable_total_pages() + 16; 88 + } 89 + 90 + static inline unsigned long host_s2_dev_pgtable_pages(void) 91 + { 92 + /* Allow 1 GiB for MMIO mappings */ 93 + return __hyp_pgtable_max_pages(SZ_1G >> PAGE_SHIFT); 94 + } 95 + 96 + #endif /* __KVM_HYP_MM_H */

+92

arch/arm64/kvm/hyp/include/nvhe/spinlock.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * A stand-alone ticket spinlock implementation for use by the non-VHE 4 + * KVM hypervisor code running at EL2. 5 + * 6 + * Copyright (C) 2020 Google LLC 7 + * Author: Will Deacon <will@kernel.org> 8 + * 9 + * Heavily based on the implementation removed by c11090474d70 which was: 10 + * Copyright (C) 2012 ARM Ltd. 11 + */ 12 + 13 + #ifndef __ARM64_KVM_NVHE_SPINLOCK_H__ 14 + #define __ARM64_KVM_NVHE_SPINLOCK_H__ 15 + 16 + #include <asm/alternative.h> 17 + #include <asm/lse.h> 18 + 19 + typedef union hyp_spinlock { 20 + u32 __val; 21 + struct { 22 + #ifdef __AARCH64EB__ 23 + u16 next, owner; 24 + #else 25 + u16 owner, next; 26 + #endif 27 + }; 28 + } hyp_spinlock_t; 29 + 30 + #define hyp_spin_lock_init(l) \ 31 + do { \ 32 + *(l) = (hyp_spinlock_t){ .__val = 0 }; \ 33 + } while (0) 34 + 35 + static inline void hyp_spin_lock(hyp_spinlock_t *lock) 36 + { 37 + u32 tmp; 38 + hyp_spinlock_t lockval, newval; 39 + 40 + asm volatile( 41 + /* Atomically increment the next ticket. */ 42 + ARM64_LSE_ATOMIC_INSN( 43 + /* LL/SC */ 44 + " prfm pstl1strm, %3\n" 45 + "1: ldaxr %w0, %3\n" 46 + " add %w1, %w0, #(1 << 16)\n" 47 + " stxr %w2, %w1, %3\n" 48 + " cbnz %w2, 1b\n", 49 + /* LSE atomics */ 50 + " mov %w2, #(1 << 16)\n" 51 + " ldadda %w2, %w0, %3\n" 52 + __nops(3)) 53 + 54 + /* Did we get the lock? */ 55 + " eor %w1, %w0, %w0, ror #16\n" 56 + " cbz %w1, 3f\n" 57 + /* 58 + * No: spin on the owner. Send a local event to avoid missing an 59 + * unlock before the exclusive load. 60 + */ 61 + " sevl\n" 62 + "2: wfe\n" 63 + " ldaxrh %w2, %4\n" 64 + " eor %w1, %w2, %w0, lsr #16\n" 65 + " cbnz %w1, 2b\n" 66 + /* We got the lock. Critical section starts here. */ 67 + "3:" 68 + : "=&r" (lockval), "=&r" (newval), "=&r" (tmp), "+Q" (*lock) 69 + : "Q" (lock->owner) 70 + : "memory"); 71 + } 72 + 73 + static inline void hyp_spin_unlock(hyp_spinlock_t *lock) 74 + { 75 + u64 tmp; 76 + 77 + asm volatile( 78 + ARM64_LSE_ATOMIC_INSN( 79 + /* LL/SC */ 80 + " ldrh %w1, %0\n" 81 + " add %w1, %w1, #1\n" 82 + " stlrh %w1, %0", 83 + /* LSE atomics */ 84 + " mov %w1, #1\n" 85 + " staddlh %w1, %0\n" 86 + __nops(1)) 87 + : "=Q" (lock->owner), "=&r" (tmp) 88 + : 89 + : "memory"); 90 + } 91 + 92 + #endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */

+7 -2

arch/arm64/kvm/hyp/nvhe/Makefile

··· 9 9 hostprogs := gen-hyprel 10 10 HOST_EXTRACFLAGS += -I$(objtree)/include 11 11 12 + lib-objs := clear_page.o copy_page.o memcpy.o memset.o 13 + lib-objs := $(addprefix ../../../lib/, $(lib-objs)) 14 + 12 15 obj-y := timer-sr.o sysreg-sr.o debug-sr.o switch.o tlb.o hyp-init.o host.o \ 13 - hyp-main.o hyp-smp.o psci-relay.o 16 + hyp-main.o hyp-smp.o psci-relay.o early_alloc.o stub.o page_alloc.o \ 17 + cache.o setup.o mm.o mem_protect.o 14 18 obj-y += ../vgic-v3-sr.o ../aarch32.o ../vgic-v2-cpuif-proxy.o ../entry.o \ 15 - ../fpsimd.o ../hyp-entry.o ../exception.o 19 + ../fpsimd.o ../hyp-entry.o ../exception.o ../pgtable.o 20 + obj-y += $(lib-objs) 16 21 17 22 ## 18 23 ## Build rules for compiling nVHE hyp code

+13

arch/arm64/kvm/hyp/nvhe/cache.S

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Code copied from arch/arm64/mm/cache.S. 4 + */ 5 + 6 + #include <linux/linkage.h> 7 + #include <asm/assembler.h> 8 + #include <asm/alternative.h> 9 + 10 + SYM_FUNC_START_PI(__flush_dcache_area) 11 + dcache_by_line_op civac, sy, x0, x1, x2, x3 12 + ret 13 + SYM_FUNC_END_PI(__flush_dcache_area)

+43 -13

arch/arm64/kvm/hyp/nvhe/debug-sr.c

··· 21 21 /* Clear pmscr in case of early return */ 22 22 *pmscr_el1 = 0; 23 23 24 - /* SPE present on this CPU? */ 25 - if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1), 26 - ID_AA64DFR0_PMSVER_SHIFT)) 27 - return; 28 - 29 - /* Yes; is it owned by EL3? */ 30 - reg = read_sysreg_s(SYS_PMBIDR_EL1); 31 - if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT)) 32 - return; 33 - 34 - /* No; is the host actually using the thing? */ 24 + /* 25 + * At this point, we know that this CPU implements 26 + * SPE and is available to the host. 27 + * Check if the host is actually using it ? 28 + */ 35 29 reg = read_sysreg_s(SYS_PMBLIMITR_EL1); 36 30 if (!(reg & BIT(SYS_PMBLIMITR_EL1_E_SHIFT))) 37 31 return; ··· 52 58 write_sysreg_s(pmscr_el1, SYS_PMSCR_EL1); 53 59 } 54 60 61 + static void __debug_save_trace(u64 *trfcr_el1) 62 + { 63 + *trfcr_el1 = 0; 64 + 65 + /* Check if the TRBE is enabled */ 66 + if (!(read_sysreg_s(SYS_TRBLIMITR_EL1) & TRBLIMITR_ENABLE)) 67 + return; 68 + /* 69 + * Prohibit trace generation while we are in guest. 70 + * Since access to TRFCR_EL1 is trapped, the guest can't 71 + * modify the filtering set by the host. 72 + */ 73 + *trfcr_el1 = read_sysreg_s(SYS_TRFCR_EL1); 74 + write_sysreg_s(0, SYS_TRFCR_EL1); 75 + isb(); 76 + /* Drain the trace buffer to memory */ 77 + tsb_csync(); 78 + dsb(nsh); 79 + } 80 + 81 + static void __debug_restore_trace(u64 trfcr_el1) 82 + { 83 + if (!trfcr_el1) 84 + return; 85 + 86 + /* Restore trace filter controls */ 87 + write_sysreg_s(trfcr_el1, SYS_TRFCR_EL1); 88 + } 89 + 55 90 void __debug_save_host_buffers_nvhe(struct kvm_vcpu *vcpu) 56 91 { 57 92 /* Disable and flush SPE data generation */ 58 - __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); 93 + if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE) 94 + __debug_save_spe(&vcpu->arch.host_debug_state.pmscr_el1); 95 + /* Disable and flush Self-Hosted Trace generation */ 96 + if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE) 97 + __debug_save_trace(&vcpu->arch.host_debug_state.trfcr_el1); 59 98 } 60 99 61 100 void __debug_switch_to_guest(struct kvm_vcpu *vcpu) ··· 98 71 99 72 void __debug_restore_host_buffers_nvhe(struct kvm_vcpu *vcpu) 100 73 { 101 - __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); 74 + if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_SPE) 75 + __debug_restore_spe(vcpu->arch.host_debug_state.pmscr_el1); 76 + if (vcpu->arch.flags & KVM_ARM64_DEBUG_STATE_SAVE_TRBE) 77 + __debug_restore_trace(vcpu->arch.host_debug_state.trfcr_el1); 102 78 } 103 79 104 80 void __debug_switch_to_host(struct kvm_vcpu *vcpu)

+54

arch/arm64/kvm/hyp/nvhe/early_alloc.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2020 Google LLC 4 + * Author: Quentin Perret <qperret@google.com> 5 + */ 6 + 7 + #include <asm/kvm_pgtable.h> 8 + 9 + #include <nvhe/early_alloc.h> 10 + #include <nvhe/memory.h> 11 + 12 + struct kvm_pgtable_mm_ops hyp_early_alloc_mm_ops; 13 + s64 __ro_after_init hyp_physvirt_offset; 14 + 15 + static unsigned long base; 16 + static unsigned long end; 17 + static unsigned long cur; 18 + 19 + unsigned long hyp_early_alloc_nr_used_pages(void) 20 + { 21 + return (cur - base) >> PAGE_SHIFT; 22 + } 23 + 24 + void *hyp_early_alloc_contig(unsigned int nr_pages) 25 + { 26 + unsigned long size = (nr_pages << PAGE_SHIFT); 27 + void *ret = (void *)cur; 28 + 29 + if (!nr_pages) 30 + return NULL; 31 + 32 + if (end - cur < size) 33 + return NULL; 34 + 35 + cur += size; 36 + memset(ret, 0, size); 37 + 38 + return ret; 39 + } 40 + 41 + void *hyp_early_alloc_page(void *arg) 42 + { 43 + return hyp_early_alloc_contig(1); 44 + } 45 + 46 + void hyp_early_alloc_init(void *virt, unsigned long size) 47 + { 48 + base = cur = (unsigned long)virt; 49 + end = base + size; 50 + 51 + hyp_early_alloc_mm_ops.zalloc_page = hyp_early_alloc_page; 52 + hyp_early_alloc_mm_ops.phys_to_virt = hyp_phys_to_virt; 53 + hyp_early_alloc_mm_ops.virt_to_phys = hyp_virt_to_phys; 54 + }

+18

arch/arm64/kvm/hyp/nvhe/gen-hyprel.c

··· 50 50 #ifndef R_AARCH64_ABS64 51 51 #define R_AARCH64_ABS64 257 52 52 #endif 53 + #ifndef R_AARCH64_PREL64 54 + #define R_AARCH64_PREL64 260 55 + #endif 56 + #ifndef R_AARCH64_PREL32 57 + #define R_AARCH64_PREL32 261 58 + #endif 59 + #ifndef R_AARCH64_PREL16 60 + #define R_AARCH64_PREL16 262 61 + #endif 62 + #ifndef R_AARCH64_PLT32 63 + #define R_AARCH64_PLT32 314 64 + #endif 53 65 #ifndef R_AARCH64_LD_PREL_LO19 54 66 #define R_AARCH64_LD_PREL_LO19 273 55 67 #endif ··· 382 370 */ 383 371 case R_AARCH64_ABS64: 384 372 emit_rela_abs64(rela, sh_orig_name); 373 + break; 374 + /* Allow position-relative data relocations. */ 375 + case R_AARCH64_PREL64: 376 + case R_AARCH64_PREL32: 377 + case R_AARCH64_PREL16: 378 + case R_AARCH64_PLT32: 385 379 break; 386 380 /* Allow relocations to generate PC-relative addressing. */ 387 381 case R_AARCH64_LD_PREL_LO19:

+7 -11

arch/arm64/kvm/hyp/nvhe/host.S

··· 79 79 mov lr, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ 80 80 PSR_MODE_EL1h) 81 81 msr spsr_el2, lr 82 - ldr lr, =panic 82 + ldr lr, =nvhe_hyp_panic_handler 83 83 hyp_kimg_va lr, x6 84 84 msr elr_el2, lr 85 85 86 86 mov x29, x0 87 87 88 - /* Load the format string into x0 and arguments into x1-7 */ 89 - ldr x0, =__hyp_panic_string 90 - hyp_kimg_va x0, x6 91 - 92 - /* Load the format arguments into x1-7. */ 93 - mov x6, x3 94 - get_vcpu_ptr x7, x3 95 - mrs x3, esr_el2 96 - mrs x4, far_el2 97 - mrs x5, hpfar_el2 88 + /* Load the panic arguments into x0-7 */ 89 + mrs x0, esr_el2 90 + get_vcpu_ptr x4, x5 91 + mrs x5, far_el2 92 + mrs x6, hpfar_el2 93 + mov x7, xzr // Unused argument 98 94 99 95 /* Enter the host, conditionally restoring the host context. */ 100 96 cbz x29, __host_enter_without_restoring

+39 -15

arch/arm64/kvm/hyp/nvhe/hyp-init.S

··· 83 83 * x0: struct kvm_nvhe_init_params PA 84 84 */ 85 85 SYM_CODE_START_LOCAL(___kvm_hyp_init) 86 - alternative_if ARM64_KVM_PROTECTED_MODE 87 - mov_q x1, HCR_HOST_NVHE_PROTECTED_FLAGS 88 - msr hcr_el2, x1 89 - alternative_else_nop_endif 90 - 91 86 ldr x1, [x0, #NVHE_INIT_TPIDR_EL2] 92 87 msr tpidr_el2, x1 93 88 ··· 91 96 92 97 ldr x1, [x0, #NVHE_INIT_MAIR_EL2] 93 98 msr mair_el2, x1 99 + 100 + ldr x1, [x0, #NVHE_INIT_HCR_EL2] 101 + msr hcr_el2, x1 102 + 103 + ldr x1, [x0, #NVHE_INIT_VTTBR] 104 + msr vttbr_el2, x1 105 + 106 + ldr x1, [x0, #NVHE_INIT_VTCR] 107 + msr vtcr_el2, x1 94 108 95 109 ldr x1, [x0, #NVHE_INIT_PGD_PA] 96 110 phys_to_ttbr x2, x1 ··· 119 115 120 116 /* Invalidate the stale TLBs from Bootloader */ 121 117 tlbi alle2 118 + tlbi vmalls12e1 122 119 dsb sy 123 120 124 - /* 125 - * Preserve all the RES1 bits while setting the default flags, 126 - * as well as the EE bit on BE. Drop the A flag since the compiler 127 - * is allowed to generate unaligned accesses. 128 - */ 129 - mov_q x0, (SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A)) 130 - CPU_BE( orr x0, x0, #SCTLR_ELx_EE) 121 + mov_q x0, INIT_SCTLR_EL2_MMU_ON 131 122 alternative_if ARM64_HAS_ADDRESS_AUTH 132 123 mov_q x1, (SCTLR_ELx_ENIA | SCTLR_ELx_ENIB | \ 133 124 SCTLR_ELx_ENDA | SCTLR_ELx_ENDB) ··· 220 221 mov x0, xzr 221 222 reset: 222 223 /* Reset kvm back to the hyp stub. */ 223 - mrs x5, sctlr_el2 224 - mov_q x6, SCTLR_ELx_FLAGS 225 - bic x5, x5, x6 // Clear SCTL_M and etc 224 + mov_q x5, INIT_SCTLR_EL2_MMU_OFF 226 225 pre_disable_mmu_workaround 227 226 msr sctlr_el2, x5 228 227 isb ··· 240 243 eret 241 244 242 245 SYM_CODE_END(__kvm_handle_stub_hvc) 246 + 247 + SYM_FUNC_START(__pkvm_init_switch_pgd) 248 + /* Turn the MMU off */ 249 + pre_disable_mmu_workaround 250 + mrs x2, sctlr_el2 251 + bic x3, x2, #SCTLR_ELx_M 252 + msr sctlr_el2, x3 253 + isb 254 + 255 + tlbi alle2 256 + 257 + /* Install the new pgtables */ 258 + ldr x3, [x0, #NVHE_INIT_PGD_PA] 259 + phys_to_ttbr x4, x3 260 + alternative_if ARM64_HAS_CNP 261 + orr x4, x4, #TTBR_CNP_BIT 262 + alternative_else_nop_endif 263 + msr ttbr0_el2, x4 264 + 265 + /* Set the new stack pointer */ 266 + ldr x0, [x0, #NVHE_INIT_STACK_HYP_VA] 267 + mov sp, x0 268 + 269 + /* And turn the MMU back on! */ 270 + set_sctlr_el2 x2 271 + ret x1 272 + SYM_FUNC_END(__pkvm_init_switch_pgd) 243 273 244 274 .popsection

+74 -1

arch/arm64/kvm/hyp/nvhe/hyp-main.c

··· 6 6 7 7 #include <hyp/switch.h> 8 8 9 + #include <asm/pgtable-types.h> 9 10 #include <asm/kvm_asm.h> 10 11 #include <asm/kvm_emulate.h> 11 12 #include <asm/kvm_host.h> 12 13 #include <asm/kvm_hyp.h> 13 14 #include <asm/kvm_mmu.h> 14 15 16 + #include <nvhe/mem_protect.h> 17 + #include <nvhe/mm.h> 15 18 #include <nvhe/trap_handler.h> 16 19 17 20 DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params); ··· 109 106 __vgic_v3_restore_aprs(kern_hyp_va(cpu_if)); 110 107 } 111 108 109 + static void handle___pkvm_init(struct kvm_cpu_context *host_ctxt) 110 + { 111 + DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); 112 + DECLARE_REG(unsigned long, size, host_ctxt, 2); 113 + DECLARE_REG(unsigned long, nr_cpus, host_ctxt, 3); 114 + DECLARE_REG(unsigned long *, per_cpu_base, host_ctxt, 4); 115 + DECLARE_REG(u32, hyp_va_bits, host_ctxt, 5); 116 + 117 + /* 118 + * __pkvm_init() will return only if an error occurred, otherwise it 119 + * will tail-call in __pkvm_init_finalise() which will have to deal 120 + * with the host context directly. 121 + */ 122 + cpu_reg(host_ctxt, 1) = __pkvm_init(phys, size, nr_cpus, per_cpu_base, 123 + hyp_va_bits); 124 + } 125 + 126 + static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt) 127 + { 128 + DECLARE_REG(enum arm64_hyp_spectre_vector, slot, host_ctxt, 1); 129 + 130 + cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot); 131 + } 132 + 133 + static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt) 134 + { 135 + DECLARE_REG(unsigned long, start, host_ctxt, 1); 136 + DECLARE_REG(unsigned long, size, host_ctxt, 2); 137 + DECLARE_REG(unsigned long, phys, host_ctxt, 3); 138 + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4); 139 + 140 + cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot); 141 + } 142 + 143 + static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt) 144 + { 145 + DECLARE_REG(phys_addr_t, phys, host_ctxt, 1); 146 + DECLARE_REG(size_t, size, host_ctxt, 2); 147 + DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 3); 148 + 149 + cpu_reg(host_ctxt, 1) = __pkvm_create_private_mapping(phys, size, prot); 150 + } 151 + 152 + static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) 153 + { 154 + cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); 155 + } 156 + 157 + static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt) 158 + { 159 + DECLARE_REG(phys_addr_t, start, host_ctxt, 1); 160 + DECLARE_REG(phys_addr_t, end, host_ctxt, 2); 161 + 162 + cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end); 163 + } 112 164 typedef void (*hcall_t)(struct kvm_cpu_context *); 113 165 114 166 #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x ··· 183 125 HANDLE_FUNC(__kvm_get_mdcr_el2), 184 126 HANDLE_FUNC(__vgic_v3_save_aprs), 185 127 HANDLE_FUNC(__vgic_v3_restore_aprs), 128 + HANDLE_FUNC(__pkvm_init), 129 + HANDLE_FUNC(__pkvm_cpu_set_vector), 130 + HANDLE_FUNC(__pkvm_create_mappings), 131 + HANDLE_FUNC(__pkvm_create_private_mapping), 132 + HANDLE_FUNC(__pkvm_prot_finalize), 133 + HANDLE_FUNC(__pkvm_mark_hyp), 186 134 }; 187 135 188 136 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) ··· 241 177 case ESR_ELx_EC_SMC64: 242 178 handle_host_smc(host_ctxt); 243 179 break; 180 + case ESR_ELx_EC_SVE: 181 + sysreg_clear_set(cptr_el2, CPTR_EL2_TZ, 0); 182 + isb(); 183 + sve_cond_update_zcr_vq(ZCR_ELx_LEN_MASK, SYS_ZCR_EL2); 184 + break; 185 + case ESR_ELx_EC_IABT_LOW: 186 + case ESR_ELx_EC_DABT_LOW: 187 + handle_host_mem_abort(host_ctxt); 188 + break; 244 189 default: 245 - hyp_panic(); 190 + BUG(); 246 191 } 247 192 }

+2 -4

arch/arm64/kvm/hyp/nvhe/hyp-smp.c

··· 18 18 19 19 u64 cpu_logical_map(unsigned int cpu) 20 20 { 21 - if (cpu >= ARRAY_SIZE(hyp_cpu_logical_map)) 22 - hyp_panic(); 21 + BUG_ON(cpu >= ARRAY_SIZE(hyp_cpu_logical_map)); 23 22 24 23 return hyp_cpu_logical_map[cpu]; 25 24 } ··· 29 30 unsigned long this_cpu_base; 30 31 unsigned long elf_base; 31 32 32 - if (cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base)) 33 - hyp_panic(); 33 + BUG_ON(cpu >= ARRAY_SIZE(kvm_arm_hyp_percpu_base)); 34 34 35 35 cpu_base_array = (unsigned long *)&kvm_arm_hyp_percpu_base; 36 36 this_cpu_base = kern_hyp_va(cpu_base_array[cpu]);

+1

arch/arm64/kvm/hyp/nvhe/hyp.lds.S

··· 25 25 BEGIN_HYP_SECTION(.data..percpu) 26 26 PERCPU_INPUT(L1_CACHE_BYTES) 27 27 END_HYP_SECTION 28 + HYP_SECTION(.bss) 28 29 }

+279

arch/arm64/kvm/hyp/nvhe/mem_protect.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2020 Google LLC 4 + * Author: Quentin Perret <qperret@google.com> 5 + */ 6 + 7 + #include <linux/kvm_host.h> 8 + #include <asm/kvm_emulate.h> 9 + #include <asm/kvm_hyp.h> 10 + #include <asm/kvm_mmu.h> 11 + #include <asm/kvm_pgtable.h> 12 + #include <asm/stage2_pgtable.h> 13 + 14 + #include <hyp/switch.h> 15 + 16 + #include <nvhe/gfp.h> 17 + #include <nvhe/memory.h> 18 + #include <nvhe/mem_protect.h> 19 + #include <nvhe/mm.h> 20 + 21 + #define KVM_HOST_S2_FLAGS (KVM_PGTABLE_S2_NOFWB | KVM_PGTABLE_S2_IDMAP) 22 + 23 + extern unsigned long hyp_nr_cpus; 24 + struct host_kvm host_kvm; 25 + 26 + struct hyp_pool host_s2_mem; 27 + struct hyp_pool host_s2_dev; 28 + 29 + /* 30 + * Copies of the host's CPU features registers holding sanitized values. 31 + */ 32 + u64 id_aa64mmfr0_el1_sys_val; 33 + u64 id_aa64mmfr1_el1_sys_val; 34 + 35 + static const u8 pkvm_hyp_id = 1; 36 + 37 + static void *host_s2_zalloc_pages_exact(size_t size) 38 + { 39 + return hyp_alloc_pages(&host_s2_mem, get_order(size)); 40 + } 41 + 42 + static void *host_s2_zalloc_page(void *pool) 43 + { 44 + return hyp_alloc_pages(pool, 0); 45 + } 46 + 47 + static int prepare_s2_pools(void *mem_pgt_pool, void *dev_pgt_pool) 48 + { 49 + unsigned long nr_pages, pfn; 50 + int ret; 51 + 52 + pfn = hyp_virt_to_pfn(mem_pgt_pool); 53 + nr_pages = host_s2_mem_pgtable_pages(); 54 + ret = hyp_pool_init(&host_s2_mem, pfn, nr_pages, 0); 55 + if (ret) 56 + return ret; 57 + 58 + pfn = hyp_virt_to_pfn(dev_pgt_pool); 59 + nr_pages = host_s2_dev_pgtable_pages(); 60 + ret = hyp_pool_init(&host_s2_dev, pfn, nr_pages, 0); 61 + if (ret) 62 + return ret; 63 + 64 + host_kvm.mm_ops = (struct kvm_pgtable_mm_ops) { 65 + .zalloc_pages_exact = host_s2_zalloc_pages_exact, 66 + .zalloc_page = host_s2_zalloc_page, 67 + .phys_to_virt = hyp_phys_to_virt, 68 + .virt_to_phys = hyp_virt_to_phys, 69 + .page_count = hyp_page_count, 70 + .get_page = hyp_get_page, 71 + .put_page = hyp_put_page, 72 + }; 73 + 74 + return 0; 75 + } 76 + 77 + static void prepare_host_vtcr(void) 78 + { 79 + u32 parange, phys_shift; 80 + 81 + /* The host stage 2 is id-mapped, so use parange for T0SZ */ 82 + parange = kvm_get_parange(id_aa64mmfr0_el1_sys_val); 83 + phys_shift = id_aa64mmfr0_parange_to_phys_shift(parange); 84 + 85 + host_kvm.arch.vtcr = kvm_get_vtcr(id_aa64mmfr0_el1_sys_val, 86 + id_aa64mmfr1_el1_sys_val, phys_shift); 87 + } 88 + 89 + int kvm_host_prepare_stage2(void *mem_pgt_pool, void *dev_pgt_pool) 90 + { 91 + struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; 92 + int ret; 93 + 94 + prepare_host_vtcr(); 95 + hyp_spin_lock_init(&host_kvm.lock); 96 + 97 + ret = prepare_s2_pools(mem_pgt_pool, dev_pgt_pool); 98 + if (ret) 99 + return ret; 100 + 101 + ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch, 102 + &host_kvm.mm_ops, KVM_HOST_S2_FLAGS); 103 + if (ret) 104 + return ret; 105 + 106 + mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd); 107 + mmu->arch = &host_kvm.arch; 108 + mmu->pgt = &host_kvm.pgt; 109 + mmu->vmid.vmid_gen = 0; 110 + mmu->vmid.vmid = 0; 111 + 112 + return 0; 113 + } 114 + 115 + int __pkvm_prot_finalize(void) 116 + { 117 + struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; 118 + struct kvm_nvhe_init_params *params = this_cpu_ptr(&kvm_init_params); 119 + 120 + params->vttbr = kvm_get_vttbr(mmu); 121 + params->vtcr = host_kvm.arch.vtcr; 122 + params->hcr_el2 |= HCR_VM; 123 + kvm_flush_dcache_to_poc(params, sizeof(*params)); 124 + 125 + write_sysreg(params->hcr_el2, hcr_el2); 126 + __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); 127 + 128 + /* 129 + * Make sure to have an ISB before the TLB maintenance below but only 130 + * when __load_stage2() doesn't include one already. 131 + */ 132 + asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); 133 + 134 + /* Invalidate stale HCR bits that may be cached in TLBs */ 135 + __tlbi(vmalls12e1); 136 + dsb(nsh); 137 + isb(); 138 + 139 + return 0; 140 + } 141 + 142 + static int host_stage2_unmap_dev_all(void) 143 + { 144 + struct kvm_pgtable *pgt = &host_kvm.pgt; 145 + struct memblock_region *reg; 146 + u64 addr = 0; 147 + int i, ret; 148 + 149 + /* Unmap all non-memory regions to recycle the pages */ 150 + for (i = 0; i < hyp_memblock_nr; i++, addr = reg->base + reg->size) { 151 + reg = &hyp_memory[i]; 152 + ret = kvm_pgtable_stage2_unmap(pgt, addr, reg->base - addr); 153 + if (ret) 154 + return ret; 155 + } 156 + return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); 157 + } 158 + 159 + static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) 160 + { 161 + int cur, left = 0, right = hyp_memblock_nr; 162 + struct memblock_region *reg; 163 + phys_addr_t end; 164 + 165 + range->start = 0; 166 + range->end = ULONG_MAX; 167 + 168 + /* The list of memblock regions is sorted, binary search it */ 169 + while (left < right) { 170 + cur = (left + right) >> 1; 171 + reg = &hyp_memory[cur]; 172 + end = reg->base + reg->size; 173 + if (addr < reg->base) { 174 + right = cur; 175 + range->end = reg->base; 176 + } else if (addr >= end) { 177 + left = cur + 1; 178 + range->start = end; 179 + } else { 180 + range->start = reg->base; 181 + range->end = end; 182 + return true; 183 + } 184 + } 185 + 186 + return false; 187 + } 188 + 189 + static bool range_is_memory(u64 start, u64 end) 190 + { 191 + struct kvm_mem_range r1, r2; 192 + 193 + if (!find_mem_range(start, &r1) || !find_mem_range(end, &r2)) 194 + return false; 195 + if (r1.start != r2.start) 196 + return false; 197 + 198 + return true; 199 + } 200 + 201 + static inline int __host_stage2_idmap(u64 start, u64 end, 202 + enum kvm_pgtable_prot prot, 203 + struct hyp_pool *pool) 204 + { 205 + return kvm_pgtable_stage2_map(&host_kvm.pgt, start, end - start, start, 206 + prot, pool); 207 + } 208 + 209 + static int host_stage2_idmap(u64 addr) 210 + { 211 + enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W; 212 + struct kvm_mem_range range; 213 + bool is_memory = find_mem_range(addr, &range); 214 + struct hyp_pool *pool = is_memory ? &host_s2_mem : &host_s2_dev; 215 + int ret; 216 + 217 + if (is_memory) 218 + prot |= KVM_PGTABLE_PROT_X; 219 + 220 + hyp_spin_lock(&host_kvm.lock); 221 + ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range); 222 + if (ret) 223 + goto unlock; 224 + 225 + ret = __host_stage2_idmap(range.start, range.end, prot, pool); 226 + if (is_memory || ret != -ENOMEM) 227 + goto unlock; 228 + 229 + /* 230 + * host_s2_mem has been provided with enough pages to cover all of 231 + * memory with page granularity, so we should never hit the ENOMEM case. 232 + * However, it is difficult to know how much of the MMIO range we will 233 + * need to cover upfront, so we may need to 'recycle' the pages if we 234 + * run out. 235 + */ 236 + ret = host_stage2_unmap_dev_all(); 237 + if (ret) 238 + goto unlock; 239 + 240 + ret = __host_stage2_idmap(range.start, range.end, prot, pool); 241 + 242 + unlock: 243 + hyp_spin_unlock(&host_kvm.lock); 244 + 245 + return ret; 246 + } 247 + 248 + int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) 249 + { 250 + int ret; 251 + 252 + /* 253 + * host_stage2_unmap_dev_all() currently relies on MMIO mappings being 254 + * non-persistent, so don't allow changing page ownership in MMIO range. 255 + */ 256 + if (!range_is_memory(start, end)) 257 + return -EINVAL; 258 + 259 + hyp_spin_lock(&host_kvm.lock); 260 + ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start, 261 + &host_s2_mem, pkvm_hyp_id); 262 + hyp_spin_unlock(&host_kvm.lock); 263 + 264 + return ret != -EAGAIN ? ret : 0; 265 + } 266 + 267 + void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) 268 + { 269 + struct kvm_vcpu_fault_info fault; 270 + u64 esr, addr; 271 + int ret = 0; 272 + 273 + esr = read_sysreg_el2(SYS_ESR); 274 + BUG_ON(!__get_fault_info(esr, &fault)); 275 + 276 + addr = (fault.hpfar_el2 & HPFAR_MASK) << 8; 277 + ret = host_stage2_idmap(addr); 278 + BUG_ON(ret && ret != -EAGAIN); 279 + }

+173

arch/arm64/kvm/hyp/nvhe/mm.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2020 Google LLC 4 + * Author: Quentin Perret <qperret@google.com> 5 + */ 6 + 7 + #include <linux/kvm_host.h> 8 + #include <asm/kvm_hyp.h> 9 + #include <asm/kvm_mmu.h> 10 + #include <asm/kvm_pgtable.h> 11 + #include <asm/spectre.h> 12 + 13 + #include <nvhe/early_alloc.h> 14 + #include <nvhe/gfp.h> 15 + #include <nvhe/memory.h> 16 + #include <nvhe/mm.h> 17 + #include <nvhe/spinlock.h> 18 + 19 + struct kvm_pgtable pkvm_pgtable; 20 + hyp_spinlock_t pkvm_pgd_lock; 21 + u64 __io_map_base; 22 + 23 + struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS]; 24 + unsigned int hyp_memblock_nr; 25 + 26 + int __pkvm_create_mappings(unsigned long start, unsigned long size, 27 + unsigned long phys, enum kvm_pgtable_prot prot) 28 + { 29 + int err; 30 + 31 + hyp_spin_lock(&pkvm_pgd_lock); 32 + err = kvm_pgtable_hyp_map(&pkvm_pgtable, start, size, phys, prot); 33 + hyp_spin_unlock(&pkvm_pgd_lock); 34 + 35 + return err; 36 + } 37 + 38 + unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, 39 + enum kvm_pgtable_prot prot) 40 + { 41 + unsigned long addr; 42 + int err; 43 + 44 + hyp_spin_lock(&pkvm_pgd_lock); 45 + 46 + size = PAGE_ALIGN(size + offset_in_page(phys)); 47 + addr = __io_map_base; 48 + __io_map_base += size; 49 + 50 + /* Are we overflowing on the vmemmap ? */ 51 + if (__io_map_base > __hyp_vmemmap) { 52 + __io_map_base -= size; 53 + addr = (unsigned long)ERR_PTR(-ENOMEM); 54 + goto out; 55 + } 56 + 57 + err = kvm_pgtable_hyp_map(&pkvm_pgtable, addr, size, phys, prot); 58 + if (err) { 59 + addr = (unsigned long)ERR_PTR(err); 60 + goto out; 61 + } 62 + 63 + addr = addr + offset_in_page(phys); 64 + out: 65 + hyp_spin_unlock(&pkvm_pgd_lock); 66 + 67 + return addr; 68 + } 69 + 70 + int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) 71 + { 72 + unsigned long start = (unsigned long)from; 73 + unsigned long end = (unsigned long)to; 74 + unsigned long virt_addr; 75 + phys_addr_t phys; 76 + 77 + start = start & PAGE_MASK; 78 + end = PAGE_ALIGN(end); 79 + 80 + for (virt_addr = start; virt_addr < end; virt_addr += PAGE_SIZE) { 81 + int err; 82 + 83 + phys = hyp_virt_to_phys((void *)virt_addr); 84 + err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot); 85 + if (err) 86 + return err; 87 + } 88 + 89 + return 0; 90 + } 91 + 92 + int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back) 93 + { 94 + unsigned long start, end; 95 + 96 + hyp_vmemmap_range(phys, size, &start, &end); 97 + 98 + return __pkvm_create_mappings(start, end - start, back, PAGE_HYP); 99 + } 100 + 101 + static void *__hyp_bp_vect_base; 102 + int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot) 103 + { 104 + void *vector; 105 + 106 + switch (slot) { 107 + case HYP_VECTOR_DIRECT: { 108 + vector = __kvm_hyp_vector; 109 + break; 110 + } 111 + case HYP_VECTOR_SPECTRE_DIRECT: { 112 + vector = __bp_harden_hyp_vecs; 113 + break; 114 + } 115 + case HYP_VECTOR_INDIRECT: 116 + case HYP_VECTOR_SPECTRE_INDIRECT: { 117 + vector = (void *)__hyp_bp_vect_base; 118 + break; 119 + } 120 + default: 121 + return -EINVAL; 122 + } 123 + 124 + vector = __kvm_vector_slot2addr(vector, slot); 125 + *this_cpu_ptr(&kvm_hyp_vector) = (unsigned long)vector; 126 + 127 + return 0; 128 + } 129 + 130 + int hyp_map_vectors(void) 131 + { 132 + phys_addr_t phys; 133 + void *bp_base; 134 + 135 + if (!cpus_have_const_cap(ARM64_SPECTRE_V3A)) 136 + return 0; 137 + 138 + phys = __hyp_pa(__bp_harden_hyp_vecs); 139 + bp_base = (void *)__pkvm_create_private_mapping(phys, 140 + __BP_HARDEN_HYP_VECS_SZ, 141 + PAGE_HYP_EXEC); 142 + if (IS_ERR_OR_NULL(bp_base)) 143 + return PTR_ERR(bp_base); 144 + 145 + __hyp_bp_vect_base = bp_base; 146 + 147 + return 0; 148 + } 149 + 150 + int hyp_create_idmap(u32 hyp_va_bits) 151 + { 152 + unsigned long start, end; 153 + 154 + start = hyp_virt_to_phys((void *)__hyp_idmap_text_start); 155 + start = ALIGN_DOWN(start, PAGE_SIZE); 156 + 157 + end = hyp_virt_to_phys((void *)__hyp_idmap_text_end); 158 + end = ALIGN(end, PAGE_SIZE); 159 + 160 + /* 161 + * One half of the VA space is reserved to linearly map portions of 162 + * memory -- see va_layout.c for more details. The other half of the VA 163 + * space contains the trampoline page, and needs some care. Split that 164 + * second half in two and find the quarter of VA space not conflicting 165 + * with the idmap to place the IOs and the vmemmap. IOs use the lower 166 + * half of the quarter and the vmemmap the upper half. 167 + */ 168 + __io_map_base = start & BIT(hyp_va_bits - 2); 169 + __io_map_base ^= BIT(hyp_va_bits - 2); 170 + __hyp_vmemmap = __io_map_base | BIT(hyp_va_bits - 3); 171 + 172 + return __pkvm_create_mappings(start, end - start, start, PAGE_HYP_EXEC); 173 + }

+195

arch/arm64/kvm/hyp/nvhe/page_alloc.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2020 Google LLC 4 + * Author: Quentin Perret <qperret@google.com> 5 + */ 6 + 7 + #include <asm/kvm_hyp.h> 8 + #include <nvhe/gfp.h> 9 + 10 + u64 __hyp_vmemmap; 11 + 12 + /* 13 + * Index the hyp_vmemmap to find a potential buddy page, but make no assumption 14 + * about its current state. 15 + * 16 + * Example buddy-tree for a 4-pages physically contiguous pool: 17 + * 18 + * o : Page 3 19 + * / 20 + * o-o : Page 2 21 + * / 22 + * / o : Page 1 23 + * / / 24 + * o---o-o : Page 0 25 + * Order 2 1 0 26 + * 27 + * Example of requests on this pool: 28 + * __find_buddy_nocheck(pool, page 0, order 0) => page 1 29 + * __find_buddy_nocheck(pool, page 0, order 1) => page 2 30 + * __find_buddy_nocheck(pool, page 1, order 0) => page 0 31 + * __find_buddy_nocheck(pool, page 2, order 0) => page 3 32 + */ 33 + static struct hyp_page *__find_buddy_nocheck(struct hyp_pool *pool, 34 + struct hyp_page *p, 35 + unsigned int order) 36 + { 37 + phys_addr_t addr = hyp_page_to_phys(p); 38 + 39 + addr ^= (PAGE_SIZE << order); 40 + 41 + /* 42 + * Don't return a page outside the pool range -- it belongs to 43 + * something else and may not be mapped in hyp_vmemmap. 44 + */ 45 + if (addr < pool->range_start || addr >= pool->range_end) 46 + return NULL; 47 + 48 + return hyp_phys_to_page(addr); 49 + } 50 + 51 + /* Find a buddy page currently available for allocation */ 52 + static struct hyp_page *__find_buddy_avail(struct hyp_pool *pool, 53 + struct hyp_page *p, 54 + unsigned int order) 55 + { 56 + struct hyp_page *buddy = __find_buddy_nocheck(pool, p, order); 57 + 58 + if (!buddy || buddy->order != order || list_empty(&buddy->node)) 59 + return NULL; 60 + 61 + return buddy; 62 + 63 + } 64 + 65 + static void __hyp_attach_page(struct hyp_pool *pool, 66 + struct hyp_page *p) 67 + { 68 + unsigned int order = p->order; 69 + struct hyp_page *buddy; 70 + 71 + memset(hyp_page_to_virt(p), 0, PAGE_SIZE << p->order); 72 + 73 + /* 74 + * Only the first struct hyp_page of a high-order page (otherwise known 75 + * as the 'head') should have p->order set. The non-head pages should 76 + * have p->order = HYP_NO_ORDER. Here @p may no longer be the head 77 + * after coallescing, so make sure to mark it HYP_NO_ORDER proactively. 78 + */ 79 + p->order = HYP_NO_ORDER; 80 + for (; (order + 1) < pool->max_order; order++) { 81 + buddy = __find_buddy_avail(pool, p, order); 82 + if (!buddy) 83 + break; 84 + 85 + /* Take the buddy out of its list, and coallesce with @p */ 86 + list_del_init(&buddy->node); 87 + buddy->order = HYP_NO_ORDER; 88 + p = min(p, buddy); 89 + } 90 + 91 + /* Mark the new head, and insert it */ 92 + p->order = order; 93 + list_add_tail(&p->node, &pool->free_area[order]); 94 + } 95 + 96 + static void hyp_attach_page(struct hyp_page *p) 97 + { 98 + struct hyp_pool *pool = hyp_page_to_pool(p); 99 + 100 + hyp_spin_lock(&pool->lock); 101 + __hyp_attach_page(pool, p); 102 + hyp_spin_unlock(&pool->lock); 103 + } 104 + 105 + static struct hyp_page *__hyp_extract_page(struct hyp_pool *pool, 106 + struct hyp_page *p, 107 + unsigned int order) 108 + { 109 + struct hyp_page *buddy; 110 + 111 + list_del_init(&p->node); 112 + while (p->order > order) { 113 + /* 114 + * The buddy of order n - 1 currently has HYP_NO_ORDER as it 115 + * is covered by a higher-level page (whose head is @p). Use 116 + * __find_buddy_nocheck() to find it and inject it in the 117 + * free_list[n - 1], effectively splitting @p in half. 118 + */ 119 + p->order--; 120 + buddy = __find_buddy_nocheck(pool, p, p->order); 121 + buddy->order = p->order; 122 + list_add_tail(&buddy->node, &pool->free_area[buddy->order]); 123 + } 124 + 125 + return p; 126 + } 127 + 128 + void hyp_put_page(void *addr) 129 + { 130 + struct hyp_page *p = hyp_virt_to_page(addr); 131 + 132 + if (hyp_page_ref_dec_and_test(p)) 133 + hyp_attach_page(p); 134 + } 135 + 136 + void hyp_get_page(void *addr) 137 + { 138 + struct hyp_page *p = hyp_virt_to_page(addr); 139 + 140 + hyp_page_ref_inc(p); 141 + } 142 + 143 + void *hyp_alloc_pages(struct hyp_pool *pool, unsigned int order) 144 + { 145 + unsigned int i = order; 146 + struct hyp_page *p; 147 + 148 + hyp_spin_lock(&pool->lock); 149 + 150 + /* Look for a high-enough-order page */ 151 + while (i < pool->max_order && list_empty(&pool->free_area[i])) 152 + i++; 153 + if (i >= pool->max_order) { 154 + hyp_spin_unlock(&pool->lock); 155 + return NULL; 156 + } 157 + 158 + /* Extract it from the tree at the right order */ 159 + p = list_first_entry(&pool->free_area[i], struct hyp_page, node); 160 + p = __hyp_extract_page(pool, p, order); 161 + 162 + hyp_spin_unlock(&pool->lock); 163 + hyp_set_page_refcounted(p); 164 + 165 + return hyp_page_to_virt(p); 166 + } 167 + 168 + int hyp_pool_init(struct hyp_pool *pool, u64 pfn, unsigned int nr_pages, 169 + unsigned int reserved_pages) 170 + { 171 + phys_addr_t phys = hyp_pfn_to_phys(pfn); 172 + struct hyp_page *p; 173 + int i; 174 + 175 + hyp_spin_lock_init(&pool->lock); 176 + pool->max_order = min(MAX_ORDER, get_order(nr_pages << PAGE_SHIFT)); 177 + for (i = 0; i < pool->max_order; i++) 178 + INIT_LIST_HEAD(&pool->free_area[i]); 179 + pool->range_start = phys; 180 + pool->range_end = phys + (nr_pages << PAGE_SHIFT); 181 + 182 + /* Init the vmemmap portion */ 183 + p = hyp_phys_to_page(phys); 184 + memset(p, 0, sizeof(*p) * nr_pages); 185 + for (i = 0; i < nr_pages; i++) { 186 + p[i].pool = pool; 187 + INIT_LIST_HEAD(&p[i].node); 188 + } 189 + 190 + /* Attach the unused pages to the buddy tree */ 191 + for (i = reserved_pages; i < nr_pages; i++) 192 + __hyp_attach_page(pool, &p[i]); 193 + 194 + return 0; 195 + }

+1 -3

arch/arm64/kvm/hyp/nvhe/psci-relay.c

··· 11 11 #include <linux/kvm_host.h> 12 12 #include <uapi/linux/psci.h> 13 13 14 + #include <nvhe/memory.h> 14 15 #include <nvhe/trap_handler.h> 15 16 16 17 void kvm_hyp_cpu_entry(unsigned long r0); ··· 21 20 22 21 /* Config options set by the host. */ 23 22 struct kvm_host_psci_config __ro_after_init kvm_host_psci_config; 24 - s64 __ro_after_init hyp_physvirt_offset; 25 - 26 - #define __hyp_pa(x) ((phys_addr_t)((x)) + hyp_physvirt_offset) 27 23 28 24 #define INVALID_CPU_ID UINT_MAX 29 25

+214

arch/arm64/kvm/hyp/nvhe/setup.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (C) 2020 Google LLC 4 + * Author: Quentin Perret <qperret@google.com> 5 + */ 6 + 7 + #include <linux/kvm_host.h> 8 + #include <asm/kvm_hyp.h> 9 + #include <asm/kvm_mmu.h> 10 + #include <asm/kvm_pgtable.h> 11 + 12 + #include <nvhe/early_alloc.h> 13 + #include <nvhe/gfp.h> 14 + #include <nvhe/memory.h> 15 + #include <nvhe/mem_protect.h> 16 + #include <nvhe/mm.h> 17 + #include <nvhe/trap_handler.h> 18 + 19 + struct hyp_pool hpool; 20 + struct kvm_pgtable_mm_ops pkvm_pgtable_mm_ops; 21 + unsigned long hyp_nr_cpus; 22 + 23 + #define hyp_percpu_size ((unsigned long)__per_cpu_end - \ 24 + (unsigned long)__per_cpu_start) 25 + 26 + static void *vmemmap_base; 27 + static void *hyp_pgt_base; 28 + static void *host_s2_mem_pgt_base; 29 + static void *host_s2_dev_pgt_base; 30 + 31 + static int divide_memory_pool(void *virt, unsigned long size) 32 + { 33 + unsigned long vstart, vend, nr_pages; 34 + 35 + hyp_early_alloc_init(virt, size); 36 + 37 + hyp_vmemmap_range(__hyp_pa(virt), size, &vstart, &vend); 38 + nr_pages = (vend - vstart) >> PAGE_SHIFT; 39 + vmemmap_base = hyp_early_alloc_contig(nr_pages); 40 + if (!vmemmap_base) 41 + return -ENOMEM; 42 + 43 + nr_pages = hyp_s1_pgtable_pages(); 44 + hyp_pgt_base = hyp_early_alloc_contig(nr_pages); 45 + if (!hyp_pgt_base) 46 + return -ENOMEM; 47 + 48 + nr_pages = host_s2_mem_pgtable_pages(); 49 + host_s2_mem_pgt_base = hyp_early_alloc_contig(nr_pages); 50 + if (!host_s2_mem_pgt_base) 51 + return -ENOMEM; 52 + 53 + nr_pages = host_s2_dev_pgtable_pages(); 54 + host_s2_dev_pgt_base = hyp_early_alloc_contig(nr_pages); 55 + if (!host_s2_dev_pgt_base) 56 + return -ENOMEM; 57 + 58 + return 0; 59 + } 60 + 61 + static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, 62 + unsigned long *per_cpu_base, 63 + u32 hyp_va_bits) 64 + { 65 + void *start, *end, *virt = hyp_phys_to_virt(phys); 66 + unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; 67 + int ret, i; 68 + 69 + /* Recreate the hyp page-table using the early page allocator */ 70 + hyp_early_alloc_init(hyp_pgt_base, pgt_size); 71 + ret = kvm_pgtable_hyp_init(&pkvm_pgtable, hyp_va_bits, 72 + &hyp_early_alloc_mm_ops); 73 + if (ret) 74 + return ret; 75 + 76 + ret = hyp_create_idmap(hyp_va_bits); 77 + if (ret) 78 + return ret; 79 + 80 + ret = hyp_map_vectors(); 81 + if (ret) 82 + return ret; 83 + 84 + ret = hyp_back_vmemmap(phys, size, hyp_virt_to_phys(vmemmap_base)); 85 + if (ret) 86 + return ret; 87 + 88 + ret = pkvm_create_mappings(__hyp_text_start, __hyp_text_end, PAGE_HYP_EXEC); 89 + if (ret) 90 + return ret; 91 + 92 + ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO); 93 + if (ret) 94 + return ret; 95 + 96 + ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); 97 + if (ret) 98 + return ret; 99 + 100 + ret = pkvm_create_mappings(__hyp_bss_start, __hyp_bss_end, PAGE_HYP); 101 + if (ret) 102 + return ret; 103 + 104 + ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO); 105 + if (ret) 106 + return ret; 107 + 108 + ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP); 109 + if (ret) 110 + return ret; 111 + 112 + for (i = 0; i < hyp_nr_cpus; i++) { 113 + start = (void *)kern_hyp_va(per_cpu_base[i]); 114 + end = start + PAGE_ALIGN(hyp_percpu_size); 115 + ret = pkvm_create_mappings(start, end, PAGE_HYP); 116 + if (ret) 117 + return ret; 118 + 119 + end = (void *)per_cpu_ptr(&kvm_init_params, i)->stack_hyp_va; 120 + start = end - PAGE_SIZE; 121 + ret = pkvm_create_mappings(start, end, PAGE_HYP); 122 + if (ret) 123 + return ret; 124 + } 125 + 126 + return 0; 127 + } 128 + 129 + static void update_nvhe_init_params(void) 130 + { 131 + struct kvm_nvhe_init_params *params; 132 + unsigned long i; 133 + 134 + for (i = 0; i < hyp_nr_cpus; i++) { 135 + params = per_cpu_ptr(&kvm_init_params, i); 136 + params->pgd_pa = __hyp_pa(pkvm_pgtable.pgd); 137 + __flush_dcache_area(params, sizeof(*params)); 138 + } 139 + } 140 + 141 + static void *hyp_zalloc_hyp_page(void *arg) 142 + { 143 + return hyp_alloc_pages(&hpool, 0); 144 + } 145 + 146 + void __noreturn __pkvm_init_finalise(void) 147 + { 148 + struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data); 149 + struct kvm_cpu_context *host_ctxt = &host_data->host_ctxt; 150 + unsigned long nr_pages, reserved_pages, pfn; 151 + int ret; 152 + 153 + /* Now that the vmemmap is backed, install the full-fledged allocator */ 154 + pfn = hyp_virt_to_pfn(hyp_pgt_base); 155 + nr_pages = hyp_s1_pgtable_pages(); 156 + reserved_pages = hyp_early_alloc_nr_used_pages(); 157 + ret = hyp_pool_init(&hpool, pfn, nr_pages, reserved_pages); 158 + if (ret) 159 + goto out; 160 + 161 + ret = kvm_host_prepare_stage2(host_s2_mem_pgt_base, host_s2_dev_pgt_base); 162 + if (ret) 163 + goto out; 164 + 165 + pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) { 166 + .zalloc_page = hyp_zalloc_hyp_page, 167 + .phys_to_virt = hyp_phys_to_virt, 168 + .virt_to_phys = hyp_virt_to_phys, 169 + .get_page = hyp_get_page, 170 + .put_page = hyp_put_page, 171 + }; 172 + pkvm_pgtable.mm_ops = &pkvm_pgtable_mm_ops; 173 + 174 + out: 175 + /* 176 + * We tail-called to here from handle___pkvm_init() and will not return, 177 + * so make sure to propagate the return value to the host. 178 + */ 179 + cpu_reg(host_ctxt, 1) = ret; 180 + 181 + __host_enter(host_ctxt); 182 + } 183 + 184 + int __pkvm_init(phys_addr_t phys, unsigned long size, unsigned long nr_cpus, 185 + unsigned long *per_cpu_base, u32 hyp_va_bits) 186 + { 187 + struct kvm_nvhe_init_params *params; 188 + void *virt = hyp_phys_to_virt(phys); 189 + void (*fn)(phys_addr_t params_pa, void *finalize_fn_va); 190 + int ret; 191 + 192 + if (!PAGE_ALIGNED(phys) || !PAGE_ALIGNED(size)) 193 + return -EINVAL; 194 + 195 + hyp_spin_lock_init(&pkvm_pgd_lock); 196 + hyp_nr_cpus = nr_cpus; 197 + 198 + ret = divide_memory_pool(virt, size); 199 + if (ret) 200 + return ret; 201 + 202 + ret = recreate_hyp_mappings(phys, size, per_cpu_base, hyp_va_bits); 203 + if (ret) 204 + return ret; 205 + 206 + update_nvhe_init_params(); 207 + 208 + /* Jump in the idmap page to switch to the new page-tables */ 209 + params = this_cpu_ptr(&kvm_init_params); 210 + fn = (typeof(fn))__hyp_pa(__pkvm_init_switch_pgd); 211 + fn(__hyp_pa(params), __pkvm_init_finalise); 212 + 213 + unreachable(); 214 + }

+22

arch/arm64/kvm/hyp/nvhe/stub.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Stubs for out-of-line function calls caused by re-using kernel 4 + * infrastructure at EL2. 5 + * 6 + * Copyright (C) 2020 - Google LLC 7 + */ 8 + 9 + #include <linux/list.h> 10 + 11 + #ifdef CONFIG_DEBUG_LIST 12 + bool __list_add_valid(struct list_head *new, struct list_head *prev, 13 + struct list_head *next) 14 + { 15 + return true; 16 + } 17 + 18 + bool __list_del_entry_valid(struct list_head *entry) 19 + { 20 + return true; 21 + } 22 + #endif

+13 -13

arch/arm64/kvm/hyp/nvhe/switch.c

··· 28 28 #include <asm/processor.h> 29 29 #include <asm/thread_info.h> 30 30 31 + #include <nvhe/mem_protect.h> 32 + 31 33 /* Non-VHE specific context */ 32 34 DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); 33 35 DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); ··· 43 41 __activate_traps_common(vcpu); 44 42 45 43 val = CPTR_EL2_DEFAULT; 46 - val |= CPTR_EL2_TTA | CPTR_EL2_TZ | CPTR_EL2_TAM; 44 + val |= CPTR_EL2_TTA | CPTR_EL2_TAM; 47 45 if (!update_fp_enabled(vcpu)) { 48 - val |= CPTR_EL2_TFP; 46 + val |= CPTR_EL2_TFP | CPTR_EL2_TZ; 49 47 __activate_traps_fpsimd32(vcpu); 50 48 } 51 49 ··· 70 68 static void __deactivate_traps(struct kvm_vcpu *vcpu) 71 69 { 72 70 extern char __kvm_hyp_host_vector[]; 73 - u64 mdcr_el2; 71 + u64 mdcr_el2, cptr; 74 72 75 73 ___deactivate_traps(vcpu); 76 74 ··· 97 95 98 96 mdcr_el2 &= MDCR_EL2_HPMN_MASK; 99 97 mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; 98 + mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; 100 99 101 100 write_sysreg(mdcr_el2, mdcr_el2); 102 - if (is_protected_kvm_enabled()) 103 - write_sysreg(HCR_HOST_NVHE_PROTECTED_FLAGS, hcr_el2); 104 - else 105 - write_sysreg(HCR_HOST_NVHE_FLAGS, hcr_el2); 106 - write_sysreg(CPTR_EL2_DEFAULT, cptr_el2); 107 - write_sysreg(__kvm_hyp_host_vector, vbar_el2); 108 - } 101 + write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); 109 102 110 - static void __load_host_stage2(void) 111 - { 112 - write_sysreg(0, vttbr_el2); 103 + cptr = CPTR_EL2_DEFAULT; 104 + if (vcpu_has_sve(vcpu) && (vcpu->arch.flags & KVM_ARM64_FP_ENABLED)) 105 + cptr |= CPTR_EL2_TZ; 106 + 107 + write_sysreg(cptr, cptr_el2); 108 + write_sysreg(__kvm_hyp_host_vector, vbar_el2); 113 109 } 114 110 115 111 /* Save VGICv3 state on non-VHE systems */

+3 -1

arch/arm64/kvm/hyp/nvhe/tlb.c

··· 8 8 #include <asm/kvm_mmu.h> 9 9 #include <asm/tlbflush.h> 10 10 11 + #include <nvhe/mem_protect.h> 12 + 11 13 struct tlb_inv_context { 12 14 u64 tcr; 13 15 }; ··· 45 43 46 44 static void __tlb_switch_to_host(struct tlb_inv_context *cxt) 47 45 { 48 - write_sysreg(0, vttbr_el2); 46 + __load_host_stage2(); 49 47 50 48 if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { 51 49 /* Ensure write of the host VMID */

+319 -93

arch/arm64/kvm/hyp/pgtable.c

··· 9 9 10 10 #include <linux/bitfield.h> 11 11 #include <asm/kvm_pgtable.h> 12 - 13 - #define KVM_PGTABLE_MAX_LEVELS 4U 12 + #include <asm/stage2_pgtable.h> 14 13 15 14 #define KVM_PTE_VALID BIT(0) 16 15 ··· 48 49 KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ 49 50 KVM_PTE_LEAF_ATTR_HI_S2_XN) 50 51 52 + #define KVM_PTE_LEAF_ATTR_S2_IGNORED GENMASK(58, 55) 53 + 54 + #define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56) 55 + #define KVM_MAX_OWNER_ID 1 56 + 51 57 struct kvm_pgtable_walk_data { 52 58 struct kvm_pgtable *pgt; 53 59 struct kvm_pgtable_walker *walker; ··· 72 68 return BIT(kvm_granule_shift(level)); 73 69 } 74 70 75 - static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) 76 - { 77 - u64 granule = kvm_granule_size(level); 71 + #define KVM_PHYS_INVALID (-1ULL) 78 72 73 + static bool kvm_phys_is_valid(u64 phys) 74 + { 75 + return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX)); 76 + } 77 + 78 + static bool kvm_level_supports_block_mapping(u32 level) 79 + { 79 80 /* 80 81 * Reject invalid block mappings and don't bother with 4TB mappings for 81 82 * 52-bit PAs. 82 83 */ 83 - if (level == 0 || (PAGE_SIZE != SZ_4K && level == 1)) 84 + return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); 85 + } 86 + 87 + static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) 88 + { 89 + u64 granule = kvm_granule_size(level); 90 + 91 + if (!kvm_level_supports_block_mapping(level)) 84 92 return false; 85 93 86 94 if (granule > (end - addr)) 87 95 return false; 88 96 89 - return IS_ALIGNED(addr, granule) && IS_ALIGNED(phys, granule); 97 + if (kvm_phys_is_valid(phys) && !IS_ALIGNED(phys, granule)) 98 + return false; 99 + 100 + return IS_ALIGNED(addr, granule); 90 101 } 91 102 92 103 static u32 kvm_pgtable_idx(struct kvm_pgtable_walk_data *data, u32 level) ··· 171 152 return pte; 172 153 } 173 154 174 - static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte) 155 + static kvm_pte_t *kvm_pte_follow(kvm_pte_t pte, struct kvm_pgtable_mm_ops *mm_ops) 175 156 { 176 - return __va(kvm_pte_to_phys(pte)); 157 + return mm_ops->phys_to_virt(kvm_pte_to_phys(pte)); 177 158 } 178 159 179 - static void kvm_set_invalid_pte(kvm_pte_t *ptep) 160 + static void kvm_clear_pte(kvm_pte_t *ptep) 180 161 { 181 - kvm_pte_t pte = *ptep; 182 - WRITE_ONCE(*ptep, pte & ~KVM_PTE_VALID); 162 + WRITE_ONCE(*ptep, 0); 183 163 } 184 164 185 - static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp) 165 + static void kvm_set_table_pte(kvm_pte_t *ptep, kvm_pte_t *childp, 166 + struct kvm_pgtable_mm_ops *mm_ops) 186 167 { 187 - kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(__pa(childp)); 168 + kvm_pte_t old = *ptep, pte = kvm_phys_to_pte(mm_ops->virt_to_phys(childp)); 188 169 189 170 pte |= FIELD_PREP(KVM_PTE_TYPE, KVM_PTE_TYPE_TABLE); 190 171 pte |= KVM_PTE_VALID; ··· 204 185 pte |= KVM_PTE_VALID; 205 186 206 187 return pte; 188 + } 189 + 190 + static kvm_pte_t kvm_init_invalid_leaf_owner(u8 owner_id) 191 + { 192 + return FIELD_PREP(KVM_INVALID_PTE_OWNER_MASK, owner_id); 207 193 } 208 194 209 195 static int kvm_pgtable_visitor_cb(struct kvm_pgtable_walk_data *data, u64 addr, ··· 252 228 goto out; 253 229 } 254 230 255 - childp = kvm_pte_follow(pte); 231 + childp = kvm_pte_follow(pte, data->pgt->mm_ops); 256 232 ret = __kvm_pgtable_walk(data, childp, level + 1); 257 233 if (ret) 258 234 goto out; ··· 327 303 } 328 304 329 305 struct hyp_map_data { 330 - u64 phys; 331 - kvm_pte_t attr; 306 + u64 phys; 307 + kvm_pte_t attr; 308 + struct kvm_pgtable_mm_ops *mm_ops; 332 309 }; 333 310 334 - static int hyp_map_set_prot_attr(enum kvm_pgtable_prot prot, 335 - struct hyp_map_data *data) 311 + static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) 336 312 { 337 313 bool device = prot & KVM_PGTABLE_PROT_DEVICE; 338 314 u32 mtype = device ? MT_DEVICE_nGnRE : MT_NORMAL; ··· 357 333 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); 358 334 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); 359 335 attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; 360 - data->attr = attr; 336 + *ptep = attr; 337 + 361 338 return 0; 362 339 } 363 340 ··· 384 359 enum kvm_pgtable_walk_flags flag, void * const arg) 385 360 { 386 361 kvm_pte_t *childp; 362 + struct hyp_map_data *data = arg; 363 + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; 387 364 388 365 if (hyp_map_walker_try_leaf(addr, end, level, ptep, arg)) 389 366 return 0; ··· 393 366 if (WARN_ON(level == KVM_PGTABLE_MAX_LEVELS - 1)) 394 367 return -EINVAL; 395 368 396 - childp = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); 369 + childp = (kvm_pte_t *)mm_ops->zalloc_page(NULL); 397 370 if (!childp) 398 371 return -ENOMEM; 399 372 400 - kvm_set_table_pte(ptep, childp); 373 + kvm_set_table_pte(ptep, childp, mm_ops); 401 374 return 0; 402 375 } 403 376 ··· 407 380 int ret; 408 381 struct hyp_map_data map_data = { 409 382 .phys = ALIGN_DOWN(phys, PAGE_SIZE), 383 + .mm_ops = pgt->mm_ops, 410 384 }; 411 385 struct kvm_pgtable_walker walker = { 412 386 .cb = hyp_map_walker, ··· 415 387 .arg = &map_data, 416 388 }; 417 389 418 - ret = hyp_map_set_prot_attr(prot, &map_data); 390 + ret = hyp_set_prot_attr(prot, &map_data.attr); 419 391 if (ret) 420 392 return ret; 421 393 ··· 425 397 return ret; 426 398 } 427 399 428 - int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits) 400 + int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, 401 + struct kvm_pgtable_mm_ops *mm_ops) 429 402 { 430 403 u64 levels = ARM64_HW_PGTABLE_LEVELS(va_bits); 431 404 432 - pgt->pgd = (kvm_pte_t *)get_zeroed_page(GFP_KERNEL); 405 + pgt->pgd = (kvm_pte_t *)mm_ops->zalloc_page(NULL); 433 406 if (!pgt->pgd) 434 407 return -ENOMEM; 435 408 436 409 pgt->ia_bits = va_bits; 437 410 pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; 411 + pgt->mm_ops = mm_ops; 438 412 pgt->mmu = NULL; 439 413 return 0; 440 414 } ··· 444 414 static int hyp_free_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, 445 415 enum kvm_pgtable_walk_flags flag, void * const arg) 446 416 { 447 - free_page((unsigned long)kvm_pte_follow(*ptep)); 417 + struct kvm_pgtable_mm_ops *mm_ops = arg; 418 + 419 + mm_ops->put_page((void *)kvm_pte_follow(*ptep, mm_ops)); 448 420 return 0; 449 421 } 450 422 ··· 455 423 struct kvm_pgtable_walker walker = { 456 424 .cb = hyp_free_walker, 457 425 .flags = KVM_PGTABLE_WALK_TABLE_POST, 426 + .arg = pgt->mm_ops, 458 427 }; 459 428 460 429 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); 461 - free_page((unsigned long)pgt->pgd); 430 + pgt->mm_ops->put_page(pgt->pgd); 462 431 pgt->pgd = NULL; 463 432 } 464 433 465 434 struct stage2_map_data { 466 435 u64 phys; 467 436 kvm_pte_t attr; 437 + u8 owner_id; 468 438 469 439 kvm_pte_t *anchor; 440 + kvm_pte_t *childp; 470 441 471 442 struct kvm_s2_mmu *mmu; 472 - struct kvm_mmu_memory_cache *memcache; 443 + void *memcache; 444 + 445 + struct kvm_pgtable_mm_ops *mm_ops; 473 446 }; 474 447 475 - static int stage2_map_set_prot_attr(enum kvm_pgtable_prot prot, 476 - struct stage2_map_data *data) 448 + u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) 449 + { 450 + u64 vtcr = VTCR_EL2_FLAGS; 451 + u8 lvls; 452 + 453 + vtcr |= kvm_get_parange(mmfr0) << VTCR_EL2_PS_SHIFT; 454 + vtcr |= VTCR_EL2_T0SZ(phys_shift); 455 + /* 456 + * Use a minimum 2 level page table to prevent splitting 457 + * host PMD huge pages at stage2. 458 + */ 459 + lvls = stage2_pgtable_levels(phys_shift); 460 + if (lvls < 2) 461 + lvls = 2; 462 + vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); 463 + 464 + /* 465 + * Enable the Hardware Access Flag management, unconditionally 466 + * on all CPUs. The features is RES0 on CPUs without the support 467 + * and must be ignored by the CPUs. 468 + */ 469 + vtcr |= VTCR_EL2_HA; 470 + 471 + /* Set the vmid bits */ 472 + vtcr |= (get_vmid_bits(mmfr1) == 16) ? 473 + VTCR_EL2_VS_16BIT : 474 + VTCR_EL2_VS_8BIT; 475 + 476 + return vtcr; 477 + } 478 + 479 + static bool stage2_has_fwb(struct kvm_pgtable *pgt) 480 + { 481 + if (!cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) 482 + return false; 483 + 484 + return !(pgt->flags & KVM_PGTABLE_S2_NOFWB); 485 + } 486 + 487 + #define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt)) 488 + 489 + static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot, 490 + kvm_pte_t *ptep) 477 491 { 478 492 bool device = prot & KVM_PGTABLE_PROT_DEVICE; 479 - kvm_pte_t attr = device ? PAGE_S2_MEMATTR(DEVICE_nGnRE) : 480 - PAGE_S2_MEMATTR(NORMAL); 493 + kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) : 494 + KVM_S2_MEMATTR(pgt, NORMAL); 481 495 u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS; 482 496 483 497 if (!(prot & KVM_PGTABLE_PROT_X)) ··· 539 461 540 462 attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); 541 463 attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; 542 - data->attr = attr; 464 + *ptep = attr; 465 + 543 466 return 0; 467 + } 468 + 469 + static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) 470 + { 471 + if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) 472 + return true; 473 + 474 + return ((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS)); 475 + } 476 + 477 + static bool stage2_pte_is_counted(kvm_pte_t pte) 478 + { 479 + /* 480 + * The refcount tracks valid entries as well as invalid entries if they 481 + * encode ownership of a page to another entity than the page-table 482 + * owner, whose id is 0. 483 + */ 484 + return !!pte; 485 + } 486 + 487 + static void stage2_put_pte(kvm_pte_t *ptep, struct kvm_s2_mmu *mmu, u64 addr, 488 + u32 level, struct kvm_pgtable_mm_ops *mm_ops) 489 + { 490 + /* 491 + * Clear the existing PTE, and perform break-before-make with 492 + * TLB maintenance if it was valid. 493 + */ 494 + if (kvm_pte_valid(*ptep)) { 495 + kvm_clear_pte(ptep); 496 + kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level); 497 + } 498 + 499 + mm_ops->put_page(ptep); 544 500 } 545 501 546 502 static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, ··· 583 471 { 584 472 kvm_pte_t new, old = *ptep; 585 473 u64 granule = kvm_granule_size(level), phys = data->phys; 586 - struct page *page = virt_to_page(ptep); 474 + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; 587 475 588 476 if (!kvm_block_mapping_supported(addr, end, phys, level)) 589 477 return -E2BIG; 590 478 591 - new = kvm_init_valid_leaf_pte(phys, data->attr, level); 592 - if (kvm_pte_valid(old)) { 479 + if (kvm_phys_is_valid(phys)) 480 + new = kvm_init_valid_leaf_pte(phys, data->attr, level); 481 + else 482 + new = kvm_init_invalid_leaf_owner(data->owner_id); 483 + 484 + if (stage2_pte_is_counted(old)) { 593 485 /* 594 486 * Skip updating the PTE if we are trying to recreate the exact 595 487 * same mapping or only change the access permissions. Instead, 596 488 * the vCPU will exit one more time from guest if still needed 597 489 * and then go through the path of relaxing permissions. 598 490 */ 599 - if (!((old ^ new) & (~KVM_PTE_LEAF_ATTR_S2_PERMS))) 491 + if (!stage2_pte_needs_update(old, new)) 600 492 return -EAGAIN; 601 493 602 - /* 603 - * There's an existing different valid leaf entry, so perform 604 - * break-before-make. 605 - */ 606 - kvm_set_invalid_pte(ptep); 607 - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); 608 - put_page(page); 494 + stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); 609 495 } 610 496 611 497 smp_store_release(ptep, new); 612 - get_page(page); 613 - data->phys += granule; 498 + if (stage2_pte_is_counted(new)) 499 + mm_ops->get_page(ptep); 500 + if (kvm_phys_is_valid(phys)) 501 + data->phys += granule; 614 502 return 0; 615 503 } 616 504 ··· 624 512 if (!kvm_block_mapping_supported(addr, end, data->phys, level)) 625 513 return 0; 626 514 627 - kvm_set_invalid_pte(ptep); 515 + data->childp = kvm_pte_follow(*ptep, data->mm_ops); 516 + kvm_clear_pte(ptep); 628 517 629 518 /* 630 519 * Invalidate the whole stage-2, as we may have numerous leaf ··· 640 527 static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, 641 528 struct stage2_map_data *data) 642 529 { 643 - int ret; 530 + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; 644 531 kvm_pte_t *childp, pte = *ptep; 645 - struct page *page = virt_to_page(ptep); 532 + int ret; 646 533 647 534 if (data->anchor) { 648 - if (kvm_pte_valid(pte)) 649 - put_page(page); 535 + if (stage2_pte_is_counted(pte)) 536 + mm_ops->put_page(ptep); 650 537 651 538 return 0; 652 539 } ··· 661 548 if (!data->memcache) 662 549 return -ENOMEM; 663 550 664 - childp = kvm_mmu_memory_cache_alloc(data->memcache); 551 + childp = mm_ops->zalloc_page(data->memcache); 665 552 if (!childp) 666 553 return -ENOMEM; 667 554 ··· 670 557 * a table. Accesses beyond 'end' that fall within the new table 671 558 * will be mapped lazily. 672 559 */ 673 - if (kvm_pte_valid(pte)) { 674 - kvm_set_invalid_pte(ptep); 675 - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, data->mmu, addr, level); 676 - put_page(page); 677 - } 560 + if (stage2_pte_is_counted(pte)) 561 + stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); 678 562 679 - kvm_set_table_pte(ptep, childp); 680 - get_page(page); 563 + kvm_set_table_pte(ptep, childp, mm_ops); 564 + mm_ops->get_page(ptep); 681 565 682 566 return 0; 683 567 } ··· 683 573 kvm_pte_t *ptep, 684 574 struct stage2_map_data *data) 685 575 { 576 + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; 577 + kvm_pte_t *childp; 686 578 int ret = 0; 687 579 688 580 if (!data->anchor) 689 581 return 0; 690 582 691 - free_page((unsigned long)kvm_pte_follow(*ptep)); 692 - put_page(virt_to_page(ptep)); 693 - 694 583 if (data->anchor == ptep) { 584 + childp = data->childp; 695 585 data->anchor = NULL; 586 + data->childp = NULL; 696 587 ret = stage2_map_walk_leaf(addr, end, level, ptep, data); 588 + } else { 589 + childp = kvm_pte_follow(*ptep, mm_ops); 697 590 } 591 + 592 + mm_ops->put_page(childp); 593 + mm_ops->put_page(ptep); 698 594 699 595 return ret; 700 596 } ··· 743 627 744 628 int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, 745 629 u64 phys, enum kvm_pgtable_prot prot, 746 - struct kvm_mmu_memory_cache *mc) 630 + void *mc) 747 631 { 748 632 int ret; 749 633 struct stage2_map_data map_data = { 750 634 .phys = ALIGN_DOWN(phys, PAGE_SIZE), 751 635 .mmu = pgt->mmu, 752 636 .memcache = mc, 637 + .mm_ops = pgt->mm_ops, 753 638 }; 754 639 struct kvm_pgtable_walker walker = { 755 640 .cb = stage2_map_walker, ··· 760 643 .arg = &map_data, 761 644 }; 762 645 763 - ret = stage2_map_set_prot_attr(prot, &map_data); 646 + if (WARN_ON((pgt->flags & KVM_PGTABLE_S2_IDMAP) && (addr != phys))) 647 + return -EINVAL; 648 + 649 + ret = stage2_set_prot_attr(pgt, prot, &map_data.attr); 764 650 if (ret) 765 651 return ret; 766 652 ··· 772 652 return ret; 773 653 } 774 654 775 - static void stage2_flush_dcache(void *addr, u64 size) 655 + int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, 656 + void *mc, u8 owner_id) 776 657 { 777 - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) 778 - return; 658 + int ret; 659 + struct stage2_map_data map_data = { 660 + .phys = KVM_PHYS_INVALID, 661 + .mmu = pgt->mmu, 662 + .memcache = mc, 663 + .mm_ops = pgt->mm_ops, 664 + .owner_id = owner_id, 665 + }; 666 + struct kvm_pgtable_walker walker = { 667 + .cb = stage2_map_walker, 668 + .flags = KVM_PGTABLE_WALK_TABLE_PRE | 669 + KVM_PGTABLE_WALK_LEAF | 670 + KVM_PGTABLE_WALK_TABLE_POST, 671 + .arg = &map_data, 672 + }; 779 673 780 - __flush_dcache_area(addr, size); 674 + if (owner_id > KVM_MAX_OWNER_ID) 675 + return -EINVAL; 676 + 677 + ret = kvm_pgtable_walk(pgt, addr, size, &walker); 678 + return ret; 781 679 } 782 680 783 - static bool stage2_pte_cacheable(kvm_pte_t pte) 681 + static bool stage2_pte_cacheable(struct kvm_pgtable *pgt, kvm_pte_t pte) 784 682 { 785 683 u64 memattr = pte & KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR; 786 - return memattr == PAGE_S2_MEMATTR(NORMAL); 684 + return memattr == KVM_S2_MEMATTR(pgt, NORMAL); 787 685 } 788 686 789 687 static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, 790 688 enum kvm_pgtable_walk_flags flag, 791 689 void * const arg) 792 690 { 793 - struct kvm_s2_mmu *mmu = arg; 691 + struct kvm_pgtable *pgt = arg; 692 + struct kvm_s2_mmu *mmu = pgt->mmu; 693 + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; 794 694 kvm_pte_t pte = *ptep, *childp = NULL; 795 695 bool need_flush = false; 796 696 797 - if (!kvm_pte_valid(pte)) 697 + if (!kvm_pte_valid(pte)) { 698 + if (stage2_pte_is_counted(pte)) { 699 + kvm_clear_pte(ptep); 700 + mm_ops->put_page(ptep); 701 + } 798 702 return 0; 703 + } 799 704 800 705 if (kvm_pte_table(pte, level)) { 801 - childp = kvm_pte_follow(pte); 706 + childp = kvm_pte_follow(pte, mm_ops); 802 707 803 - if (page_count(virt_to_page(childp)) != 1) 708 + if (mm_ops->page_count(childp) != 1) 804 709 return 0; 805 - } else if (stage2_pte_cacheable(pte)) { 806 - need_flush = true; 710 + } else if (stage2_pte_cacheable(pgt, pte)) { 711 + need_flush = !stage2_has_fwb(pgt); 807 712 } 808 713 809 714 /* ··· 836 691 * block entry and rely on the remaining portions being faulted 837 692 * back lazily. 838 693 */ 839 - kvm_set_invalid_pte(ptep); 840 - kvm_call_hyp(__kvm_tlb_flush_vmid_ipa, mmu, addr, level); 841 - put_page(virt_to_page(ptep)); 694 + stage2_put_pte(ptep, mmu, addr, level, mm_ops); 842 695 843 696 if (need_flush) { 844 - stage2_flush_dcache(kvm_pte_follow(pte), 697 + __flush_dcache_area(kvm_pte_follow(pte, mm_ops), 845 698 kvm_granule_size(level)); 846 699 } 847 700 848 701 if (childp) 849 - free_page((unsigned long)childp); 702 + mm_ops->put_page(childp); 850 703 851 704 return 0; 852 705 } ··· 853 710 { 854 711 struct kvm_pgtable_walker walker = { 855 712 .cb = stage2_unmap_walker, 856 - .arg = pgt->mmu, 713 + .arg = pgt, 857 714 .flags = KVM_PGTABLE_WALK_LEAF | KVM_PGTABLE_WALK_TABLE_POST, 858 715 }; 859 716 ··· 985 842 enum kvm_pgtable_walk_flags flag, 986 843 void * const arg) 987 844 { 845 + struct kvm_pgtable *pgt = arg; 846 + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; 988 847 kvm_pte_t pte = *ptep; 989 848 990 - if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pte)) 849 + if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte)) 991 850 return 0; 992 851 993 - stage2_flush_dcache(kvm_pte_follow(pte), kvm_granule_size(level)); 852 + __flush_dcache_area(kvm_pte_follow(pte, mm_ops), kvm_granule_size(level)); 994 853 return 0; 995 854 } 996 855 ··· 1001 856 struct kvm_pgtable_walker walker = { 1002 857 .cb = stage2_flush_walker, 1003 858 .flags = KVM_PGTABLE_WALK_LEAF, 859 + .arg = pgt, 1004 860 }; 1005 861 1006 - if (cpus_have_const_cap(ARM64_HAS_STAGE2_FWB)) 862 + if (stage2_has_fwb(pgt)) 1007 863 return 0; 1008 864 1009 865 return kvm_pgtable_walk(pgt, addr, size, &walker); 1010 866 } 1011 867 1012 - int kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm *kvm) 868 + int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, 869 + struct kvm_pgtable_mm_ops *mm_ops, 870 + enum kvm_pgtable_stage2_flags flags) 1013 871 { 1014 872 size_t pgd_sz; 1015 - u64 vtcr = kvm->arch.vtcr; 873 + u64 vtcr = arch->vtcr; 1016 874 u32 ia_bits = VTCR_EL2_IPA(vtcr); 1017 875 u32 sl0 = FIELD_GET(VTCR_EL2_SL0_MASK, vtcr); 1018 876 u32 start_level = VTCR_EL2_TGRAN_SL0_BASE - sl0; 1019 877 1020 878 pgd_sz = kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE; 1021 - pgt->pgd = alloc_pages_exact(pgd_sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 879 + pgt->pgd = mm_ops->zalloc_pages_exact(pgd_sz); 1022 880 if (!pgt->pgd) 1023 881 return -ENOMEM; 1024 882 1025 883 pgt->ia_bits = ia_bits; 1026 884 pgt->start_level = start_level; 1027 - pgt->mmu = &kvm->arch.mmu; 885 + pgt->mm_ops = mm_ops; 886 + pgt->mmu = &arch->mmu; 887 + pgt->flags = flags; 1028 888 1029 889 /* Ensure zeroed PGD pages are visible to the hardware walker */ 1030 890 dsb(ishst); ··· 1040 890 enum kvm_pgtable_walk_flags flag, 1041 891 void * const arg) 1042 892 { 893 + struct kvm_pgtable_mm_ops *mm_ops = arg; 1043 894 kvm_pte_t pte = *ptep; 1044 895 1045 - if (!kvm_pte_valid(pte)) 896 + if (!stage2_pte_is_counted(pte)) 1046 897 return 0; 1047 898 1048 - put_page(virt_to_page(ptep)); 899 + mm_ops->put_page(ptep); 1049 900 1050 901 if (kvm_pte_table(pte, level)) 1051 - free_page((unsigned long)kvm_pte_follow(pte)); 902 + mm_ops->put_page(kvm_pte_follow(pte, mm_ops)); 1052 903 1053 904 return 0; 1054 905 } ··· 1061 910 .cb = stage2_free_walker, 1062 911 .flags = KVM_PGTABLE_WALK_LEAF | 1063 912 KVM_PGTABLE_WALK_TABLE_POST, 913 + .arg = pgt->mm_ops, 1064 914 }; 1065 915 1066 916 WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker)); 1067 917 pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE; 1068 - free_pages_exact(pgt->pgd, pgd_sz); 918 + pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz); 1069 919 pgt->pgd = NULL; 920 + } 921 + 922 + #define KVM_PTE_LEAF_S2_COMPAT_MASK (KVM_PTE_LEAF_ATTR_S2_PERMS | \ 923 + KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \ 924 + KVM_PTE_LEAF_ATTR_S2_IGNORED) 925 + 926 + static int stage2_check_permission_walker(u64 addr, u64 end, u32 level, 927 + kvm_pte_t *ptep, 928 + enum kvm_pgtable_walk_flags flag, 929 + void * const arg) 930 + { 931 + kvm_pte_t old_attr, pte = *ptep, *new_attr = arg; 932 + 933 + /* 934 + * Compatible mappings are either invalid and owned by the page-table 935 + * owner (whose id is 0), or valid with matching permission attributes. 936 + */ 937 + if (kvm_pte_valid(pte)) { 938 + old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK; 939 + if (old_attr != *new_attr) 940 + return -EEXIST; 941 + } else if (pte) { 942 + return -EEXIST; 943 + } 944 + 945 + return 0; 946 + } 947 + 948 + int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, 949 + enum kvm_pgtable_prot prot, 950 + struct kvm_mem_range *range) 951 + { 952 + kvm_pte_t attr; 953 + struct kvm_pgtable_walker check_perm_walker = { 954 + .cb = stage2_check_permission_walker, 955 + .flags = KVM_PGTABLE_WALK_LEAF, 956 + .arg = &attr, 957 + }; 958 + u64 granule, start, end; 959 + u32 level; 960 + int ret; 961 + 962 + ret = stage2_set_prot_attr(pgt, prot, &attr); 963 + if (ret) 964 + return ret; 965 + attr &= KVM_PTE_LEAF_S2_COMPAT_MASK; 966 + 967 + for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) { 968 + granule = kvm_granule_size(level); 969 + start = ALIGN_DOWN(addr, granule); 970 + end = start + granule; 971 + 972 + if (!kvm_level_supports_block_mapping(level)) 973 + continue; 974 + 975 + if (start < range->start || range->end < end) 976 + continue; 977 + 978 + /* 979 + * Check the presence of existing mappings with incompatible 980 + * permissions within the current block range, and try one level 981 + * deeper if one is found. 982 + */ 983 + ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker); 984 + if (ret != -EEXIST) 985 + break; 986 + } 987 + 988 + if (!ret) { 989 + range->start = start; 990 + range->end = end; 991 + } 992 + 993 + return ret; 1070 994 }

+113

arch/arm64/kvm/hyp/reserved_mem.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * Copyright (C) 2020 - Google LLC 4 + * Author: Quentin Perret <qperret@google.com> 5 + */ 6 + 7 + #include <linux/kvm_host.h> 8 + #include <linux/memblock.h> 9 + #include <linux/sort.h> 10 + 11 + #include <asm/kvm_host.h> 12 + 13 + #include <nvhe/memory.h> 14 + #include <nvhe/mm.h> 15 + 16 + static struct memblock_region *hyp_memory = kvm_nvhe_sym(hyp_memory); 17 + static unsigned int *hyp_memblock_nr_ptr = &kvm_nvhe_sym(hyp_memblock_nr); 18 + 19 + phys_addr_t hyp_mem_base; 20 + phys_addr_t hyp_mem_size; 21 + 22 + static int cmp_hyp_memblock(const void *p1, const void *p2) 23 + { 24 + const struct memblock_region *r1 = p1; 25 + const struct memblock_region *r2 = p2; 26 + 27 + return r1->base < r2->base ? -1 : (r1->base > r2->base); 28 + } 29 + 30 + static void __init sort_memblock_regions(void) 31 + { 32 + sort(hyp_memory, 33 + *hyp_memblock_nr_ptr, 34 + sizeof(struct memblock_region), 35 + cmp_hyp_memblock, 36 + NULL); 37 + } 38 + 39 + static int __init register_memblock_regions(void) 40 + { 41 + struct memblock_region *reg; 42 + 43 + for_each_mem_region(reg) { 44 + if (*hyp_memblock_nr_ptr >= HYP_MEMBLOCK_REGIONS) 45 + return -ENOMEM; 46 + 47 + hyp_memory[*hyp_memblock_nr_ptr] = *reg; 48 + (*hyp_memblock_nr_ptr)++; 49 + } 50 + sort_memblock_regions(); 51 + 52 + return 0; 53 + } 54 + 55 + void __init kvm_hyp_reserve(void) 56 + { 57 + u64 nr_pages, prev, hyp_mem_pages = 0; 58 + int ret; 59 + 60 + if (!is_hyp_mode_available() || is_kernel_in_hyp_mode()) 61 + return; 62 + 63 + if (kvm_get_mode() != KVM_MODE_PROTECTED) 64 + return; 65 + 66 + ret = register_memblock_regions(); 67 + if (ret) { 68 + *hyp_memblock_nr_ptr = 0; 69 + kvm_err("Failed to register hyp memblocks: %d\n", ret); 70 + return; 71 + } 72 + 73 + hyp_mem_pages += hyp_s1_pgtable_pages(); 74 + hyp_mem_pages += host_s2_mem_pgtable_pages(); 75 + hyp_mem_pages += host_s2_dev_pgtable_pages(); 76 + 77 + /* 78 + * The hyp_vmemmap needs to be backed by pages, but these pages 79 + * themselves need to be present in the vmemmap, so compute the number 80 + * of pages needed by looking for a fixed point. 81 + */ 82 + nr_pages = 0; 83 + do { 84 + prev = nr_pages; 85 + nr_pages = hyp_mem_pages + prev; 86 + nr_pages = DIV_ROUND_UP(nr_pages * sizeof(struct hyp_page), PAGE_SIZE); 87 + nr_pages += __hyp_pgtable_max_pages(nr_pages); 88 + } while (nr_pages != prev); 89 + hyp_mem_pages += nr_pages; 90 + 91 + /* 92 + * Try to allocate a PMD-aligned region to reduce TLB pressure once 93 + * this is unmapped from the host stage-2, and fallback to PAGE_SIZE. 94 + */ 95 + hyp_mem_size = hyp_mem_pages << PAGE_SHIFT; 96 + hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), 97 + ALIGN(hyp_mem_size, PMD_SIZE), 98 + PMD_SIZE); 99 + if (!hyp_mem_base) 100 + hyp_mem_base = memblock_find_in_range(0, memblock_end_of_DRAM(), 101 + hyp_mem_size, PAGE_SIZE); 102 + else 103 + hyp_mem_size = ALIGN(hyp_mem_size, PMD_SIZE); 104 + 105 + if (!hyp_mem_base) { 106 + kvm_err("Failed to reserve hyp memory\n"); 107 + return; 108 + } 109 + memblock_reserve(hyp_mem_base, hyp_mem_size); 110 + 111 + kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20, 112 + hyp_mem_base); 113 + }

+1 -3

arch/arm64/kvm/hyp/vhe/switch.c

··· 27 27 #include <asm/processor.h> 28 28 #include <asm/thread_info.h> 29 29 30 - const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n"; 31 - 32 30 /* VHE specific context */ 33 31 DEFINE_PER_CPU(struct kvm_host_data, kvm_host_data); 34 32 DEFINE_PER_CPU(struct kvm_cpu_context, kvm_hyp_ctxt); ··· 205 207 __deactivate_traps(vcpu); 206 208 sysreg_restore_host_state_vhe(host_ctxt); 207 209 208 - panic(__hyp_panic_string, 210 + panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n", 209 211 spsr, elr, 210 212 read_sysreg_el2(SYS_ESR), read_sysreg_el2(SYS_FAR), 211 213 read_sysreg(hpfar_el2), par, vcpu);

+71 -9

arch/arm64/kvm/hypercalls.c

··· 9 9 #include <kvm/arm_hypercalls.h> 10 10 #include <kvm/arm_psci.h> 11 11 12 + static void kvm_ptp_get_time(struct kvm_vcpu *vcpu, u64 *val) 13 + { 14 + struct system_time_snapshot systime_snapshot; 15 + u64 cycles = ~0UL; 16 + u32 feature; 17 + 18 + /* 19 + * system time and counter value must captured at the same 20 + * time to keep consistency and precision. 21 + */ 22 + ktime_get_snapshot(&systime_snapshot); 23 + 24 + /* 25 + * This is only valid if the current clocksource is the 26 + * architected counter, as this is the only one the guest 27 + * can see. 28 + */ 29 + if (systime_snapshot.cs_id != CSID_ARM_ARCH_COUNTER) 30 + return; 31 + 32 + /* 33 + * The guest selects one of the two reference counters 34 + * (virtual or physical) with the first argument of the SMCCC 35 + * call. In case the identifier is not supported, error out. 36 + */ 37 + feature = smccc_get_arg1(vcpu); 38 + switch (feature) { 39 + case KVM_PTP_VIRT_COUNTER: 40 + cycles = systime_snapshot.cycles - vcpu_read_sys_reg(vcpu, CNTVOFF_EL2); 41 + break; 42 + case KVM_PTP_PHYS_COUNTER: 43 + cycles = systime_snapshot.cycles; 44 + break; 45 + default: 46 + return; 47 + } 48 + 49 + /* 50 + * This relies on the top bit of val[0] never being set for 51 + * valid values of system time, because that is *really* far 52 + * in the future (about 292 years from 1970, and at that stage 53 + * nobody will give a damn about it). 54 + */ 55 + val[0] = upper_32_bits(systime_snapshot.real); 56 + val[1] = lower_32_bits(systime_snapshot.real); 57 + val[2] = upper_32_bits(cycles); 58 + val[3] = lower_32_bits(cycles); 59 + } 60 + 12 61 int kvm_hvc_call_handler(struct kvm_vcpu *vcpu) 13 62 { 14 63 u32 func_id = smccc_get_function(vcpu); 15 - long val = SMCCC_RET_NOT_SUPPORTED; 64 + u64 val[4] = {SMCCC_RET_NOT_SUPPORTED}; 16 65 u32 feature; 17 66 gpa_t gpa; 18 67 19 68 switch (func_id) { 20 69 case ARM_SMCCC_VERSION_FUNC_ID: 21 - val = ARM_SMCCC_VERSION_1_1; 70 + val[0] = ARM_SMCCC_VERSION_1_1; 22 71 break; 23 72 case ARM_SMCCC_ARCH_FEATURES_FUNC_ID: 24 73 feature = smccc_get_arg1(vcpu); ··· 77 28 case SPECTRE_VULNERABLE: 78 29 break; 79 30 case SPECTRE_MITIGATED: 80 - val = SMCCC_RET_SUCCESS; 31 + val[0] = SMCCC_RET_SUCCESS; 81 32 break; 82 33 case SPECTRE_UNAFFECTED: 83 - val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; 34 + val[0] = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED; 84 35 break; 85 36 } 86 37 break; ··· 103 54 break; 104 55 fallthrough; 105 56 case SPECTRE_UNAFFECTED: 106 - val = SMCCC_RET_NOT_REQUIRED; 57 + val[0] = SMCCC_RET_NOT_REQUIRED; 107 58 break; 108 59 } 109 60 break; 110 61 case ARM_SMCCC_HV_PV_TIME_FEATURES: 111 - val = SMCCC_RET_SUCCESS; 62 + val[0] = SMCCC_RET_SUCCESS; 112 63 break; 113 64 } 114 65 break; 115 66 case ARM_SMCCC_HV_PV_TIME_FEATURES: 116 - val = kvm_hypercall_pv_features(vcpu); 67 + val[0] = kvm_hypercall_pv_features(vcpu); 117 68 break; 118 69 case ARM_SMCCC_HV_PV_TIME_ST: 119 70 gpa = kvm_init_stolen_time(vcpu); 120 71 if (gpa != GPA_INVALID) 121 - val = gpa; 72 + val[0] = gpa; 73 + break; 74 + case ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID: 75 + val[0] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0; 76 + val[1] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1; 77 + val[2] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2; 78 + val[3] = ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3; 79 + break; 80 + case ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID: 81 + val[0] = BIT(ARM_SMCCC_KVM_FUNC_FEATURES); 82 + val[0] |= BIT(ARM_SMCCC_KVM_FUNC_PTP); 83 + break; 84 + case ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID: 85 + kvm_ptp_get_time(vcpu, val); 122 86 break; 123 87 case ARM_SMCCC_TRNG_VERSION: 124 88 case ARM_SMCCC_TRNG_FEATURES: ··· 143 81 return kvm_psci_call(vcpu); 144 82 } 145 83 146 - smccc_set_retval(vcpu, val, 0, 0, 0); 84 + smccc_set_retval(vcpu, val[0], val[1], val[2], val[3]); 147 85 return 1; 148 86 }

+119 -17

arch/arm64/kvm/mmu.c

··· 88 88 return !pfn_valid(pfn); 89 89 } 90 90 91 + static void *stage2_memcache_zalloc_page(void *arg) 92 + { 93 + struct kvm_mmu_memory_cache *mc = arg; 94 + 95 + /* Allocated with __GFP_ZERO, so no need to zero */ 96 + return kvm_mmu_memory_cache_alloc(mc); 97 + } 98 + 99 + static void *kvm_host_zalloc_pages_exact(size_t size) 100 + { 101 + return alloc_pages_exact(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); 102 + } 103 + 104 + static void kvm_host_get_page(void *addr) 105 + { 106 + get_page(virt_to_page(addr)); 107 + } 108 + 109 + static void kvm_host_put_page(void *addr) 110 + { 111 + put_page(virt_to_page(addr)); 112 + } 113 + 114 + static int kvm_host_page_count(void *addr) 115 + { 116 + return page_count(virt_to_page(addr)); 117 + } 118 + 119 + static phys_addr_t kvm_host_pa(void *addr) 120 + { 121 + return __pa(addr); 122 + } 123 + 124 + static void *kvm_host_va(phys_addr_t phys) 125 + { 126 + return __va(phys); 127 + } 128 + 91 129 /* 92 130 * Unmapping vs dcache management: 93 131 * ··· 165 127 static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size, 166 128 bool may_block) 167 129 { 168 - struct kvm *kvm = mmu->kvm; 130 + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 169 131 phys_addr_t end = start + size; 170 132 171 133 assert_spin_locked(&kvm->mmu_lock); ··· 221 183 if (hyp_pgtable) { 222 184 kvm_pgtable_hyp_destroy(hyp_pgtable); 223 185 kfree(hyp_pgtable); 186 + hyp_pgtable = NULL; 224 187 } 225 188 mutex_unlock(&kvm_hyp_pgd_mutex); 189 + } 190 + 191 + static bool kvm_host_owns_hyp_mappings(void) 192 + { 193 + if (static_branch_likely(&kvm_protected_mode_initialized)) 194 + return false; 195 + 196 + /* 197 + * This can happen at boot time when __create_hyp_mappings() is called 198 + * after the hyp protection has been enabled, but the static key has 199 + * not been flipped yet. 200 + */ 201 + if (!hyp_pgtable && is_protected_kvm_enabled()) 202 + return false; 203 + 204 + WARN_ON(!hyp_pgtable); 205 + 206 + return true; 226 207 } 227 208 228 209 static int __create_hyp_mappings(unsigned long start, unsigned long size, 229 210 unsigned long phys, enum kvm_pgtable_prot prot) 230 211 { 231 212 int err; 213 + 214 + if (!kvm_host_owns_hyp_mappings()) { 215 + return kvm_call_hyp_nvhe(__pkvm_create_mappings, 216 + start, size, phys, prot); 217 + } 232 218 233 219 mutex_lock(&kvm_hyp_pgd_mutex); 234 220 err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); ··· 314 252 { 315 253 unsigned long base; 316 254 int ret = 0; 255 + 256 + if (!kvm_host_owns_hyp_mappings()) { 257 + base = kvm_call_hyp_nvhe(__pkvm_create_private_mapping, 258 + phys_addr, size, prot); 259 + if (IS_ERR_OR_NULL((void *)base)) 260 + return PTR_ERR((void *)base); 261 + *haddr = base; 262 + 263 + return 0; 264 + } 317 265 318 266 mutex_lock(&kvm_hyp_pgd_mutex); 319 267 ··· 423 351 return 0; 424 352 } 425 353 354 + static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { 355 + .zalloc_page = stage2_memcache_zalloc_page, 356 + .zalloc_pages_exact = kvm_host_zalloc_pages_exact, 357 + .free_pages_exact = free_pages_exact, 358 + .get_page = kvm_host_get_page, 359 + .put_page = kvm_host_put_page, 360 + .page_count = kvm_host_page_count, 361 + .phys_to_virt = kvm_host_va, 362 + .virt_to_phys = kvm_host_pa, 363 + }; 364 + 426 365 /** 427 366 * kvm_init_stage2_mmu - Initialise a S2 MMU strucrure 428 367 * @kvm: The pointer to the KVM structure ··· 457 374 if (!pgt) 458 375 return -ENOMEM; 459 376 460 - err = kvm_pgtable_stage2_init(pgt, kvm); 377 + err = kvm_pgtable_stage2_init(pgt, &kvm->arch, &kvm_s2_mm_ops); 461 378 if (err) 462 379 goto out_free_pgtable; 463 380 ··· 470 387 for_each_possible_cpu(cpu) 471 388 *per_cpu_ptr(mmu->last_vcpu_ran, cpu) = -1; 472 389 473 - mmu->kvm = kvm; 390 + mmu->arch = &kvm->arch; 474 391 mmu->pgt = pgt; 475 392 mmu->pgd_phys = __pa(pgt->pgd); 476 393 mmu->vmid.vmid_gen = 0; ··· 504 421 * +--------------------------------------------+ 505 422 */ 506 423 do { 507 - struct vm_area_struct *vma = find_vma(current->mm, hva); 424 + struct vm_area_struct *vma; 508 425 hva_t vm_start, vm_end; 509 426 510 - if (!vma || vma->vm_start >= reg_end) 427 + vma = find_vma_intersection(current->mm, hva, reg_end); 428 + if (!vma) 511 429 break; 512 430 513 431 /* ··· 553 469 554 470 void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu) 555 471 { 556 - struct kvm *kvm = mmu->kvm; 472 + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 557 473 struct kvm_pgtable *pgt = NULL; 558 474 559 475 spin_lock(&kvm->mmu_lock); ··· 622 538 */ 623 539 static void stage2_wp_range(struct kvm_s2_mmu *mmu, phys_addr_t addr, phys_addr_t end) 624 540 { 625 - struct kvm *kvm = mmu->kvm; 541 + struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu); 626 542 stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_wrprotect); 627 543 } 628 544 ··· 639 555 * Acquires kvm_mmu_lock. Called with kvm->slots_lock mutex acquired, 640 556 * serializing operations for VM memory regions. 641 557 */ 642 - void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 558 + static void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot) 643 559 { 644 560 struct kvm_memslots *slots = kvm_memslots(kvm); 645 561 struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); ··· 926 842 * unmapped afterwards, the call to kvm_unmap_gfn will take it away 927 843 * from us again properly. This smp_rmb() interacts with the smp_wmb() 928 844 * in kvm_mmu_notifier_invalidate_<page|range_end>. 845 + * 846 + * Besides, __gfn_to_pfn_memslot() instead of gfn_to_pfn_prot() is 847 + * used to avoid unnecessary overhead introduced to locate the memory 848 + * slot because it's always fixed even @gfn is adjusted for huge pages. 929 849 */ 930 850 smp_rmb(); 931 851 932 - pfn = gfn_to_pfn_prot(kvm, gfn, write_fault, &writable); 852 + pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL, 853 + write_fault, &writable, NULL); 933 854 if (pfn == KVM_PFN_ERR_HWPOISON) { 934 855 kvm_send_hwpoison_signal(hva, vma_shift); 935 856 return 0; ··· 1000 911 /* Mark the page dirty only if the fault is handled successfully */ 1001 912 if (writable && !ret) { 1002 913 kvm_set_pfn_dirty(pfn); 1003 - mark_page_dirty(kvm, gfn); 914 + mark_page_dirty_in_slot(kvm, memslot, gfn); 1004 915 } 1005 916 1006 917 out_unlock: ··· 1241 1152 return err; 1242 1153 } 1243 1154 1244 - int kvm_mmu_init(void) 1155 + static void *kvm_hyp_zalloc_page(void *arg) 1156 + { 1157 + return (void *)get_zeroed_page(GFP_KERNEL); 1158 + } 1159 + 1160 + static struct kvm_pgtable_mm_ops kvm_hyp_mm_ops = { 1161 + .zalloc_page = kvm_hyp_zalloc_page, 1162 + .get_page = kvm_host_get_page, 1163 + .put_page = kvm_host_put_page, 1164 + .phys_to_virt = kvm_host_va, 1165 + .virt_to_phys = kvm_host_pa, 1166 + }; 1167 + 1168 + int kvm_mmu_init(u32 *hyp_va_bits) 1245 1169 { 1246 1170 int err; 1247 - u32 hyp_va_bits; 1248 1171 1249 1172 hyp_idmap_start = __pa_symbol(__hyp_idmap_text_start); 1250 1173 hyp_idmap_start = ALIGN_DOWN(hyp_idmap_start, PAGE_SIZE); ··· 1270 1169 */ 1271 1170 BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK); 1272 1171 1273 - hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); 1274 - kvm_debug("Using %u-bit virtual addresses at EL2\n", hyp_va_bits); 1172 + *hyp_va_bits = 64 - ((idmap_t0sz & TCR_T0SZ_MASK) >> TCR_T0SZ_OFFSET); 1173 + kvm_debug("Using %u-bit virtual addresses at EL2\n", *hyp_va_bits); 1275 1174 kvm_debug("IDMAP page: %lx\n", hyp_idmap_start); 1276 1175 kvm_debug("HYP VA range: %lx:%lx\n", 1277 1176 kern_hyp_va(PAGE_OFFSET), ··· 1296 1195 goto out; 1297 1196 } 1298 1197 1299 - err = kvm_pgtable_hyp_init(hyp_pgtable, hyp_va_bits); 1198 + err = kvm_pgtable_hyp_init(hyp_pgtable, *hyp_va_bits, &kvm_hyp_mm_ops); 1300 1199 if (err) 1301 1200 goto out_free_pgtable; 1302 1201 ··· 1374 1273 * +--------------------------------------------+ 1375 1274 */ 1376 1275 do { 1377 - struct vm_area_struct *vma = find_vma(current->mm, hva); 1276 + struct vm_area_struct *vma; 1378 1277 hva_t vm_start, vm_end; 1379 1278 1380 - if (!vma || vma->vm_start >= reg_end) 1279 + vma = find_vma_intersection(current->mm, hva, reg_end); 1280 + if (!vma) 1381 1281 break; 1382 1282 1383 1283 /*

+1 -6

arch/arm64/kvm/perf.c

··· 50 50 51 51 int kvm_perf_init(void) 52 52 { 53 - /* 54 - * Check if HW_PERF_EVENTS are supported by checking the number of 55 - * hardware performance counters. This could ensure the presence of 56 - * a physical PMU and CONFIG_PERF_EVENT is selected. 57 - */ 58 - if (IS_ENABLED(CONFIG_ARM_PMU) && perf_num_counters() > 0) 53 + if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled()) 59 54 static_branch_enable(&kvm_arm_pmu_available); 60 55 61 56 return perf_register_guest_info_callbacks(&kvm_guest_cbs);

+1 -1

arch/arm64/kvm/pmu-emul.c

··· 739 739 kvm_pmu_create_perf_event(vcpu, select_idx); 740 740 } 741 741 742 - static int kvm_pmu_probe_pmuver(void) 742 + int kvm_pmu_probe_pmuver(void) 743 743 { 744 744 struct perf_event_attr attr = { }; 745 745 struct perf_event *event;

+4 -4

arch/arm64/kvm/pmu.c

··· 33 33 { 34 34 struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); 35 35 36 - if (!ctx || !kvm_pmu_switch_needed(attr)) 36 + if (!kvm_arm_support_pmu_v3() || !ctx || !kvm_pmu_switch_needed(attr)) 37 37 return; 38 38 39 39 if (!attr->exclude_host) ··· 49 49 { 50 50 struct kvm_host_data *ctx = this_cpu_ptr_hyp_sym(kvm_host_data); 51 51 52 - if (!ctx) 52 + if (!kvm_arm_support_pmu_v3() || !ctx) 53 53 return; 54 54 55 55 ctx->pmu_events.events_host &= ~clr; ··· 172 172 struct kvm_host_data *host; 173 173 u32 events_guest, events_host; 174 174 175 - if (!has_vhe()) 175 + if (!kvm_arm_support_pmu_v3() || !has_vhe()) 176 176 return; 177 177 178 178 preempt_disable(); ··· 193 193 struct kvm_host_data *host; 194 194 u32 events_guest, events_host; 195 195 196 - if (!has_vhe()) 196 + if (!kvm_arm_support_pmu_v3() || !has_vhe()) 197 197 return; 198 198 199 199 host = this_cpu_ptr_hyp_sym(kvm_host_data);

+9 -42

arch/arm64/kvm/reset.c

··· 74 74 if (!system_supports_sve()) 75 75 return -EINVAL; 76 76 77 - /* Verify that KVM startup enforced this when SVE was detected: */ 78 - if (WARN_ON(!has_vhe())) 79 - return -EINVAL; 80 - 81 77 vcpu->arch.sve_max_vl = kvm_sve_max_vl; 82 78 83 79 /* ··· 238 242 239 243 /* Reset core registers */ 240 244 memset(vcpu_gp_regs(vcpu), 0, sizeof(*vcpu_gp_regs(vcpu))); 245 + memset(&vcpu->arch.ctxt.fp_regs, 0, sizeof(vcpu->arch.ctxt.fp_regs)); 246 + vcpu->arch.ctxt.spsr_abt = 0; 247 + vcpu->arch.ctxt.spsr_und = 0; 248 + vcpu->arch.ctxt.spsr_irq = 0; 249 + vcpu->arch.ctxt.spsr_fiq = 0; 241 250 vcpu_gp_regs(vcpu)->pstate = pstate; 242 251 243 252 /* Reset system registers */ ··· 334 333 return 0; 335 334 } 336 335 337 - /* 338 - * Configure the VTCR_EL2 for this VM. The VTCR value is common 339 - * across all the physical CPUs on the system. We use system wide 340 - * sanitised values to fill in different fields, except for Hardware 341 - * Management of Access Flags. HA Flag is set unconditionally on 342 - * all CPUs, as it is safe to run with or without the feature and 343 - * the bit is RES0 on CPUs that don't support it. 344 - */ 345 336 int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) 346 337 { 347 - u64 vtcr = VTCR_EL2_FLAGS, mmfr0; 348 - u32 parange, phys_shift; 349 - u8 lvls; 338 + u64 mmfr0, mmfr1; 339 + u32 phys_shift; 350 340 351 341 if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK) 352 342 return -EINVAL; ··· 357 365 } 358 366 359 367 mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); 360 - parange = cpuid_feature_extract_unsigned_field(mmfr0, 361 - ID_AA64MMFR0_PARANGE_SHIFT); 362 - if (parange > ID_AA64MMFR0_PARANGE_MAX) 363 - parange = ID_AA64MMFR0_PARANGE_MAX; 364 - vtcr |= parange << VTCR_EL2_PS_SHIFT; 368 + mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1); 369 + kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift); 365 370 366 - vtcr |= VTCR_EL2_T0SZ(phys_shift); 367 - /* 368 - * Use a minimum 2 level page table to prevent splitting 369 - * host PMD huge pages at stage2. 370 - */ 371 - lvls = stage2_pgtable_levels(phys_shift); 372 - if (lvls < 2) 373 - lvls = 2; 374 - vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls); 375 - 376 - /* 377 - * Enable the Hardware Access Flag management, unconditionally 378 - * on all CPUs. The features is RES0 on CPUs without the support 379 - * and must be ignored by the CPUs. 380 - */ 381 - vtcr |= VTCR_EL2_HA; 382 - 383 - /* Set the vmid bits */ 384 - vtcr |= (kvm_get_vmid_bits() == 16) ? 385 - VTCR_EL2_VS_16BIT : 386 - VTCR_EL2_VS_8BIT; 387 - kvm->arch.vtcr = vtcr; 388 371 return 0; 389 372 }

+16

arch/arm64/kvm/sys_regs.c

··· 1063 1063 val = cpuid_feature_cap_perfmon_field(val, 1064 1064 ID_AA64DFR0_PMUVER_SHIFT, 1065 1065 kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0); 1066 + /* Hide SPE from guests */ 1067 + val &= ~FEATURE(ID_AA64DFR0_PMSVER); 1066 1068 break; 1067 1069 case SYS_ID_DFR0_EL1: 1068 1070 /* Limit guests to PMUv3 for ARMv8.4 */ ··· 1474 1472 { SYS_DESC(SYS_GCR_EL1), undef_access }, 1475 1473 1476 1474 { SYS_DESC(SYS_ZCR_EL1), NULL, reset_val, ZCR_EL1, 0, .visibility = sve_visibility }, 1475 + { SYS_DESC(SYS_TRFCR_EL1), undef_access }, 1477 1476 { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, 1478 1477 { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, 1479 1478 { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, ··· 1503 1500 1504 1501 { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 }, 1505 1502 { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 }, 1503 + 1504 + { SYS_DESC(SYS_PMSCR_EL1), undef_access }, 1505 + { SYS_DESC(SYS_PMSNEVFR_EL1), undef_access }, 1506 + { SYS_DESC(SYS_PMSICR_EL1), undef_access }, 1507 + { SYS_DESC(SYS_PMSIRR_EL1), undef_access }, 1508 + { SYS_DESC(SYS_PMSFCR_EL1), undef_access }, 1509 + { SYS_DESC(SYS_PMSEVFR_EL1), undef_access }, 1510 + { SYS_DESC(SYS_PMSLATFR_EL1), undef_access }, 1511 + { SYS_DESC(SYS_PMSIDR_EL1), undef_access }, 1512 + { SYS_DESC(SYS_PMBLIMITR_EL1), undef_access }, 1513 + { SYS_DESC(SYS_PMBPTR_EL1), undef_access }, 1514 + { SYS_DESC(SYS_PMBSR_EL1), undef_access }, 1515 + /* PMBIDR_EL1 is not trapped */ 1506 1516 1507 1517 { PMU_SYS_REG(SYS_PMINTENSET_EL1), 1508 1518 .access = access_pminten, .reg = PMINTENSET_EL1 },

+7

arch/arm64/kvm/va_layout.c

··· 288 288 { 289 289 generate_mov_q(kimage_voffset, origptr, updptr, nr_inst); 290 290 } 291 + 292 + void kvm_compute_final_ctr_el0(struct alt_instr *alt, 293 + __le32 *origptr, __le32 *updptr, int nr_inst) 294 + { 295 + generate_mov_q(read_sanitised_ftr_reg(SYS_CTR_EL0), 296 + origptr, updptr, nr_inst); 297 + }

+7 -5

arch/arm64/kvm/vgic/vgic-init.c

··· 335 335 kfree(dist->spis); 336 336 dist->spis = NULL; 337 337 dist->nr_spis = 0; 338 + dist->vgic_dist_base = VGIC_ADDR_UNDEF; 338 339 339 - if (kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { 340 - list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) { 341 - list_del(&rdreg->list); 342 - kfree(rdreg); 343 - } 340 + if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) { 341 + list_for_each_entry_safe(rdreg, next, &dist->rd_regions, list) 342 + vgic_v3_free_redist_region(rdreg); 344 343 INIT_LIST_HEAD(&dist->rd_regions); 344 + } else { 345 + dist->vgic_cpu_base = VGIC_ADDR_UNDEF; 345 346 } 346 347 347 348 if (vgic_has_its(kvm)) ··· 363 362 vgic_flush_pending_lpis(vcpu); 364 363 365 364 INIT_LIST_HEAD(&vgic_cpu->ap_list_head); 365 + vgic_cpu->rd_iodev.base_addr = VGIC_ADDR_UNDEF; 366 366 } 367 367 368 368 /* To be called with kvm->lock held */

+3 -3

arch/arm64/kvm/vgic/vgic-its.c

··· 2218 2218 /* 2219 2219 * If an LPI carries the HW bit, this means that this 2220 2220 * interrupt is controlled by GICv4, and we do not 2221 - * have direct access to that state. Let's simply fail 2222 - * the save operation... 2221 + * have direct access to that state without GICv4.1. 2222 + * Let's simply fail the save operation... 2223 2223 */ 2224 - if (ite->irq->hw) 2224 + if (ite->irq->hw && !kvm_vgic_global_state.has_gicv4_1) 2225 2225 return -EACCES; 2226 2226 2227 2227 ret = vgic_its_save_ite(its, device, ite, gpa, ite_esz);

+5 -2

arch/arm64/kvm/vgic/vgic-kvm-device.c

··· 87 87 r = vgic_v3_set_redist_base(kvm, 0, *addr, 0); 88 88 goto out; 89 89 } 90 - rdreg = list_first_entry(&vgic->rd_regions, 91 - struct vgic_redist_region, list); 90 + rdreg = list_first_entry_or_null(&vgic->rd_regions, 91 + struct vgic_redist_region, list); 92 92 if (!rdreg) 93 93 addr_ptr = &undef_value; 94 94 else ··· 225 225 u64 __user *uaddr = (u64 __user *)(long)attr->addr; 226 226 u64 addr; 227 227 unsigned long type = (unsigned long)attr->attr; 228 + 229 + if (copy_from_user(&addr, uaddr, sizeof(addr))) 230 + return -EFAULT; 228 231 229 232 r = kvm_vgic_addr(dev->kvm, type, &addr, false); 230 233 if (r)

+56 -37

arch/arm64/kvm/vgic/vgic-mmio-v3.c

··· 251 251 vgic_enable_lpis(vcpu); 252 252 } 253 253 254 + static bool vgic_mmio_vcpu_rdist_is_last(struct kvm_vcpu *vcpu) 255 + { 256 + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; 257 + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 258 + struct vgic_redist_region *iter, *rdreg = vgic_cpu->rdreg; 259 + 260 + if (!rdreg) 261 + return false; 262 + 263 + if (vgic_cpu->rdreg_index < rdreg->free_index - 1) { 264 + return false; 265 + } else if (rdreg->count && vgic_cpu->rdreg_index == (rdreg->count - 1)) { 266 + struct list_head *rd_regions = &vgic->rd_regions; 267 + gpa_t end = rdreg->base + rdreg->count * KVM_VGIC_V3_REDIST_SIZE; 268 + 269 + /* 270 + * the rdist is the last one of the redist region, 271 + * check whether there is no other contiguous rdist region 272 + */ 273 + list_for_each_entry(iter, rd_regions, list) { 274 + if (iter->base == end && iter->free_index > 0) 275 + return false; 276 + } 277 + } 278 + return true; 279 + } 280 + 254 281 static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu, 255 282 gpa_t addr, unsigned int len) 256 283 { 257 284 unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); 258 - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; 259 - struct vgic_redist_region *rdreg = vgic_cpu->rdreg; 260 285 int target_vcpu_id = vcpu->vcpu_id; 261 - gpa_t last_rdist_typer = rdreg->base + GICR_TYPER + 262 - (rdreg->free_index - 1) * KVM_VGIC_V3_REDIST_SIZE; 263 286 u64 value; 264 287 265 288 value = (u64)(mpidr & GENMASK(23, 0)) << 32; 266 289 value |= ((target_vcpu_id & 0xffff) << 8); 267 290 268 - if (addr == last_rdist_typer) 291 + if (vgic_has_its(vcpu->kvm)) 292 + value |= GICR_TYPER_PLPIS; 293 + 294 + if (vgic_mmio_vcpu_rdist_is_last(vcpu)) 269 295 value |= GICR_TYPER_LAST; 270 - if (vgic_has_its(vcpu->kvm)) 271 - value |= GICR_TYPER_PLPIS; 272 296 273 - return extract_bytes(value, addr & 7, len); 274 - } 275 - 276 - static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu, 277 - gpa_t addr, unsigned int len) 278 - { 279 - unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu); 280 - int target_vcpu_id = vcpu->vcpu_id; 281 - u64 value; 282 - 283 - value = (u64)(mpidr & GENMASK(23, 0)) << 32; 284 - value |= ((target_vcpu_id & 0xffff) << 8); 285 - 286 - if (vgic_has_its(vcpu->kvm)) 287 - value |= GICR_TYPER_PLPIS; 288 - 289 - /* reporting of the Last bit is not supported for userspace */ 290 297 return extract_bytes(value, addr & 7, len); 291 298 } 292 299 ··· 619 612 VGIC_ACCESS_32bit), 620 613 REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_TYPER, 621 614 vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 622 - vgic_uaccess_read_v3r_typer, vgic_mmio_uaccess_write_wi, 8, 615 + NULL, vgic_mmio_uaccess_write_wi, 8, 623 616 VGIC_ACCESS_64bit | VGIC_ACCESS_32bit), 624 617 REGISTER_DESC_WITH_LENGTH(GICR_WAKER, 625 618 vgic_mmio_read_raz, vgic_mmio_write_wi, 4, ··· 721 714 return -EINVAL; 722 715 723 716 vgic_cpu->rdreg = rdreg; 717 + vgic_cpu->rdreg_index = rdreg->free_index; 724 718 725 719 rd_base = rdreg->base + rdreg->free_index * KVM_VGIC_V3_REDIST_SIZE; 726 720 ··· 776 768 } 777 769 778 770 /** 779 - * vgic_v3_insert_redist_region - Insert a new redistributor region 771 + * vgic_v3_alloc_redist_region - Allocate a new redistributor region 780 772 * 781 773 * Performs various checks before inserting the rdist region in the list. 782 774 * Those tests depend on whether the size of the rdist region is known ··· 790 782 * 791 783 * Return 0 on success, < 0 otherwise 792 784 */ 793 - static int vgic_v3_insert_redist_region(struct kvm *kvm, uint32_t index, 794 - gpa_t base, uint32_t count) 785 + static int vgic_v3_alloc_redist_region(struct kvm *kvm, uint32_t index, 786 + gpa_t base, uint32_t count) 795 787 { 796 788 struct vgic_dist *d = &kvm->arch.vgic; 797 789 struct vgic_redist_region *rdreg; 798 790 struct list_head *rd_regions = &d->rd_regions; 799 791 size_t size = count * KVM_VGIC_V3_REDIST_SIZE; 800 792 int ret; 801 - 802 - /* single rdist region already set ?*/ 803 - if (!count && !list_empty(rd_regions)) 804 - return -EINVAL; 805 793 806 794 /* cross the end of memory ? */ 807 795 if (base + size < base) ··· 809 805 } else { 810 806 rdreg = list_last_entry(rd_regions, 811 807 struct vgic_redist_region, list); 812 - if (index != rdreg->index + 1) 808 + 809 + /* Don't mix single region and discrete redist regions */ 810 + if (!count && rdreg->count) 813 811 return -EINVAL; 814 812 815 - /* Cannot add an explicitly sized regions after legacy region */ 816 - if (!rdreg->count) 813 + if (!count) 814 + return -EEXIST; 815 + 816 + if (index != rdreg->index + 1) 817 817 return -EINVAL; 818 818 } 819 819 ··· 856 848 return ret; 857 849 } 858 850 851 + void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg) 852 + { 853 + list_del(&rdreg->list); 854 + kfree(rdreg); 855 + } 856 + 859 857 int vgic_v3_set_redist_base(struct kvm *kvm, u32 index, u64 addr, u32 count) 860 858 { 861 859 int ret; 862 860 863 - ret = vgic_v3_insert_redist_region(kvm, index, addr, count); 861 + ret = vgic_v3_alloc_redist_region(kvm, index, addr, count); 864 862 if (ret) 865 863 return ret; 866 864 ··· 875 861 * afterwards will register the iodevs when needed. 876 862 */ 877 863 ret = vgic_register_all_redist_iodevs(kvm); 878 - if (ret) 864 + if (ret) { 865 + struct vgic_redist_region *rdreg; 866 + 867 + rdreg = vgic_v3_rdist_region_from_index(kvm, index); 868 + vgic_v3_free_redist_region(rdreg); 879 869 return ret; 870 + } 880 871 881 872 return 0; 882 873 }

+4 -6

arch/arm64/kvm/vgic/vgic-mmio.c

··· 938 938 return region; 939 939 } 940 940 941 - static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, 941 + static int vgic_uaccess_read(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, 942 942 gpa_t addr, u32 *val) 943 943 { 944 - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); 945 944 const struct vgic_register_region *region; 946 945 struct kvm_vcpu *r_vcpu; 947 946 ··· 959 960 return 0; 960 961 } 961 962 962 - static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, 963 + static int vgic_uaccess_write(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev, 963 964 gpa_t addr, const u32 *val) 964 965 { 965 - struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev); 966 966 const struct vgic_register_region *region; 967 967 struct kvm_vcpu *r_vcpu; 968 968 ··· 984 986 bool is_write, int offset, u32 *val) 985 987 { 986 988 if (is_write) 987 - return vgic_uaccess_write(vcpu, &dev->dev, offset, val); 989 + return vgic_uaccess_write(vcpu, dev, offset, val); 988 990 else 989 - return vgic_uaccess_read(vcpu, &dev->dev, offset, val); 991 + return vgic_uaccess_read(vcpu, dev, offset, val); 990 992 } 991 993 992 994 static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,

+60 -6

arch/arm64/kvm/vgic/vgic-v3.c

··· 1 1 // SPDX-License-Identifier: GPL-2.0-only 2 2 3 3 #include <linux/irqchip/arm-gic-v3.h> 4 + #include <linux/irq.h> 5 + #include <linux/irqdomain.h> 4 6 #include <linux/kvm.h> 5 7 #include <linux/kvm_host.h> 6 8 #include <kvm/arm_vgic.h> ··· 358 356 return 0; 359 357 } 360 358 359 + /* 360 + * The deactivation of the doorbell interrupt will trigger the 361 + * unmapping of the associated vPE. 362 + */ 363 + static void unmap_all_vpes(struct vgic_dist *dist) 364 + { 365 + struct irq_desc *desc; 366 + int i; 367 + 368 + for (i = 0; i < dist->its_vm.nr_vpes; i++) { 369 + desc = irq_to_desc(dist->its_vm.vpes[i]->irq); 370 + irq_domain_deactivate_irq(irq_desc_get_irq_data(desc)); 371 + } 372 + } 373 + 374 + static void map_all_vpes(struct vgic_dist *dist) 375 + { 376 + struct irq_desc *desc; 377 + int i; 378 + 379 + for (i = 0; i < dist->its_vm.nr_vpes; i++) { 380 + desc = irq_to_desc(dist->its_vm.vpes[i]->irq); 381 + irq_domain_activate_irq(irq_desc_get_irq_data(desc), false); 382 + } 383 + } 384 + 361 385 /** 362 386 * vgic_v3_save_pending_tables - Save the pending tables into guest RAM 363 387 * kvm lock and all vcpu lock must be held ··· 393 365 struct vgic_dist *dist = &kvm->arch.vgic; 394 366 struct vgic_irq *irq; 395 367 gpa_t last_ptr = ~(gpa_t)0; 396 - int ret; 368 + bool vlpi_avail = false; 369 + int ret = 0; 397 370 u8 val; 371 + 372 + if (unlikely(!vgic_initialized(kvm))) 373 + return -ENXIO; 374 + 375 + /* 376 + * A preparation for getting any VLPI states. 377 + * The above vgic initialized check also ensures that the allocation 378 + * and enabling of the doorbells have already been done. 379 + */ 380 + if (kvm_vgic_global_state.has_gicv4_1) { 381 + unmap_all_vpes(dist); 382 + vlpi_avail = true; 383 + } 398 384 399 385 list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) { 400 386 int byte_offset, bit_nr; 401 387 struct kvm_vcpu *vcpu; 402 388 gpa_t pendbase, ptr; 389 + bool is_pending; 403 390 bool stored; 404 391 405 392 vcpu = irq->target_vcpu; ··· 430 387 if (ptr != last_ptr) { 431 388 ret = kvm_read_guest_lock(kvm, ptr, &val, 1); 432 389 if (ret) 433 - return ret; 390 + goto out; 434 391 last_ptr = ptr; 435 392 } 436 393 437 394 stored = val & (1U << bit_nr); 438 - if (stored == irq->pending_latch) 395 + 396 + is_pending = irq->pending_latch; 397 + 398 + if (irq->hw && vlpi_avail) 399 + vgic_v4_get_vlpi_state(irq, &is_pending); 400 + 401 + if (stored == is_pending) 439 402 continue; 440 403 441 - if (irq->pending_latch) 404 + if (is_pending) 442 405 val |= 1 << bit_nr; 443 406 else 444 407 val &= ~(1 << bit_nr); 445 408 446 409 ret = kvm_write_guest_lock(kvm, ptr, &val, 1); 447 410 if (ret) 448 - return ret; 411 + goto out; 449 412 } 450 - return 0; 413 + 414 + out: 415 + if (vlpi_avail) 416 + map_all_vpes(dist); 417 + 418 + return ret; 451 419 } 452 420 453 421 /**

+38

arch/arm64/kvm/vgic/vgic-v4.c

··· 203 203 kvm_arm_resume_guest(kvm); 204 204 } 205 205 206 + /* 207 + * Must be called with GICv4.1 and the vPE unmapped, which 208 + * indicates the invalidation of any VPT caches associated 209 + * with the vPE, thus we can get the VLPI state by peeking 210 + * at the VPT. 211 + */ 212 + void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val) 213 + { 214 + struct its_vpe *vpe = &irq->target_vcpu->arch.vgic_cpu.vgic_v3.its_vpe; 215 + int mask = BIT(irq->intid % BITS_PER_BYTE); 216 + void *va; 217 + u8 *ptr; 218 + 219 + va = page_address(vpe->vpt_page); 220 + ptr = va + irq->intid / BITS_PER_BYTE; 221 + 222 + *val = !!(*ptr & mask); 223 + } 224 + 206 225 /** 207 226 * vgic_v4_init - Initialize the GICv4 data structures 208 227 * @kvm: Pointer to the VM being initialized ··· 404 385 struct vgic_its *its; 405 386 struct vgic_irq *irq; 406 387 struct its_vlpi_map map; 388 + unsigned long flags; 407 389 int ret; 408 390 409 391 if (!vgic_supports_direct_msis(kvm)) ··· 449 429 irq->hw = true; 450 430 irq->host_irq = virq; 451 431 atomic_inc(&map.vpe->vlpi_count); 432 + 433 + /* Transfer pending state */ 434 + raw_spin_lock_irqsave(&irq->irq_lock, flags); 435 + if (irq->pending_latch) { 436 + ret = irq_set_irqchip_state(irq->host_irq, 437 + IRQCHIP_STATE_PENDING, 438 + irq->pending_latch); 439 + WARN_RATELIMIT(ret, "IRQ %d", irq->host_irq); 440 + 441 + /* 442 + * Clear pending_latch and communicate this state 443 + * change via vgic_queue_irq_unlock. 444 + */ 445 + irq->pending_latch = false; 446 + vgic_queue_irq_unlock(kvm, irq, flags); 447 + } else { 448 + raw_spin_unlock_irqrestore(&irq->irq_lock, flags); 449 + } 452 450 453 451 out: 454 452 mutex_unlock(&its->its_lock);

+2

arch/arm64/kvm/vgic/vgic.h

··· 293 293 294 294 struct vgic_redist_region *vgic_v3_rdist_region_from_index(struct kvm *kvm, 295 295 u32 index); 296 + void vgic_v3_free_redist_region(struct vgic_redist_region *rdreg); 296 297 297 298 bool vgic_v3_rdist_overlap(struct kvm *kvm, gpa_t base, size_t size); 298 299 ··· 318 317 int vgic_v4_init(struct kvm *kvm); 319 318 void vgic_v4_teardown(struct kvm *kvm); 320 319 void vgic_v4_configure_vsgis(struct kvm *kvm); 320 + void vgic_v4_get_vlpi_state(struct vgic_irq *irq, bool *val); 321 321 322 322 #endif

+2 -2

arch/arm64/lib/clear_page.S

··· 14 14 * Parameters: 15 15 * x0 - dest 16 16 */ 17 - SYM_FUNC_START(clear_page) 17 + SYM_FUNC_START_PI(clear_page) 18 18 mrs x1, dczid_el0 19 19 and w1, w1, #0xf 20 20 mov x2, #4 ··· 25 25 tst x0, #(PAGE_SIZE - 1) 26 26 b.ne 1b 27 27 ret 28 - SYM_FUNC_END(clear_page) 28 + SYM_FUNC_END_PI(clear_page) 29 29 EXPORT_SYMBOL(clear_page)

+2 -2

arch/arm64/lib/copy_page.S

··· 17 17 * x0 - dest 18 18 * x1 - src 19 19 */ 20 - SYM_FUNC_START(copy_page) 20 + SYM_FUNC_START_PI(copy_page) 21 21 alternative_if ARM64_HAS_NO_HW_PREFETCH 22 22 // Prefetch three cache lines ahead. 23 23 prfm pldl1strm, [x1, #128] ··· 75 75 stnp x16, x17, [x0, #112 - 256] 76 76 77 77 ret 78 - SYM_FUNC_END(copy_page) 78 + SYM_FUNC_END_PI(copy_page) 79 79 EXPORT_SYMBOL(copy_page)

+3

arch/arm64/mm/init.c

··· 35 35 #include <asm/fixmap.h> 36 36 #include <asm/kasan.h> 37 37 #include <asm/kernel-pgtable.h> 38 + #include <asm/kvm_host.h> 38 39 #include <asm/memory.h> 39 40 #include <asm/numa.h> 40 41 #include <asm/sections.h> ··· 452 451 #endif 453 452 454 453 dma_pernuma_cma_reserve(); 454 + 455 + kvm_hyp_reserve(); 455 456 456 457 /* 457 458 * sparse_init() tries to allocate memory from memblock, so must be

-21

arch/s390/kernel/perf_event.c

··· 23 23 #include <asm/sysinfo.h> 24 24 #include <asm/unwind.h> 25 25 26 - const char *perf_pmu_name(void) 27 - { 28 - if (cpum_cf_avail() || cpum_sf_avail()) 29 - return "CPU-Measurement Facilities (CPU-MF)"; 30 - return "pmu"; 31 - } 32 - EXPORT_SYMBOL(perf_pmu_name); 33 - 34 - int perf_num_counters(void) 35 - { 36 - int num = 0; 37 - 38 - if (cpum_cf_avail()) 39 - num += PERF_CPUM_CF_MAX_CTR; 40 - if (cpum_sf_avail()) 41 - num += PERF_CPUM_SF_MAX_CTR; 42 - 43 - return num; 44 - } 45 - EXPORT_SYMBOL(perf_num_counters); 46 - 47 26 static struct kvm_s390_sie_block *sie_block(struct pt_regs *regs) 48 27 { 49 28 struct stack_frame *stack = (struct stack_frame *) regs->gprs[15];

-18

arch/sh/kernel/perf_event.c

··· 57 57 return !!sh_pmu; 58 58 } 59 59 60 - const char *perf_pmu_name(void) 61 - { 62 - if (!sh_pmu) 63 - return NULL; 64 - 65 - return sh_pmu->name; 66 - } 67 - EXPORT_SYMBOL_GPL(perf_pmu_name); 68 - 69 - int perf_num_counters(void) 70 - { 71 - if (!sh_pmu) 72 - return 0; 73 - 74 - return sh_pmu->num_events; 75 - } 76 - EXPORT_SYMBOL_GPL(perf_num_counters); 77 - 78 60 /* 79 61 * Release the PMU if this is the last perf_event. 80 62 */

+36

drivers/clocksource/arm_arch_timer.c

··· 16 16 #include <linux/cpu_pm.h> 17 17 #include <linux/clockchips.h> 18 18 #include <linux/clocksource.h> 19 + #include <linux/clocksource_ids.h> 19 20 #include <linux/interrupt.h> 20 21 #include <linux/of_irq.h> 21 22 #include <linux/of_address.h> ··· 25 24 #include <linux/sched/clock.h> 26 25 #include <linux/sched_clock.h> 27 26 #include <linux/acpi.h> 27 + #include <linux/arm-smccc.h> 28 + #include <linux/ptp_kvm.h> 28 29 29 30 #include <asm/arch_timer.h> 30 31 #include <asm/virt.h> ··· 194 191 195 192 static struct clocksource clocksource_counter = { 196 193 .name = "arch_sys_counter", 194 + .id = CSID_ARM_ARCH_COUNTER, 197 195 .rating = 400, 198 196 .read = arch_counter_read, 199 197 .mask = CLOCKSOURCE_MASK(56), ··· 1661 1657 } 1662 1658 TIMER_ACPI_DECLARE(arch_timer, ACPI_SIG_GTDT, arch_timer_acpi_init); 1663 1659 #endif 1660 + 1661 + int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *ts, 1662 + struct clocksource **cs) 1663 + { 1664 + struct arm_smccc_res hvc_res; 1665 + u32 ptp_counter; 1666 + ktime_t ktime; 1667 + 1668 + if (!IS_ENABLED(CONFIG_HAVE_ARM_SMCCC_DISCOVERY)) 1669 + return -EOPNOTSUPP; 1670 + 1671 + if (arch_timer_uses_ppi == ARCH_TIMER_VIRT_PPI) 1672 + ptp_counter = KVM_PTP_VIRT_COUNTER; 1673 + else 1674 + ptp_counter = KVM_PTP_PHYS_COUNTER; 1675 + 1676 + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID, 1677 + ptp_counter, &hvc_res); 1678 + 1679 + if ((int)(hvc_res.a0) < 0) 1680 + return -EOPNOTSUPP; 1681 + 1682 + ktime = (u64)hvc_res.a0 << 32 | hvc_res.a1; 1683 + *ts = ktime_to_timespec64(ktime); 1684 + if (cycle) 1685 + *cycle = (u64)hvc_res.a2 << 32 | hvc_res.a3; 1686 + if (cs) 1687 + *cs = &clocksource_counter; 1688 + 1689 + return 0; 1690 + } 1691 + EXPORT_SYMBOL_GPL(kvm_arch_ptp_get_crosststamp);

+2

drivers/firmware/psci/psci.c

··· 23 23 24 24 #include <asm/cpuidle.h> 25 25 #include <asm/cputype.h> 26 + #include <asm/hypervisor.h> 26 27 #include <asm/system_misc.h> 27 28 #include <asm/smp_plat.h> 28 29 #include <asm/suspend.h> ··· 499 498 psci_init_cpu_suspend(); 500 499 psci_init_system_suspend(); 501 500 psci_init_system_reset2(); 501 + kvm_init_hyp_services(); 502 502 } 503 503 504 504 return 0;

+1 -1

drivers/firmware/smccc/Makefile

··· 1 1 # SPDX-License-Identifier: GPL-2.0 2 2 # 3 - obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o 3 + obj-$(CONFIG_HAVE_ARM_SMCCC_DISCOVERY) += smccc.o kvm_guest.o 4 4 obj-$(CONFIG_ARM_SMCCC_SOC_ID) += soc_id.o

+50

drivers/firmware/smccc/kvm_guest.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + 3 + #define pr_fmt(fmt) "smccc: KVM: " fmt 4 + 5 + #include <linux/arm-smccc.h> 6 + #include <linux/bitmap.h> 7 + #include <linux/kernel.h> 8 + #include <linux/string.h> 9 + 10 + #include <asm/hypervisor.h> 11 + 12 + static DECLARE_BITMAP(__kvm_arm_hyp_services, ARM_SMCCC_KVM_NUM_FUNCS) __ro_after_init = { }; 13 + 14 + void __init kvm_init_hyp_services(void) 15 + { 16 + struct arm_smccc_res res; 17 + u32 val[4]; 18 + 19 + if (arm_smccc_1_1_get_conduit() != SMCCC_CONDUIT_HVC) 20 + return; 21 + 22 + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID, &res); 23 + if (res.a0 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 || 24 + res.a1 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 || 25 + res.a2 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 || 26 + res.a3 != ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3) 27 + return; 28 + 29 + memset(&res, 0, sizeof(res)); 30 + arm_smccc_1_1_invoke(ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID, &res); 31 + 32 + val[0] = lower_32_bits(res.a0); 33 + val[1] = lower_32_bits(res.a1); 34 + val[2] = lower_32_bits(res.a2); 35 + val[3] = lower_32_bits(res.a3); 36 + 37 + bitmap_from_arr32(__kvm_arm_hyp_services, val, ARM_SMCCC_KVM_NUM_FUNCS); 38 + 39 + pr_info("hypervisor services detected (0x%08lx 0x%08lx 0x%08lx 0x%08lx)\n", 40 + res.a3, res.a2, res.a1, res.a0); 41 + } 42 + 43 + bool kvm_arm_hyp_service_available(u32 func_id) 44 + { 45 + if (func_id >= ARM_SMCCC_KVM_NUM_FUNCS) 46 + return false; 47 + 48 + return test_bit(func_id, __kvm_arm_hyp_services); 49 + } 50 + EXPORT_SYMBOL_GPL(kvm_arm_hyp_service_available);

+1

drivers/firmware/smccc/smccc.c

··· 8 8 #include <linux/cache.h> 9 9 #include <linux/init.h> 10 10 #include <linux/arm-smccc.h> 11 + #include <linux/kernel.h> 11 12 #include <asm/archrandom.h> 12 13 13 14 static u32 smccc_version = ARM_SMCCC_VERSION_1_0;

+19 -5

drivers/hwtracing/coresight/Kconfig

··· 97 97 module will be called coresight-etm3x. 98 98 99 99 config CORESIGHT_SOURCE_ETM4X 100 - tristate "CoreSight Embedded Trace Macrocell 4.x driver" 100 + tristate "CoreSight ETMv4.x / ETE driver" 101 101 depends on ARM64 102 102 select CORESIGHT_LINKS_AND_SINKS 103 103 select PID_IN_CONTEXTIDR 104 104 help 105 - This driver provides support for the ETM4.x tracer module, tracing the 106 - instructions that a processor is executing. This is primarily useful 107 - for instruction level tracing. Depending on the implemented version 108 - data tracing may also be available. 105 + This driver provides support for the CoreSight Embedded Trace Macrocell 106 + version 4.x and the Embedded Trace Extensions (ETE). Both are CPU tracer 107 + modules, tracing the instructions that a processor is executing. This is 108 + primarily useful for instruction level tracing. 109 109 110 110 To compile this driver as a module, choose M here: the 111 111 module will be called coresight-etm4x. ··· 173 173 CTI trigger connections between this and other devices.These 174 174 registers are not used in normal operation and can leave devices in 175 175 an inconsistent state. 176 + 177 + config CORESIGHT_TRBE 178 + tristate "Trace Buffer Extension (TRBE) driver" 179 + depends on ARM64 && CORESIGHT_SOURCE_ETM4X 180 + help 181 + This driver provides support for percpu Trace Buffer Extension (TRBE). 182 + TRBE always needs to be used along with it's corresponding percpu ETE 183 + component. ETE generates trace data which is then captured with TRBE. 184 + Unlike traditional sink devices, TRBE is a CPU feature accessible via 185 + system registers. But it's explicit dependency with trace unit (ETE) 186 + requires it to be plugged in as a coresight sink device. 187 + 188 + To compile this driver as a module, choose M here: the module will be 189 + called coresight-trbe. 176 190 endif

+1

drivers/hwtracing/coresight/Makefile

··· 21 21 obj-$(CONFIG_CORESIGHT_CPU_DEBUG) += coresight-cpu-debug.o 22 22 obj-$(CONFIG_CORESIGHT_CATU) += coresight-catu.o 23 23 obj-$(CONFIG_CORESIGHT_CTI) += coresight-cti.o 24 + obj-$(CONFIG_CORESIGHT_TRBE) += coresight-trbe.o 24 25 coresight-cti-y := coresight-cti-core.o coresight-cti-platform.o \ 25 26 coresight-cti-sysfs.o

+27 -2

drivers/hwtracing/coresight/coresight-core.c

··· 23 23 #include "coresight-priv.h" 24 24 25 25 static DEFINE_MUTEX(coresight_mutex); 26 + static DEFINE_PER_CPU(struct coresight_device *, csdev_sink); 26 27 27 28 /** 28 29 * struct coresight_node - elements of a path, from source to sink ··· 70 69 cti_assoc_ops = NULL; 71 70 } 72 71 EXPORT_SYMBOL_GPL(coresight_remove_cti_ops); 72 + 73 + void coresight_set_percpu_sink(int cpu, struct coresight_device *csdev) 74 + { 75 + per_cpu(csdev_sink, cpu) = csdev; 76 + } 77 + EXPORT_SYMBOL_GPL(coresight_set_percpu_sink); 78 + 79 + struct coresight_device *coresight_get_percpu_sink(int cpu) 80 + { 81 + return per_cpu(csdev_sink, cpu); 82 + } 83 + EXPORT_SYMBOL_GPL(coresight_get_percpu_sink); 73 84 74 85 static int coresight_id_match(struct device *dev, void *data) 75 86 { ··· 797 784 if (csdev == sink) 798 785 goto out; 799 786 787 + if (coresight_is_percpu_source(csdev) && coresight_is_percpu_sink(sink) && 788 + sink == per_cpu(csdev_sink, source_ops(csdev)->cpu_id(csdev))) { 789 + if (_coresight_build_path(sink, sink, path) == 0) { 790 + found = true; 791 + goto out; 792 + } 793 + } 794 + 800 795 /* Not a sink - recursively explore each port found on this element */ 801 796 for (i = 0; i < csdev->pdata->nr_outport; i++) { 802 797 struct coresight_device *child_dev; ··· 1020 999 int depth = 0; 1021 1000 1022 1001 /* look for a default sink if we have not found for this device */ 1023 - if (!csdev->def_sink) 1024 - csdev->def_sink = coresight_find_sink(csdev, &depth); 1002 + if (!csdev->def_sink) { 1003 + if (coresight_is_percpu_source(csdev)) 1004 + csdev->def_sink = per_cpu(csdev_sink, source_ops(csdev)->cpu_id(csdev)); 1005 + if (!csdev->def_sink) 1006 + csdev->def_sink = coresight_find_sink(csdev, &depth); 1007 + } 1025 1008 return csdev->def_sink; 1026 1009 } 1027 1010

+106 -13

drivers/hwtracing/coresight/coresight-etm-perf.c

··· 24 24 static struct pmu etm_pmu; 25 25 static bool etm_perf_up; 26 26 27 - static DEFINE_PER_CPU(struct perf_output_handle, ctx_handle); 27 + /* 28 + * An ETM context for a running event includes the perf aux handle 29 + * and aux_data. For ETM, the aux_data (etm_event_data), consists of 30 + * the trace path and the sink configuration. The event data is accessible 31 + * via perf_get_aux(handle). However, a sink could "end" a perf output 32 + * handle via the IRQ handler. And if the "sink" encounters a failure 33 + * to "begin" another session (e.g due to lack of space in the buffer), 34 + * the handle will be cleared. Thus, the event_data may not be accessible 35 + * from the handle when we get to the etm_event_stop(), which is required 36 + * for stopping the trace path. The event_data is guaranteed to stay alive 37 + * until "free_aux()", which cannot happen as long as the event is active on 38 + * the ETM. Thus the event_data for the session must be part of the ETM context 39 + * to make sure we can disable the trace path. 40 + */ 41 + struct etm_ctxt { 42 + struct perf_output_handle handle; 43 + struct etm_event_data *event_data; 44 + }; 45 + 46 + static DEFINE_PER_CPU(struct etm_ctxt, etm_ctxt); 28 47 static DEFINE_PER_CPU(struct coresight_device *, csdev_src); 29 48 30 49 /* ··· 251 232 schedule_work(&event_data->work); 252 233 } 253 234 235 + /* 236 + * Check if two given sinks are compatible with each other, 237 + * so that they can use the same sink buffers, when an event 238 + * moves around. 239 + */ 240 + static bool sinks_compatible(struct coresight_device *a, 241 + struct coresight_device *b) 242 + { 243 + if (!a || !b) 244 + return false; 245 + /* 246 + * If the sinks are of the same subtype and driven 247 + * by the same driver, we can use the same buffer 248 + * on these sinks. 249 + */ 250 + return (a->subtype.sink_subtype == b->subtype.sink_subtype) && 251 + (sink_ops(a) == sink_ops(b)); 252 + } 253 + 254 254 static void *etm_setup_aux(struct perf_event *event, void **pages, 255 255 int nr_pages, bool overwrite) 256 256 { ··· 277 239 int cpu = event->cpu; 278 240 cpumask_t *mask; 279 241 struct coresight_device *sink = NULL; 242 + struct coresight_device *user_sink = NULL, *last_sink = NULL; 280 243 struct etm_event_data *event_data = NULL; 281 244 282 245 event_data = alloc_event_data(cpu); ··· 288 249 /* First get the selected sink from user space. */ 289 250 if (event->attr.config2) { 290 251 id = (u32)event->attr.config2; 291 - sink = coresight_get_sink_by_id(id); 252 + sink = user_sink = coresight_get_sink_by_id(id); 292 253 } 293 254 294 255 mask = &event_data->mask; ··· 316 277 } 317 278 318 279 /* 319 - * No sink provided - look for a default sink for one of the 320 - * devices. At present we only support topology where all CPUs 321 - * use the same sink [N:1], so only need to find one sink. The 322 - * coresight_build_path later will remove any CPU that does not 323 - * attach to the sink, or if we have not found a sink. 280 + * No sink provided - look for a default sink for all the ETMs, 281 + * where this event can be scheduled. 282 + * We allocate the sink specific buffers only once for this 283 + * event. If the ETMs have different default sink devices, we 284 + * can only use a single "type" of sink as the event can carry 285 + * only one sink specific buffer. Thus we have to make sure 286 + * that the sinks are of the same type and driven by the same 287 + * driver, as the one we allocate the buffer for. As such 288 + * we choose the first sink and check if the remaining ETMs 289 + * have a compatible default sink. We don't trace on a CPU 290 + * if the sink is not compatible. 324 291 */ 325 - if (!sink) 292 + if (!user_sink) { 293 + /* Find the default sink for this ETM */ 326 294 sink = coresight_find_default_sink(csdev); 295 + if (!sink) { 296 + cpumask_clear_cpu(cpu, mask); 297 + continue; 298 + } 299 + 300 + /* Check if this sink compatible with the last sink */ 301 + if (last_sink && !sinks_compatible(last_sink, sink)) { 302 + cpumask_clear_cpu(cpu, mask); 303 + continue; 304 + } 305 + last_sink = sink; 306 + } 327 307 328 308 /* 329 309 * Building a path doesn't enable it, it simply builds a ··· 370 312 if (!sink_ops(sink)->alloc_buffer || !sink_ops(sink)->free_buffer) 371 313 goto err; 372 314 373 - /* Allocate the sink buffer for this session */ 315 + /* 316 + * Allocate the sink buffer for this session. All the sinks 317 + * where this event can be scheduled are ensured to be of the 318 + * same type. Thus the same sink configuration is used by the 319 + * sinks. 320 + */ 374 321 event_data->snk_config = 375 322 sink_ops(sink)->alloc_buffer(sink, event, pages, 376 323 nr_pages, overwrite); ··· 395 332 { 396 333 int cpu = smp_processor_id(); 397 334 struct etm_event_data *event_data; 398 - struct perf_output_handle *handle = this_cpu_ptr(&ctx_handle); 335 + struct etm_ctxt *ctxt = this_cpu_ptr(&etm_ctxt); 336 + struct perf_output_handle *handle = &ctxt->handle; 399 337 struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu); 400 338 struct list_head *path; 401 339 402 340 if (!csdev) 341 + goto fail; 342 + 343 + /* Have we messed up our tracking ? */ 344 + if (WARN_ON(ctxt->event_data)) 403 345 goto fail; 404 346 405 347 /* ··· 442 374 if (source_ops(csdev)->enable(csdev, event, CS_MODE_PERF)) 443 375 goto fail_disable_path; 444 376 377 + /* Save the event_data for this ETM */ 378 + ctxt->event_data = event_data; 445 379 out: 446 380 return; 447 381 ··· 462 392 int cpu = smp_processor_id(); 463 393 unsigned long size; 464 394 struct coresight_device *sink, *csdev = per_cpu(csdev_src, cpu); 465 - struct perf_output_handle *handle = this_cpu_ptr(&ctx_handle); 466 - struct etm_event_data *event_data = perf_get_aux(handle); 395 + struct etm_ctxt *ctxt = this_cpu_ptr(&etm_ctxt); 396 + struct perf_output_handle *handle = &ctxt->handle; 397 + struct etm_event_data *event_data; 467 398 struct list_head *path; 468 399 400 + /* 401 + * If we still have access to the event_data via handle, 402 + * confirm that we haven't messed up the tracking. 403 + */ 404 + if (handle->event && 405 + WARN_ON(perf_get_aux(handle) != ctxt->event_data)) 406 + return; 407 + 408 + event_data = ctxt->event_data; 409 + /* Clear the event_data as this ETM is stopping the trace. */ 410 + ctxt->event_data = NULL; 411 + 469 412 if (event->hw.state == PERF_HES_STOPPED) 413 + return; 414 + 415 + /* We must have a valid event_data for a running event */ 416 + if (WARN_ON(!event_data)) 470 417 return; 471 418 472 419 if (!csdev) ··· 503 416 /* tell the core */ 504 417 event->hw.state = PERF_HES_STOPPED; 505 418 506 - if (mode & PERF_EF_UPDATE) { 419 + /* 420 + * If the handle is not bound to an event anymore 421 + * (e.g, the sink driver was unable to restart the 422 + * handle due to lack of buffer space), we don't 423 + * have to do anything here. 424 + */ 425 + if (handle->event && (mode & PERF_EF_UPDATE)) { 507 426 if (WARN_ON_ONCE(handle->event != event)) 508 427 return; 509 428

+137 -24

drivers/hwtracing/coresight/coresight-etm4x-core.c

··· 31 31 #include <linux/pm_runtime.h> 32 32 #include <linux/property.h> 33 33 34 + #include <asm/barrier.h> 34 35 #include <asm/sections.h> 35 36 #include <asm/sysreg.h> 36 37 #include <asm/local.h> ··· 115 114 } 116 115 } 117 116 118 - static void etm4_os_unlock_csa(struct etmv4_drvdata *drvdata, struct csdev_access *csa) 117 + static u64 ete_sysreg_read(u32 offset, bool _relaxed, bool _64bit) 119 118 { 120 - /* Writing 0 to TRCOSLAR unlocks the trace registers */ 121 - etm4x_relaxed_write32(csa, 0x0, TRCOSLAR); 122 - drvdata->os_unlock = true; 119 + u64 res = 0; 120 + 121 + switch (offset) { 122 + ETE_READ_CASES(res) 123 + default : 124 + pr_warn_ratelimited("ete: trying to read unsupported register @%x\n", 125 + offset); 126 + } 127 + 128 + if (!_relaxed) 129 + __iormb(res); /* Imitate the !relaxed I/O helpers */ 130 + 131 + return res; 132 + } 133 + 134 + static void ete_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit) 135 + { 136 + if (!_relaxed) 137 + __iowmb(); /* Imitate the !relaxed I/O helpers */ 138 + if (!_64bit) 139 + val &= GENMASK(31, 0); 140 + 141 + switch (offset) { 142 + ETE_WRITE_CASES(val) 143 + default : 144 + pr_warn_ratelimited("ete: trying to write to unsupported register @%x\n", 145 + offset); 146 + } 147 + } 148 + 149 + static void etm_detect_os_lock(struct etmv4_drvdata *drvdata, 150 + struct csdev_access *csa) 151 + { 152 + u32 oslsr = etm4x_relaxed_read32(csa, TRCOSLSR); 153 + 154 + drvdata->os_lock_model = ETM_OSLSR_OSLM(oslsr); 155 + } 156 + 157 + static void etm_write_os_lock(struct etmv4_drvdata *drvdata, 158 + struct csdev_access *csa, u32 val) 159 + { 160 + val = !!val; 161 + 162 + switch (drvdata->os_lock_model) { 163 + case ETM_OSLOCK_PRESENT: 164 + etm4x_relaxed_write32(csa, val, TRCOSLAR); 165 + break; 166 + case ETM_OSLOCK_PE: 167 + write_sysreg_s(val, SYS_OSLAR_EL1); 168 + break; 169 + default: 170 + pr_warn_once("CPU%d: Unsupported Trace OSLock model: %x\n", 171 + smp_processor_id(), drvdata->os_lock_model); 172 + fallthrough; 173 + case ETM_OSLOCK_NI: 174 + return; 175 + } 123 176 isb(); 177 + } 178 + 179 + static inline void etm4_os_unlock_csa(struct etmv4_drvdata *drvdata, 180 + struct csdev_access *csa) 181 + { 182 + WARN_ON(drvdata->cpu != smp_processor_id()); 183 + 184 + /* Writing 0 to OS Lock unlocks the trace unit registers */ 185 + etm_write_os_lock(drvdata, csa, 0x0); 186 + drvdata->os_unlock = true; 124 187 } 125 188 126 189 static void etm4_os_unlock(struct etmv4_drvdata *drvdata) 127 190 { 128 191 if (!WARN_ON(!drvdata->csdev)) 129 192 etm4_os_unlock_csa(drvdata, &drvdata->csdev->access); 130 - 131 193 } 132 194 133 195 static void etm4_os_lock(struct etmv4_drvdata *drvdata) 134 196 { 135 197 if (WARN_ON(!drvdata->csdev)) 136 198 return; 137 - 138 - /* Writing 0x1 to TRCOSLAR locks the trace registers */ 139 - etm4x_relaxed_write32(&drvdata->csdev->access, 0x1, TRCOSLAR); 199 + /* Writing 0x1 to OS Lock locks the trace registers */ 200 + etm_write_os_lock(drvdata, &drvdata->csdev->access, 0x1); 140 201 drvdata->os_unlock = false; 141 - isb(); 142 202 } 143 203 144 204 static void etm4_cs_lock(struct etmv4_drvdata *drvdata, ··· 432 370 */ 433 371 etm4x_relaxed_write32(csa, trcpdcr | TRCPDCR_PU, TRCPDCR); 434 372 } 373 + 374 + /* 375 + * ETE mandates that the TRCRSR is written to before 376 + * enabling it. 377 + */ 378 + if (etm4x_is_ete(drvdata)) 379 + etm4x_relaxed_write32(csa, TRCRSR_TA, TRCRSR); 435 380 436 381 /* Enable the trace unit */ 437 382 etm4x_relaxed_write32(csa, 1, TRCPRGCTLR); ··· 723 654 static void etm4_disable_hw(void *info) 724 655 { 725 656 u32 control; 657 + u64 trfcr; 726 658 struct etmv4_drvdata *drvdata = info; 727 659 struct etmv4_config *config = &drvdata->config; 728 660 struct coresight_device *csdev = drvdata->csdev; ··· 747 677 control &= ~0x1; 748 678 749 679 /* 680 + * If the CPU supports v8.4 Trace filter Control, 681 + * set the ETM to trace prohibited region. 682 + */ 683 + if (drvdata->trfc) { 684 + trfcr = read_sysreg_s(SYS_TRFCR_EL1); 685 + write_sysreg_s(trfcr & ~(TRFCR_ELx_ExTRE | TRFCR_ELx_E0TRE), 686 + SYS_TRFCR_EL1); 687 + isb(); 688 + } 689 + /* 750 690 * Make sure everything completes before disabling, as recommended 751 691 * by section 7.3.77 ("TRCVICTLR, ViewInst Main Control Register, 752 692 * SSTATUS") of ARM IHI 0064D 753 693 */ 754 694 dsb(sy); 755 695 isb(); 696 + /* Trace synchronization barrier, is a nop if not supported */ 697 + tsb_csync(); 756 698 etm4x_relaxed_write32(csa, control, TRCPRGCTLR); 757 699 758 700 /* wait for TRCSTATR.PMSTABLE to go to '1' */ 759 701 if (coresight_timeout(csa, TRCSTATR, TRCSTATR_PMSTABLE_BIT, 1)) 760 702 dev_err(etm_dev, 761 703 "timeout while waiting for PM stable Trace Status\n"); 704 + if (drvdata->trfc) 705 + write_sysreg_s(trfcr, SYS_TRFCR_EL1); 762 706 763 707 /* read the status of the single shot comparators */ 764 708 for (i = 0; i < drvdata->nr_ss_cmp; i++) { ··· 901 817 * ETMs implementing sysreg access must implement TRCDEVARCH. 902 818 */ 903 819 devarch = read_etm4x_sysreg_const_offset(TRCDEVARCH); 904 - if ((devarch & ETM_DEVARCH_ID_MASK) != ETM_DEVARCH_ETMv4x_ARCH) 820 + switch (devarch & ETM_DEVARCH_ID_MASK) { 821 + case ETM_DEVARCH_ETMv4x_ARCH: 822 + *csa = (struct csdev_access) { 823 + .io_mem = false, 824 + .read = etm4x_sysreg_read, 825 + .write = etm4x_sysreg_write, 826 + }; 827 + break; 828 + case ETM_DEVARCH_ETE_ARCH: 829 + *csa = (struct csdev_access) { 830 + .io_mem = false, 831 + .read = ete_sysreg_read, 832 + .write = ete_sysreg_write, 833 + }; 834 + break; 835 + default: 905 836 return false; 906 - *csa = (struct csdev_access) { 907 - .io_mem = false, 908 - .read = etm4x_sysreg_read, 909 - .write = etm4x_sysreg_write, 910 - }; 837 + } 911 838 912 839 drvdata->arch = etm_devarch_to_arch(devarch); 913 840 return true; ··· 968 873 return false; 969 874 } 970 875 971 - static void cpu_enable_tracing(void) 876 + static void cpu_enable_tracing(struct etmv4_drvdata *drvdata) 972 877 { 973 878 u64 dfr0 = read_sysreg(id_aa64dfr0_el1); 974 879 u64 trfcr; ··· 976 881 if (!cpuid_feature_extract_unsigned_field(dfr0, ID_AA64DFR0_TRACE_FILT_SHIFT)) 977 882 return; 978 883 884 + drvdata->trfc = true; 979 885 /* 980 886 * If the CPU supports v8.4 SelfHosted Tracing, enable 981 887 * tracing at the kernel EL and EL0, forcing to use the ··· 1015 919 */ 1016 920 if (!etm4_init_csdev_access(drvdata, csa)) 1017 921 return; 922 + 923 + /* Detect the support for OS Lock before we actually use it */ 924 + etm_detect_os_lock(drvdata, csa); 1018 925 1019 926 /* Make sure all registers are accessible */ 1020 927 etm4_os_unlock_csa(drvdata, csa); ··· 1181 1082 /* NUMCNTR, bits[30:28] number of counters available for tracing */ 1182 1083 drvdata->nr_cntr = BMVAL(etmidr5, 28, 30); 1183 1084 etm4_cs_lock(drvdata, csa); 1184 - cpu_enable_tracing(); 1085 + cpu_enable_tracing(drvdata); 1185 1086 } 1186 1087 1187 1088 static inline u32 etm4_get_victlr_access_type(struct etmv4_config *config) ··· 1859 1760 struct etmv4_drvdata *drvdata; 1860 1761 struct coresight_desc desc = { 0 }; 1861 1762 struct etm4_init_arg init_arg = { 0 }; 1763 + u8 major, minor; 1764 + char *type_name; 1862 1765 1863 1766 drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL); 1864 1767 if (!drvdata) ··· 1887 1786 if (drvdata->cpu < 0) 1888 1787 return drvdata->cpu; 1889 1788 1890 - desc.name = devm_kasprintf(dev, GFP_KERNEL, "etm%d", drvdata->cpu); 1891 - if (!desc.name) 1892 - return -ENOMEM; 1893 - 1894 1789 init_arg.drvdata = drvdata; 1895 1790 init_arg.csa = &desc.access; 1896 1791 init_arg.pid = etm_pid; ··· 1902 1805 if (!desc.access.io_mem || 1903 1806 fwnode_property_present(dev_fwnode(dev), "qcom,skip-power-up")) 1904 1807 drvdata->skip_power_up = true; 1808 + 1809 + major = ETM_ARCH_MAJOR_VERSION(drvdata->arch); 1810 + minor = ETM_ARCH_MINOR_VERSION(drvdata->arch); 1811 + 1812 + if (etm4x_is_ete(drvdata)) { 1813 + type_name = "ete"; 1814 + /* ETE v1 has major version == 0b101. Adjust this for logging.*/ 1815 + major -= 4; 1816 + } else { 1817 + type_name = "etm"; 1818 + } 1819 + 1820 + desc.name = devm_kasprintf(dev, GFP_KERNEL, 1821 + "%s%d", type_name, drvdata->cpu); 1822 + if (!desc.name) 1823 + return -ENOMEM; 1905 1824 1906 1825 etm4_init_trace_id(drvdata); 1907 1826 etm4_set_default(&drvdata->config); ··· 1946 1833 1947 1834 etmdrvdata[drvdata->cpu] = drvdata; 1948 1835 1949 - dev_info(&drvdata->csdev->dev, "CPU%d: ETM v%d.%d initialized\n", 1950 - drvdata->cpu, ETM_ARCH_MAJOR_VERSION(drvdata->arch), 1951 - ETM_ARCH_MINOR_VERSION(drvdata->arch)); 1836 + dev_info(&drvdata->csdev->dev, "CPU%d: %s v%d.%d initialized\n", 1837 + drvdata->cpu, type_name, major, minor); 1952 1838 1953 1839 if (boot_enable) { 1954 1840 coresight_enable(drvdata->csdev); ··· 2090 1978 2091 1979 static const struct of_device_id etm4_sysreg_match[] = { 2092 1980 { .compatible = "arm,coresight-etm4x-sysreg" }, 1981 + { .compatible = "arm,embedded-trace-extension" }, 2093 1982 {} 2094 1983 }; 2095 1984

+16 -3

drivers/hwtracing/coresight/coresight-etm4x-sysfs.c

··· 2374 2374 etm4x_register_implemented(struct etmv4_drvdata *drvdata, u32 offset) 2375 2375 { 2376 2376 switch (offset) { 2377 - ETM4x_SYSREG_LIST_CASES 2377 + ETM_COMMON_SYSREG_LIST_CASES 2378 2378 /* 2379 - * Registers accessible via system instructions are always 2380 - * implemented. 2379 + * Common registers to ETE & ETM4x accessible via system 2380 + * instructions are always implemented. 2381 2381 */ 2382 2382 return true; 2383 + 2384 + ETM4x_ONLY_SYSREG_LIST_CASES 2385 + /* 2386 + * We only support etm4x and ete. So if the device is not 2387 + * ETE, it must be ETMv4x. 2388 + */ 2389 + return !etm4x_is_ete(drvdata); 2390 + 2383 2391 ETM4x_MMAP_LIST_CASES 2384 2392 /* 2385 2393 * Registers accessible only via memory-mapped registers ··· 2397 2389 * coresight_register() and the csdev is not initialized 2398 2390 * until that is done. So rely on the drvdata->base to 2399 2391 * detect if we have a memory mapped access. 2392 + * Also ETE doesn't implement memory mapped access, thus 2393 + * it is sufficient to check that we are using mmio. 2400 2394 */ 2401 2395 return !!drvdata->base; 2396 + 2397 + ETE_ONLY_SYSREG_LIST_CASES 2398 + return etm4x_is_ete(drvdata); 2402 2399 } 2403 2400 2404 2401 return false;

+74 -9

drivers/hwtracing/coresight/coresight-etm4x.h

··· 29 29 #define TRCAUXCTLR 0x018 30 30 #define TRCEVENTCTL0R 0x020 31 31 #define TRCEVENTCTL1R 0x024 32 + #define TRCRSR 0x028 32 33 #define TRCSTALLCTLR 0x02C 33 34 #define TRCTSCTLR 0x030 34 35 #define TRCSYNCPR 0x034 ··· 50 49 #define TRCSEQRSTEVR 0x118 51 50 #define TRCSEQSTR 0x11C 52 51 #define TRCEXTINSELR 0x120 52 + #define TRCEXTINSELRn(n) (0x120 + (n * 4)) /* n = 0-3 */ 53 53 #define TRCCNTRLDVRn(n) (0x140 + (n * 4)) /* n = 0-3 */ 54 54 #define TRCCNTCTLRn(n) (0x150 + (n * 4)) /* n = 0-3 */ 55 55 #define TRCCNTVRn(n) (0x160 + (n * 4)) /* n = 0-3 */ ··· 128 126 #define TRCCIDR2 0xFF8 129 127 #define TRCCIDR3 0xFFC 130 128 129 + #define TRCRSR_TA BIT(12) 130 + 131 131 /* 132 132 * System instructions to access ETM registers. 133 133 * See ETMv4.4 spec ARM IHI0064F section 4.3.6 System instructions ··· 164 160 #define CASE_NOP(__unused, x) \ 165 161 case (x): /* fall through */ 166 162 163 + #define ETE_ONLY_SYSREG_LIST(op, val) \ 164 + CASE_##op((val), TRCRSR) \ 165 + CASE_##op((val), TRCEXTINSELRn(1)) \ 166 + CASE_##op((val), TRCEXTINSELRn(2)) \ 167 + CASE_##op((val), TRCEXTINSELRn(3)) 168 + 167 169 /* List of registers accessible via System instructions */ 168 - #define ETM_SYSREG_LIST(op, val) \ 169 - CASE_##op((val), TRCPRGCTLR) \ 170 + #define ETM4x_ONLY_SYSREG_LIST(op, val) \ 170 171 CASE_##op((val), TRCPROCSELR) \ 172 + CASE_##op((val), TRCVDCTLR) \ 173 + CASE_##op((val), TRCVDSACCTLR) \ 174 + CASE_##op((val), TRCVDARCCTLR) \ 175 + CASE_##op((val), TRCOSLAR) 176 + 177 + #define ETM_COMMON_SYSREG_LIST(op, val) \ 178 + CASE_##op((val), TRCPRGCTLR) \ 171 179 CASE_##op((val), TRCSTATR) \ 172 180 CASE_##op((val), TRCCONFIGR) \ 173 181 CASE_##op((val), TRCAUXCTLR) \ ··· 196 180 CASE_##op((val), TRCVIIECTLR) \ 197 181 CASE_##op((val), TRCVISSCTLR) \ 198 182 CASE_##op((val), TRCVIPCSSCTLR) \ 199 - CASE_##op((val), TRCVDCTLR) \ 200 - CASE_##op((val), TRCVDSACCTLR) \ 201 - CASE_##op((val), TRCVDARCCTLR) \ 202 183 CASE_##op((val), TRCSEQEVRn(0)) \ 203 184 CASE_##op((val), TRCSEQEVRn(1)) \ 204 185 CASE_##op((val), TRCSEQEVRn(2)) \ ··· 290 277 CASE_##op((val), TRCSSPCICRn(5)) \ 291 278 CASE_##op((val), TRCSSPCICRn(6)) \ 292 279 CASE_##op((val), TRCSSPCICRn(7)) \ 293 - CASE_##op((val), TRCOSLAR) \ 294 280 CASE_##op((val), TRCOSLSR) \ 295 281 CASE_##op((val), TRCACVRn(0)) \ 296 282 CASE_##op((val), TRCACVRn(1)) \ ··· 381 369 CASE_##op((val), TRCPIDR2) \ 382 370 CASE_##op((val), TRCPIDR3) 383 371 384 - #define ETM4x_READ_SYSREG_CASES(res) ETM_SYSREG_LIST(READ, (res)) 385 - #define ETM4x_WRITE_SYSREG_CASES(val) ETM_SYSREG_LIST(WRITE, (val)) 372 + #define ETM4x_READ_SYSREG_CASES(res) \ 373 + ETM_COMMON_SYSREG_LIST(READ, (res)) \ 374 + ETM4x_ONLY_SYSREG_LIST(READ, (res)) 386 375 387 - #define ETM4x_SYSREG_LIST_CASES ETM_SYSREG_LIST(NOP, __unused) 376 + #define ETM4x_WRITE_SYSREG_CASES(val) \ 377 + ETM_COMMON_SYSREG_LIST(WRITE, (val)) \ 378 + ETM4x_ONLY_SYSREG_LIST(WRITE, (val)) 379 + 380 + #define ETM_COMMON_SYSREG_LIST_CASES \ 381 + ETM_COMMON_SYSREG_LIST(NOP, __unused) 382 + 383 + #define ETM4x_ONLY_SYSREG_LIST_CASES \ 384 + ETM4x_ONLY_SYSREG_LIST(NOP, __unused) 385 + 386 + #define ETM4x_SYSREG_LIST_CASES \ 387 + ETM_COMMON_SYSREG_LIST_CASES \ 388 + ETM4x_ONLY_SYSREG_LIST(NOP, __unused) 389 + 388 390 #define ETM4x_MMAP_LIST_CASES ETM_MMAP_LIST(NOP, __unused) 391 + 392 + /* ETE only supports system register access */ 393 + #define ETE_READ_CASES(res) \ 394 + ETM_COMMON_SYSREG_LIST(READ, (res)) \ 395 + ETE_ONLY_SYSREG_LIST(READ, (res)) 396 + 397 + #define ETE_WRITE_CASES(val) \ 398 + ETM_COMMON_SYSREG_LIST(WRITE, (val)) \ 399 + ETE_ONLY_SYSREG_LIST(WRITE, (val)) 400 + 401 + #define ETE_ONLY_SYSREG_LIST_CASES \ 402 + ETE_ONLY_SYSREG_LIST(NOP, __unused) 389 403 390 404 #define read_etm4x_sysreg_offset(offset, _64bit) \ 391 405 ({ \ ··· 544 506 ETM_MODE_EXCL_USER) 545 507 546 508 /* 509 + * TRCOSLSR.OSLM advertises the OS Lock model. 510 + * OSLM[2:0] = TRCOSLSR[4:3,0] 511 + * 512 + * 0b000 - Trace OS Lock is not implemented. 513 + * 0b010 - Trace OS Lock is implemented. 514 + * 0b100 - Trace OS Lock is not implemented, unit is controlled by PE OS Lock. 515 + */ 516 + #define ETM_OSLOCK_NI 0b000 517 + #define ETM_OSLOCK_PRESENT 0b010 518 + #define ETM_OSLOCK_PE 0b100 519 + 520 + #define ETM_OSLSR_OSLM(oslsr) ((((oslsr) & GENMASK(4, 3)) >> 2) | (oslsr & 0x1)) 521 + 522 + /* 547 523 * TRCDEVARCH Bit field definitions 548 524 * Bits[31:21] - ARCHITECT = Always Arm Ltd. 549 525 * * Bits[31:28] = 0x4 ··· 593 541 ((ETM_DEVARCH_MAKE_ARCHID_ARCH_VER(major)) | ETM_DEVARCH_ARCHID_ARCH_PART(0xA13)) 594 542 595 543 #define ETM_DEVARCH_ARCHID_ETMv4x ETM_DEVARCH_MAKE_ARCHID(0x4) 544 + #define ETM_DEVARCH_ARCHID_ETE ETM_DEVARCH_MAKE_ARCHID(0x5) 596 545 597 546 #define ETM_DEVARCH_ID_MASK \ 598 547 (ETM_DEVARCH_ARCHITECT_MASK | ETM_DEVARCH_ARCHID_MASK | ETM_DEVARCH_PRESENT) 599 548 #define ETM_DEVARCH_ETMv4x_ARCH \ 600 549 (ETM_DEVARCH_ARCHITECT_ARM | ETM_DEVARCH_ARCHID_ETMv4x | ETM_DEVARCH_PRESENT) 550 + #define ETM_DEVARCH_ETE_ARCH \ 551 + (ETM_DEVARCH_ARCHITECT_ARM | ETM_DEVARCH_ARCHID_ETE | ETM_DEVARCH_PRESENT) 601 552 602 553 #define TRCSTATR_IDLE_BIT 0 603 554 #define TRCSTATR_PMSTABLE_BIT 1 ··· 690 635 #define ETM_ARCH_MINOR_VERSION(arch) ((arch) & 0xfU) 691 636 692 637 #define ETM_ARCH_V4 ETM_ARCH_VERSION(4, 0) 638 + #define ETM_ARCH_ETE ETM_ARCH_VERSION(5, 0) 639 + 693 640 /* Interpretation of resource numbers change at ETM v4.3 architecture */ 694 641 #define ETM_ARCH_V4_3 ETM_ARCH_VERSION(4, 3) 695 642 ··· 919 862 * @nooverflow: Indicate if overflow prevention is supported. 920 863 * @atbtrig: If the implementation can support ATB triggers 921 864 * @lpoverride: If the implementation can support low-power state over. 865 + * @trfc: If the implementation supports Arm v8.4 trace filter controls. 922 866 * @config: structure holding configuration parameters. 923 867 * @save_state: State to be preserved across power loss 924 868 * @state_needs_restore: True when there is context to restore after PM exit ··· 955 897 u8 s_ex_level; 956 898 u8 ns_ex_level; 957 899 u8 q_support; 900 + u8 os_lock_model; 958 901 bool sticky_enable; 959 902 bool boot_enable; 960 903 bool os_unlock; ··· 971 912 bool nooverflow; 972 913 bool atbtrig; 973 914 bool lpoverride; 915 + bool trfc; 974 916 struct etmv4_config config; 975 917 struct etmv4_save_state *save_state; 976 918 bool state_needs_restore; ··· 1000 940 1001 941 u64 etm4x_sysreg_read(u32 offset, bool _relaxed, bool _64bit); 1002 942 void etm4x_sysreg_write(u64 val, u32 offset, bool _relaxed, bool _64bit); 943 + 944 + static inline bool etm4x_is_ete(struct etmv4_drvdata *drvdata) 945 + { 946 + return drvdata->arch >= ETM_ARCH_ETE; 947 + } 1003 948 #endif

+6

drivers/hwtracing/coresight/coresight-platform.c

··· 90 90 struct of_endpoint endpoint; 91 91 int in = 0, out = 0; 92 92 93 + /* 94 + * Avoid warnings in of_graph_get_next_endpoint() 95 + * if the device doesn't have any graph connections 96 + */ 97 + if (!of_graph_is_present(node)) 98 + return; 93 99 do { 94 100 ep = of_graph_get_next_endpoint(node, ep); 95 101 if (!ep)

+3

drivers/hwtracing/coresight/coresight-priv.h

··· 232 232 void coresight_set_assoc_ectdev_mutex(struct coresight_device *csdev, 233 233 struct coresight_device *ect_csdev); 234 234 235 + void coresight_set_percpu_sink(int cpu, struct coresight_device *csdev); 236 + struct coresight_device *coresight_get_percpu_sink(int cpu); 237 + 235 238 #endif

+1157

drivers/hwtracing/coresight/coresight-trbe.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * This driver enables Trace Buffer Extension (TRBE) as a per-cpu coresight 4 + * sink device could then pair with an appropriate per-cpu coresight source 5 + * device (ETE) thus generating required trace data. Trace can be enabled 6 + * via the perf framework. 7 + * 8 + * The AUX buffer handling is inspired from Arm SPE PMU driver. 9 + * 10 + * Copyright (C) 2020 ARM Ltd. 11 + * 12 + * Author: Anshuman Khandual <anshuman.khandual@arm.com> 13 + */ 14 + #define DRVNAME "arm_trbe" 15 + 16 + #define pr_fmt(fmt) DRVNAME ": " fmt 17 + 18 + #include <asm/barrier.h> 19 + #include "coresight-trbe.h" 20 + 21 + #define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT)) 22 + 23 + /* 24 + * A padding packet that will help the user space tools 25 + * in skipping relevant sections in the captured trace 26 + * data which could not be decoded. TRBE doesn't support 27 + * formatting the trace data, unlike the legacy CoreSight 28 + * sinks and thus we use ETE trace packets to pad the 29 + * sections of the buffer. 30 + */ 31 + #define ETE_IGNORE_PACKET 0x70 32 + 33 + /* 34 + * Minimum amount of meaningful trace will contain: 35 + * A-Sync, Trace Info, Trace On, Address, Atom. 36 + * This is about 44bytes of ETE trace. To be on 37 + * the safer side, we assume 64bytes is the minimum 38 + * space required for a meaningful session, before 39 + * we hit a "WRAP" event. 40 + */ 41 + #define TRBE_TRACE_MIN_BUF_SIZE 64 42 + 43 + enum trbe_fault_action { 44 + TRBE_FAULT_ACT_WRAP, 45 + TRBE_FAULT_ACT_SPURIOUS, 46 + TRBE_FAULT_ACT_FATAL, 47 + }; 48 + 49 + struct trbe_buf { 50 + /* 51 + * Even though trbe_base represents vmap() 52 + * mapped allocated buffer's start address, 53 + * it's being as unsigned long for various 54 + * arithmetic and comparision operations & 55 + * also to be consistent with trbe_write & 56 + * trbe_limit sibling pointers. 57 + */ 58 + unsigned long trbe_base; 59 + unsigned long trbe_limit; 60 + unsigned long trbe_write; 61 + int nr_pages; 62 + void **pages; 63 + bool snapshot; 64 + struct trbe_cpudata *cpudata; 65 + }; 66 + 67 + struct trbe_cpudata { 68 + bool trbe_flag; 69 + u64 trbe_align; 70 + int cpu; 71 + enum cs_mode mode; 72 + struct trbe_buf *buf; 73 + struct trbe_drvdata *drvdata; 74 + }; 75 + 76 + struct trbe_drvdata { 77 + struct trbe_cpudata __percpu *cpudata; 78 + struct perf_output_handle * __percpu *handle; 79 + struct hlist_node hotplug_node; 80 + int irq; 81 + cpumask_t supported_cpus; 82 + enum cpuhp_state trbe_online; 83 + struct platform_device *pdev; 84 + }; 85 + 86 + static int trbe_alloc_node(struct perf_event *event) 87 + { 88 + if (event->cpu == -1) 89 + return NUMA_NO_NODE; 90 + return cpu_to_node(event->cpu); 91 + } 92 + 93 + static void trbe_drain_buffer(void) 94 + { 95 + tsb_csync(); 96 + dsb(nsh); 97 + } 98 + 99 + static void trbe_drain_and_disable_local(void) 100 + { 101 + u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1); 102 + 103 + trbe_drain_buffer(); 104 + 105 + /* 106 + * Disable the TRBE without clearing LIMITPTR which 107 + * might be required for fetching the buffer limits. 108 + */ 109 + trblimitr &= ~TRBLIMITR_ENABLE; 110 + write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1); 111 + isb(); 112 + } 113 + 114 + static void trbe_reset_local(void) 115 + { 116 + trbe_drain_and_disable_local(); 117 + write_sysreg_s(0, SYS_TRBLIMITR_EL1); 118 + write_sysreg_s(0, SYS_TRBPTR_EL1); 119 + write_sysreg_s(0, SYS_TRBBASER_EL1); 120 + write_sysreg_s(0, SYS_TRBSR_EL1); 121 + } 122 + 123 + static void trbe_stop_and_truncate_event(struct perf_output_handle *handle) 124 + { 125 + struct trbe_buf *buf = etm_perf_sink_config(handle); 126 + 127 + /* 128 + * We cannot proceed with the buffer collection and we 129 + * do not have any data for the current session. The 130 + * etm_perf driver expects to close out the aux_buffer 131 + * at event_stop(). So disable the TRBE here and leave 132 + * the update_buffer() to return a 0 size. 133 + */ 134 + trbe_drain_and_disable_local(); 135 + perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); 136 + *this_cpu_ptr(buf->cpudata->drvdata->handle) = NULL; 137 + } 138 + 139 + /* 140 + * TRBE Buffer Management 141 + * 142 + * The TRBE buffer spans from the base pointer till the limit pointer. When enabled, 143 + * it starts writing trace data from the write pointer onward till the limit pointer. 144 + * When the write pointer reaches the address just before the limit pointer, it gets 145 + * wrapped around again to the base pointer. This is called a TRBE wrap event, which 146 + * generates a maintenance interrupt when operated in WRAP or FILL mode. This driver 147 + * uses FILL mode, where the TRBE stops the trace collection at wrap event. The IRQ 148 + * handler updates the AUX buffer and re-enables the TRBE with updated WRITE and 149 + * LIMIT pointers. 150 + * 151 + * Wrap around with an IRQ 152 + * ------ < ------ < ------- < ----- < ----- 153 + * | | 154 + * ------ > ------ > ------- > ----- > ----- 155 + * 156 + * +---------------+-----------------------+ 157 + * | | | 158 + * +---------------+-----------------------+ 159 + * Base Pointer Write Pointer Limit Pointer 160 + * 161 + * The base and limit pointers always needs to be PAGE_SIZE aligned. But the write 162 + * pointer can be aligned to the implementation defined TRBE trace buffer alignment 163 + * as captured in trbe_cpudata->trbe_align. 164 + * 165 + * 166 + * head tail wakeup 167 + * +---------------------------------------+----- ~ ~ ------ 168 + * |$$$$$$$|################|$$$$$$$$$$$$$$| | 169 + * +---------------------------------------+----- ~ ~ ------ 170 + * Base Pointer Write Pointer Limit Pointer 171 + * 172 + * The perf_output_handle indices (head, tail, wakeup) are monotonically increasing 173 + * values which tracks all the driver writes and user reads from the perf auxiliary 174 + * buffer. Generally [head..tail] is the area where the driver can write into unless 175 + * the wakeup is behind the tail. Enabled TRBE buffer span needs to be adjusted and 176 + * configured depending on the perf_output_handle indices, so that the driver does 177 + * not override into areas in the perf auxiliary buffer which is being or yet to be 178 + * consumed from the user space. The enabled TRBE buffer area is a moving subset of 179 + * the allocated perf auxiliary buffer. 180 + */ 181 + static void trbe_pad_buf(struct perf_output_handle *handle, int len) 182 + { 183 + struct trbe_buf *buf = etm_perf_sink_config(handle); 184 + u64 head = PERF_IDX2OFF(handle->head, buf); 185 + 186 + memset((void *)buf->trbe_base + head, ETE_IGNORE_PACKET, len); 187 + if (!buf->snapshot) 188 + perf_aux_output_skip(handle, len); 189 + } 190 + 191 + static unsigned long trbe_snapshot_offset(struct perf_output_handle *handle) 192 + { 193 + struct trbe_buf *buf = etm_perf_sink_config(handle); 194 + 195 + /* 196 + * The ETE trace has alignment synchronization packets allowing 197 + * the decoder to reset in case of an overflow or corruption. 198 + * So we can use the entire buffer for the snapshot mode. 199 + */ 200 + return buf->nr_pages * PAGE_SIZE; 201 + } 202 + 203 + /* 204 + * TRBE Limit Calculation 205 + * 206 + * The following markers are used to illustrate various TRBE buffer situations. 207 + * 208 + * $$$$ - Data area, unconsumed captured trace data, not to be overridden 209 + * #### - Free area, enabled, trace will be written 210 + * %%%% - Free area, disabled, trace will not be written 211 + * ==== - Free area, padded with ETE_IGNORE_PACKET, trace will be skipped 212 + */ 213 + static unsigned long __trbe_normal_offset(struct perf_output_handle *handle) 214 + { 215 + struct trbe_buf *buf = etm_perf_sink_config(handle); 216 + struct trbe_cpudata *cpudata = buf->cpudata; 217 + const u64 bufsize = buf->nr_pages * PAGE_SIZE; 218 + u64 limit = bufsize; 219 + u64 head, tail, wakeup; 220 + 221 + head = PERF_IDX2OFF(handle->head, buf); 222 + 223 + /* 224 + * head 225 + * ------->| 226 + * | 227 + * head TRBE align tail 228 + * +----|-------|---------------|-------+ 229 + * |$$$$|=======|###############|$$$$$$$| 230 + * +----|-------|---------------|-------+ 231 + * trbe_base trbe_base + nr_pages 232 + * 233 + * Perf aux buffer output head position can be misaligned depending on 234 + * various factors including user space reads. In case misaligned, head 235 + * needs to be aligned before TRBE can be configured. Pad the alignment 236 + * gap with ETE_IGNORE_PACKET bytes that will be ignored by user tools 237 + * and skip this section thus advancing the head. 238 + */ 239 + if (!IS_ALIGNED(head, cpudata->trbe_align)) { 240 + unsigned long delta = roundup(head, cpudata->trbe_align) - head; 241 + 242 + delta = min(delta, handle->size); 243 + trbe_pad_buf(handle, delta); 244 + head = PERF_IDX2OFF(handle->head, buf); 245 + } 246 + 247 + /* 248 + * head = tail (size = 0) 249 + * +----|-------------------------------+ 250 + * |$$$$|$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ | 251 + * +----|-------------------------------+ 252 + * trbe_base trbe_base + nr_pages 253 + * 254 + * Perf aux buffer does not have any space for the driver to write into. 255 + * Just communicate trace truncation event to the user space by marking 256 + * it with PERF_AUX_FLAG_TRUNCATED. 257 + */ 258 + if (!handle->size) { 259 + perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); 260 + return 0; 261 + } 262 + 263 + /* Compute the tail and wakeup indices now that we've aligned head */ 264 + tail = PERF_IDX2OFF(handle->head + handle->size, buf); 265 + wakeup = PERF_IDX2OFF(handle->wakeup, buf); 266 + 267 + /* 268 + * Lets calculate the buffer area which TRBE could write into. There 269 + * are three possible scenarios here. Limit needs to be aligned with 270 + * PAGE_SIZE per the TRBE requirement. Always avoid clobbering the 271 + * unconsumed data. 272 + * 273 + * 1) head < tail 274 + * 275 + * head tail 276 + * +----|-----------------------|-------+ 277 + * |$$$$|#######################|$$$$$$$| 278 + * +----|-----------------------|-------+ 279 + * trbe_base limit trbe_base + nr_pages 280 + * 281 + * TRBE could write into [head..tail] area. Unless the tail is right at 282 + * the end of the buffer, neither an wrap around nor an IRQ is expected 283 + * while being enabled. 284 + * 285 + * 2) head == tail 286 + * 287 + * head = tail (size > 0) 288 + * +----|-------------------------------+ 289 + * |%%%%|###############################| 290 + * +----|-------------------------------+ 291 + * trbe_base limit = trbe_base + nr_pages 292 + * 293 + * TRBE should just write into [head..base + nr_pages] area even though 294 + * the entire buffer is empty. Reason being, when the trace reaches the 295 + * end of the buffer, it will just wrap around with an IRQ giving an 296 + * opportunity to reconfigure the buffer. 297 + * 298 + * 3) tail < head 299 + * 300 + * tail head 301 + * +----|-----------------------|-------+ 302 + * |%%%%|$$$$$$$$$$$$$$$$$$$$$$$|#######| 303 + * +----|-----------------------|-------+ 304 + * trbe_base limit = trbe_base + nr_pages 305 + * 306 + * TRBE should just write into [head..base + nr_pages] area even though 307 + * the [trbe_base..tail] is also empty. Reason being, when the trace 308 + * reaches the end of the buffer, it will just wrap around with an IRQ 309 + * giving an opportunity to reconfigure the buffer. 310 + */ 311 + if (head < tail) 312 + limit = round_down(tail, PAGE_SIZE); 313 + 314 + /* 315 + * Wakeup may be arbitrarily far into the future. If it's not in the 316 + * current generation, either we'll wrap before hitting it, or it's 317 + * in the past and has been handled already. 318 + * 319 + * If there's a wakeup before we wrap, arrange to be woken up by the 320 + * page boundary following it. Keep the tail boundary if that's lower. 321 + * 322 + * head wakeup tail 323 + * +----|---------------|-------|-------+ 324 + * |$$$$|###############|%%%%%%%|$$$$$$$| 325 + * +----|---------------|-------|-------+ 326 + * trbe_base limit trbe_base + nr_pages 327 + */ 328 + if (handle->wakeup < (handle->head + handle->size) && head <= wakeup) 329 + limit = min(limit, round_up(wakeup, PAGE_SIZE)); 330 + 331 + /* 332 + * There are two situation when this can happen i.e limit is before 333 + * the head and hence TRBE cannot be configured. 334 + * 335 + * 1) head < tail (aligned down with PAGE_SIZE) and also they are both 336 + * within the same PAGE size range. 337 + * 338 + * PAGE_SIZE 339 + * |----------------------| 340 + * 341 + * limit head tail 342 + * +------------|------|--------|-------+ 343 + * |$$$$$$$$$$$$$$$$$$$|========|$$$$$$$| 344 + * +------------|------|--------|-------+ 345 + * trbe_base trbe_base + nr_pages 346 + * 347 + * 2) head < wakeup (aligned up with PAGE_SIZE) < tail and also both 348 + * head and wakeup are within same PAGE size range. 349 + * 350 + * PAGE_SIZE 351 + * |----------------------| 352 + * 353 + * limit head wakeup tail 354 + * +----|------|-------|--------|-------+ 355 + * |$$$$$$$$$$$|=======|========|$$$$$$$| 356 + * +----|------|-------|--------|-------+ 357 + * trbe_base trbe_base + nr_pages 358 + */ 359 + if (limit > head) 360 + return limit; 361 + 362 + trbe_pad_buf(handle, handle->size); 363 + perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); 364 + return 0; 365 + } 366 + 367 + static unsigned long trbe_normal_offset(struct perf_output_handle *handle) 368 + { 369 + struct trbe_buf *buf = perf_get_aux(handle); 370 + u64 limit = __trbe_normal_offset(handle); 371 + u64 head = PERF_IDX2OFF(handle->head, buf); 372 + 373 + /* 374 + * If the head is too close to the limit and we don't 375 + * have space for a meaningful run, we rather pad it 376 + * and start fresh. 377 + */ 378 + if (limit && (limit - head < TRBE_TRACE_MIN_BUF_SIZE)) { 379 + trbe_pad_buf(handle, limit - head); 380 + limit = __trbe_normal_offset(handle); 381 + } 382 + return limit; 383 + } 384 + 385 + static unsigned long compute_trbe_buffer_limit(struct perf_output_handle *handle) 386 + { 387 + struct trbe_buf *buf = etm_perf_sink_config(handle); 388 + unsigned long offset; 389 + 390 + if (buf->snapshot) 391 + offset = trbe_snapshot_offset(handle); 392 + else 393 + offset = trbe_normal_offset(handle); 394 + return buf->trbe_base + offset; 395 + } 396 + 397 + static void clr_trbe_status(void) 398 + { 399 + u64 trbsr = read_sysreg_s(SYS_TRBSR_EL1); 400 + 401 + WARN_ON(is_trbe_enabled()); 402 + trbsr &= ~TRBSR_IRQ; 403 + trbsr &= ~TRBSR_TRG; 404 + trbsr &= ~TRBSR_WRAP; 405 + trbsr &= ~(TRBSR_EC_MASK << TRBSR_EC_SHIFT); 406 + trbsr &= ~(TRBSR_BSC_MASK << TRBSR_BSC_SHIFT); 407 + trbsr &= ~TRBSR_STOP; 408 + write_sysreg_s(trbsr, SYS_TRBSR_EL1); 409 + } 410 + 411 + static void set_trbe_limit_pointer_enabled(unsigned long addr) 412 + { 413 + u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1); 414 + 415 + WARN_ON(!IS_ALIGNED(addr, (1UL << TRBLIMITR_LIMIT_SHIFT))); 416 + WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE)); 417 + 418 + trblimitr &= ~TRBLIMITR_NVM; 419 + trblimitr &= ~(TRBLIMITR_FILL_MODE_MASK << TRBLIMITR_FILL_MODE_SHIFT); 420 + trblimitr &= ~(TRBLIMITR_TRIG_MODE_MASK << TRBLIMITR_TRIG_MODE_SHIFT); 421 + trblimitr &= ~(TRBLIMITR_LIMIT_MASK << TRBLIMITR_LIMIT_SHIFT); 422 + 423 + /* 424 + * Fill trace buffer mode is used here while configuring the 425 + * TRBE for trace capture. In this particular mode, the trace 426 + * collection is stopped and a maintenance interrupt is raised 427 + * when the current write pointer wraps. This pause in trace 428 + * collection gives the software an opportunity to capture the 429 + * trace data in the interrupt handler, before reconfiguring 430 + * the TRBE. 431 + */ 432 + trblimitr |= (TRBE_FILL_MODE_FILL & TRBLIMITR_FILL_MODE_MASK) << TRBLIMITR_FILL_MODE_SHIFT; 433 + 434 + /* 435 + * Trigger mode is not used here while configuring the TRBE for 436 + * the trace capture. Hence just keep this in the ignore mode. 437 + */ 438 + trblimitr |= (TRBE_TRIG_MODE_IGNORE & TRBLIMITR_TRIG_MODE_MASK) << 439 + TRBLIMITR_TRIG_MODE_SHIFT; 440 + trblimitr |= (addr & PAGE_MASK); 441 + 442 + trblimitr |= TRBLIMITR_ENABLE; 443 + write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1); 444 + 445 + /* Synchronize the TRBE enable event */ 446 + isb(); 447 + } 448 + 449 + static void trbe_enable_hw(struct trbe_buf *buf) 450 + { 451 + WARN_ON(buf->trbe_write < buf->trbe_base); 452 + WARN_ON(buf->trbe_write >= buf->trbe_limit); 453 + set_trbe_disabled(); 454 + isb(); 455 + clr_trbe_status(); 456 + set_trbe_base_pointer(buf->trbe_base); 457 + set_trbe_write_pointer(buf->trbe_write); 458 + 459 + /* 460 + * Synchronize all the register updates 461 + * till now before enabling the TRBE. 462 + */ 463 + isb(); 464 + set_trbe_limit_pointer_enabled(buf->trbe_limit); 465 + } 466 + 467 + static enum trbe_fault_action trbe_get_fault_act(u64 trbsr) 468 + { 469 + int ec = get_trbe_ec(trbsr); 470 + int bsc = get_trbe_bsc(trbsr); 471 + 472 + WARN_ON(is_trbe_running(trbsr)); 473 + if (is_trbe_trg(trbsr) || is_trbe_abort(trbsr)) 474 + return TRBE_FAULT_ACT_FATAL; 475 + 476 + if ((ec == TRBE_EC_STAGE1_ABORT) || (ec == TRBE_EC_STAGE2_ABORT)) 477 + return TRBE_FAULT_ACT_FATAL; 478 + 479 + if (is_trbe_wrap(trbsr) && (ec == TRBE_EC_OTHERS) && (bsc == TRBE_BSC_FILLED)) { 480 + if (get_trbe_write_pointer() == get_trbe_base_pointer()) 481 + return TRBE_FAULT_ACT_WRAP; 482 + } 483 + return TRBE_FAULT_ACT_SPURIOUS; 484 + } 485 + 486 + static void *arm_trbe_alloc_buffer(struct coresight_device *csdev, 487 + struct perf_event *event, void **pages, 488 + int nr_pages, bool snapshot) 489 + { 490 + struct trbe_buf *buf; 491 + struct page **pglist; 492 + int i; 493 + 494 + /* 495 + * TRBE LIMIT and TRBE WRITE pointers must be page aligned. But with 496 + * just a single page, there would not be any room left while writing 497 + * into a partially filled TRBE buffer after the page size alignment. 498 + * Hence restrict the minimum buffer size as two pages. 499 + */ 500 + if (nr_pages < 2) 501 + return NULL; 502 + 503 + buf = kzalloc_node(sizeof(*buf), GFP_KERNEL, trbe_alloc_node(event)); 504 + if (!buf) 505 + return ERR_PTR(-ENOMEM); 506 + 507 + pglist = kcalloc(nr_pages, sizeof(*pglist), GFP_KERNEL); 508 + if (!pglist) { 509 + kfree(buf); 510 + return ERR_PTR(-ENOMEM); 511 + } 512 + 513 + for (i = 0; i < nr_pages; i++) 514 + pglist[i] = virt_to_page(pages[i]); 515 + 516 + buf->trbe_base = (unsigned long)vmap(pglist, nr_pages, VM_MAP, PAGE_KERNEL); 517 + if (!buf->trbe_base) { 518 + kfree(pglist); 519 + kfree(buf); 520 + return ERR_PTR(-ENOMEM); 521 + } 522 + buf->trbe_limit = buf->trbe_base + nr_pages * PAGE_SIZE; 523 + buf->trbe_write = buf->trbe_base; 524 + buf->snapshot = snapshot; 525 + buf->nr_pages = nr_pages; 526 + buf->pages = pages; 527 + kfree(pglist); 528 + return buf; 529 + } 530 + 531 + static void arm_trbe_free_buffer(void *config) 532 + { 533 + struct trbe_buf *buf = config; 534 + 535 + vunmap((void *)buf->trbe_base); 536 + kfree(buf); 537 + } 538 + 539 + static unsigned long arm_trbe_update_buffer(struct coresight_device *csdev, 540 + struct perf_output_handle *handle, 541 + void *config) 542 + { 543 + struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); 544 + struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev); 545 + struct trbe_buf *buf = config; 546 + enum trbe_fault_action act; 547 + unsigned long size, offset; 548 + unsigned long write, base, status; 549 + unsigned long flags; 550 + 551 + WARN_ON(buf->cpudata != cpudata); 552 + WARN_ON(cpudata->cpu != smp_processor_id()); 553 + WARN_ON(cpudata->drvdata != drvdata); 554 + if (cpudata->mode != CS_MODE_PERF) 555 + return 0; 556 + 557 + perf_aux_output_flag(handle, PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW); 558 + 559 + /* 560 + * We are about to disable the TRBE. And this could in turn 561 + * fill up the buffer triggering, an IRQ. This could be consumed 562 + * by the PE asynchronously, causing a race here against 563 + * the IRQ handler in closing out the handle. So, let us 564 + * make sure the IRQ can't trigger while we are collecting 565 + * the buffer. We also make sure that a WRAP event is handled 566 + * accordingly. 567 + */ 568 + local_irq_save(flags); 569 + 570 + /* 571 + * If the TRBE was disabled due to lack of space in the AUX buffer or a 572 + * spurious fault, the driver leaves it disabled, truncating the buffer. 573 + * Since the etm_perf driver expects to close out the AUX buffer, the 574 + * driver skips it. Thus, just pass in 0 size here to indicate that the 575 + * buffer was truncated. 576 + */ 577 + if (!is_trbe_enabled()) { 578 + size = 0; 579 + goto done; 580 + } 581 + /* 582 + * perf handle structure needs to be shared with the TRBE IRQ handler for 583 + * capturing trace data and restarting the handle. There is a probability 584 + * of an undefined reference based crash when etm event is being stopped 585 + * while a TRBE IRQ also getting processed. This happens due the release 586 + * of perf handle via perf_aux_output_end() in etm_event_stop(). Stopping 587 + * the TRBE here will ensure that no IRQ could be generated when the perf 588 + * handle gets freed in etm_event_stop(). 589 + */ 590 + trbe_drain_and_disable_local(); 591 + write = get_trbe_write_pointer(); 592 + base = get_trbe_base_pointer(); 593 + 594 + /* Check if there is a pending interrupt and handle it here */ 595 + status = read_sysreg_s(SYS_TRBSR_EL1); 596 + if (is_trbe_irq(status)) { 597 + 598 + /* 599 + * Now that we are handling the IRQ here, clear the IRQ 600 + * from the status, to let the irq handler know that it 601 + * is taken care of. 602 + */ 603 + clr_trbe_irq(); 604 + isb(); 605 + 606 + act = trbe_get_fault_act(status); 607 + /* 608 + * If this was not due to a WRAP event, we have some 609 + * errors and as such buffer is empty. 610 + */ 611 + if (act != TRBE_FAULT_ACT_WRAP) { 612 + size = 0; 613 + goto done; 614 + } 615 + 616 + /* 617 + * Otherwise, the buffer is full and the write pointer 618 + * has reached base. Adjust this back to the Limit pointer 619 + * for correct size. Also, mark the buffer truncated. 620 + */ 621 + write = get_trbe_limit_pointer(); 622 + perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED); 623 + } 624 + 625 + offset = write - base; 626 + if (WARN_ON_ONCE(offset < PERF_IDX2OFF(handle->head, buf))) 627 + size = 0; 628 + else 629 + size = offset - PERF_IDX2OFF(handle->head, buf); 630 + 631 + done: 632 + local_irq_restore(flags); 633 + 634 + if (buf->snapshot) 635 + handle->head += size; 636 + return size; 637 + } 638 + 639 + static int arm_trbe_enable(struct coresight_device *csdev, u32 mode, void *data) 640 + { 641 + struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); 642 + struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev); 643 + struct perf_output_handle *handle = data; 644 + struct trbe_buf *buf = etm_perf_sink_config(handle); 645 + 646 + WARN_ON(cpudata->cpu != smp_processor_id()); 647 + WARN_ON(cpudata->drvdata != drvdata); 648 + if (mode != CS_MODE_PERF) 649 + return -EINVAL; 650 + 651 + *this_cpu_ptr(drvdata->handle) = handle; 652 + cpudata->buf = buf; 653 + cpudata->mode = mode; 654 + buf->cpudata = cpudata; 655 + buf->trbe_limit = compute_trbe_buffer_limit(handle); 656 + buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf); 657 + if (buf->trbe_limit == buf->trbe_base) { 658 + trbe_stop_and_truncate_event(handle); 659 + return 0; 660 + } 661 + trbe_enable_hw(buf); 662 + return 0; 663 + } 664 + 665 + static int arm_trbe_disable(struct coresight_device *csdev) 666 + { 667 + struct trbe_drvdata *drvdata = dev_get_drvdata(csdev->dev.parent); 668 + struct trbe_cpudata *cpudata = dev_get_drvdata(&csdev->dev); 669 + struct trbe_buf *buf = cpudata->buf; 670 + 671 + WARN_ON(buf->cpudata != cpudata); 672 + WARN_ON(cpudata->cpu != smp_processor_id()); 673 + WARN_ON(cpudata->drvdata != drvdata); 674 + if (cpudata->mode != CS_MODE_PERF) 675 + return -EINVAL; 676 + 677 + trbe_drain_and_disable_local(); 678 + buf->cpudata = NULL; 679 + cpudata->buf = NULL; 680 + cpudata->mode = CS_MODE_DISABLED; 681 + return 0; 682 + } 683 + 684 + static void trbe_handle_spurious(struct perf_output_handle *handle) 685 + { 686 + struct trbe_buf *buf = etm_perf_sink_config(handle); 687 + 688 + buf->trbe_limit = compute_trbe_buffer_limit(handle); 689 + buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf); 690 + if (buf->trbe_limit == buf->trbe_base) { 691 + trbe_drain_and_disable_local(); 692 + return; 693 + } 694 + trbe_enable_hw(buf); 695 + } 696 + 697 + static void trbe_handle_overflow(struct perf_output_handle *handle) 698 + { 699 + struct perf_event *event = handle->event; 700 + struct trbe_buf *buf = etm_perf_sink_config(handle); 701 + unsigned long offset, size; 702 + struct etm_event_data *event_data; 703 + 704 + offset = get_trbe_limit_pointer() - get_trbe_base_pointer(); 705 + size = offset - PERF_IDX2OFF(handle->head, buf); 706 + if (buf->snapshot) 707 + handle->head += size; 708 + 709 + /* 710 + * Mark the buffer as truncated, as we have stopped the trace 711 + * collection upon the WRAP event, without stopping the source. 712 + */ 713 + perf_aux_output_flag(handle, PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW | 714 + PERF_AUX_FLAG_TRUNCATED); 715 + perf_aux_output_end(handle, size); 716 + event_data = perf_aux_output_begin(handle, event); 717 + if (!event_data) { 718 + /* 719 + * We are unable to restart the trace collection, 720 + * thus leave the TRBE disabled. The etm-perf driver 721 + * is able to detect this with a disconnected handle 722 + * (handle->event = NULL). 723 + */ 724 + trbe_drain_and_disable_local(); 725 + *this_cpu_ptr(buf->cpudata->drvdata->handle) = NULL; 726 + return; 727 + } 728 + buf->trbe_limit = compute_trbe_buffer_limit(handle); 729 + buf->trbe_write = buf->trbe_base + PERF_IDX2OFF(handle->head, buf); 730 + if (buf->trbe_limit == buf->trbe_base) { 731 + trbe_stop_and_truncate_event(handle); 732 + return; 733 + } 734 + *this_cpu_ptr(buf->cpudata->drvdata->handle) = handle; 735 + trbe_enable_hw(buf); 736 + } 737 + 738 + static bool is_perf_trbe(struct perf_output_handle *handle) 739 + { 740 + struct trbe_buf *buf = etm_perf_sink_config(handle); 741 + struct trbe_cpudata *cpudata = buf->cpudata; 742 + struct trbe_drvdata *drvdata = cpudata->drvdata; 743 + int cpu = smp_processor_id(); 744 + 745 + WARN_ON(buf->trbe_base != get_trbe_base_pointer()); 746 + WARN_ON(buf->trbe_limit != get_trbe_limit_pointer()); 747 + 748 + if (cpudata->mode != CS_MODE_PERF) 749 + return false; 750 + 751 + if (cpudata->cpu != cpu) 752 + return false; 753 + 754 + if (!cpumask_test_cpu(cpu, &drvdata->supported_cpus)) 755 + return false; 756 + 757 + return true; 758 + } 759 + 760 + static irqreturn_t arm_trbe_irq_handler(int irq, void *dev) 761 + { 762 + struct perf_output_handle **handle_ptr = dev; 763 + struct perf_output_handle *handle = *handle_ptr; 764 + enum trbe_fault_action act; 765 + u64 status; 766 + 767 + /* 768 + * Ensure the trace is visible to the CPUs and 769 + * any external aborts have been resolved. 770 + */ 771 + trbe_drain_and_disable_local(); 772 + 773 + status = read_sysreg_s(SYS_TRBSR_EL1); 774 + /* 775 + * If the pending IRQ was handled by update_buffer callback 776 + * we have nothing to do here. 777 + */ 778 + if (!is_trbe_irq(status)) 779 + return IRQ_NONE; 780 + 781 + clr_trbe_irq(); 782 + isb(); 783 + 784 + if (WARN_ON_ONCE(!handle) || !perf_get_aux(handle)) 785 + return IRQ_NONE; 786 + 787 + if (!is_perf_trbe(handle)) 788 + return IRQ_NONE; 789 + 790 + /* 791 + * Ensure perf callbacks have completed, which may disable 792 + * the trace buffer in response to a TRUNCATION flag. 793 + */ 794 + irq_work_run(); 795 + 796 + act = trbe_get_fault_act(status); 797 + switch (act) { 798 + case TRBE_FAULT_ACT_WRAP: 799 + trbe_handle_overflow(handle); 800 + break; 801 + case TRBE_FAULT_ACT_SPURIOUS: 802 + trbe_handle_spurious(handle); 803 + break; 804 + case TRBE_FAULT_ACT_FATAL: 805 + trbe_stop_and_truncate_event(handle); 806 + break; 807 + } 808 + return IRQ_HANDLED; 809 + } 810 + 811 + static const struct coresight_ops_sink arm_trbe_sink_ops = { 812 + .enable = arm_trbe_enable, 813 + .disable = arm_trbe_disable, 814 + .alloc_buffer = arm_trbe_alloc_buffer, 815 + .free_buffer = arm_trbe_free_buffer, 816 + .update_buffer = arm_trbe_update_buffer, 817 + }; 818 + 819 + static const struct coresight_ops arm_trbe_cs_ops = { 820 + .sink_ops = &arm_trbe_sink_ops, 821 + }; 822 + 823 + static ssize_t align_show(struct device *dev, struct device_attribute *attr, char *buf) 824 + { 825 + struct trbe_cpudata *cpudata = dev_get_drvdata(dev); 826 + 827 + return sprintf(buf, "%llx\n", cpudata->trbe_align); 828 + } 829 + static DEVICE_ATTR_RO(align); 830 + 831 + static ssize_t flag_show(struct device *dev, struct device_attribute *attr, char *buf) 832 + { 833 + struct trbe_cpudata *cpudata = dev_get_drvdata(dev); 834 + 835 + return sprintf(buf, "%d\n", cpudata->trbe_flag); 836 + } 837 + static DEVICE_ATTR_RO(flag); 838 + 839 + static struct attribute *arm_trbe_attrs[] = { 840 + &dev_attr_align.attr, 841 + &dev_attr_flag.attr, 842 + NULL, 843 + }; 844 + 845 + static const struct attribute_group arm_trbe_group = { 846 + .attrs = arm_trbe_attrs, 847 + }; 848 + 849 + static const struct attribute_group *arm_trbe_groups[] = { 850 + &arm_trbe_group, 851 + NULL, 852 + }; 853 + 854 + static void arm_trbe_enable_cpu(void *info) 855 + { 856 + struct trbe_drvdata *drvdata = info; 857 + 858 + trbe_reset_local(); 859 + enable_percpu_irq(drvdata->irq, IRQ_TYPE_NONE); 860 + } 861 + 862 + static void arm_trbe_register_coresight_cpu(struct trbe_drvdata *drvdata, int cpu) 863 + { 864 + struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu); 865 + struct coresight_device *trbe_csdev = coresight_get_percpu_sink(cpu); 866 + struct coresight_desc desc = { 0 }; 867 + struct device *dev; 868 + 869 + if (WARN_ON(trbe_csdev)) 870 + return; 871 + 872 + dev = &cpudata->drvdata->pdev->dev; 873 + desc.name = devm_kasprintf(dev, GFP_KERNEL, "trbe%d", cpu); 874 + if (!desc.name) 875 + goto cpu_clear; 876 + 877 + desc.type = CORESIGHT_DEV_TYPE_SINK; 878 + desc.subtype.sink_subtype = CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM; 879 + desc.ops = &arm_trbe_cs_ops; 880 + desc.pdata = dev_get_platdata(dev); 881 + desc.groups = arm_trbe_groups; 882 + desc.dev = dev; 883 + trbe_csdev = coresight_register(&desc); 884 + if (IS_ERR(trbe_csdev)) 885 + goto cpu_clear; 886 + 887 + dev_set_drvdata(&trbe_csdev->dev, cpudata); 888 + coresight_set_percpu_sink(cpu, trbe_csdev); 889 + return; 890 + cpu_clear: 891 + cpumask_clear_cpu(cpu, &drvdata->supported_cpus); 892 + } 893 + 894 + static void arm_trbe_probe_cpu(void *info) 895 + { 896 + struct trbe_drvdata *drvdata = info; 897 + int cpu = smp_processor_id(); 898 + struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu); 899 + u64 trbidr; 900 + 901 + if (WARN_ON(!cpudata)) 902 + goto cpu_clear; 903 + 904 + if (!is_trbe_available()) { 905 + pr_err("TRBE is not implemented on cpu %d\n", cpu); 906 + goto cpu_clear; 907 + } 908 + 909 + trbidr = read_sysreg_s(SYS_TRBIDR_EL1); 910 + if (!is_trbe_programmable(trbidr)) { 911 + pr_err("TRBE is owned in higher exception level on cpu %d\n", cpu); 912 + goto cpu_clear; 913 + } 914 + 915 + cpudata->trbe_align = 1ULL << get_trbe_address_align(trbidr); 916 + if (cpudata->trbe_align > SZ_2K) { 917 + pr_err("Unsupported alignment on cpu %d\n", cpu); 918 + goto cpu_clear; 919 + } 920 + cpudata->trbe_flag = get_trbe_flag_update(trbidr); 921 + cpudata->cpu = cpu; 922 + cpudata->drvdata = drvdata; 923 + return; 924 + cpu_clear: 925 + cpumask_clear_cpu(cpu, &drvdata->supported_cpus); 926 + } 927 + 928 + static void arm_trbe_remove_coresight_cpu(void *info) 929 + { 930 + int cpu = smp_processor_id(); 931 + struct trbe_drvdata *drvdata = info; 932 + struct trbe_cpudata *cpudata = per_cpu_ptr(drvdata->cpudata, cpu); 933 + struct coresight_device *trbe_csdev = coresight_get_percpu_sink(cpu); 934 + 935 + disable_percpu_irq(drvdata->irq); 936 + trbe_reset_local(); 937 + if (trbe_csdev) { 938 + coresight_unregister(trbe_csdev); 939 + cpudata->drvdata = NULL; 940 + coresight_set_percpu_sink(cpu, NULL); 941 + } 942 + } 943 + 944 + static int arm_trbe_probe_coresight(struct trbe_drvdata *drvdata) 945 + { 946 + int cpu; 947 + 948 + drvdata->cpudata = alloc_percpu(typeof(*drvdata->cpudata)); 949 + if (!drvdata->cpudata) 950 + return -ENOMEM; 951 + 952 + for_each_cpu(cpu, &drvdata->supported_cpus) { 953 + smp_call_function_single(cpu, arm_trbe_probe_cpu, drvdata, 1); 954 + if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) 955 + arm_trbe_register_coresight_cpu(drvdata, cpu); 956 + if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) 957 + smp_call_function_single(cpu, arm_trbe_enable_cpu, drvdata, 1); 958 + } 959 + return 0; 960 + } 961 + 962 + static int arm_trbe_remove_coresight(struct trbe_drvdata *drvdata) 963 + { 964 + int cpu; 965 + 966 + for_each_cpu(cpu, &drvdata->supported_cpus) 967 + smp_call_function_single(cpu, arm_trbe_remove_coresight_cpu, drvdata, 1); 968 + free_percpu(drvdata->cpudata); 969 + return 0; 970 + } 971 + 972 + static int arm_trbe_cpu_startup(unsigned int cpu, struct hlist_node *node) 973 + { 974 + struct trbe_drvdata *drvdata = hlist_entry_safe(node, struct trbe_drvdata, hotplug_node); 975 + 976 + if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) { 977 + 978 + /* 979 + * If this CPU was not probed for TRBE, 980 + * initialize it now. 981 + */ 982 + if (!coresight_get_percpu_sink(cpu)) { 983 + arm_trbe_probe_cpu(drvdata); 984 + if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) 985 + arm_trbe_register_coresight_cpu(drvdata, cpu); 986 + if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) 987 + arm_trbe_enable_cpu(drvdata); 988 + } else { 989 + arm_trbe_enable_cpu(drvdata); 990 + } 991 + } 992 + return 0; 993 + } 994 + 995 + static int arm_trbe_cpu_teardown(unsigned int cpu, struct hlist_node *node) 996 + { 997 + struct trbe_drvdata *drvdata = hlist_entry_safe(node, struct trbe_drvdata, hotplug_node); 998 + 999 + if (cpumask_test_cpu(cpu, &drvdata->supported_cpus)) { 1000 + disable_percpu_irq(drvdata->irq); 1001 + trbe_reset_local(); 1002 + } 1003 + return 0; 1004 + } 1005 + 1006 + static int arm_trbe_probe_cpuhp(struct trbe_drvdata *drvdata) 1007 + { 1008 + enum cpuhp_state trbe_online; 1009 + int ret; 1010 + 1011 + trbe_online = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, DRVNAME, 1012 + arm_trbe_cpu_startup, arm_trbe_cpu_teardown); 1013 + if (trbe_online < 0) 1014 + return trbe_online; 1015 + 1016 + ret = cpuhp_state_add_instance(trbe_online, &drvdata->hotplug_node); 1017 + if (ret) { 1018 + cpuhp_remove_multi_state(trbe_online); 1019 + return ret; 1020 + } 1021 + drvdata->trbe_online = trbe_online; 1022 + return 0; 1023 + } 1024 + 1025 + static void arm_trbe_remove_cpuhp(struct trbe_drvdata *drvdata) 1026 + { 1027 + cpuhp_remove_multi_state(drvdata->trbe_online); 1028 + } 1029 + 1030 + static int arm_trbe_probe_irq(struct platform_device *pdev, 1031 + struct trbe_drvdata *drvdata) 1032 + { 1033 + int ret; 1034 + 1035 + drvdata->irq = platform_get_irq(pdev, 0); 1036 + if (drvdata->irq < 0) { 1037 + pr_err("IRQ not found for the platform device\n"); 1038 + return drvdata->irq; 1039 + } 1040 + 1041 + if (!irq_is_percpu(drvdata->irq)) { 1042 + pr_err("IRQ is not a PPI\n"); 1043 + return -EINVAL; 1044 + } 1045 + 1046 + if (irq_get_percpu_devid_partition(drvdata->irq, &drvdata->supported_cpus)) 1047 + return -EINVAL; 1048 + 1049 + drvdata->handle = alloc_percpu(struct perf_output_handle *); 1050 + if (!drvdata->handle) 1051 + return -ENOMEM; 1052 + 1053 + ret = request_percpu_irq(drvdata->irq, arm_trbe_irq_handler, DRVNAME, drvdata->handle); 1054 + if (ret) { 1055 + free_percpu(drvdata->handle); 1056 + return ret; 1057 + } 1058 + return 0; 1059 + } 1060 + 1061 + static void arm_trbe_remove_irq(struct trbe_drvdata *drvdata) 1062 + { 1063 + free_percpu_irq(drvdata->irq, drvdata->handle); 1064 + free_percpu(drvdata->handle); 1065 + } 1066 + 1067 + static int arm_trbe_device_probe(struct platform_device *pdev) 1068 + { 1069 + struct coresight_platform_data *pdata; 1070 + struct trbe_drvdata *drvdata; 1071 + struct device *dev = &pdev->dev; 1072 + int ret; 1073 + 1074 + drvdata = devm_kzalloc(dev, sizeof(*drvdata), GFP_KERNEL); 1075 + if (!drvdata) 1076 + return -ENOMEM; 1077 + 1078 + pdata = coresight_get_platform_data(dev); 1079 + if (IS_ERR(pdata)) 1080 + return PTR_ERR(pdata); 1081 + 1082 + dev_set_drvdata(dev, drvdata); 1083 + dev->platform_data = pdata; 1084 + drvdata->pdev = pdev; 1085 + ret = arm_trbe_probe_irq(pdev, drvdata); 1086 + if (ret) 1087 + return ret; 1088 + 1089 + ret = arm_trbe_probe_coresight(drvdata); 1090 + if (ret) 1091 + goto probe_failed; 1092 + 1093 + ret = arm_trbe_probe_cpuhp(drvdata); 1094 + if (ret) 1095 + goto cpuhp_failed; 1096 + 1097 + return 0; 1098 + cpuhp_failed: 1099 + arm_trbe_remove_coresight(drvdata); 1100 + probe_failed: 1101 + arm_trbe_remove_irq(drvdata); 1102 + return ret; 1103 + } 1104 + 1105 + static int arm_trbe_device_remove(struct platform_device *pdev) 1106 + { 1107 + struct trbe_drvdata *drvdata = platform_get_drvdata(pdev); 1108 + 1109 + arm_trbe_remove_cpuhp(drvdata); 1110 + arm_trbe_remove_coresight(drvdata); 1111 + arm_trbe_remove_irq(drvdata); 1112 + return 0; 1113 + } 1114 + 1115 + static const struct of_device_id arm_trbe_of_match[] = { 1116 + { .compatible = "arm,trace-buffer-extension"}, 1117 + {}, 1118 + }; 1119 + MODULE_DEVICE_TABLE(of, arm_trbe_of_match); 1120 + 1121 + static struct platform_driver arm_trbe_driver = { 1122 + .driver = { 1123 + .name = DRVNAME, 1124 + .of_match_table = of_match_ptr(arm_trbe_of_match), 1125 + .suppress_bind_attrs = true, 1126 + }, 1127 + .probe = arm_trbe_device_probe, 1128 + .remove = arm_trbe_device_remove, 1129 + }; 1130 + 1131 + static int __init arm_trbe_init(void) 1132 + { 1133 + int ret; 1134 + 1135 + if (arm64_kernel_unmapped_at_el0()) { 1136 + pr_err("TRBE wouldn't work if kernel gets unmapped at EL0\n"); 1137 + return -EOPNOTSUPP; 1138 + } 1139 + 1140 + ret = platform_driver_register(&arm_trbe_driver); 1141 + if (!ret) 1142 + return 0; 1143 + 1144 + pr_err("Error registering %s platform driver\n", DRVNAME); 1145 + return ret; 1146 + } 1147 + 1148 + static void __exit arm_trbe_exit(void) 1149 + { 1150 + platform_driver_unregister(&arm_trbe_driver); 1151 + } 1152 + module_init(arm_trbe_init); 1153 + module_exit(arm_trbe_exit); 1154 + 1155 + MODULE_AUTHOR("Anshuman Khandual <anshuman.khandual@arm.com>"); 1156 + MODULE_DESCRIPTION("Arm Trace Buffer Extension (TRBE) driver"); 1157 + MODULE_LICENSE("GPL v2");

+152

drivers/hwtracing/coresight/coresight-trbe.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + /* 3 + * This contains all required hardware related helper functions for 4 + * Trace Buffer Extension (TRBE) driver in the coresight framework. 5 + * 6 + * Copyright (C) 2020 ARM Ltd. 7 + * 8 + * Author: Anshuman Khandual <anshuman.khandual@arm.com> 9 + */ 10 + #include <linux/coresight.h> 11 + #include <linux/device.h> 12 + #include <linux/irq.h> 13 + #include <linux/kernel.h> 14 + #include <linux/of.h> 15 + #include <linux/platform_device.h> 16 + #include <linux/smp.h> 17 + 18 + #include "coresight-etm-perf.h" 19 + 20 + static inline bool is_trbe_available(void) 21 + { 22 + u64 aa64dfr0 = read_sysreg_s(SYS_ID_AA64DFR0_EL1); 23 + unsigned int trbe = cpuid_feature_extract_unsigned_field(aa64dfr0, ID_AA64DFR0_TRBE_SHIFT); 24 + 25 + return trbe >= 0b0001; 26 + } 27 + 28 + static inline bool is_trbe_enabled(void) 29 + { 30 + u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1); 31 + 32 + return trblimitr & TRBLIMITR_ENABLE; 33 + } 34 + 35 + #define TRBE_EC_OTHERS 0 36 + #define TRBE_EC_STAGE1_ABORT 36 37 + #define TRBE_EC_STAGE2_ABORT 37 38 + 39 + static inline int get_trbe_ec(u64 trbsr) 40 + { 41 + return (trbsr >> TRBSR_EC_SHIFT) & TRBSR_EC_MASK; 42 + } 43 + 44 + #define TRBE_BSC_NOT_STOPPED 0 45 + #define TRBE_BSC_FILLED 1 46 + #define TRBE_BSC_TRIGGERED 2 47 + 48 + static inline int get_trbe_bsc(u64 trbsr) 49 + { 50 + return (trbsr >> TRBSR_BSC_SHIFT) & TRBSR_BSC_MASK; 51 + } 52 + 53 + static inline void clr_trbe_irq(void) 54 + { 55 + u64 trbsr = read_sysreg_s(SYS_TRBSR_EL1); 56 + 57 + trbsr &= ~TRBSR_IRQ; 58 + write_sysreg_s(trbsr, SYS_TRBSR_EL1); 59 + } 60 + 61 + static inline bool is_trbe_irq(u64 trbsr) 62 + { 63 + return trbsr & TRBSR_IRQ; 64 + } 65 + 66 + static inline bool is_trbe_trg(u64 trbsr) 67 + { 68 + return trbsr & TRBSR_TRG; 69 + } 70 + 71 + static inline bool is_trbe_wrap(u64 trbsr) 72 + { 73 + return trbsr & TRBSR_WRAP; 74 + } 75 + 76 + static inline bool is_trbe_abort(u64 trbsr) 77 + { 78 + return trbsr & TRBSR_ABORT; 79 + } 80 + 81 + static inline bool is_trbe_running(u64 trbsr) 82 + { 83 + return !(trbsr & TRBSR_STOP); 84 + } 85 + 86 + #define TRBE_TRIG_MODE_STOP 0 87 + #define TRBE_TRIG_MODE_IRQ 1 88 + #define TRBE_TRIG_MODE_IGNORE 3 89 + 90 + #define TRBE_FILL_MODE_FILL 0 91 + #define TRBE_FILL_MODE_WRAP 1 92 + #define TRBE_FILL_MODE_CIRCULAR_BUFFER 3 93 + 94 + static inline void set_trbe_disabled(void) 95 + { 96 + u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1); 97 + 98 + trblimitr &= ~TRBLIMITR_ENABLE; 99 + write_sysreg_s(trblimitr, SYS_TRBLIMITR_EL1); 100 + } 101 + 102 + static inline bool get_trbe_flag_update(u64 trbidr) 103 + { 104 + return trbidr & TRBIDR_FLAG; 105 + } 106 + 107 + static inline bool is_trbe_programmable(u64 trbidr) 108 + { 109 + return !(trbidr & TRBIDR_PROG); 110 + } 111 + 112 + static inline int get_trbe_address_align(u64 trbidr) 113 + { 114 + return (trbidr >> TRBIDR_ALIGN_SHIFT) & TRBIDR_ALIGN_MASK; 115 + } 116 + 117 + static inline unsigned long get_trbe_write_pointer(void) 118 + { 119 + return read_sysreg_s(SYS_TRBPTR_EL1); 120 + } 121 + 122 + static inline void set_trbe_write_pointer(unsigned long addr) 123 + { 124 + WARN_ON(is_trbe_enabled()); 125 + write_sysreg_s(addr, SYS_TRBPTR_EL1); 126 + } 127 + 128 + static inline unsigned long get_trbe_limit_pointer(void) 129 + { 130 + u64 trblimitr = read_sysreg_s(SYS_TRBLIMITR_EL1); 131 + unsigned long addr = trblimitr & (TRBLIMITR_LIMIT_MASK << TRBLIMITR_LIMIT_SHIFT); 132 + 133 + WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE)); 134 + return addr; 135 + } 136 + 137 + static inline unsigned long get_trbe_base_pointer(void) 138 + { 139 + u64 trbbaser = read_sysreg_s(SYS_TRBBASER_EL1); 140 + unsigned long addr = trbbaser & (TRBBASER_BASE_MASK << TRBBASER_BASE_SHIFT); 141 + 142 + WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE)); 143 + return addr; 144 + } 145 + 146 + static inline void set_trbe_base_pointer(unsigned long addr) 147 + { 148 + WARN_ON(is_trbe_enabled()); 149 + WARN_ON(!IS_ALIGNED(addr, (1UL << TRBBASER_BASE_SHIFT))); 150 + WARN_ON(!IS_ALIGNED(addr, PAGE_SIZE)); 151 + write_sysreg_s(addr, SYS_TRBBASER_EL1); 152 + }

+16 -2

drivers/irqchip/irq-gic-v3-its.c

··· 794 794 795 795 its_encode_alloc(cmd, alloc); 796 796 797 - /* We can only signal PTZ when alloc==1. Why do we have two bits? */ 798 - its_encode_ptz(cmd, alloc); 797 + /* 798 + * GICv4.1 provides a way to get the VLPI state, which needs the vPE 799 + * to be unmapped first, and in this case, we may remap the vPE 800 + * back while the VPT is not empty. So we can't assume that the 801 + * VPT is empty on map. This is why we never advertise PTZ. 802 + */ 803 + its_encode_ptz(cmd, false); 799 804 its_encode_vconf_addr(cmd, vconf_addr); 800 805 its_encode_vmapp_default_db(cmd, desc->its_vmapp_cmd.vpe->vpe_db_lpi); 801 806 ··· 4559 4554 4560 4555 its_send_vmapp(its, vpe, false); 4561 4556 } 4557 + 4558 + /* 4559 + * There may be a direct read to the VPT after unmapping the 4560 + * vPE, to guarantee the validity of this, we make the VPT 4561 + * memory coherent with the CPU caches here. 4562 + */ 4563 + if (find_4_1_its() && !atomic_read(&vpe->vmapp_count)) 4564 + gic_flush_dcache_to_poc(page_address(vpe->vpt_page), 4565 + LPI_PENDBASE_SZ); 4562 4566 } 4563 4567 4564 4568 static const struct irq_domain_ops its_vpe_domain_ops = {

-30

drivers/perf/arm_pmu.c

··· 581 581 .attrs = armpmu_common_attrs, 582 582 }; 583 583 584 - /* Set at runtime when we know what CPU type we are. */ 585 - static struct arm_pmu *__oprofile_cpu_pmu; 586 - 587 - /* 588 - * Despite the names, these two functions are CPU-specific and are used 589 - * by the OProfile/perf code. 590 - */ 591 - const char *perf_pmu_name(void) 592 - { 593 - if (!__oprofile_cpu_pmu) 594 - return NULL; 595 - 596 - return __oprofile_cpu_pmu->name; 597 - } 598 - EXPORT_SYMBOL_GPL(perf_pmu_name); 599 - 600 - int perf_num_counters(void) 601 - { 602 - int max_events = 0; 603 - 604 - if (__oprofile_cpu_pmu != NULL) 605 - max_events = __oprofile_cpu_pmu->num_events; 606 - 607 - return max_events; 608 - } 609 - EXPORT_SYMBOL_GPL(perf_num_counters); 610 - 611 584 static int armpmu_count_irq_users(const int irq) 612 585 { 613 586 int cpu, count = 0; ··· 951 978 ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); 952 979 if (ret) 953 980 goto out_destroy; 954 - 955 - if (!__oprofile_cpu_pmu) 956 - __oprofile_cpu_pmu = pmu; 957 981 958 982 pr_info("enabled with %s PMU driver, %d counters available%s\n", 959 983 pmu->name, pmu->num_events,

+1 -1

drivers/ptp/Kconfig

··· 108 108 config PTP_1588_CLOCK_KVM 109 109 tristate "KVM virtual PTP clock" 110 110 depends on PTP_1588_CLOCK 111 - depends on KVM_GUEST && X86 111 + depends on (KVM_GUEST && X86) || (HAVE_ARM_SMCCC_DISCOVERY && ARM_ARCH_TIMER) 112 112 default y 113 113 help 114 114 This driver adds support for using kvm infrastructure as a PTP

+2

drivers/ptp/Makefile

··· 4 4 # 5 5 6 6 ptp-y := ptp_clock.o ptp_chardev.o ptp_sysfs.o 7 + ptp_kvm-$(CONFIG_X86) := ptp_kvm_x86.o ptp_kvm_common.o 8 + ptp_kvm-$(CONFIG_HAVE_ARM_SMCCC) := ptp_kvm_arm.o ptp_kvm_common.o 7 9 obj-$(CONFIG_PTP_1588_CLOCK) += ptp.o 8 10 obj-$(CONFIG_PTP_1588_CLOCK_DTE) += ptp_dte.o 9 11 obj-$(CONFIG_PTP_1588_CLOCK_INES) += ptp_ines.o

+23 -62

drivers/ptp/ptp_kvm.c drivers/ptp/ptp_kvm_common.c

··· 8 8 #include <linux/err.h> 9 9 #include <linux/init.h> 10 10 #include <linux/kernel.h> 11 + #include <linux/slab.h> 11 12 #include <linux/module.h> 13 + #include <linux/ptp_kvm.h> 12 14 #include <uapi/linux/kvm_para.h> 13 15 #include <asm/kvm_para.h> 14 - #include <asm/pvclock.h> 15 - #include <asm/kvmclock.h> 16 16 #include <uapi/asm/kvm_para.h> 17 17 18 18 #include <linux/ptp_clock_kernel.h> ··· 24 24 25 25 static DEFINE_SPINLOCK(kvm_ptp_lock); 26 26 27 - static struct pvclock_vsyscall_time_info *hv_clock; 28 - 29 - static struct kvm_clock_pairing clock_pair; 30 - static phys_addr_t clock_pair_gpa; 31 - 32 27 static int ptp_kvm_get_time_fn(ktime_t *device_time, 33 28 struct system_counterval_t *system_counter, 34 29 void *ctx) 35 30 { 36 - unsigned long ret; 31 + long ret; 32 + u64 cycle; 37 33 struct timespec64 tspec; 38 - unsigned version; 39 - int cpu; 40 - struct pvclock_vcpu_time_info *src; 34 + struct clocksource *cs; 41 35 42 36 spin_lock(&kvm_ptp_lock); 43 37 44 38 preempt_disable_notrace(); 45 - cpu = smp_processor_id(); 46 - src = &hv_clock[cpu].pvti; 47 - 48 - do { 49 - /* 50 - * We are using a TSC value read in the hosts 51 - * kvm_hc_clock_pairing handling. 52 - * So any changes to tsc_to_system_mul 53 - * and tsc_shift or any other pvclock 54 - * data invalidate that measurement. 55 - */ 56 - version = pvclock_read_begin(src); 57 - 58 - ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, 59 - clock_pair_gpa, 60 - KVM_CLOCK_PAIRING_WALLCLOCK); 61 - if (ret != 0) { 62 - pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret); 63 - spin_unlock(&kvm_ptp_lock); 64 - preempt_enable_notrace(); 65 - return -EOPNOTSUPP; 66 - } 67 - 68 - tspec.tv_sec = clock_pair.sec; 69 - tspec.tv_nsec = clock_pair.nsec; 70 - ret = __pvclock_read_cycles(src, clock_pair.tsc); 71 - } while (pvclock_read_retry(src, version)); 39 + ret = kvm_arch_ptp_get_crosststamp(&cycle, &tspec, &cs); 40 + if (ret) { 41 + spin_unlock(&kvm_ptp_lock); 42 + preempt_enable_notrace(); 43 + return ret; 44 + } 72 45 73 46 preempt_enable_notrace(); 74 47 75 - system_counter->cycles = ret; 76 - system_counter->cs = &kvm_clock; 48 + system_counter->cycles = cycle; 49 + system_counter->cs = cs; 77 50 78 51 *device_time = timespec64_to_ktime(tspec); 79 52 ··· 84 111 85 112 static int ptp_kvm_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts) 86 113 { 87 - unsigned long ret; 114 + long ret; 88 115 struct timespec64 tspec; 89 116 90 117 spin_lock(&kvm_ptp_lock); 91 118 92 - ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, 93 - clock_pair_gpa, 94 - KVM_CLOCK_PAIRING_WALLCLOCK); 95 - if (ret != 0) { 96 - pr_err_ratelimited("clock offset hypercall ret %lu\n", ret); 119 + ret = kvm_arch_ptp_get_clock(&tspec); 120 + if (ret) { 97 121 spin_unlock(&kvm_ptp_lock); 98 - return -EOPNOTSUPP; 122 + return ret; 99 123 } 100 124 101 - tspec.tv_sec = clock_pair.sec; 102 - tspec.tv_nsec = clock_pair.nsec; 103 125 spin_unlock(&kvm_ptp_lock); 104 126 105 127 memcpy(ts, &tspec, sizeof(struct timespec64)); ··· 136 168 { 137 169 long ret; 138 170 139 - if (!kvm_para_available()) 140 - return -ENODEV; 141 - 142 - clock_pair_gpa = slow_virt_to_phys(&clock_pair); 143 - hv_clock = pvclock_get_pvti_cpu0_va(); 144 - 145 - if (!hv_clock) 146 - return -ENODEV; 147 - 148 - ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, 149 - KVM_CLOCK_PAIRING_WALLCLOCK); 150 - if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP) 151 - return -ENODEV; 171 + ret = kvm_arch_ptp_init(); 172 + if (ret) { 173 + if (ret != -EOPNOTSUPP) 174 + pr_err("fail to initialize ptp_kvm"); 175 + return ret; 176 + } 152 177 153 178 kvm_ptp_clock.caps = ptp_kvm_caps; 154 179

+28

drivers/ptp/ptp_kvm_arm.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Virtual PTP 1588 clock for use with KVM guests 4 + * Copyright (C) 2019 ARM Ltd. 5 + * All Rights Reserved 6 + */ 7 + 8 + #include <linux/arm-smccc.h> 9 + #include <linux/ptp_kvm.h> 10 + 11 + #include <asm/arch_timer.h> 12 + #include <asm/hypervisor.h> 13 + 14 + int kvm_arch_ptp_init(void) 15 + { 16 + int ret; 17 + 18 + ret = kvm_arm_hyp_service_available(ARM_SMCCC_KVM_FUNC_PTP); 19 + if (ret <= 0) 20 + return -EOPNOTSUPP; 21 + 22 + return 0; 23 + } 24 + 25 + int kvm_arch_ptp_get_clock(struct timespec64 *ts) 26 + { 27 + return kvm_arch_ptp_get_crosststamp(NULL, ts, NULL); 28 + }

+97

drivers/ptp/ptp_kvm_x86.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-or-later 2 + /* 3 + * Virtual PTP 1588 clock for use with KVM guests 4 + * 5 + * Copyright (C) 2017 Red Hat Inc. 6 + */ 7 + 8 + #include <linux/device.h> 9 + #include <linux/kernel.h> 10 + #include <asm/pvclock.h> 11 + #include <asm/kvmclock.h> 12 + #include <linux/module.h> 13 + #include <uapi/asm/kvm_para.h> 14 + #include <uapi/linux/kvm_para.h> 15 + #include <linux/ptp_clock_kernel.h> 16 + #include <linux/ptp_kvm.h> 17 + 18 + struct pvclock_vsyscall_time_info *hv_clock; 19 + 20 + static phys_addr_t clock_pair_gpa; 21 + static struct kvm_clock_pairing clock_pair; 22 + 23 + int kvm_arch_ptp_init(void) 24 + { 25 + long ret; 26 + 27 + if (!kvm_para_available()) 28 + return -ENODEV; 29 + 30 + clock_pair_gpa = slow_virt_to_phys(&clock_pair); 31 + hv_clock = pvclock_get_pvti_cpu0_va(); 32 + if (!hv_clock) 33 + return -ENODEV; 34 + 35 + ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, clock_pair_gpa, 36 + KVM_CLOCK_PAIRING_WALLCLOCK); 37 + if (ret == -KVM_ENOSYS || ret == -KVM_EOPNOTSUPP) 38 + return -ENODEV; 39 + 40 + return 0; 41 + } 42 + 43 + int kvm_arch_ptp_get_clock(struct timespec64 *ts) 44 + { 45 + long ret; 46 + 47 + ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, 48 + clock_pair_gpa, 49 + KVM_CLOCK_PAIRING_WALLCLOCK); 50 + if (ret != 0) { 51 + pr_err_ratelimited("clock offset hypercall ret %lu\n", ret); 52 + return -EOPNOTSUPP; 53 + } 54 + 55 + ts->tv_sec = clock_pair.sec; 56 + ts->tv_nsec = clock_pair.nsec; 57 + 58 + return 0; 59 + } 60 + 61 + int kvm_arch_ptp_get_crosststamp(u64 *cycle, struct timespec64 *tspec, 62 + struct clocksource **cs) 63 + { 64 + struct pvclock_vcpu_time_info *src; 65 + unsigned int version; 66 + long ret; 67 + int cpu; 68 + 69 + cpu = smp_processor_id(); 70 + src = &hv_clock[cpu].pvti; 71 + 72 + do { 73 + /* 74 + * We are using a TSC value read in the hosts 75 + * kvm_hc_clock_pairing handling. 76 + * So any changes to tsc_to_system_mul 77 + * and tsc_shift or any other pvclock 78 + * data invalidate that measurement. 79 + */ 80 + version = pvclock_read_begin(src); 81 + 82 + ret = kvm_hypercall2(KVM_HC_CLOCK_PAIRING, 83 + clock_pair_gpa, 84 + KVM_CLOCK_PAIRING_WALLCLOCK); 85 + if (ret != 0) { 86 + pr_err_ratelimited("clock pairing hypercall ret %lu\n", ret); 87 + return -EOPNOTSUPP; 88 + } 89 + tspec->tv_sec = clock_pair.sec; 90 + tspec->tv_nsec = clock_pair.nsec; 91 + *cycle = __pvclock_read_cycles(src, clock_pair.tsc); 92 + } while (pvclock_read_retry(src, version)); 93 + 94 + *cs = &kvm_clock; 95 + 96 + return 0; 97 + }

+4

include/kvm/arm_pmu.h

··· 61 61 int kvm_arm_pmu_v3_has_attr(struct kvm_vcpu *vcpu, 62 62 struct kvm_device_attr *attr); 63 63 int kvm_arm_pmu_v3_enable(struct kvm_vcpu *vcpu); 64 + int kvm_pmu_probe_pmuver(void); 64 65 #else 65 66 struct kvm_pmu { 66 67 }; ··· 117 116 { 118 117 return 0; 119 118 } 119 + 120 + static inline int kvm_pmu_probe_pmuver(void) { return 0xf; } 121 + 120 122 #endif 121 123 122 124 #endif

+1

include/kvm/arm_vgic.h

··· 322 322 */ 323 323 struct vgic_io_device rd_iodev; 324 324 struct vgic_redist_region *rdreg; 325 + u32 rdreg_index; 325 326 326 327 /* Contains the attributes and gpa of the LPI pending tables. */ 327 328 u64 pendbaser;

+41

include/linux/arm-smccc.h

··· 55 55 #define ARM_SMCCC_OWNER_TRUSTED_OS 50 56 56 #define ARM_SMCCC_OWNER_TRUSTED_OS_END 63 57 57 58 + #define ARM_SMCCC_FUNC_QUERY_CALL_UID 0xff01 59 + 58 60 #define ARM_SMCCC_QUIRK_NONE 0 59 61 #define ARM_SMCCC_QUIRK_QCOM_A6 1 /* Save/restore register a6 */ 60 62 ··· 89 87 ARM_SMCCC_SMC_32, \ 90 88 0, 0x7fff) 91 89 90 + #define ARM_SMCCC_VENDOR_HYP_CALL_UID_FUNC_ID \ 91 + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ 92 + ARM_SMCCC_SMC_32, \ 93 + ARM_SMCCC_OWNER_VENDOR_HYP, \ 94 + ARM_SMCCC_FUNC_QUERY_CALL_UID) 95 + 96 + /* KVM UID value: 28b46fb6-2ec5-11e9-a9ca-4b564d003a74 */ 97 + #define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_0 0xb66fb428U 98 + #define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_1 0xe911c52eU 99 + #define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_2 0x564bcaa9U 100 + #define ARM_SMCCC_VENDOR_HYP_UID_KVM_REG_3 0x743a004dU 101 + 102 + /* KVM "vendor specific" services */ 103 + #define ARM_SMCCC_KVM_FUNC_FEATURES 0 104 + #define ARM_SMCCC_KVM_FUNC_PTP 1 105 + #define ARM_SMCCC_KVM_FUNC_FEATURES_2 127 106 + #define ARM_SMCCC_KVM_NUM_FUNCS 128 107 + 108 + #define ARM_SMCCC_VENDOR_HYP_KVM_FEATURES_FUNC_ID \ 109 + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ 110 + ARM_SMCCC_SMC_32, \ 111 + ARM_SMCCC_OWNER_VENDOR_HYP, \ 112 + ARM_SMCCC_KVM_FUNC_FEATURES) 113 + 92 114 #define SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED 1 115 + 116 + /* 117 + * ptp_kvm is a feature used for time sync between vm and host. 118 + * ptp_kvm module in guest kernel will get service from host using 119 + * this hypercall ID. 120 + */ 121 + #define ARM_SMCCC_VENDOR_HYP_KVM_PTP_FUNC_ID \ 122 + ARM_SMCCC_CALL_VAL(ARM_SMCCC_FAST_CALL, \ 123 + ARM_SMCCC_SMC_32, \ 124 + ARM_SMCCC_OWNER_VENDOR_HYP, \ 125 + ARM_SMCCC_KVM_FUNC_PTP) 126 + 127 + /* ptp_kvm counter type ID */ 128 + #define KVM_PTP_VIRT_COUNTER 0 129 + #define KVM_PTP_PHYS_COUNTER 1 93 130 94 131 /* Paravirtualised time calls (defined by ARM DEN0057A) */ 95 132 #define ARM_SMCCC_HV_PV_TIME_FEATURES \

+10

include/linux/bug.h

··· 36 36 return bug->flags & BUGFLAG_WARNING; 37 37 } 38 38 39 + void bug_get_file_line(struct bug_entry *bug, const char **file, 40 + unsigned int *line); 41 + 39 42 struct bug_entry *find_bug(unsigned long bugaddr); 40 43 41 44 enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs); ··· 61 58 return BUG_TRAP_TYPE_BUG; 62 59 } 63 60 61 + struct bug_entry; 62 + static inline void bug_get_file_line(struct bug_entry *bug, const char **file, 63 + unsigned int *line) 64 + { 65 + *file = NULL; 66 + *line = 0; 67 + } 64 68 65 69 static inline void generic_bug_clear_once(void) {} 66 70

+6

include/linux/clocksource.h

··· 17 17 #include <linux/timer.h> 18 18 #include <linux/init.h> 19 19 #include <linux/of.h> 20 + #include <linux/clocksource_ids.h> 20 21 #include <asm/div64.h> 21 22 #include <asm/io.h> 22 23 ··· 63 62 * 400-499: Perfect 64 63 * The ideal clocksource. A must-use where 65 64 * available. 65 + * @id: Defaults to CSID_GENERIC. The id value is captured 66 + * in certain snapshot functions to allow callers to 67 + * validate the clocksource from which the snapshot was 68 + * taken. 66 69 * @flags: Flags describing special properties 67 70 * @enable: Optional function to enable the clocksource 68 71 * @disable: Optional function to disable the clocksource ··· 105 100 const char *name; 106 101 struct list_head list; 107 102 int rating; 103 + enum clocksource_ids id; 108 104 enum vdso_clock_mode vdso_clock_mode; 109 105 unsigned long flags; 110 106

+12

include/linux/clocksource_ids.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0 */ 2 + #ifndef _LINUX_CLOCKSOURCE_IDS_H 3 + #define _LINUX_CLOCKSOURCE_IDS_H 4 + 5 + /* Enum to give clocksources a unique identifier */ 6 + enum clocksource_ids { 7 + CSID_GENERIC = 0, 8 + CSID_ARM_ARCH_COUNTER, 9 + CSID_MAX, 10 + }; 11 + 12 + #endif

+13

include/linux/coresight.h

··· 50 50 CORESIGHT_DEV_SUBTYPE_SINK_PORT, 51 51 CORESIGHT_DEV_SUBTYPE_SINK_BUFFER, 52 52 CORESIGHT_DEV_SUBTYPE_SINK_SYSMEM, 53 + CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM, 53 54 }; 54 55 55 56 enum coresight_dev_subtype_link { ··· 455 454 WARN_ON(1); 456 455 } 457 456 #endif /* CONFIG_64BIT */ 457 + 458 + static inline bool coresight_is_percpu_source(struct coresight_device *csdev) 459 + { 460 + return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SOURCE) && 461 + (csdev->subtype.source_subtype == CORESIGHT_DEV_SUBTYPE_SOURCE_PROC); 462 + } 463 + 464 + static inline bool coresight_is_percpu_sink(struct coresight_device *csdev) 465 + { 466 + return csdev && (csdev->type == CORESIGHT_DEV_TYPE_SINK) && 467 + (csdev->subtype.sink_subtype == CORESIGHT_DEV_SUBTYPE_SINK_PERCPU_SYSMEM); 468 + } 458 469 459 470 extern struct coresight_device * 460 471 coresight_register(struct coresight_desc *desc);

-2

include/linux/perf_event.h

··· 951 951 extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); 952 952 extern void perf_pmu_unregister(struct pmu *pmu); 953 953 954 - extern int perf_num_counters(void); 955 - extern const char *perf_pmu_name(void); 956 954 extern void __perf_event_task_sched_in(struct task_struct *prev, 957 955 struct task_struct *task); 958 956 extern void __perf_event_task_sched_out(struct task_struct *prev,

+19

include/linux/ptp_kvm.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 + /* 3 + * Virtual PTP 1588 clock for use with KVM guests 4 + * 5 + * Copyright (C) 2017 Red Hat Inc. 6 + */ 7 + 8 + #ifndef _PTP_KVM_H_ 9 + #define _PTP_KVM_H_ 10 + 11 + struct timespec64; 12 + struct clocksource; 13 + 14 + int kvm_arch_ptp_init(void); 15 + int kvm_arch_ptp_get_clock(struct timespec64 *ts); 16 + int kvm_arch_ptp_get_crosststamp(u64 *cycle, 17 + struct timespec64 *tspec, struct clocksource **cs); 18 + 19 + #endif /* _PTP_KVM_H_ */

+7 -5

include/linux/timekeeping.h

··· 3 3 #define _LINUX_TIMEKEEPING_H 4 4 5 5 #include <linux/errno.h> 6 + #include <linux/clocksource_ids.h> 6 7 7 8 /* Included from linux/ktime.h */ 8 9 ··· 244 243 * @cs_was_changed_seq: The sequence number of clocksource change events 245 244 */ 246 245 struct system_time_snapshot { 247 - u64 cycles; 248 - ktime_t real; 249 - ktime_t raw; 250 - unsigned int clock_was_set_seq; 251 - u8 cs_was_changed_seq; 246 + u64 cycles; 247 + ktime_t real; 248 + ktime_t raw; 249 + enum clocksource_ids cs_id; 250 + unsigned int clock_was_set_seq; 251 + u8 cs_was_changed_seq; 252 252 }; 253 253 254 254 /**

+1

include/uapi/linux/kvm.h

··· 1081 1081 #define KVM_CAP_SET_GUEST_DEBUG2 195 1082 1082 #define KVM_CAP_SGX_ATTRIBUTE 196 1083 1083 #define KVM_CAP_VM_COPY_ENC_CONTEXT_FROM 197 1084 + #define KVM_CAP_PTP_KVM 198 1084 1085 1085 1086 #ifdef KVM_CAP_IRQ_ROUTING 1086 1087

+9 -4

include/uapi/linux/perf_event.h

··· 1156 1156 /** 1157 1157 * PERF_RECORD_AUX::flags bits 1158 1158 */ 1159 - #define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ 1160 - #define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ 1161 - #define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ 1162 - #define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ 1159 + #define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ 1160 + #define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ 1161 + #define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ 1162 + #define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ 1163 + #define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ 1164 + 1165 + /* CoreSight PMU AUX buffer formats */ 1166 + #define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ 1167 + #define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ 1163 1168 1164 1169 #define PERF_FLAG_FD_NO_GROUP (1UL << 0) 1165 1170 #define PERF_FLAG_FD_OUTPUT (1UL << 1)

-5

kernel/events/core.c

··· 580 580 581 581 void __weak perf_event_print_debug(void) { } 582 582 583 - extern __weak const char *perf_pmu_name(void) 584 - { 585 - return "pmu"; 586 - } 587 - 588 583 static inline u64 perf_clock(void) 589 584 { 590 585 return local_clock();

+2

kernel/time/clocksource.c

··· 920 920 921 921 clocksource_arch_init(cs); 922 922 923 + if (WARN_ON_ONCE((unsigned int)cs->id >= CSID_MAX)) 924 + cs->id = CSID_GENERIC; 923 925 if (cs->vdso_clock_mode < 0 || 924 926 cs->vdso_clock_mode >= VDSO_CLOCKMODE_MAX) { 925 927 pr_warn("clocksource %s registered with invalid VDSO mode %d. Disabling VDSO support.\n",

+1

kernel/time/timekeeping.c

··· 1048 1048 do { 1049 1049 seq = read_seqcount_begin(&tk_core.seq); 1050 1050 now = tk_clock_read(&tk->tkr_mono); 1051 + systime_snapshot->cs_id = tk->tkr_mono.clock->id; 1051 1052 systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; 1052 1053 systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; 1053 1054 base_real = ktime_add(tk->tkr_mono.base,

+27 -23

lib/bug.c

··· 127 127 } 128 128 #endif 129 129 130 + void bug_get_file_line(struct bug_entry *bug, const char **file, 131 + unsigned int *line) 132 + { 133 + #ifdef CONFIG_DEBUG_BUGVERBOSE 134 + #ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS 135 + *file = bug->file; 136 + #else 137 + *file = (const char *)bug + bug->file_disp; 138 + #endif 139 + *line = bug->line; 140 + #else 141 + *file = NULL; 142 + *line = 0; 143 + #endif 144 + } 145 + 130 146 struct bug_entry *find_bug(unsigned long bugaddr) 131 147 { 132 148 struct bug_entry *bug; ··· 169 153 170 154 disable_trace_on_warning(); 171 155 172 - file = NULL; 173 - line = 0; 174 - warning = 0; 156 + bug_get_file_line(bug, &file, &line); 175 157 176 - if (bug) { 177 - #ifdef CONFIG_DEBUG_BUGVERBOSE 178 - #ifndef CONFIG_GENERIC_BUG_RELATIVE_POINTERS 179 - file = bug->file; 180 - #else 181 - file = (const char *)bug + bug->file_disp; 182 - #endif 183 - line = bug->line; 184 - #endif 185 - warning = (bug->flags & BUGFLAG_WARNING) != 0; 186 - once = (bug->flags & BUGFLAG_ONCE) != 0; 187 - done = (bug->flags & BUGFLAG_DONE) != 0; 158 + warning = (bug->flags & BUGFLAG_WARNING) != 0; 159 + once = (bug->flags & BUGFLAG_ONCE) != 0; 160 + done = (bug->flags & BUGFLAG_DONE) != 0; 188 161 189 - if (warning && once) { 190 - if (done) 191 - return BUG_TRAP_TYPE_WARN; 162 + if (warning && once) { 163 + if (done) 164 + return BUG_TRAP_TYPE_WARN; 192 165 193 - /* 194 - * Since this is the only store, concurrency is not an issue. 195 - */ 196 - bug->flags |= BUGFLAG_DONE; 197 - } 166 + /* 167 + * Since this is the only store, concurrency is not an issue. 168 + */ 169 + bug->flags |= BUGFLAG_DONE; 198 170 } 199 171 200 172 /*

+1

tools/testing/selftests/kvm/.gitignore

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 2 /aarch64/get-reg-list 3 3 /aarch64/get-reg-list-sve 4 + /aarch64/vgic_init 4 5 /s390x/memop 5 6 /s390x/resets 6 7 /s390x/sync_regs_test

+1

tools/testing/selftests/kvm/Makefile

··· 79 79 80 80 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list 81 81 TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list-sve 82 + TEST_GEN_PROGS_aarch64 += aarch64/vgic_init 82 83 TEST_GEN_PROGS_aarch64 += demand_paging_test 83 84 TEST_GEN_PROGS_aarch64 += dirty_log_test 84 85 TEST_GEN_PROGS_aarch64 += dirty_log_perf_test

+551

tools/testing/selftests/kvm/aarch64/vgic_init.c

··· 1 + // SPDX-License-Identifier: GPL-2.0 2 + /* 3 + * vgic init sequence tests 4 + * 5 + * Copyright (C) 2020, Red Hat, Inc. 6 + */ 7 + #define _GNU_SOURCE 8 + #include <linux/kernel.h> 9 + #include <sys/syscall.h> 10 + #include <asm/kvm.h> 11 + #include <asm/kvm_para.h> 12 + 13 + #include "test_util.h" 14 + #include "kvm_util.h" 15 + #include "processor.h" 16 + 17 + #define NR_VCPUS 4 18 + 19 + #define REDIST_REGION_ATTR_ADDR(count, base, flags, index) (((uint64_t)(count) << 52) | \ 20 + ((uint64_t)((base) >> 16) << 16) | ((uint64_t)(flags) << 12) | index) 21 + #define REG_OFFSET(vcpu, offset) (((uint64_t)vcpu << 32) | offset) 22 + 23 + #define GICR_TYPER 0x8 24 + 25 + struct vm_gic { 26 + struct kvm_vm *vm; 27 + int gic_fd; 28 + }; 29 + 30 + static int max_ipa_bits; 31 + 32 + /* helper to access a redistributor register */ 33 + static int access_redist_reg(int gicv3_fd, int vcpu, int offset, 34 + uint32_t *val, bool write) 35 + { 36 + uint64_t attr = REG_OFFSET(vcpu, offset); 37 + 38 + return _kvm_device_access(gicv3_fd, KVM_DEV_ARM_VGIC_GRP_REDIST_REGS, 39 + attr, val, write); 40 + } 41 + 42 + /* dummy guest code */ 43 + static void guest_code(void) 44 + { 45 + GUEST_SYNC(0); 46 + GUEST_SYNC(1); 47 + GUEST_SYNC(2); 48 + GUEST_DONE(); 49 + } 50 + 51 + /* we don't want to assert on run execution, hence that helper */ 52 + static int run_vcpu(struct kvm_vm *vm, uint32_t vcpuid) 53 + { 54 + ucall_init(vm, NULL); 55 + int ret = _vcpu_ioctl(vm, vcpuid, KVM_RUN, NULL); 56 + if (ret) 57 + return -errno; 58 + return 0; 59 + } 60 + 61 + static struct vm_gic vm_gic_create(void) 62 + { 63 + struct vm_gic v; 64 + 65 + v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL); 66 + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); 67 + 68 + return v; 69 + } 70 + 71 + static void vm_gic_destroy(struct vm_gic *v) 72 + { 73 + close(v->gic_fd); 74 + kvm_vm_free(v->vm); 75 + } 76 + 77 + /** 78 + * Helper routine that performs KVM device tests in general and 79 + * especially ARM_VGIC_V3 ones. Eventually the ARM_VGIC_V3 80 + * device gets created, a legacy RDIST region is set at @0x0 81 + * and a DIST region is set @0x60000 82 + */ 83 + static void subtest_dist_rdist(struct vm_gic *v) 84 + { 85 + int ret; 86 + uint64_t addr; 87 + 88 + /* Check existing group/attributes */ 89 + kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 90 + KVM_VGIC_V3_ADDR_TYPE_DIST); 91 + 92 + kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 93 + KVM_VGIC_V3_ADDR_TYPE_REDIST); 94 + 95 + /* check non existing attribute */ 96 + ret = _kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 0); 97 + TEST_ASSERT(ret && errno == ENXIO, "attribute not supported"); 98 + 99 + /* misaligned DIST and REDIST address settings */ 100 + addr = 0x1000; 101 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 102 + KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); 103 + TEST_ASSERT(ret && errno == EINVAL, "GICv3 dist base not 64kB aligned"); 104 + 105 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 106 + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); 107 + TEST_ASSERT(ret && errno == EINVAL, "GICv3 redist base not 64kB aligned"); 108 + 109 + /* out of range address */ 110 + if (max_ipa_bits) { 111 + addr = 1ULL << max_ipa_bits; 112 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 113 + KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); 114 + TEST_ASSERT(ret && errno == E2BIG, "dist address beyond IPA limit"); 115 + 116 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 117 + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); 118 + TEST_ASSERT(ret && errno == E2BIG, "redist address beyond IPA limit"); 119 + } 120 + 121 + /* set REDIST base address @0x0*/ 122 + addr = 0x00000; 123 + kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 124 + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); 125 + 126 + /* Attempt to create a second legacy redistributor region */ 127 + addr = 0xE0000; 128 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 129 + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); 130 + TEST_ASSERT(ret && errno == EEXIST, "GICv3 redist base set again"); 131 + 132 + /* Attempt to mix legacy and new redistributor regions */ 133 + addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 0, 0); 134 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 135 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 136 + TEST_ASSERT(ret && errno == EINVAL, "attempt to mix GICv3 REDIST and REDIST_REGION"); 137 + 138 + /* 139 + * Set overlapping DIST / REDIST, cannot be detected here. Will be detected 140 + * on first vcpu run instead. 141 + */ 142 + addr = 3 * 2 * 0x10000; 143 + kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, KVM_VGIC_V3_ADDR_TYPE_DIST, 144 + &addr, true); 145 + } 146 + 147 + /* Test the new REDIST region API */ 148 + static void subtest_redist_regions(struct vm_gic *v) 149 + { 150 + uint64_t addr, expected_addr; 151 + int ret; 152 + 153 + ret = kvm_device_check_attr(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 154 + KVM_VGIC_V3_ADDR_TYPE_REDIST); 155 + TEST_ASSERT(!ret, "Multiple redist regions advertised"); 156 + 157 + addr = REDIST_REGION_ATTR_ADDR(NR_VCPUS, 0x100000, 2, 0); 158 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 159 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 160 + TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with flags != 0"); 161 + 162 + addr = REDIST_REGION_ATTR_ADDR(0, 0x100000, 0, 0); 163 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 164 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 165 + TEST_ASSERT(ret && errno == EINVAL, "redist region attr value with count== 0"); 166 + 167 + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1); 168 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 169 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 170 + TEST_ASSERT(ret && errno == EINVAL, 171 + "attempt to register the first rdist region with index != 0"); 172 + 173 + addr = REDIST_REGION_ATTR_ADDR(2, 0x201000, 0, 1); 174 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 175 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 176 + TEST_ASSERT(ret && errno == EINVAL, "rdist region with misaligned address"); 177 + 178 + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); 179 + kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 180 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 181 + 182 + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 1); 183 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 184 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 185 + TEST_ASSERT(ret && errno == EINVAL, "register an rdist region with already used index"); 186 + 187 + addr = REDIST_REGION_ATTR_ADDR(1, 0x210000, 0, 2); 188 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 189 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 190 + TEST_ASSERT(ret && errno == EINVAL, 191 + "register an rdist region overlapping with another one"); 192 + 193 + addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 2); 194 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 195 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 196 + TEST_ASSERT(ret && errno == EINVAL, "register redist region with index not +1"); 197 + 198 + addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1); 199 + kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 200 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 201 + 202 + addr = REDIST_REGION_ATTR_ADDR(1, 1ULL << max_ipa_bits, 0, 2); 203 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 204 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 205 + TEST_ASSERT(ret && errno == E2BIG, 206 + "register redist region with base address beyond IPA range"); 207 + 208 + addr = 0x260000; 209 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 210 + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); 211 + TEST_ASSERT(ret && errno == EINVAL, 212 + "Mix KVM_VGIC_V3_ADDR_TYPE_REDIST and REDIST_REGION"); 213 + 214 + /* 215 + * Now there are 2 redist regions: 216 + * region 0 @ 0x200000 2 redists 217 + * region 1 @ 0x240000 1 redist 218 + * Attempt to read their characteristics 219 + */ 220 + 221 + addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 0); 222 + expected_addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); 223 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 224 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false); 225 + TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #0"); 226 + 227 + addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 1); 228 + expected_addr = REDIST_REGION_ATTR_ADDR(1, 0x240000, 0, 1); 229 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 230 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false); 231 + TEST_ASSERT(!ret && addr == expected_addr, "read characteristics of region #1"); 232 + 233 + addr = REDIST_REGION_ATTR_ADDR(0, 0, 0, 2); 234 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 235 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, false); 236 + TEST_ASSERT(ret && errno == ENOENT, "read characteristics of non existing region"); 237 + 238 + addr = 0x260000; 239 + kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 240 + KVM_VGIC_V3_ADDR_TYPE_DIST, &addr, true); 241 + 242 + addr = REDIST_REGION_ATTR_ADDR(1, 0x260000, 0, 2); 243 + ret = _kvm_device_access(v->gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 244 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 245 + TEST_ASSERT(ret && errno == EINVAL, "register redist region colliding with dist"); 246 + } 247 + 248 + /* 249 + * VGIC KVM device is created and initialized before the secondary CPUs 250 + * get created 251 + */ 252 + static void test_vgic_then_vcpus(void) 253 + { 254 + struct vm_gic v; 255 + int ret, i; 256 + 257 + v.vm = vm_create_default(0, 0, guest_code); 258 + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); 259 + 260 + subtest_dist_rdist(&v); 261 + 262 + /* Add the rest of the VCPUs */ 263 + for (i = 1; i < NR_VCPUS; ++i) 264 + vm_vcpu_add_default(v.vm, i, guest_code); 265 + 266 + ret = run_vcpu(v.vm, 3); 267 + TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run"); 268 + 269 + vm_gic_destroy(&v); 270 + } 271 + 272 + /* All the VCPUs are created before the VGIC KVM device gets initialized */ 273 + static void test_vcpus_then_vgic(void) 274 + { 275 + struct vm_gic v; 276 + int ret; 277 + 278 + v = vm_gic_create(); 279 + 280 + subtest_dist_rdist(&v); 281 + 282 + ret = run_vcpu(v.vm, 3); 283 + TEST_ASSERT(ret == -EINVAL, "dist/rdist overlap detected on 1st vcpu run"); 284 + 285 + vm_gic_destroy(&v); 286 + } 287 + 288 + static void test_new_redist_regions(void) 289 + { 290 + void *dummy = NULL; 291 + struct vm_gic v; 292 + uint64_t addr; 293 + int ret; 294 + 295 + v = vm_gic_create(); 296 + subtest_redist_regions(&v); 297 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, 298 + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); 299 + 300 + ret = run_vcpu(v.vm, 3); 301 + TEST_ASSERT(ret == -ENXIO, "running without sufficient number of rdists"); 302 + vm_gic_destroy(&v); 303 + 304 + /* step2 */ 305 + 306 + v = vm_gic_create(); 307 + subtest_redist_regions(&v); 308 + 309 + addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2); 310 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 311 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 312 + 313 + ret = run_vcpu(v.vm, 3); 314 + TEST_ASSERT(ret == -EBUSY, "running without vgic explicit init"); 315 + 316 + vm_gic_destroy(&v); 317 + 318 + /* step 3 */ 319 + 320 + v = vm_gic_create(); 321 + subtest_redist_regions(&v); 322 + 323 + _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 324 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, dummy, true); 325 + TEST_ASSERT(ret && errno == EFAULT, 326 + "register a third region allowing to cover the 4 vcpus"); 327 + 328 + addr = REDIST_REGION_ATTR_ADDR(1, 0x280000, 0, 2); 329 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 330 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 331 + 332 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, 333 + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); 334 + 335 + ret = run_vcpu(v.vm, 3); 336 + TEST_ASSERT(!ret, "vcpu run"); 337 + 338 + vm_gic_destroy(&v); 339 + } 340 + 341 + static void test_typer_accesses(void) 342 + { 343 + struct vm_gic v; 344 + uint64_t addr; 345 + uint32_t val; 346 + int ret, i; 347 + 348 + v.vm = vm_create_default(0, 0, guest_code); 349 + 350 + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); 351 + 352 + vm_vcpu_add_default(v.vm, 3, guest_code); 353 + 354 + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); 355 + TEST_ASSERT(ret && errno == EINVAL, "attempting to read GICR_TYPER of non created vcpu"); 356 + 357 + vm_vcpu_add_default(v.vm, 1, guest_code); 358 + 359 + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); 360 + TEST_ASSERT(ret && errno == EBUSY, "read GICR_TYPER before GIC initialized"); 361 + 362 + vm_vcpu_add_default(v.vm, 2, guest_code); 363 + 364 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, 365 + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); 366 + 367 + for (i = 0; i < NR_VCPUS ; i++) { 368 + ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); 369 + TEST_ASSERT(!ret && !val, "read GICR_TYPER before rdist region setting"); 370 + } 371 + 372 + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 0); 373 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 374 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 375 + 376 + /* The 2 first rdists should be put there (vcpu 0 and 3) */ 377 + ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); 378 + TEST_ASSERT(!ret && !val, "read typer of rdist #0"); 379 + 380 + ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false); 381 + TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #1"); 382 + 383 + addr = REDIST_REGION_ATTR_ADDR(10, 0x100000, 0, 1); 384 + ret = _kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 385 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 386 + TEST_ASSERT(ret && errno == EINVAL, "collision with previous rdist region"); 387 + 388 + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); 389 + TEST_ASSERT(!ret && val == 0x100, 390 + "no redist region attached to vcpu #1 yet, last cannot be returned"); 391 + 392 + ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); 393 + TEST_ASSERT(!ret && val == 0x200, 394 + "no redist region attached to vcpu #2, last cannot be returned"); 395 + 396 + addr = REDIST_REGION_ATTR_ADDR(10, 0x20000, 0, 1); 397 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 398 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 399 + 400 + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); 401 + TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1"); 402 + 403 + ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); 404 + TEST_ASSERT(!ret && val == 0x210, 405 + "read typer of rdist #1, last properly returned"); 406 + 407 + vm_gic_destroy(&v); 408 + } 409 + 410 + /** 411 + * Test GICR_TYPER last bit with new redist regions 412 + * rdist regions #1 and #2 are contiguous 413 + * rdist region #0 @0x100000 2 rdist capacity 414 + * rdists: 0, 3 (Last) 415 + * rdist region #1 @0x240000 2 rdist capacity 416 + * rdists: 5, 4 (Last) 417 + * rdist region #2 @0x200000 2 rdist capacity 418 + * rdists: 1, 2 419 + */ 420 + static void test_last_bit_redist_regions(void) 421 + { 422 + uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 }; 423 + struct vm_gic v; 424 + uint64_t addr; 425 + uint32_t val; 426 + int ret; 427 + 428 + v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); 429 + 430 + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); 431 + 432 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, 433 + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); 434 + 435 + addr = REDIST_REGION_ATTR_ADDR(2, 0x100000, 0, 0); 436 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 437 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 438 + 439 + addr = REDIST_REGION_ATTR_ADDR(2, 0x240000, 0, 1); 440 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 441 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 442 + 443 + addr = REDIST_REGION_ATTR_ADDR(2, 0x200000, 0, 2); 444 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 445 + KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION, &addr, true); 446 + 447 + ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); 448 + TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0"); 449 + 450 + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); 451 + TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #1"); 452 + 453 + ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); 454 + TEST_ASSERT(!ret && val == 0x200, "read typer of rdist #2"); 455 + 456 + ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false); 457 + TEST_ASSERT(!ret && val == 0x310, "read typer of rdist #3"); 458 + 459 + ret = access_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false); 460 + TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #5"); 461 + 462 + ret = access_redist_reg(v.gic_fd, 4, GICR_TYPER, &val, false); 463 + TEST_ASSERT(!ret && val == 0x410, "read typer of rdist #4"); 464 + 465 + vm_gic_destroy(&v); 466 + } 467 + 468 + /* Test last bit with legacy region */ 469 + static void test_last_bit_single_rdist(void) 470 + { 471 + uint32_t vcpuids[] = { 0, 3, 5, 4, 1, 2 }; 472 + struct vm_gic v; 473 + uint64_t addr; 474 + uint32_t val; 475 + int ret; 476 + 477 + v.vm = vm_create_default_with_vcpus(6, 0, 0, guest_code, vcpuids); 478 + 479 + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); 480 + 481 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_CTRL, 482 + KVM_DEV_ARM_VGIC_CTRL_INIT, NULL, true); 483 + 484 + addr = 0x10000; 485 + kvm_device_access(v.gic_fd, KVM_DEV_ARM_VGIC_GRP_ADDR, 486 + KVM_VGIC_V3_ADDR_TYPE_REDIST, &addr, true); 487 + 488 + ret = access_redist_reg(v.gic_fd, 0, GICR_TYPER, &val, false); 489 + TEST_ASSERT(!ret && val == 0x000, "read typer of rdist #0"); 490 + 491 + ret = access_redist_reg(v.gic_fd, 3, GICR_TYPER, &val, false); 492 + TEST_ASSERT(!ret && val == 0x300, "read typer of rdist #1"); 493 + 494 + ret = access_redist_reg(v.gic_fd, 5, GICR_TYPER, &val, false); 495 + TEST_ASSERT(!ret && val == 0x500, "read typer of rdist #2"); 496 + 497 + ret = access_redist_reg(v.gic_fd, 1, GICR_TYPER, &val, false); 498 + TEST_ASSERT(!ret && val == 0x100, "read typer of rdist #3"); 499 + 500 + ret = access_redist_reg(v.gic_fd, 2, GICR_TYPER, &val, false); 501 + TEST_ASSERT(!ret && val == 0x210, "read typer of rdist #3"); 502 + 503 + vm_gic_destroy(&v); 504 + } 505 + 506 + void test_kvm_device(void) 507 + { 508 + struct vm_gic v; 509 + int ret, fd; 510 + 511 + v.vm = vm_create_default_with_vcpus(NR_VCPUS, 0, 0, guest_code, NULL); 512 + 513 + /* try to create a non existing KVM device */ 514 + ret = _kvm_create_device(v.vm, 0, true, &fd); 515 + TEST_ASSERT(ret && errno == ENODEV, "unsupported device"); 516 + 517 + /* trial mode with VGIC_V3 device */ 518 + ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true, &fd); 519 + if (ret) { 520 + print_skip("GICv3 not supported"); 521 + exit(KSFT_SKIP); 522 + } 523 + v.gic_fd = kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false); 524 + 525 + ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, false, &fd); 526 + TEST_ASSERT(ret && errno == EEXIST, "create GICv3 device twice"); 527 + 528 + kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V3, true); 529 + 530 + if (!_kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, true, &fd)) { 531 + ret = _kvm_create_device(v.vm, KVM_DEV_TYPE_ARM_VGIC_V2, false, &fd); 532 + TEST_ASSERT(ret && errno == EINVAL, "create GICv2 while v3 exists"); 533 + } 534 + 535 + vm_gic_destroy(&v); 536 + } 537 + 538 + int main(int ac, char **av) 539 + { 540 + max_ipa_bits = kvm_check_cap(KVM_CAP_ARM_VM_IPA_SIZE); 541 + 542 + test_kvm_device(); 543 + test_vcpus_then_vgic(); 544 + test_vgic_then_vcpus(); 545 + test_new_redist_regions(); 546 + test_typer_accesses(); 547 + test_last_bit_redist_regions(); 548 + test_last_bit_single_rdist(); 549 + 550 + return 0; 551 + }

+9

tools/testing/selftests/kvm/include/kvm_util.h

··· 223 223 #endif 224 224 void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid); 225 225 226 + int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); 227 + int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr); 228 + int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd); 229 + int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test); 230 + int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, 231 + void *val, bool write); 232 + int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, 233 + void *val, bool write); 234 + 226 235 const char *exit_reason_str(unsigned int exit_reason); 227 236 228 237 void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot);

+75

tools/testing/selftests/kvm/lib/kvm_util.c

··· 1731 1731 } 1732 1732 1733 1733 /* 1734 + * Device Ioctl 1735 + */ 1736 + 1737 + int _kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 1738 + { 1739 + struct kvm_device_attr attribute = { 1740 + .group = group, 1741 + .attr = attr, 1742 + .flags = 0, 1743 + }; 1744 + 1745 + return ioctl(dev_fd, KVM_HAS_DEVICE_ATTR, &attribute); 1746 + } 1747 + 1748 + int kvm_device_check_attr(int dev_fd, uint32_t group, uint64_t attr) 1749 + { 1750 + int ret = _kvm_device_check_attr(dev_fd, group, attr); 1751 + 1752 + TEST_ASSERT(ret >= 0, "KVM_HAS_DEVICE_ATTR failed, rc: %i errno: %i", ret, errno); 1753 + return ret; 1754 + } 1755 + 1756 + int _kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test, int *fd) 1757 + { 1758 + struct kvm_create_device create_dev; 1759 + int ret; 1760 + 1761 + create_dev.type = type; 1762 + create_dev.fd = -1; 1763 + create_dev.flags = test ? KVM_CREATE_DEVICE_TEST : 0; 1764 + ret = ioctl(vm_get_fd(vm), KVM_CREATE_DEVICE, &create_dev); 1765 + *fd = create_dev.fd; 1766 + return ret; 1767 + } 1768 + 1769 + int kvm_create_device(struct kvm_vm *vm, uint64_t type, bool test) 1770 + { 1771 + int fd, ret; 1772 + 1773 + ret = _kvm_create_device(vm, type, test, &fd); 1774 + 1775 + if (!test) { 1776 + TEST_ASSERT(ret >= 0, 1777 + "KVM_CREATE_DEVICE IOCTL failed, rc: %i errno: %i", ret, errno); 1778 + return fd; 1779 + } 1780 + return ret; 1781 + } 1782 + 1783 + int _kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, 1784 + void *val, bool write) 1785 + { 1786 + struct kvm_device_attr kvmattr = { 1787 + .group = group, 1788 + .attr = attr, 1789 + .flags = 0, 1790 + .addr = (uintptr_t)val, 1791 + }; 1792 + int ret; 1793 + 1794 + ret = ioctl(dev_fd, write ? KVM_SET_DEVICE_ATTR : KVM_GET_DEVICE_ATTR, 1795 + &kvmattr); 1796 + return ret; 1797 + } 1798 + 1799 + int kvm_device_access(int dev_fd, uint32_t group, uint64_t attr, 1800 + void *val, bool write) 1801 + { 1802 + int ret = _kvm_device_access(dev_fd, group, attr, val, write); 1803 + 1804 + TEST_ASSERT(ret >= 0, "KVM_SET|GET_DEVICE_ATTR IOCTL failed, rc: %i errno: %i", ret, errno); 1805 + return ret; 1806 + } 1807 + 1808 + /* 1734 1809 * VM Dump 1735 1810 * 1736 1811 * Input Args: