Merge tag 'iommu-updates-v6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/iommu/linux

+1

.clang-format

··· 415 415 - 'for_each_prop_dlc_cpus' 416 416 - 'for_each_prop_dlc_platforms' 417 417 - 'for_each_property_of_node' 418 + - 'for_each_pt_level_entry' 418 419 - 'for_each_rdt_resource' 419 420 - 'for_each_reg' 420 421 - 'for_each_reg_filtered'

+2 -1

.mailmap

··· 345 345 Jayachandran C <c.jayachandran@gmail.com> <jchandra@broadcom.com> 346 346 Jayachandran C <c.jayachandran@gmail.com> <jchandra@digeo.com> 347 347 Jayachandran C <c.jayachandran@gmail.com> <jnair@caviumnetworks.com> 348 - <jean-philippe@linaro.org> <jean-philippe.brucker@arm.com> 348 + Jean-Philippe Brucker <jpb@kernel.org> <jean-philippe.brucker@arm.com> 349 + Jean-Philippe Brucker <jpb@kernel.org> <jean-philippe@linaro.org> 349 350 Jean-Michel Hautbois <jeanmichel.hautbois@yoseli.org> <jeanmichel.hautbois@ideasonboard.com> 350 351 Jean Tourrilhes <jt@hpl.hp.com> 351 352 Jeevan Shriram <quic_jshriram@quicinc.com> <jshriram@codeaurora.org>

+2

Documentation/devicetree/bindings/iommu/arm,smmu.yaml

··· 35 35 - description: Qcom SoCs implementing "qcom,smmu-500" and "arm,mmu-500" 36 36 items: 37 37 - enum: 38 + - qcom,glymur-smmu-500 39 + - qcom,kaanapali-smmu-500 38 40 - qcom,milos-smmu-500 39 41 - qcom,qcm2290-smmu-500 40 42 - qcom,qcs615-smmu-500

+8

Documentation/devicetree/bindings/iommu/mediatek,iommu.yaml

··· 82 82 - mediatek,mt8188-iommu-vdo # generation two 83 83 - mediatek,mt8188-iommu-vpp # generation two 84 84 - mediatek,mt8188-iommu-infra # generation two 85 + - mediatek,mt8189-iommu-apu # generation two 86 + - mediatek,mt8189-iommu-infra # generation two 87 + - mediatek,mt8189-iommu-mm # generation two 85 88 - mediatek,mt8192-m4u # generation two 86 89 - mediatek,mt8195-iommu-vdo # generation two 87 90 - mediatek,mt8195-iommu-vpp # generation two ··· 131 128 This is the mtk_m4u_id according to the HW. Specifies the mtk_m4u_id as 132 129 defined in 133 130 dt-binding/memory/mediatek,mt8188-memory-port.h for mt8188, 131 + dt-binding/memory/mediatek,mt8189-memory-port.h for mt8189, 134 132 dt-binding/memory/mt2701-larb-port.h for mt2701 and mt7623, 135 133 dt-binding/memory/mt2712-larb-port.h for mt2712, 136 134 dt-binding/memory/mt6779-larb-port.h for mt6779, ··· 168 164 - mediatek,mt8186-iommu-mm 169 165 - mediatek,mt8188-iommu-vdo 170 166 - mediatek,mt8188-iommu-vpp 167 + - mediatek,mt8189-iommu-mm 171 168 - mediatek,mt8192-m4u 172 169 - mediatek,mt8195-iommu-vdo 173 170 - mediatek,mt8195-iommu-vpp ··· 185 180 - mediatek,mt8186-iommu-mm 186 181 - mediatek,mt8188-iommu-vdo 187 182 - mediatek,mt8188-iommu-vpp 183 + - mediatek,mt8189-iommu-mm 188 184 - mediatek,mt8192-m4u 189 185 - mediatek,mt8195-iommu-vdo 190 186 - mediatek,mt8195-iommu-vpp ··· 214 208 contains: 215 209 enum: 216 210 - mediatek,mt8188-iommu-infra 211 + - mediatek,mt8189-iommu-apu 212 + - mediatek,mt8189-iommu-infra 217 213 - mediatek,mt8195-iommu-infra 218 214 219 215 then:

+4

Documentation/devicetree/bindings/iommu/qcom,iommu.yaml

··· 32 32 - const: qcom,msm-iommu-v2 33 33 34 34 clocks: 35 + minItems: 2 35 36 items: 36 37 - description: Clock required for IOMMU register group access 37 38 - description: Clock required for underlying bus access 39 + - description: Clock required for Translation Buffer Unit access 38 40 39 41 clock-names: 42 + minItems: 2 40 43 items: 41 44 - const: iface 42 45 - const: bus 46 + - const: tbu 43 47 44 48 power-domains: 45 49 maxItems: 1

+137

Documentation/driver-api/generic_pt.rst

··· 1 + .. SPDX-License-Identifier: GPL-2.0 2 + 3 + ======================== 4 + Generic Radix Page Table 5 + ======================== 6 + 7 + .. kernel-doc:: include/linux/generic_pt/common.h 8 + :doc: Generic Radix Page Table 9 + 10 + .. kernel-doc:: drivers/iommu/generic_pt/pt_defs.h 11 + :doc: Generic Page Table Language 12 + 13 + Usage 14 + ===== 15 + 16 + Generic PT is structured as a multi-compilation system. Since each format 17 + provides an API using a common set of names there can be only one format active 18 + within a compilation unit. This design avoids function pointers around the low 19 + level API. 20 + 21 + Instead the function pointers can end up at the higher level API (i.e. 22 + map/unmap, etc.) and the per-format code can be directly inlined into the 23 + per-format compilation unit. For something like IOMMU each format will be 24 + compiled into a per-format IOMMU operations kernel module. 25 + 26 + For this to work the .c file for each compilation unit will include both the 27 + format headers and the generic code for the implementation. For instance in an 28 + implementation compilation unit the headers would normally be included as 29 + follows: 30 + 31 + generic_pt/fmt/iommu_amdv1.c:: 32 + 33 + #include <linux/generic_pt/common.h> 34 + #include "defs_amdv1.h" 35 + #include "../pt_defs.h" 36 + #include "amdv1.h" 37 + #include "../pt_common.h" 38 + #include "../pt_iter.h" 39 + #include "../iommu_pt.h" /* The IOMMU implementation */ 40 + 41 + iommu_pt.h includes definitions that will generate the operations functions for 42 + map/unmap/etc. using the definitions provided by AMDv1. The resulting module 43 + will have exported symbols named like pt_iommu_amdv1_init(). 44 + 45 + Refer to drivers/iommu/generic_pt/fmt/iommu_template.h for an example of how the 46 + IOMMU implementation uses multi-compilation to generate per-format ops structs 47 + pointers. 48 + 49 + The format code is written so that the common names arise from #defines to 50 + distinct format specific names. This is intended to aid debuggability by 51 + avoiding symbol clashes across all the different formats. 52 + 53 + Exported symbols and other global names are mangled using a per-format string 54 + via the NS() helper macro. 55 + 56 + The format uses struct pt_common as the top-level struct for the table, 57 + and each format will have its own struct pt_xxx which embeds it to store 58 + format-specific information. 59 + 60 + The implementation will further wrap struct pt_common in its own top-level 61 + struct, such as struct pt_iommu_amdv1. 62 + 63 + Format functions at the struct pt_common level 64 + ---------------------------------------------- 65 + 66 + .. kernel-doc:: include/linux/generic_pt/common.h 67 + :identifiers: 68 + .. kernel-doc:: drivers/iommu/generic_pt/pt_common.h 69 + 70 + Iteration Helpers 71 + ----------------- 72 + 73 + .. kernel-doc:: drivers/iommu/generic_pt/pt_iter.h 74 + 75 + Writing a Format 76 + ---------------- 77 + 78 + It is best to start from a simple format that is similar to the target. x86_64 79 + is usually a good reference for something simple, and AMDv1 is something fairly 80 + complete. 81 + 82 + The required inline functions need to be implemented in the format header. 83 + These should all follow the standard pattern of:: 84 + 85 + static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts) 86 + { 87 + [..] 88 + } 89 + #define pt_entry_oa amdv1pt_entry_oa 90 + 91 + where a uniquely named per-format inline function provides the implementation 92 + and a define maps it to the generic name. This is intended to make debug symbols 93 + work better. inline functions should always be used as the prototypes in 94 + pt_common.h will cause the compiler to validate the function signature to 95 + prevent errors. 96 + 97 + Review pt_fmt_defaults.h to understand some of the optional inlines. 98 + 99 + Once the format compiles then it should be run through the generic page table 100 + kunit test in kunit_generic_pt.h using kunit. For example:: 101 + 102 + $ tools/testing/kunit/kunit.py run --build_dir build_kunit_x86_64 --arch x86_64 --kunitconfig ./drivers/iommu/generic_pt/.kunitconfig amdv1_fmt_test.* 103 + [...] 104 + [11:15:08] Testing complete. Ran 9 tests: passed: 9 105 + [11:15:09] Elapsed time: 3.137s total, 0.001s configuring, 2.368s building, 0.311s running 106 + 107 + The generic tests are intended to prove out the format functions and give 108 + clearer failures to speed up finding the problems. Once those pass then the 109 + entire kunit suite should be run. 110 + 111 + IOMMU Invalidation Features 112 + --------------------------- 113 + 114 + Invalidation is how the page table algorithms synchronize with a HW cache of the 115 + page table memory, typically called the TLB (or IOTLB for IOMMU cases). 116 + 117 + The TLB can store present PTEs, non-present PTEs and table pointers, depending 118 + on its design. Every HW has its own approach on how to describe what has changed 119 + to have changed items removed from the TLB. 120 + 121 + PT_FEAT_FLUSH_RANGE 122 + ~~~~~~~~~~~~~~~~~~~ 123 + 124 + PT_FEAT_FLUSH_RANGE is the easiest scheme to understand. It tries to generate a 125 + single range invalidation for each operation, over-invalidating if there are 126 + gaps of VA that don't need invalidation. This trades off impacted VA for number 127 + of invalidation operations. It does not keep track of what is being invalidated; 128 + however, if pages have to be freed then page table pointers have to be cleaned 129 + from the walk cache. The range can start/end at any page boundary. 130 + 131 + PT_FEAT_FLUSH_RANGE_NO_GAPS 132 + ~~~~~~~~~~~~~~~~~~~~~~~~~~~ 133 + 134 + PT_FEAT_FLUSH_RANGE_NO_GAPS is similar to PT_FEAT_FLUSH_RANGE; however, it tries 135 + to minimize the amount of impacted VA by issuing extra flush operations. This is 136 + useful if the cost of processing VA is very high, for instance because a 137 + hypervisor is processing the page table with a shadowing algorithm.

+1

Documentation/driver-api/index.rst

··· 93 93 frame-buffer 94 94 aperture 95 95 generic-counter 96 + generic_pt 96 97 gpio/index 97 98 hsi 98 99 hte/index

+3 -3

MAINTAINERS

··· 388 388 F: drivers/acpi/*thermal* 389 389 390 390 ACPI VIOT DRIVER 391 - M: Jean-Philippe Brucker <jean-philippe@linaro.org> 391 + M: Jean-Philippe Brucker <jpb@kernel.org> 392 392 L: linux-acpi@vger.kernel.org 393 393 L: iommu@lists.linux.dev 394 394 S: Maintained ··· 2269 2269 F: drivers/iommu/io-pgtable-arm* 2270 2270 2271 2271 ARM SMMU SVA SUPPORT 2272 - R: Jean-Philippe Brucker <jean-philippe@linaro.org> 2272 + R: Jean-Philippe Brucker <jpb@kernel.org> 2273 2273 F: drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c 2274 2274 2275 2275 ARM SUB-ARCHITECTURES ··· 27455 27455 F: include/uapi/linux/virtio_input.h 27456 27456 27457 27457 VIRTIO IOMMU DRIVER 27458 - M: Jean-Philippe Brucker <jean-philippe@linaro.org> 27458 + M: Jean-Philippe Brucker <jpb@kernel.org> 27459 27459 L: virtualization@lists.linux.dev 27460 27460 S: Maintained 27461 27461 F: drivers/iommu/virtio-iommu.c

+3

arch/powerpc/include/asm/mem_encrypt.h

··· 9 9 #define _ASM_POWERPC_MEM_ENCRYPT_H 10 10 11 11 #include <asm/svm.h> 12 + #include <linux/types.h> 13 + 14 + struct device; 12 15 13 16 static inline bool force_dma_unencrypted(struct device *dev) 14 17 {

+3 -2

arch/powerpc/kernel/iommu.c

··· 1156 1156 */ 1157 1157 static int 1158 1158 spapr_tce_platform_iommu_attach_dev(struct iommu_domain *platform_domain, 1159 - struct device *dev) 1159 + struct device *dev, 1160 + struct iommu_domain *old) 1160 1161 { 1161 1162 struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 1162 1163 struct iommu_table_group *table_group; ··· 1190 1189 1191 1190 static int 1192 1191 spapr_tce_blocked_iommu_attach_dev(struct iommu_domain *platform_domain, 1193 - struct device *dev) 1192 + struct device *dev, struct iommu_domain *old) 1194 1193 { 1195 1194 struct iommu_group *grp = iommu_group_get(dev); 1196 1195 struct iommu_table_group *table_group;

+1 -1

drivers/amba/Kconfig

··· 5 5 if ARM_AMBA 6 6 7 7 config TEGRA_AHB 8 - bool 8 + bool "Enable AHB driver for NVIDIA Tegra SoCs" if COMPILE_TEST 9 9 default y if ARCH_TEGRA 10 10 help 11 11 Adds AHB configuration functionality for NVIDIA Tegra SoCs,

+9 -6

drivers/iommu/Kconfig

··· 40 40 sizes at both stage-1 and stage-2, as well as address spaces 41 41 up to 48-bits in size. 42 42 43 - config IOMMU_IO_PGTABLE_LPAE_SELFTEST 44 - bool "LPAE selftests" 45 - depends on IOMMU_IO_PGTABLE_LPAE 43 + config IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST 44 + tristate "KUnit tests for LPAE" 45 + depends on IOMMU_IO_PGTABLE_LPAE && KUNIT 46 + default KUNIT_ALL_TESTS 46 47 help 47 - Enable self-tests for LPAE page table allocator. This performs 48 - a series of page-table consistency checks during boot. 48 + Enable kunit tests for LPAE page table allocator. This performs 49 + a series of page-table consistency checks. 49 50 50 51 If unsure, say N here. 51 52 ··· 248 247 249 248 config TEGRA_IOMMU_SMMU 250 249 bool "NVIDIA Tegra SMMU Support" 251 - depends on ARCH_TEGRA 250 + depends on ARCH_TEGRA || COMPILE_TEST 252 251 depends on TEGRA_AHB 253 252 depends on TEGRA_MC 254 253 select IOMMU_API ··· 385 384 Say Y here if you want to use the multimedia devices listed above. 386 385 387 386 endif # IOMMU_SUPPORT 387 + 388 + source "drivers/iommu/generic_pt/Kconfig"

+2

drivers/iommu/Makefile

··· 3 3 obj-$(CONFIG_AMD_IOMMU) += amd/ 4 4 obj-$(CONFIG_INTEL_IOMMU) += intel/ 5 5 obj-$(CONFIG_RISCV_IOMMU) += riscv/ 6 + obj-$(CONFIG_GENERIC_PT) += generic_pt/fmt/ 6 7 obj-$(CONFIG_IOMMU_API) += iommu.o 7 8 obj-$(CONFIG_IOMMU_SUPPORT) += iommu-pages.o 8 9 obj-$(CONFIG_IOMMU_API) += iommu-traces.o ··· 13 12 obj-$(CONFIG_IOMMU_IO_PGTABLE) += io-pgtable.o 14 13 obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o 15 14 obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o 15 + obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE_KUNIT_TEST) += io-pgtable-arm-selftests.o 16 16 obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o 17 17 obj-$(CONFIG_IOMMU_IOVA) += iova.o 18 18 obj-$(CONFIG_OF_IOMMU) += of_iommu.o

+4 -1

drivers/iommu/amd/Kconfig

··· 11 11 select MMU_NOTIFIER 12 12 select IOMMU_API 13 13 select IOMMU_IOVA 14 - select IOMMU_IO_PGTABLE 15 14 select IOMMU_SVA 16 15 select IOMMU_IOPF 17 16 select IOMMUFD_DRIVER if IOMMUFD 17 + select GENERIC_PT 18 + select IOMMU_PT 19 + select IOMMU_PT_AMDV1 20 + select IOMMU_PT_X86_64 18 21 depends on X86_64 && PCI && ACPI && HAVE_CMPXCHG_DOUBLE 19 22 help 20 23 With this option you can enable support for AMD IOMMU hardware in

+1 -1

drivers/iommu/amd/Makefile

··· 1 1 # SPDX-License-Identifier: GPL-2.0-only 2 - obj-y += iommu.o init.o quirks.o io_pgtable.o io_pgtable_v2.o ppr.o pasid.o 2 + obj-y += iommu.o init.o quirks.o ppr.o pasid.o 3 3 obj-$(CONFIG_AMD_IOMMU_DEBUGFS) += debugfs.o

-1

drivers/iommu/amd/amd_iommu.h

··· 88 88 * the IOMMU used by this driver. 89 89 */ 90 90 void amd_iommu_flush_all_caches(struct amd_iommu *iommu); 91 - void amd_iommu_update_and_flush_device_table(struct protection_domain *domain); 92 91 void amd_iommu_domain_flush_pages(struct protection_domain *domain, 93 92 u64 address, size_t size); 94 93 void amd_iommu_dev_flush_pasid_pages(struct iommu_dev_data *dev_data,

+14 -100

drivers/iommu/amd/amd_iommu_types.h

··· 18 18 #include <linux/spinlock.h> 19 19 #include <linux/pci.h> 20 20 #include <linux/irqreturn.h> 21 - #include <linux/io-pgtable.h> 21 + #include <linux/generic_pt/iommu.h> 22 22 23 23 /* 24 24 * Maximum number of IOMMUs supported ··· 247 247 #define CMD_BUFFER_ENTRIES 512 248 248 #define MMIO_CMD_SIZE_SHIFT 56 249 249 #define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) 250 + #define MMIO_CMD_HEAD_MASK GENMASK_ULL(18, 4) /* Command buffer head ptr field [18:4] */ 251 + #define MMIO_CMD_BUFFER_HEAD(x) FIELD_GET(MMIO_CMD_HEAD_MASK, (x)) 252 + #define MMIO_CMD_TAIL_MASK GENMASK_ULL(18, 4) /* Command buffer tail ptr field [18:4] */ 253 + #define MMIO_CMD_BUFFER_TAIL(x) FIELD_GET(MMIO_CMD_TAIL_MASK, (x)) 250 254 251 255 /* constants for event buffer handling */ 252 256 #define EVT_BUFFER_SIZE 8192 /* 512 entries */ ··· 341 337 #define GUEST_PGTABLE_4_LEVEL 0x00 342 338 #define GUEST_PGTABLE_5_LEVEL 0x01 343 339 344 - #define PM_LEVEL_SHIFT(x) (12 + ((x) * 9)) 345 - #define PM_LEVEL_SIZE(x) (((x) < 6) ? \ 346 - ((1ULL << PM_LEVEL_SHIFT((x))) - 1): \ 347 - (0xffffffffffffffffULL)) 348 - #define PM_LEVEL_INDEX(x, a) (((a) >> PM_LEVEL_SHIFT((x))) & 0x1ffULL) 349 - #define PM_LEVEL_ENC(x) (((x) << 9) & 0xe00ULL) 350 - #define PM_LEVEL_PDE(x, a) ((a) | PM_LEVEL_ENC((x)) | \ 351 - IOMMU_PTE_PR | IOMMU_PTE_IR | IOMMU_PTE_IW) 352 - #define PM_PTE_LEVEL(pte) (((pte) >> 9) & 0x7ULL) 353 - 354 - #define PM_MAP_4k 0 355 340 #define PM_ADDR_MASK 0x000ffffffffff000ULL 356 - #define PM_MAP_MASK(lvl) (PM_ADDR_MASK & \ 357 - (~((1ULL << (12 + ((lvl) * 9))) - 1))) 358 - #define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) 359 - 360 - /* 361 - * Returns the page table level to use for a given page size 362 - * Pagesize is expected to be a power-of-two 363 - */ 364 - #define PAGE_SIZE_LEVEL(pagesize) \ 365 - ((__ffs(pagesize) - 12) / 9) 366 - /* 367 - * Returns the number of ptes to use for a given page size 368 - * Pagesize is expected to be a power-of-two 369 - */ 370 - #define PAGE_SIZE_PTE_COUNT(pagesize) \ 371 - (1ULL << ((__ffs(pagesize) - 12) % 9)) 372 - 373 - /* 374 - * Aligns a given io-virtual address to a given page size 375 - * Pagesize is expected to be a power-of-two 376 - */ 377 - #define PAGE_SIZE_ALIGN(address, pagesize) \ 378 - ((address) & ~((pagesize) - 1)) 379 - /* 380 - * Creates an IOMMU PTE for an address and a given pagesize 381 - * The PTE has no permission bits set 382 - * Pagesize is expected to be a power-of-two larger than 4096 383 - */ 384 - #define PAGE_SIZE_PTE(address, pagesize) \ 385 - (((address) | ((pagesize) - 1)) & \ 386 - (~(pagesize >> 1)) & PM_ADDR_MASK) 387 - 388 - /* 389 - * Takes a PTE value with mode=0x07 and returns the page size it maps 390 - */ 391 - #define PTE_PAGE_SIZE(pte) \ 392 - (1ULL << (1 + ffz(((pte) | 0xfffULL)))) 393 - 394 - /* 395 - * Takes a page-table level and returns the default page-size for this level 396 - */ 397 - #define PTE_LEVEL_PAGE_SIZE(level) \ 398 - (1ULL << (12 + (9 * (level)))) 399 - 400 - /* 401 - * The IOPTE dirty bit 402 - */ 403 - #define IOMMU_PTE_HD_BIT (6) 404 - 405 - /* 406 - * Bit value definition for I/O PTE fields 407 - */ 408 - #define IOMMU_PTE_PR BIT_ULL(0) 409 - #define IOMMU_PTE_HD BIT_ULL(IOMMU_PTE_HD_BIT) 410 - #define IOMMU_PTE_U BIT_ULL(59) 411 - #define IOMMU_PTE_FC BIT_ULL(60) 412 - #define IOMMU_PTE_IR BIT_ULL(61) 413 - #define IOMMU_PTE_IW BIT_ULL(62) 414 341 415 342 /* 416 343 * Bit value definition for DTE fields ··· 370 435 371 436 /* DTE[128:179] | DTE[184:191] */ 372 437 #define DTE_DATA2_INTR_MASK ~GENMASK_ULL(55, 52) 373 - 374 - #define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) 375 - #define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_PR) 376 - #define IOMMU_PTE_DIRTY(pte) ((pte) & IOMMU_PTE_HD) 377 - #define IOMMU_PTE_PAGE(pte) (iommu_phys_to_virt((pte) & IOMMU_PAGE_MASK)) 378 - #define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) 379 438 380 439 #define IOMMU_PROT_MASK 0x03 381 440 #define IOMMU_PROT_IR 0x01 ··· 463 534 464 535 #define AMD_IOMMU_FLAG_TRANS_PRE_ENABLED (1 << 0) 465 536 466 - #define io_pgtable_to_data(x) \ 467 - container_of((x), struct amd_io_pgtable, pgtbl) 468 - 469 - #define io_pgtable_ops_to_data(x) \ 470 - io_pgtable_to_data(io_pgtable_ops_to_pgtable(x)) 471 - 472 - #define io_pgtable_ops_to_domain(x) \ 473 - container_of(io_pgtable_ops_to_data(x), \ 474 - struct protection_domain, iop) 475 - 476 - #define io_pgtable_cfg_to_data(x) \ 477 - container_of((x), struct amd_io_pgtable, pgtbl.cfg) 478 - 479 537 struct gcr3_tbl_info { 480 538 u64 *gcr3_tbl; /* Guest CR3 table */ 481 539 int glx; /* Number of levels for GCR3 table */ 482 540 u32 pasid_cnt; /* Track attached PASIDs */ 483 541 u16 domid; /* Per device domain ID */ 484 - }; 485 - 486 - struct amd_io_pgtable { 487 - seqcount_t seqcount; /* Protects root/mode update */ 488 - struct io_pgtable pgtbl; 489 - int mode; 490 - u64 *root; 491 - u64 *pgd; /* v2 pgtable pgd pointer */ 492 542 }; 493 543 494 544 enum protection_domain_mode { ··· 497 589 * independent of their use. 498 590 */ 499 591 struct protection_domain { 592 + union { 593 + struct iommu_domain domain; 594 + struct pt_iommu iommu; 595 + struct pt_iommu_amdv1 amdv1; 596 + struct pt_iommu_x86_64 amdv2; 597 + }; 500 598 struct list_head dev_list; /* List of all devices in this domain */ 501 - struct iommu_domain domain; /* generic domain handle used by 502 - iommu core code */ 503 - struct amd_io_pgtable iop; 504 599 spinlock_t lock; /* mostly used to lock the page table*/ 505 600 u16 id; /* the domain id written to the device table */ 506 601 enum protection_domain_mode pd_mode; /* Track page table type */ ··· 513 602 struct mmu_notifier mn; /* mmu notifier for the SVA domain */ 514 603 struct list_head dev_data_list; /* List of pdom_dev_data */ 515 604 }; 605 + PT_IOMMU_CHECK_DOMAIN(struct protection_domain, iommu, domain); 606 + PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv1.iommu, domain); 607 + PT_IOMMU_CHECK_DOMAIN(struct protection_domain, amdv2.iommu, domain); 516 608 517 609 /* 518 610 * This structure contains information about one PCI segment in the system.

+1 -1

drivers/iommu/amd/debugfs.c

··· 37 37 if (ret) 38 38 return ret; 39 39 40 - if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - 4) { 40 + if (iommu->dbg_mmio_offset > iommu->mmio_phys_end - sizeof(u64)) { 41 41 iommu->dbg_mmio_offset = -1; 42 42 return -EINVAL; 43 43 }

+12 -3

drivers/iommu/amd/init.c

··· 1710 1710 list_add_tail(&pci_seg->list, &amd_iommu_pci_seg_list); 1711 1711 1712 1712 if (alloc_dev_table(pci_seg)) 1713 - return NULL; 1713 + goto err_free_pci_seg; 1714 1714 if (alloc_alias_table(pci_seg)) 1715 - return NULL; 1715 + goto err_free_dev_table; 1716 1716 if (alloc_rlookup_table(pci_seg)) 1717 - return NULL; 1717 + goto err_free_alias_table; 1718 1718 1719 1719 return pci_seg; 1720 + 1721 + err_free_alias_table: 1722 + free_alias_table(pci_seg); 1723 + err_free_dev_table: 1724 + free_dev_table(pci_seg); 1725 + err_free_pci_seg: 1726 + list_del(&pci_seg->list); 1727 + kfree(pci_seg); 1728 + return NULL; 1720 1729 } 1721 1730 1722 1731 static struct amd_iommu_pci_seg *__init get_pci_segment(u16 id,

-577

drivers/iommu/amd/io_pgtable.c

··· 1 - // SPDX-License-Identifier: GPL-2.0-only 2 - /* 3 - * CPU-agnostic AMD IO page table allocator. 4 - * 5 - * Copyright (C) 2020 Advanced Micro Devices, Inc. 6 - * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 - */ 8 - 9 - #define pr_fmt(fmt) "AMD-Vi: " fmt 10 - #define dev_fmt(fmt) pr_fmt(fmt) 11 - 12 - #include <linux/atomic.h> 13 - #include <linux/bitops.h> 14 - #include <linux/io-pgtable.h> 15 - #include <linux/kernel.h> 16 - #include <linux/sizes.h> 17 - #include <linux/slab.h> 18 - #include <linux/types.h> 19 - #include <linux/dma-mapping.h> 20 - #include <linux/seqlock.h> 21 - 22 - #include <asm/barrier.h> 23 - 24 - #include "amd_iommu_types.h" 25 - #include "amd_iommu.h" 26 - #include "../iommu-pages.h" 27 - 28 - /* 29 - * Helper function to get the first pte of a large mapping 30 - */ 31 - static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, 32 - unsigned long *count) 33 - { 34 - unsigned long pte_mask, pg_size, cnt; 35 - u64 *fpte; 36 - 37 - pg_size = PTE_PAGE_SIZE(*pte); 38 - cnt = PAGE_SIZE_PTE_COUNT(pg_size); 39 - pte_mask = ~((cnt << 3) - 1); 40 - fpte = (u64 *)(((unsigned long)pte) & pte_mask); 41 - 42 - if (page_size) 43 - *page_size = pg_size; 44 - 45 - if (count) 46 - *count = cnt; 47 - 48 - return fpte; 49 - } 50 - 51 - static void free_pt_lvl(u64 *pt, struct iommu_pages_list *freelist, int lvl) 52 - { 53 - u64 *p; 54 - int i; 55 - 56 - for (i = 0; i < 512; ++i) { 57 - /* PTE present? */ 58 - if (!IOMMU_PTE_PRESENT(pt[i])) 59 - continue; 60 - 61 - /* Large PTE? */ 62 - if (PM_PTE_LEVEL(pt[i]) == 0 || 63 - PM_PTE_LEVEL(pt[i]) == 7) 64 - continue; 65 - 66 - /* 67 - * Free the next level. No need to look at l1 tables here since 68 - * they can only contain leaf PTEs; just free them directly. 69 - */ 70 - p = IOMMU_PTE_PAGE(pt[i]); 71 - if (lvl > 2) 72 - free_pt_lvl(p, freelist, lvl - 1); 73 - else 74 - iommu_pages_list_add(freelist, p); 75 - } 76 - 77 - iommu_pages_list_add(freelist, pt); 78 - } 79 - 80 - static void free_sub_pt(u64 *root, int mode, struct iommu_pages_list *freelist) 81 - { 82 - switch (mode) { 83 - case PAGE_MODE_NONE: 84 - case PAGE_MODE_7_LEVEL: 85 - break; 86 - case PAGE_MODE_1_LEVEL: 87 - iommu_pages_list_add(freelist, root); 88 - break; 89 - case PAGE_MODE_2_LEVEL: 90 - case PAGE_MODE_3_LEVEL: 91 - case PAGE_MODE_4_LEVEL: 92 - case PAGE_MODE_5_LEVEL: 93 - case PAGE_MODE_6_LEVEL: 94 - free_pt_lvl(root, freelist, mode); 95 - break; 96 - default: 97 - BUG(); 98 - } 99 - } 100 - 101 - /* 102 - * This function is used to add another level to an IO page table. Adding 103 - * another level increases the size of the address space by 9 bits to a size up 104 - * to 64 bits. 105 - */ 106 - static bool increase_address_space(struct amd_io_pgtable *pgtable, 107 - unsigned long address, 108 - unsigned int page_size_level, 109 - gfp_t gfp) 110 - { 111 - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 112 - struct protection_domain *domain = 113 - container_of(pgtable, struct protection_domain, iop); 114 - unsigned long flags; 115 - bool ret = true; 116 - u64 *pte; 117 - 118 - pte = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, SZ_4K); 119 - if (!pte) 120 - return false; 121 - 122 - spin_lock_irqsave(&domain->lock, flags); 123 - 124 - if (address <= PM_LEVEL_SIZE(pgtable->mode) && 125 - pgtable->mode - 1 >= page_size_level) 126 - goto out; 127 - 128 - ret = false; 129 - if (WARN_ON_ONCE(pgtable->mode == amd_iommu_hpt_level)) 130 - goto out; 131 - 132 - *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); 133 - 134 - write_seqcount_begin(&pgtable->seqcount); 135 - pgtable->root = pte; 136 - pgtable->mode += 1; 137 - write_seqcount_end(&pgtable->seqcount); 138 - 139 - amd_iommu_update_and_flush_device_table(domain); 140 - 141 - pte = NULL; 142 - ret = true; 143 - 144 - out: 145 - spin_unlock_irqrestore(&domain->lock, flags); 146 - iommu_free_pages(pte); 147 - 148 - return ret; 149 - } 150 - 151 - static u64 *alloc_pte(struct amd_io_pgtable *pgtable, 152 - unsigned long address, 153 - unsigned long page_size, 154 - u64 **pte_page, 155 - gfp_t gfp, 156 - bool *updated) 157 - { 158 - unsigned long last_addr = address + (page_size - 1); 159 - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 160 - unsigned int seqcount; 161 - int level, end_lvl; 162 - u64 *pte, *page; 163 - 164 - BUG_ON(!is_power_of_2(page_size)); 165 - 166 - while (last_addr > PM_LEVEL_SIZE(pgtable->mode) || 167 - pgtable->mode - 1 < PAGE_SIZE_LEVEL(page_size)) { 168 - /* 169 - * Return an error if there is no memory to update the 170 - * page-table. 171 - */ 172 - if (!increase_address_space(pgtable, last_addr, 173 - PAGE_SIZE_LEVEL(page_size), gfp)) 174 - return NULL; 175 - } 176 - 177 - 178 - do { 179 - seqcount = read_seqcount_begin(&pgtable->seqcount); 180 - 181 - level = pgtable->mode - 1; 182 - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 183 - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); 184 - 185 - 186 - address = PAGE_SIZE_ALIGN(address, page_size); 187 - end_lvl = PAGE_SIZE_LEVEL(page_size); 188 - 189 - while (level > end_lvl) { 190 - u64 __pte, __npte; 191 - int pte_level; 192 - 193 - __pte = *pte; 194 - pte_level = PM_PTE_LEVEL(__pte); 195 - 196 - /* 197 - * If we replace a series of large PTEs, we need 198 - * to tear down all of them. 199 - */ 200 - if (IOMMU_PTE_PRESENT(__pte) && 201 - pte_level == PAGE_MODE_7_LEVEL) { 202 - unsigned long count, i; 203 - u64 *lpte; 204 - 205 - lpte = first_pte_l7(pte, NULL, &count); 206 - 207 - /* 208 - * Unmap the replicated PTEs that still match the 209 - * original large mapping 210 - */ 211 - for (i = 0; i < count; ++i) 212 - cmpxchg64(&lpte[i], __pte, 0ULL); 213 - 214 - *updated = true; 215 - continue; 216 - } 217 - 218 - if (!IOMMU_PTE_PRESENT(__pte) || 219 - pte_level == PAGE_MODE_NONE) { 220 - page = iommu_alloc_pages_node_sz(cfg->amd.nid, gfp, 221 - SZ_4K); 222 - 223 - if (!page) 224 - return NULL; 225 - 226 - __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); 227 - 228 - /* pte could have been changed somewhere. */ 229 - if (!try_cmpxchg64(pte, &__pte, __npte)) 230 - iommu_free_pages(page); 231 - else if (IOMMU_PTE_PRESENT(__pte)) 232 - *updated = true; 233 - 234 - continue; 235 - } 236 - 237 - /* No level skipping support yet */ 238 - if (pte_level != level) 239 - return NULL; 240 - 241 - level -= 1; 242 - 243 - pte = IOMMU_PTE_PAGE(__pte); 244 - 245 - if (pte_page && level == end_lvl) 246 - *pte_page = pte; 247 - 248 - pte = &pte[PM_LEVEL_INDEX(level, address)]; 249 - } 250 - 251 - return pte; 252 - } 253 - 254 - /* 255 - * This function checks if there is a PTE for a given dma address. If 256 - * there is one, it returns the pointer to it. 257 - */ 258 - static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 259 - unsigned long address, 260 - unsigned long *page_size) 261 - { 262 - int level; 263 - unsigned int seqcount; 264 - u64 *pte; 265 - 266 - *page_size = 0; 267 - 268 - if (address > PM_LEVEL_SIZE(pgtable->mode)) 269 - return NULL; 270 - 271 - do { 272 - seqcount = read_seqcount_begin(&pgtable->seqcount); 273 - level = pgtable->mode - 1; 274 - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; 275 - } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); 276 - 277 - *page_size = PTE_LEVEL_PAGE_SIZE(level); 278 - 279 - while (level > 0) { 280 - 281 - /* Not Present */ 282 - if (!IOMMU_PTE_PRESENT(*pte)) 283 - return NULL; 284 - 285 - /* Large PTE */ 286 - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || 287 - PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) 288 - break; 289 - 290 - /* No level skipping support yet */ 291 - if (PM_PTE_LEVEL(*pte) != level) 292 - return NULL; 293 - 294 - level -= 1; 295 - 296 - /* Walk to the next level */ 297 - pte = IOMMU_PTE_PAGE(*pte); 298 - pte = &pte[PM_LEVEL_INDEX(level, address)]; 299 - *page_size = PTE_LEVEL_PAGE_SIZE(level); 300 - } 301 - 302 - /* 303 - * If we have a series of large PTEs, make 304 - * sure to return a pointer to the first one. 305 - */ 306 - if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) 307 - pte = first_pte_l7(pte, page_size, NULL); 308 - 309 - return pte; 310 - } 311 - 312 - static void free_clear_pte(u64 *pte, u64 pteval, 313 - struct iommu_pages_list *freelist) 314 - { 315 - u64 *pt; 316 - int mode; 317 - 318 - while (!try_cmpxchg64(pte, &pteval, 0)) 319 - pr_warn("AMD-Vi: IOMMU pte changed since we read it\n"); 320 - 321 - if (!IOMMU_PTE_PRESENT(pteval)) 322 - return; 323 - 324 - pt = IOMMU_PTE_PAGE(pteval); 325 - mode = IOMMU_PTE_MODE(pteval); 326 - 327 - free_sub_pt(pt, mode, freelist); 328 - } 329 - 330 - /* 331 - * Generic mapping functions. It maps a physical address into a DMA 332 - * address space. It allocates the page table pages if necessary. 333 - * In the future it can be extended to a generic mapping function 334 - * supporting all features of AMD IOMMU page tables like level skipping 335 - * and full 64 bit address spaces. 336 - */ 337 - static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, 338 - phys_addr_t paddr, size_t pgsize, size_t pgcount, 339 - int prot, gfp_t gfp, size_t *mapped) 340 - { 341 - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 342 - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); 343 - bool updated = false; 344 - u64 __pte, *pte; 345 - int ret, i, count; 346 - size_t size = pgcount << __ffs(pgsize); 347 - unsigned long o_iova = iova; 348 - 349 - BUG_ON(!IS_ALIGNED(iova, pgsize)); 350 - BUG_ON(!IS_ALIGNED(paddr, pgsize)); 351 - 352 - ret = -EINVAL; 353 - if (!(prot & IOMMU_PROT_MASK)) 354 - goto out; 355 - 356 - while (pgcount > 0) { 357 - count = PAGE_SIZE_PTE_COUNT(pgsize); 358 - pte = alloc_pte(pgtable, iova, pgsize, NULL, gfp, &updated); 359 - 360 - ret = -ENOMEM; 361 - if (!pte) 362 - goto out; 363 - 364 - for (i = 0; i < count; ++i) 365 - free_clear_pte(&pte[i], pte[i], &freelist); 366 - 367 - if (!iommu_pages_list_empty(&freelist)) 368 - updated = true; 369 - 370 - if (count > 1) { 371 - __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); 372 - __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; 373 - } else 374 - __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; 375 - 376 - if (prot & IOMMU_PROT_IR) 377 - __pte |= IOMMU_PTE_IR; 378 - if (prot & IOMMU_PROT_IW) 379 - __pte |= IOMMU_PTE_IW; 380 - 381 - for (i = 0; i < count; ++i) 382 - pte[i] = __pte; 383 - 384 - iova += pgsize; 385 - paddr += pgsize; 386 - pgcount--; 387 - if (mapped) 388 - *mapped += pgsize; 389 - } 390 - 391 - ret = 0; 392 - 393 - out: 394 - if (updated) { 395 - struct protection_domain *dom = io_pgtable_ops_to_domain(ops); 396 - unsigned long flags; 397 - 398 - spin_lock_irqsave(&dom->lock, flags); 399 - /* 400 - * Flush domain TLB(s) and wait for completion. Any Device-Table 401 - * Updates and flushing already happened in 402 - * increase_address_space(). 403 - */ 404 - amd_iommu_domain_flush_pages(dom, o_iova, size); 405 - spin_unlock_irqrestore(&dom->lock, flags); 406 - } 407 - 408 - /* Everything flushed out, free pages now */ 409 - iommu_put_pages_list(&freelist); 410 - 411 - return ret; 412 - } 413 - 414 - static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, 415 - unsigned long iova, 416 - size_t pgsize, size_t pgcount, 417 - struct iommu_iotlb_gather *gather) 418 - { 419 - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 420 - unsigned long long unmapped; 421 - unsigned long unmap_size; 422 - u64 *pte; 423 - size_t size = pgcount << __ffs(pgsize); 424 - 425 - BUG_ON(!is_power_of_2(pgsize)); 426 - 427 - unmapped = 0; 428 - 429 - while (unmapped < size) { 430 - pte = fetch_pte(pgtable, iova, &unmap_size); 431 - if (pte) { 432 - int i, count; 433 - 434 - count = PAGE_SIZE_PTE_COUNT(unmap_size); 435 - for (i = 0; i < count; i++) 436 - pte[i] = 0ULL; 437 - } else { 438 - return unmapped; 439 - } 440 - 441 - iova = (iova & ~(unmap_size - 1)) + unmap_size; 442 - unmapped += unmap_size; 443 - } 444 - 445 - return unmapped; 446 - } 447 - 448 - static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 449 - { 450 - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 451 - unsigned long offset_mask, pte_pgsize; 452 - u64 *pte, __pte; 453 - 454 - pte = fetch_pte(pgtable, iova, &pte_pgsize); 455 - 456 - if (!pte || !IOMMU_PTE_PRESENT(*pte)) 457 - return 0; 458 - 459 - offset_mask = pte_pgsize - 1; 460 - __pte = __sme_clr(*pte & PM_ADDR_MASK); 461 - 462 - return (__pte & ~offset_mask) | (iova & offset_mask); 463 - } 464 - 465 - static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, 466 - unsigned long flags) 467 - { 468 - bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; 469 - bool dirty = false; 470 - int i, count; 471 - 472 - /* 473 - * 2.2.3.2 Host Dirty Support 474 - * When a non-default page size is used , software must OR the 475 - * Dirty bits in all of the replicated host PTEs used to map 476 - * the page. The IOMMU does not guarantee the Dirty bits are 477 - * set in all of the replicated PTEs. Any portion of the page 478 - * may have been written even if the Dirty bit is set in only 479 - * one of the replicated PTEs. 480 - */ 481 - count = PAGE_SIZE_PTE_COUNT(size); 482 - for (i = 0; i < count && test_only; i++) { 483 - if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { 484 - dirty = true; 485 - break; 486 - } 487 - } 488 - 489 - for (i = 0; i < count && !test_only; i++) { 490 - if (test_and_clear_bit(IOMMU_PTE_HD_BIT, 491 - (unsigned long *)&ptep[i])) { 492 - dirty = true; 493 - } 494 - } 495 - 496 - return dirty; 497 - } 498 - 499 - static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, 500 - unsigned long iova, size_t size, 501 - unsigned long flags, 502 - struct iommu_dirty_bitmap *dirty) 503 - { 504 - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 505 - unsigned long end = iova + size - 1; 506 - 507 - do { 508 - unsigned long pgsize = 0; 509 - u64 *ptep, pte; 510 - 511 - ptep = fetch_pte(pgtable, iova, &pgsize); 512 - if (ptep) 513 - pte = READ_ONCE(*ptep); 514 - if (!ptep || !IOMMU_PTE_PRESENT(pte)) { 515 - pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); 516 - iova += pgsize; 517 - continue; 518 - } 519 - 520 - /* 521 - * Mark the whole IOVA range as dirty even if only one of 522 - * the replicated PTEs were marked dirty. 523 - */ 524 - if (pte_test_and_clear_dirty(ptep, pgsize, flags)) 525 - iommu_dirty_bitmap_record(dirty, iova, pgsize); 526 - iova += pgsize; 527 - } while (iova < end); 528 - 529 - return 0; 530 - } 531 - 532 - /* 533 - * ---------------------------------------------------- 534 - */ 535 - static void v1_free_pgtable(struct io_pgtable *iop) 536 - { 537 - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); 538 - struct iommu_pages_list freelist = IOMMU_PAGES_LIST_INIT(freelist); 539 - 540 - if (pgtable->mode == PAGE_MODE_NONE) 541 - return; 542 - 543 - /* Page-table is not visible to IOMMU anymore, so free it */ 544 - BUG_ON(pgtable->mode < PAGE_MODE_NONE || 545 - pgtable->mode > amd_iommu_hpt_level); 546 - 547 - free_sub_pt(pgtable->root, pgtable->mode, &freelist); 548 - iommu_put_pages_list(&freelist); 549 - } 550 - 551 - static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 552 - { 553 - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 554 - 555 - pgtable->root = 556 - iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); 557 - if (!pgtable->root) 558 - return NULL; 559 - pgtable->mode = PAGE_MODE_3_LEVEL; 560 - seqcount_init(&pgtable->seqcount); 561 - 562 - cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; 563 - cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; 564 - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; 565 - 566 - pgtable->pgtbl.ops.map_pages = iommu_v1_map_pages; 567 - pgtable->pgtbl.ops.unmap_pages = iommu_v1_unmap_pages; 568 - pgtable->pgtbl.ops.iova_to_phys = iommu_v1_iova_to_phys; 569 - pgtable->pgtbl.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; 570 - 571 - return &pgtable->pgtbl; 572 - } 573 - 574 - struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { 575 - .alloc = v1_alloc_pgtable, 576 - .free = v1_free_pgtable, 577 - };

-370

drivers/iommu/amd/io_pgtable_v2.c

··· 1 - // SPDX-License-Identifier: GPL-2.0-only 2 - /* 3 - * CPU-agnostic AMD IO page table v2 allocator. 4 - * 5 - * Copyright (C) 2022, 2023 Advanced Micro Devices, Inc. 6 - * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> 7 - * Author: Vasant Hegde <vasant.hegde@amd.com> 8 - */ 9 - 10 - #define pr_fmt(fmt) "AMD-Vi: " fmt 11 - #define dev_fmt(fmt) pr_fmt(fmt) 12 - 13 - #include <linux/bitops.h> 14 - #include <linux/io-pgtable.h> 15 - #include <linux/kernel.h> 16 - 17 - #include <asm/barrier.h> 18 - 19 - #include "amd_iommu_types.h" 20 - #include "amd_iommu.h" 21 - #include "../iommu-pages.h" 22 - 23 - #define IOMMU_PAGE_PRESENT BIT_ULL(0) /* Is present */ 24 - #define IOMMU_PAGE_RW BIT_ULL(1) /* Writeable */ 25 - #define IOMMU_PAGE_USER BIT_ULL(2) /* Userspace addressable */ 26 - #define IOMMU_PAGE_PWT BIT_ULL(3) /* Page write through */ 27 - #define IOMMU_PAGE_PCD BIT_ULL(4) /* Page cache disabled */ 28 - #define IOMMU_PAGE_ACCESS BIT_ULL(5) /* Was accessed (updated by IOMMU) */ 29 - #define IOMMU_PAGE_DIRTY BIT_ULL(6) /* Was written to (updated by IOMMU) */ 30 - #define IOMMU_PAGE_PSE BIT_ULL(7) /* Page Size Extensions */ 31 - #define IOMMU_PAGE_NX BIT_ULL(63) /* No execute */ 32 - 33 - #define MAX_PTRS_PER_PAGE 512 34 - 35 - #define IOMMU_PAGE_SIZE_2M BIT_ULL(21) 36 - #define IOMMU_PAGE_SIZE_1G BIT_ULL(30) 37 - 38 - 39 - static inline int get_pgtable_level(void) 40 - { 41 - return amd_iommu_gpt_level; 42 - } 43 - 44 - static inline bool is_large_pte(u64 pte) 45 - { 46 - return (pte & IOMMU_PAGE_PSE); 47 - } 48 - 49 - static inline u64 set_pgtable_attr(u64 *page) 50 - { 51 - u64 prot; 52 - 53 - prot = IOMMU_PAGE_PRESENT | IOMMU_PAGE_RW | IOMMU_PAGE_USER; 54 - prot |= IOMMU_PAGE_ACCESS; 55 - 56 - return (iommu_virt_to_phys(page) | prot); 57 - } 58 - 59 - static inline void *get_pgtable_pte(u64 pte) 60 - { 61 - return iommu_phys_to_virt(pte & PM_ADDR_MASK); 62 - } 63 - 64 - static u64 set_pte_attr(u64 paddr, u64 pg_size, int prot) 65 - { 66 - u64 pte; 67 - 68 - pte = __sme_set(paddr & PM_ADDR_MASK); 69 - pte |= IOMMU_PAGE_PRESENT | IOMMU_PAGE_USER; 70 - pte |= IOMMU_PAGE_ACCESS | IOMMU_PAGE_DIRTY; 71 - 72 - if (prot & IOMMU_PROT_IW) 73 - pte |= IOMMU_PAGE_RW; 74 - 75 - /* Large page */ 76 - if (pg_size == IOMMU_PAGE_SIZE_1G || pg_size == IOMMU_PAGE_SIZE_2M) 77 - pte |= IOMMU_PAGE_PSE; 78 - 79 - return pte; 80 - } 81 - 82 - static inline u64 get_alloc_page_size(u64 size) 83 - { 84 - if (size >= IOMMU_PAGE_SIZE_1G) 85 - return IOMMU_PAGE_SIZE_1G; 86 - 87 - if (size >= IOMMU_PAGE_SIZE_2M) 88 - return IOMMU_PAGE_SIZE_2M; 89 - 90 - return PAGE_SIZE; 91 - } 92 - 93 - static inline int page_size_to_level(u64 pg_size) 94 - { 95 - if (pg_size == IOMMU_PAGE_SIZE_1G) 96 - return PAGE_MODE_3_LEVEL; 97 - if (pg_size == IOMMU_PAGE_SIZE_2M) 98 - return PAGE_MODE_2_LEVEL; 99 - 100 - return PAGE_MODE_1_LEVEL; 101 - } 102 - 103 - static void free_pgtable(u64 *pt, int level) 104 - { 105 - u64 *p; 106 - int i; 107 - 108 - for (i = 0; i < MAX_PTRS_PER_PAGE; i++) { 109 - /* PTE present? */ 110 - if (!IOMMU_PTE_PRESENT(pt[i])) 111 - continue; 112 - 113 - if (is_large_pte(pt[i])) 114 - continue; 115 - 116 - /* 117 - * Free the next level. No need to look at l1 tables here since 118 - * they can only contain leaf PTEs; just free them directly. 119 - */ 120 - p = get_pgtable_pte(pt[i]); 121 - if (level > 2) 122 - free_pgtable(p, level - 1); 123 - else 124 - iommu_free_pages(p); 125 - } 126 - 127 - iommu_free_pages(pt); 128 - } 129 - 130 - /* Allocate page table */ 131 - static u64 *v2_alloc_pte(int nid, u64 *pgd, unsigned long iova, 132 - unsigned long pg_size, gfp_t gfp, bool *updated) 133 - { 134 - u64 *pte, *page; 135 - int level, end_level; 136 - 137 - level = get_pgtable_level() - 1; 138 - end_level = page_size_to_level(pg_size); 139 - pte = &pgd[PM_LEVEL_INDEX(level, iova)]; 140 - iova = PAGE_SIZE_ALIGN(iova, PAGE_SIZE); 141 - 142 - while (level >= end_level) { 143 - u64 __pte, __npte; 144 - 145 - __pte = *pte; 146 - 147 - if (IOMMU_PTE_PRESENT(__pte) && is_large_pte(__pte)) { 148 - /* Unmap large pte */ 149 - cmpxchg64(pte, *pte, 0ULL); 150 - *updated = true; 151 - continue; 152 - } 153 - 154 - if (!IOMMU_PTE_PRESENT(__pte)) { 155 - page = iommu_alloc_pages_node_sz(nid, gfp, SZ_4K); 156 - if (!page) 157 - return NULL; 158 - 159 - __npte = set_pgtable_attr(page); 160 - /* pte could have been changed somewhere. */ 161 - if (!try_cmpxchg64(pte, &__pte, __npte)) 162 - iommu_free_pages(page); 163 - else if (IOMMU_PTE_PRESENT(__pte)) 164 - *updated = true; 165 - 166 - continue; 167 - } 168 - 169 - level -= 1; 170 - pte = get_pgtable_pte(__pte); 171 - pte = &pte[PM_LEVEL_INDEX(level, iova)]; 172 - } 173 - 174 - /* Tear down existing pte entries */ 175 - if (IOMMU_PTE_PRESENT(*pte)) { 176 - u64 *__pte; 177 - 178 - *updated = true; 179 - __pte = get_pgtable_pte(*pte); 180 - cmpxchg64(pte, *pte, 0ULL); 181 - if (pg_size == IOMMU_PAGE_SIZE_1G) 182 - free_pgtable(__pte, end_level - 1); 183 - else if (pg_size == IOMMU_PAGE_SIZE_2M) 184 - iommu_free_pages(__pte); 185 - } 186 - 187 - return pte; 188 - } 189 - 190 - /* 191 - * This function checks if there is a PTE for a given dma address. 192 - * If there is one, it returns the pointer to it. 193 - */ 194 - static u64 *fetch_pte(struct amd_io_pgtable *pgtable, 195 - unsigned long iova, unsigned long *page_size) 196 - { 197 - u64 *pte; 198 - int level; 199 - 200 - level = get_pgtable_level() - 1; 201 - pte = &pgtable->pgd[PM_LEVEL_INDEX(level, iova)]; 202 - /* Default page size is 4K */ 203 - *page_size = PAGE_SIZE; 204 - 205 - while (level) { 206 - /* Not present */ 207 - if (!IOMMU_PTE_PRESENT(*pte)) 208 - return NULL; 209 - 210 - /* Walk to the next level */ 211 - pte = get_pgtable_pte(*pte); 212 - pte = &pte[PM_LEVEL_INDEX(level - 1, iova)]; 213 - 214 - /* Large page */ 215 - if (is_large_pte(*pte)) { 216 - if (level == PAGE_MODE_3_LEVEL) 217 - *page_size = IOMMU_PAGE_SIZE_1G; 218 - else if (level == PAGE_MODE_2_LEVEL) 219 - *page_size = IOMMU_PAGE_SIZE_2M; 220 - else 221 - return NULL; /* Wrongly set PSE bit in PTE */ 222 - 223 - break; 224 - } 225 - 226 - level -= 1; 227 - } 228 - 229 - return pte; 230 - } 231 - 232 - static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, 233 - phys_addr_t paddr, size_t pgsize, size_t pgcount, 234 - int prot, gfp_t gfp, size_t *mapped) 235 - { 236 - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 237 - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 238 - u64 *pte; 239 - unsigned long map_size; 240 - unsigned long mapped_size = 0; 241 - unsigned long o_iova = iova; 242 - size_t size = pgcount << __ffs(pgsize); 243 - int ret = 0; 244 - bool updated = false; 245 - 246 - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize) || !pgcount) 247 - return -EINVAL; 248 - 249 - if (!(prot & IOMMU_PROT_MASK)) 250 - return -EINVAL; 251 - 252 - while (mapped_size < size) { 253 - map_size = get_alloc_page_size(pgsize); 254 - pte = v2_alloc_pte(cfg->amd.nid, pgtable->pgd, 255 - iova, map_size, gfp, &updated); 256 - if (!pte) { 257 - ret = -ENOMEM; 258 - goto out; 259 - } 260 - 261 - *pte = set_pte_attr(paddr, map_size, prot); 262 - 263 - iova += map_size; 264 - paddr += map_size; 265 - mapped_size += map_size; 266 - } 267 - 268 - out: 269 - if (updated) { 270 - struct protection_domain *pdom = io_pgtable_ops_to_domain(ops); 271 - unsigned long flags; 272 - 273 - spin_lock_irqsave(&pdom->lock, flags); 274 - amd_iommu_domain_flush_pages(pdom, o_iova, size); 275 - spin_unlock_irqrestore(&pdom->lock, flags); 276 - } 277 - 278 - if (mapped) 279 - *mapped += mapped_size; 280 - 281 - return ret; 282 - } 283 - 284 - static unsigned long iommu_v2_unmap_pages(struct io_pgtable_ops *ops, 285 - unsigned long iova, 286 - size_t pgsize, size_t pgcount, 287 - struct iommu_iotlb_gather *gather) 288 - { 289 - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 290 - struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; 291 - unsigned long unmap_size; 292 - unsigned long unmapped = 0; 293 - size_t size = pgcount << __ffs(pgsize); 294 - u64 *pte; 295 - 296 - if (WARN_ON(!pgsize || (pgsize & cfg->pgsize_bitmap) != pgsize || !pgcount)) 297 - return 0; 298 - 299 - while (unmapped < size) { 300 - pte = fetch_pte(pgtable, iova, &unmap_size); 301 - if (!pte) 302 - return unmapped; 303 - 304 - *pte = 0ULL; 305 - 306 - iova = (iova & ~(unmap_size - 1)) + unmap_size; 307 - unmapped += unmap_size; 308 - } 309 - 310 - return unmapped; 311 - } 312 - 313 - static phys_addr_t iommu_v2_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) 314 - { 315 - struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); 316 - unsigned long offset_mask, pte_pgsize; 317 - u64 *pte, __pte; 318 - 319 - pte = fetch_pte(pgtable, iova, &pte_pgsize); 320 - if (!pte || !IOMMU_PTE_PRESENT(*pte)) 321 - return 0; 322 - 323 - offset_mask = pte_pgsize - 1; 324 - __pte = __sme_clr(*pte & PM_ADDR_MASK); 325 - 326 - return (__pte & ~offset_mask) | (iova & offset_mask); 327 - } 328 - 329 - /* 330 - * ---------------------------------------------------- 331 - */ 332 - static void v2_free_pgtable(struct io_pgtable *iop) 333 - { 334 - struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, pgtbl); 335 - 336 - if (!pgtable || !pgtable->pgd) 337 - return; 338 - 339 - /* Free page table */ 340 - free_pgtable(pgtable->pgd, get_pgtable_level()); 341 - pgtable->pgd = NULL; 342 - } 343 - 344 - static struct io_pgtable *v2_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) 345 - { 346 - struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); 347 - int ias = IOMMU_IN_ADDR_BIT_SIZE; 348 - 349 - pgtable->pgd = iommu_alloc_pages_node_sz(cfg->amd.nid, GFP_KERNEL, SZ_4K); 350 - if (!pgtable->pgd) 351 - return NULL; 352 - 353 - if (get_pgtable_level() == PAGE_MODE_5_LEVEL) 354 - ias = 57; 355 - 356 - pgtable->pgtbl.ops.map_pages = iommu_v2_map_pages; 357 - pgtable->pgtbl.ops.unmap_pages = iommu_v2_unmap_pages; 358 - pgtable->pgtbl.ops.iova_to_phys = iommu_v2_iova_to_phys; 359 - 360 - cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES_V2; 361 - cfg->ias = ias; 362 - cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE; 363 - 364 - return &pgtable->pgtbl; 365 - } 366 - 367 - struct io_pgtable_init_fns io_pgtable_amd_iommu_v2_init_fns = { 368 - .alloc = v2_alloc_pgtable, 369 - .free = v2_free_pgtable, 370 - };

+306 -286

drivers/iommu/amd/iommu.c

··· 30 30 #include <linux/msi.h> 31 31 #include <linux/irqdomain.h> 32 32 #include <linux/percpu.h> 33 - #include <linux/io-pgtable.h> 34 33 #include <linux/cc_platform.h> 35 34 #include <asm/irq_remapping.h> 36 35 #include <asm/io_apic.h> ··· 40 41 #include <asm/gart.h> 41 42 #include <asm/dma.h> 42 43 #include <uapi/linux/iommufd.h> 44 + #include <linux/generic_pt/iommu.h> 43 45 44 46 #include "amd_iommu.h" 45 - #include "../dma-iommu.h" 46 47 #include "../irq_remapping.h" 47 48 #include "../iommu-pages.h" 48 49 ··· 59 60 LIST_HEAD(acpihid_map); 60 61 61 62 const struct iommu_ops amd_iommu_ops; 62 - static const struct iommu_dirty_ops amd_dirty_ops; 63 63 64 64 int amd_iommu_max_glx_val = -1; 65 65 ··· 68 70 */ 69 71 DEFINE_IDA(pdom_ids); 70 72 71 - static int amd_iommu_attach_device(struct iommu_domain *dom, 72 - struct device *dev); 73 + static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, 74 + struct iommu_domain *old); 73 75 74 76 static void set_dte_entry(struct amd_iommu *iommu, 75 - struct iommu_dev_data *dev_data); 77 + struct iommu_dev_data *dev_data, 78 + phys_addr_t top_paddr, unsigned int top_level); 79 + 80 + static void amd_iommu_change_top(struct pt_iommu *iommu_table, 81 + phys_addr_t top_paddr, unsigned int top_level); 76 82 77 83 static void iommu_flush_dte_sync(struct amd_iommu *iommu, u16 devid); 78 84 79 85 static struct iommu_dev_data *find_dev_data(struct amd_iommu *iommu, u16 devid); 86 + static bool amd_iommu_enforce_cache_coherency(struct iommu_domain *domain); 87 + static int amd_iommu_set_dirty_tracking(struct iommu_domain *domain, 88 + bool enable); 80 89 81 90 /**************************************************************************** 82 91 * ··· 1162 1157 * 1163 1158 ****************************************************************************/ 1164 1159 1160 + static void dump_command_buffer(struct amd_iommu *iommu) 1161 + { 1162 + struct iommu_cmd *cmd; 1163 + u32 head, tail; 1164 + int i; 1165 + 1166 + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 1167 + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 1168 + 1169 + pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head), 1170 + MMIO_CMD_BUFFER_TAIL(tail)); 1171 + 1172 + for (i = 0; i < CMD_BUFFER_ENTRIES; i++) { 1173 + cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd)); 1174 + pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2], 1175 + cmd->data[3]); 1176 + } 1177 + } 1178 + 1165 1179 static int wait_on_sem(struct amd_iommu *iommu, u64 data) 1166 1180 { 1167 1181 int i = 0; ··· 1191 1167 } 1192 1168 1193 1169 if (i == LOOP_TIMEOUT) { 1194 - pr_alert("Completion-Wait loop timed out\n"); 1170 + 1171 + pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n", 1172 + iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid), 1173 + PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid)); 1174 + 1175 + if (amd_iommu_dump) 1176 + DO_ONCE_LITE(dump_command_buffer, iommu); 1177 + 1195 1178 return -EIO; 1196 1179 } 1197 1180 ··· 1787 1756 CMD_INV_IOMMU_ALL_PAGES_ADDRESS); 1788 1757 } 1789 1758 1790 - /* Flush the not present cache if it exists */ 1791 - static void domain_flush_np_cache(struct protection_domain *domain, 1792 - dma_addr_t iova, size_t size) 1793 - { 1794 - if (unlikely(amd_iommu_np_cache)) { 1795 - unsigned long flags; 1796 - 1797 - spin_lock_irqsave(&domain->lock, flags); 1798 - amd_iommu_domain_flush_pages(domain, iova, size); 1799 - spin_unlock_irqrestore(&domain->lock, flags); 1800 - } 1801 - } 1802 - 1803 - 1804 - /* 1805 - * This function flushes the DTEs for all devices in domain 1806 - */ 1807 - void amd_iommu_update_and_flush_device_table(struct protection_domain *domain) 1808 - { 1809 - struct iommu_dev_data *dev_data; 1810 - 1811 - lockdep_assert_held(&domain->lock); 1812 - 1813 - list_for_each_entry(dev_data, &domain->dev_list, list) { 1814 - struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); 1815 - 1816 - set_dte_entry(iommu, dev_data); 1817 - clone_aliases(iommu, dev_data->dev); 1818 - } 1819 - 1820 - list_for_each_entry(dev_data, &domain->dev_list, list) 1821 - device_flush_dte(dev_data); 1822 - 1823 - domain_flush_complete(domain); 1824 - } 1825 - 1826 1759 int amd_iommu_complete_ppr(struct device *dev, u32 pasid, int status, int tag) 1827 1760 { 1828 1761 struct iommu_dev_data *dev_data; ··· 2046 2051 } 2047 2052 2048 2053 static void set_dte_entry(struct amd_iommu *iommu, 2049 - struct iommu_dev_data *dev_data) 2054 + struct iommu_dev_data *dev_data, 2055 + phys_addr_t top_paddr, unsigned int top_level) 2050 2056 { 2051 2057 u16 domid; 2052 2058 u32 old_domid; ··· 2056 2060 struct protection_domain *domain = dev_data->domain; 2057 2061 struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info; 2058 2062 struct dev_table_entry *dte = &get_dev_table(iommu)[dev_data->devid]; 2059 - 2060 - if (gcr3_info && gcr3_info->gcr3_tbl) 2061 - domid = dev_data->gcr3_info.domid; 2062 - else 2063 - domid = domain->id; 2063 + struct pt_iommu_amdv1_hw_info pt_info; 2064 2064 2065 2065 make_clear_dte(dev_data, dte, &new); 2066 2066 2067 - if (domain->iop.mode != PAGE_MODE_NONE) 2068 - new.data[0] |= iommu_virt_to_phys(domain->iop.root); 2067 + if (gcr3_info && gcr3_info->gcr3_tbl) 2068 + domid = dev_data->gcr3_info.domid; 2069 + else { 2070 + domid = domain->id; 2069 2071 2070 - new.data[0] |= (domain->iop.mode & DEV_ENTRY_MODE_MASK) 2071 - << DEV_ENTRY_MODE_SHIFT; 2072 + if (domain->domain.type & __IOMMU_DOMAIN_PAGING) { 2073 + /* 2074 + * When updating the IO pagetable, the new top and level 2075 + * are provided as parameters. For other operations i.e. 2076 + * device attach, retrieve the current pagetable info 2077 + * via the IOMMU PT API. 2078 + */ 2079 + if (top_paddr) { 2080 + pt_info.host_pt_root = top_paddr; 2081 + pt_info.mode = top_level + 1; 2082 + } else { 2083 + WARN_ON(top_paddr || top_level); 2084 + pt_iommu_amdv1_hw_info(&domain->amdv1, 2085 + &pt_info); 2086 + } 2087 + 2088 + new.data[0] |= __sme_set(pt_info.host_pt_root) | 2089 + (pt_info.mode & DEV_ENTRY_MODE_MASK) 2090 + << DEV_ENTRY_MODE_SHIFT; 2091 + } 2092 + } 2072 2093 2073 2094 new.data[0] |= DTE_FLAG_IR | DTE_FLAG_IW; 2074 2095 ··· 2151 2138 struct amd_iommu *iommu = get_amd_iommu_from_dev(dev_data->dev); 2152 2139 2153 2140 if (set) 2154 - set_dte_entry(iommu, dev_data); 2141 + set_dte_entry(iommu, dev_data, 0, 0); 2155 2142 else 2156 2143 clear_dte_entry(iommu, dev_data); 2157 2144 ··· 2169 2156 { 2170 2157 struct amd_iommu *iommu = get_amd_iommu_from_dev_data(dev_data); 2171 2158 int max_pasids = dev_data->max_pasids; 2159 + struct pt_iommu_x86_64_hw_info pt_info; 2172 2160 int ret = 0; 2173 2161 2174 2162 /* ··· 2192 2178 if (!pdom_is_v2_pgtbl_mode(pdom)) 2193 2179 return ret; 2194 2180 2195 - ret = update_gcr3(dev_data, 0, iommu_virt_to_phys(pdom->iop.pgd), true); 2181 + pt_iommu_x86_64_hw_info(&pdom->amdv2, &pt_info); 2182 + ret = update_gcr3(dev_data, 0, __sme_set(pt_info.gcr3_pt), true); 2196 2183 if (ret) 2197 2184 free_gcr3_table(&dev_data->gcr3_info); 2198 2185 ··· 2515 2500 return domain; 2516 2501 } 2517 2502 2518 - static int pdom_setup_pgtable(struct protection_domain *domain, 2519 - struct device *dev) 2520 - { 2521 - struct io_pgtable_ops *pgtbl_ops; 2522 - enum io_pgtable_fmt fmt; 2523 - 2524 - switch (domain->pd_mode) { 2525 - case PD_MODE_V1: 2526 - fmt = AMD_IOMMU_V1; 2527 - break; 2528 - case PD_MODE_V2: 2529 - fmt = AMD_IOMMU_V2; 2530 - break; 2531 - case PD_MODE_NONE: 2532 - WARN_ON_ONCE(1); 2533 - return -EPERM; 2534 - } 2535 - 2536 - domain->iop.pgtbl.cfg.amd.nid = dev_to_node(dev); 2537 - pgtbl_ops = alloc_io_pgtable_ops(fmt, &domain->iop.pgtbl.cfg, domain); 2538 - if (!pgtbl_ops) 2539 - return -ENOMEM; 2540 - 2541 - return 0; 2542 - } 2543 - 2544 - static inline u64 dma_max_address(enum protection_domain_mode pgtable) 2545 - { 2546 - if (pgtable == PD_MODE_V1) 2547 - return PM_LEVEL_SIZE(amd_iommu_hpt_level); 2548 - 2549 - /* 2550 - * V2 with 4/5 level page table. Note that "2.2.6.5 AMD64 4-Kbyte Page 2551 - * Translation" shows that the V2 table sign extends the top of the 2552 - * address space creating a reserved region in the middle of the 2553 - * translation, just like the CPU does. Further Vasant says the docs are 2554 - * incomplete and this only applies to non-zero PASIDs. If the AMDv2 2555 - * page table is assigned to the 0 PASID then there is no sign extension 2556 - * check. 2557 - * 2558 - * Since the IOMMU must have a fixed geometry, and the core code does 2559 - * not understand sign extended addressing, we have to chop off the high 2560 - * bit to get consistent behavior with attachments of the domain to any 2561 - * PASID. 2562 - */ 2563 - return ((1ULL << (PM_LEVEL_SHIFT(amd_iommu_gpt_level) - 1)) - 1); 2564 - } 2565 - 2566 2503 static bool amd_iommu_hd_support(struct amd_iommu *iommu) 2567 2504 { 2568 2505 if (amd_iommu_hatdis) ··· 2523 2556 return iommu && (iommu->features & FEATURE_HDSUP); 2524 2557 } 2525 2558 2526 - static struct iommu_domain * 2527 - do_iommu_domain_alloc(struct device *dev, u32 flags, 2528 - enum protection_domain_mode pgtable) 2559 + static spinlock_t *amd_iommu_get_top_lock(struct pt_iommu *iommupt) 2529 2560 { 2530 - bool dirty_tracking = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING; 2531 - struct amd_iommu *iommu = get_amd_iommu_from_dev(dev); 2561 + struct protection_domain *pdom = 2562 + container_of(iommupt, struct protection_domain, iommu); 2563 + 2564 + return &pdom->lock; 2565 + } 2566 + 2567 + /* 2568 + * Update all HW references to the domain with a new pgtable configuration. 2569 + */ 2570 + static void amd_iommu_change_top(struct pt_iommu *iommu_table, 2571 + phys_addr_t top_paddr, unsigned int top_level) 2572 + { 2573 + struct protection_domain *pdom = 2574 + container_of(iommu_table, struct protection_domain, iommu); 2575 + struct iommu_dev_data *dev_data; 2576 + 2577 + lockdep_assert_held(&pdom->lock); 2578 + 2579 + /* Update the DTE for all devices attached to this domain */ 2580 + list_for_each_entry(dev_data, &pdom->dev_list, list) { 2581 + struct amd_iommu *iommu = rlookup_amd_iommu(dev_data->dev); 2582 + 2583 + /* Update the HW references with the new level and top ptr */ 2584 + set_dte_entry(iommu, dev_data, top_paddr, top_level); 2585 + clone_aliases(iommu, dev_data->dev); 2586 + } 2587 + 2588 + list_for_each_entry(dev_data, &pdom->dev_list, list) 2589 + device_flush_dte(dev_data); 2590 + 2591 + domain_flush_complete(pdom); 2592 + } 2593 + 2594 + /* 2595 + * amd_iommu_iotlb_sync_map() is used to generate flushes for non-present to 2596 + * present (ie mapping) operations. It is a NOP if the IOMMU doesn't have non 2597 + * present caching (like hypervisor shadowing). 2598 + */ 2599 + static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, 2600 + unsigned long iova, size_t size) 2601 + { 2602 + struct protection_domain *domain = to_pdomain(dom); 2603 + unsigned long flags; 2604 + 2605 + if (likely(!amd_iommu_np_cache)) 2606 + return 0; 2607 + 2608 + spin_lock_irqsave(&domain->lock, flags); 2609 + amd_iommu_domain_flush_pages(domain, iova, size); 2610 + spin_unlock_irqrestore(&domain->lock, flags); 2611 + return 0; 2612 + } 2613 + 2614 + static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) 2615 + { 2616 + struct protection_domain *dom = to_pdomain(domain); 2617 + unsigned long flags; 2618 + 2619 + spin_lock_irqsave(&dom->lock, flags); 2620 + amd_iommu_domain_flush_all(dom); 2621 + spin_unlock_irqrestore(&dom->lock, flags); 2622 + } 2623 + 2624 + static void amd_iommu_iotlb_sync(struct iommu_domain *domain, 2625 + struct iommu_iotlb_gather *gather) 2626 + { 2627 + struct protection_domain *dom = to_pdomain(domain); 2628 + unsigned long flags; 2629 + 2630 + spin_lock_irqsave(&dom->lock, flags); 2631 + amd_iommu_domain_flush_pages(dom, gather->start, 2632 + gather->end - gather->start + 1); 2633 + spin_unlock_irqrestore(&dom->lock, flags); 2634 + iommu_put_pages_list(&gather->freelist); 2635 + } 2636 + 2637 + static const struct pt_iommu_driver_ops amd_hw_driver_ops_v1 = { 2638 + .get_top_lock = amd_iommu_get_top_lock, 2639 + .change_top = amd_iommu_change_top, 2640 + }; 2641 + 2642 + static const struct iommu_domain_ops amdv1_ops = { 2643 + IOMMU_PT_DOMAIN_OPS(amdv1), 2644 + .iotlb_sync_map = amd_iommu_iotlb_sync_map, 2645 + .flush_iotlb_all = amd_iommu_flush_iotlb_all, 2646 + .iotlb_sync = amd_iommu_iotlb_sync, 2647 + .attach_dev = amd_iommu_attach_device, 2648 + .free = amd_iommu_domain_free, 2649 + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 2650 + }; 2651 + 2652 + static const struct iommu_dirty_ops amdv1_dirty_ops = { 2653 + IOMMU_PT_DIRTY_OPS(amdv1), 2654 + .set_dirty_tracking = amd_iommu_set_dirty_tracking, 2655 + }; 2656 + 2657 + static struct iommu_domain *amd_iommu_domain_alloc_paging_v1(struct device *dev, 2658 + u32 flags) 2659 + { 2660 + struct pt_iommu_amdv1_cfg cfg = {}; 2532 2661 struct protection_domain *domain; 2533 2662 int ret; 2663 + 2664 + if (amd_iommu_hatdis) 2665 + return ERR_PTR(-EOPNOTSUPP); 2534 2666 2535 2667 domain = protection_domain_alloc(); 2536 2668 if (!domain) 2537 2669 return ERR_PTR(-ENOMEM); 2538 2670 2539 - domain->pd_mode = pgtable; 2540 - ret = pdom_setup_pgtable(domain, dev); 2671 + domain->pd_mode = PD_MODE_V1; 2672 + domain->iommu.driver_ops = &amd_hw_driver_ops_v1; 2673 + domain->iommu.nid = dev_to_node(dev); 2674 + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) 2675 + domain->domain.dirty_ops = &amdv1_dirty_ops; 2676 + 2677 + /* 2678 + * Someday FORCE_COHERENCE should be set by 2679 + * amd_iommu_enforce_cache_coherency() like VT-d does. 2680 + */ 2681 + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | 2682 + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | 2683 + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); 2684 + 2685 + /* 2686 + * AMD's IOMMU can flush as many pages as necessary in a single flush. 2687 + * Unless we run in a virtual machine, which can be inferred according 2688 + * to whether "non-present cache" is on, it is probably best to prefer 2689 + * (potentially) too extensive TLB flushing (i.e., more misses) over 2690 + * multiple TLB flushes (i.e., more flushes). For virtual machines the 2691 + * hypervisor needs to synchronize the host IOMMU PTEs with those of 2692 + * the guest, and the trade-off is different: unnecessary TLB flushes 2693 + * should be avoided. 2694 + */ 2695 + if (amd_iommu_np_cache) 2696 + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); 2697 + else 2698 + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); 2699 + 2700 + cfg.common.hw_max_vasz_lg2 = 2701 + min(64, (amd_iommu_hpt_level - 1) * 9 + 21); 2702 + cfg.common.hw_max_oasz_lg2 = 52; 2703 + cfg.starting_level = 2; 2704 + domain->domain.ops = &amdv1_ops; 2705 + 2706 + ret = pt_iommu_amdv1_init(&domain->amdv1, &cfg, GFP_KERNEL); 2541 2707 if (ret) { 2542 - pdom_id_free(domain->id); 2543 - kfree(domain); 2708 + amd_iommu_domain_free(&domain->domain); 2544 2709 return ERR_PTR(ret); 2545 2710 } 2546 2711 2547 - domain->domain.geometry.aperture_start = 0; 2548 - domain->domain.geometry.aperture_end = dma_max_address(pgtable); 2549 - domain->domain.geometry.force_aperture = true; 2550 - domain->domain.pgsize_bitmap = domain->iop.pgtbl.cfg.pgsize_bitmap; 2712 + /* 2713 + * Narrow the supported page sizes to those selected by the kernel 2714 + * command line. 2715 + */ 2716 + domain->domain.pgsize_bitmap &= amd_iommu_pgsize_bitmap; 2717 + return &domain->domain; 2718 + } 2551 2719 2552 - domain->domain.type = IOMMU_DOMAIN_UNMANAGED; 2553 - domain->domain.ops = iommu->iommu.ops->default_domain_ops; 2720 + static const struct iommu_domain_ops amdv2_ops = { 2721 + IOMMU_PT_DOMAIN_OPS(x86_64), 2722 + .iotlb_sync_map = amd_iommu_iotlb_sync_map, 2723 + .flush_iotlb_all = amd_iommu_flush_iotlb_all, 2724 + .iotlb_sync = amd_iommu_iotlb_sync, 2725 + .attach_dev = amd_iommu_attach_device, 2726 + .free = amd_iommu_domain_free, 2727 + /* 2728 + * Note the AMDv2 page table format does not support a Force Coherency 2729 + * bit, so enforce_cache_coherency should not be set. However VFIO is 2730 + * not prepared to handle a case where some domains will support 2731 + * enforcement and others do not. VFIO and iommufd will have to be fixed 2732 + * before it can fully use the V2 page table. See the comment in 2733 + * iommufd_hwpt_paging_alloc(). For now leave things as they have 2734 + * historically been and lie about enforce_cache_coherencey. 2735 + */ 2736 + .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 2737 + }; 2554 2738 2555 - if (dirty_tracking) 2556 - domain->domain.dirty_ops = &amd_dirty_ops; 2739 + static struct iommu_domain *amd_iommu_domain_alloc_paging_v2(struct device *dev, 2740 + u32 flags) 2741 + { 2742 + struct pt_iommu_x86_64_cfg cfg = {}; 2743 + struct protection_domain *domain; 2744 + int ret; 2557 2745 2746 + if (!amd_iommu_v2_pgtbl_supported()) 2747 + return ERR_PTR(-EOPNOTSUPP); 2748 + 2749 + domain = protection_domain_alloc(); 2750 + if (!domain) 2751 + return ERR_PTR(-ENOMEM); 2752 + 2753 + domain->pd_mode = PD_MODE_V2; 2754 + domain->iommu.nid = dev_to_node(dev); 2755 + 2756 + cfg.common.features = BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES); 2757 + if (amd_iommu_np_cache) 2758 + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS); 2759 + else 2760 + cfg.common.features |= BIT(PT_FEAT_FLUSH_RANGE); 2761 + 2762 + /* 2763 + * The v2 table behaves differently if it is attached to PASID 0 vs a 2764 + * non-zero PASID. On PASID 0 it has no sign extension and the full 2765 + * 57/48 bits decode the lower addresses. Otherwise it behaves like a 2766 + * normal sign extended x86 page table. Since we want the domain to work 2767 + * in both modes the top bit is removed and PT_FEAT_SIGN_EXTEND is not 2768 + * set which creates a table that is compatible in both modes. 2769 + */ 2770 + if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) { 2771 + cfg.common.hw_max_vasz_lg2 = 56; 2772 + cfg.top_level = 4; 2773 + } else { 2774 + cfg.common.hw_max_vasz_lg2 = 47; 2775 + cfg.top_level = 3; 2776 + } 2777 + cfg.common.hw_max_oasz_lg2 = 52; 2778 + domain->domain.ops = &amdv2_ops; 2779 + 2780 + ret = pt_iommu_x86_64_init(&domain->amdv2, &cfg, GFP_KERNEL); 2781 + if (ret) { 2782 + amd_iommu_domain_free(&domain->domain); 2783 + return ERR_PTR(ret); 2784 + } 2558 2785 return &domain->domain; 2559 2786 } 2560 2787 ··· 2769 2608 /* Allocate domain with v1 page table for dirty tracking */ 2770 2609 if (!amd_iommu_hd_support(iommu)) 2771 2610 break; 2772 - return do_iommu_domain_alloc(dev, flags, PD_MODE_V1); 2611 + return amd_iommu_domain_alloc_paging_v1(dev, flags); 2773 2612 case IOMMU_HWPT_ALLOC_PASID: 2774 2613 /* Allocate domain with v2 page table if IOMMU supports PASID. */ 2775 2614 if (!amd_iommu_pasid_supported()) 2776 2615 break; 2777 - return do_iommu_domain_alloc(dev, flags, PD_MODE_V2); 2778 - case 0: 2616 + return amd_iommu_domain_alloc_paging_v2(dev, flags); 2617 + case 0: { 2618 + struct iommu_domain *ret; 2619 + 2779 2620 /* If nothing specific is required use the kernel commandline default */ 2780 - return do_iommu_domain_alloc(dev, 0, amd_iommu_pgtable); 2621 + if (amd_iommu_pgtable == PD_MODE_V1) { 2622 + ret = amd_iommu_domain_alloc_paging_v1(dev, flags); 2623 + if (ret != ERR_PTR(-EOPNOTSUPP)) 2624 + return ret; 2625 + return amd_iommu_domain_alloc_paging_v2(dev, flags); 2626 + } 2627 + ret = amd_iommu_domain_alloc_paging_v2(dev, flags); 2628 + if (ret != ERR_PTR(-EOPNOTSUPP)) 2629 + return ret; 2630 + return amd_iommu_domain_alloc_paging_v1(dev, flags); 2631 + } 2781 2632 default: 2782 2633 break; 2783 2634 } ··· 2801 2628 struct protection_domain *domain = to_pdomain(dom); 2802 2629 2803 2630 WARN_ON(!list_empty(&domain->dev_list)); 2804 - if (domain->domain.type & __IOMMU_DOMAIN_PAGING) 2805 - free_io_pgtable_ops(&domain->iop.pgtbl.ops); 2631 + pt_iommu_deinit(&domain->iommu); 2806 2632 pdom_id_free(domain->id); 2807 2633 kfree(domain); 2808 2634 } 2809 2635 2810 2636 static int blocked_domain_attach_device(struct iommu_domain *domain, 2811 - struct device *dev) 2637 + struct device *dev, 2638 + struct iommu_domain *old) 2812 2639 { 2813 2640 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2814 2641 ··· 2858 2685 protection_domain_init(&identity_domain); 2859 2686 } 2860 2687 2861 - /* Same as blocked domain except it supports only ops->attach_dev() */ 2862 - static struct iommu_domain release_domain = { 2863 - .type = IOMMU_DOMAIN_BLOCKED, 2864 - .ops = &(const struct iommu_domain_ops) { 2865 - .attach_dev = blocked_domain_attach_device, 2866 - } 2867 - }; 2868 - 2869 - static int amd_iommu_attach_device(struct iommu_domain *dom, 2870 - struct device *dev) 2688 + static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev, 2689 + struct iommu_domain *old) 2871 2690 { 2872 2691 struct iommu_dev_data *dev_data = dev_iommu_priv_get(dev); 2873 2692 struct protection_domain *domain = to_pdomain(dom); ··· 2897 2732 #endif 2898 2733 2899 2734 return ret; 2900 - } 2901 - 2902 - static int amd_iommu_iotlb_sync_map(struct iommu_domain *dom, 2903 - unsigned long iova, size_t size) 2904 - { 2905 - struct protection_domain *domain = to_pdomain(dom); 2906 - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2907 - 2908 - if (ops->map_pages) 2909 - domain_flush_np_cache(domain, iova, size); 2910 - return 0; 2911 - } 2912 - 2913 - static int amd_iommu_map_pages(struct iommu_domain *dom, unsigned long iova, 2914 - phys_addr_t paddr, size_t pgsize, size_t pgcount, 2915 - int iommu_prot, gfp_t gfp, size_t *mapped) 2916 - { 2917 - struct protection_domain *domain = to_pdomain(dom); 2918 - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2919 - int prot = 0; 2920 - int ret = -EINVAL; 2921 - 2922 - if ((domain->pd_mode == PD_MODE_V1) && 2923 - (domain->iop.mode == PAGE_MODE_NONE)) 2924 - return -EINVAL; 2925 - 2926 - if (iommu_prot & IOMMU_READ) 2927 - prot |= IOMMU_PROT_IR; 2928 - if (iommu_prot & IOMMU_WRITE) 2929 - prot |= IOMMU_PROT_IW; 2930 - 2931 - if (ops->map_pages) { 2932 - ret = ops->map_pages(ops, iova, paddr, pgsize, 2933 - pgcount, prot, gfp, mapped); 2934 - } 2935 - 2936 - return ret; 2937 - } 2938 - 2939 - static void amd_iommu_iotlb_gather_add_page(struct iommu_domain *domain, 2940 - struct iommu_iotlb_gather *gather, 2941 - unsigned long iova, size_t size) 2942 - { 2943 - /* 2944 - * AMD's IOMMU can flush as many pages as necessary in a single flush. 2945 - * Unless we run in a virtual machine, which can be inferred according 2946 - * to whether "non-present cache" is on, it is probably best to prefer 2947 - * (potentially) too extensive TLB flushing (i.e., more misses) over 2948 - * mutliple TLB flushes (i.e., more flushes). For virtual machines the 2949 - * hypervisor needs to synchronize the host IOMMU PTEs with those of 2950 - * the guest, and the trade-off is different: unnecessary TLB flushes 2951 - * should be avoided. 2952 - */ 2953 - if (amd_iommu_np_cache && 2954 - iommu_iotlb_gather_is_disjoint(gather, iova, size)) 2955 - iommu_iotlb_sync(domain, gather); 2956 - 2957 - iommu_iotlb_gather_add_range(gather, iova, size); 2958 - } 2959 - 2960 - static size_t amd_iommu_unmap_pages(struct iommu_domain *dom, unsigned long iova, 2961 - size_t pgsize, size_t pgcount, 2962 - struct iommu_iotlb_gather *gather) 2963 - { 2964 - struct protection_domain *domain = to_pdomain(dom); 2965 - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2966 - size_t r; 2967 - 2968 - if ((domain->pd_mode == PD_MODE_V1) && 2969 - (domain->iop.mode == PAGE_MODE_NONE)) 2970 - return 0; 2971 - 2972 - r = (ops->unmap_pages) ? ops->unmap_pages(ops, iova, pgsize, pgcount, NULL) : 0; 2973 - 2974 - if (r) 2975 - amd_iommu_iotlb_gather_add_page(dom, gather, iova, r); 2976 - 2977 - return r; 2978 - } 2979 - 2980 - static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2981 - dma_addr_t iova) 2982 - { 2983 - struct protection_domain *domain = to_pdomain(dom); 2984 - struct io_pgtable_ops *ops = &domain->iop.pgtbl.ops; 2985 - 2986 - return ops->iova_to_phys(ops, iova); 2987 2735 } 2988 2736 2989 2737 static bool amd_iommu_capable(struct device *dev, enum iommu_cap cap) ··· 2963 2885 spin_unlock_irqrestore(&pdomain->lock, flags); 2964 2886 2965 2887 return 0; 2966 - } 2967 - 2968 - static int amd_iommu_read_and_clear_dirty(struct iommu_domain *domain, 2969 - unsigned long iova, size_t size, 2970 - unsigned long flags, 2971 - struct iommu_dirty_bitmap *dirty) 2972 - { 2973 - struct protection_domain *pdomain = to_pdomain(domain); 2974 - struct io_pgtable_ops *ops = &pdomain->iop.pgtbl.ops; 2975 - unsigned long lflags; 2976 - 2977 - if (!ops || !ops->read_and_clear_dirty) 2978 - return -EOPNOTSUPP; 2979 - 2980 - spin_lock_irqsave(&pdomain->lock, lflags); 2981 - if (!pdomain->dirty_tracking && dirty->bitmap) { 2982 - spin_unlock_irqrestore(&pdomain->lock, lflags); 2983 - return -EINVAL; 2984 - } 2985 - spin_unlock_irqrestore(&pdomain->lock, lflags); 2986 - 2987 - return ops->read_and_clear_dirty(ops, iova, size, flags, dirty); 2988 2888 } 2989 2889 2990 2890 static void amd_iommu_get_resv_regions(struct device *dev, ··· 3034 2978 return dev_data->defer_attach; 3035 2979 } 3036 2980 3037 - static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain) 3038 - { 3039 - struct protection_domain *dom = to_pdomain(domain); 3040 - unsigned long flags; 3041 - 3042 - spin_lock_irqsave(&dom->lock, flags); 3043 - amd_iommu_domain_flush_all(dom); 3044 - spin_unlock_irqrestore(&dom->lock, flags); 3045 - } 3046 - 3047 - static void amd_iommu_iotlb_sync(struct iommu_domain *domain, 3048 - struct iommu_iotlb_gather *gather) 3049 - { 3050 - struct protection_domain *dom = to_pdomain(domain); 3051 - unsigned long flags; 3052 - 3053 - spin_lock_irqsave(&dom->lock, flags); 3054 - amd_iommu_domain_flush_pages(dom, gather->start, 3055 - gather->end - gather->start + 1); 3056 - spin_unlock_irqrestore(&dom->lock, flags); 3057 - } 3058 - 3059 2981 static int amd_iommu_def_domain_type(struct device *dev) 3060 2982 { 3061 2983 struct iommu_dev_data *dev_data; ··· 3068 3034 return true; 3069 3035 } 3070 3036 3071 - static const struct iommu_dirty_ops amd_dirty_ops = { 3072 - .set_dirty_tracking = amd_iommu_set_dirty_tracking, 3073 - .read_and_clear_dirty = amd_iommu_read_and_clear_dirty, 3074 - }; 3075 - 3076 3037 const struct iommu_ops amd_iommu_ops = { 3077 3038 .capable = amd_iommu_capable, 3078 3039 .blocked_domain = &blocked_domain, 3079 - .release_domain = &release_domain, 3040 + .release_domain = &blocked_domain, 3080 3041 .identity_domain = &identity_domain.domain, 3081 3042 .domain_alloc_paging_flags = amd_iommu_domain_alloc_paging_flags, 3082 3043 .domain_alloc_sva = amd_iommu_domain_alloc_sva, ··· 3082 3053 .is_attach_deferred = amd_iommu_is_attach_deferred, 3083 3054 .def_domain_type = amd_iommu_def_domain_type, 3084 3055 .page_response = amd_iommu_page_response, 3085 - .default_domain_ops = &(const struct iommu_domain_ops) { 3086 - .attach_dev = amd_iommu_attach_device, 3087 - .map_pages = amd_iommu_map_pages, 3088 - .unmap_pages = amd_iommu_unmap_pages, 3089 - .iotlb_sync_map = amd_iommu_iotlb_sync_map, 3090 - .iova_to_phys = amd_iommu_iova_to_phys, 3091 - .flush_iotlb_all = amd_iommu_flush_iotlb_all, 3092 - .iotlb_sync = amd_iommu_iotlb_sync, 3093 - .free = amd_iommu_domain_free, 3094 - .enforce_cache_coherency = amd_iommu_enforce_cache_coherency, 3095 - } 3096 3056 }; 3097 3057 3098 3058 #ifdef CONFIG_IRQ_REMAP ··· 3372 3354 static int modify_irte_ga(struct amd_iommu *iommu, u16 devid, int index, 3373 3355 struct irte_ga *irte) 3374 3356 { 3375 - bool ret; 3357 + int ret; 3376 3358 3377 3359 ret = __modify_irte_ga(iommu, devid, index, irte); 3378 3360 if (ret) ··· 4090 4072 return 0; 4091 4073 } 4092 4074 #endif 4075 + 4076 + MODULE_IMPORT_NS("GENERIC_PT_IOMMU");

+8 -3

drivers/iommu/apple-dart.c

··· 672 672 } 673 673 674 674 static int apple_dart_attach_dev_paging(struct iommu_domain *domain, 675 - struct device *dev) 675 + struct device *dev, 676 + struct iommu_domain *old) 676 677 { 677 678 int ret, i; 678 679 struct apple_dart_stream_map *stream_map; ··· 694 693 } 695 694 696 695 static int apple_dart_attach_dev_identity(struct iommu_domain *domain, 697 - struct device *dev) 696 + struct device *dev, 697 + struct iommu_domain *old) 698 698 { 699 699 struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); 700 700 struct apple_dart_stream_map *stream_map; ··· 719 717 }; 720 718 721 719 static int apple_dart_attach_dev_blocked(struct iommu_domain *domain, 722 - struct device *dev) 720 + struct device *dev, 721 + struct iommu_domain *old) 723 722 { 724 723 struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev); 725 724 struct apple_dart_stream_map *stream_map; ··· 804 801 struct apple_dart *dart = platform_get_drvdata(iommu_pdev); 805 802 struct apple_dart *cfg_dart; 806 803 int i, sid; 804 + 805 + put_device(&iommu_pdev->dev); 807 806 808 807 if (args->args_count != 1) 809 808 return -EINVAL;

+3 -2

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-iommufd.c

··· 138 138 } 139 139 140 140 static int arm_smmu_attach_dev_nested(struct iommu_domain *domain, 141 - struct device *dev) 141 + struct device *dev, 142 + struct iommu_domain *old_domain) 142 143 { 143 144 struct arm_smmu_nested_domain *nested_domain = 144 145 to_smmu_nested_domain(domain); 145 146 struct arm_smmu_master *master = dev_iommu_priv_get(dev); 146 147 struct arm_smmu_attach_state state = { 147 148 .master = master, 148 - .old_domain = iommu_get_domain_for_dev(dev), 149 + .old_domain = old_domain, 149 150 .ssid = IOMMU_NO_PASID, 150 151 }; 151 152 struct arm_smmu_ste ste;

+17 -16

drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c

··· 1464 1464 cd_table->l2.l1tab = dma_alloc_coherent(smmu->dev, l1size, 1465 1465 &cd_table->cdtab_dma, 1466 1466 GFP_KERNEL); 1467 - if (!cd_table->l2.l2ptrs) { 1467 + if (!cd_table->l2.l1tab) { 1468 1468 ret = -ENOMEM; 1469 1469 goto err_free_l2ptrs; 1470 1470 } ··· 3002 3002 master->ats_enabled = state->ats_enabled; 3003 3003 } 3004 3004 3005 - static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) 3005 + static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, 3006 + struct iommu_domain *old_domain) 3006 3007 { 3007 3008 int ret = 0; 3008 3009 struct arm_smmu_ste target; ··· 3011 3010 struct arm_smmu_device *smmu; 3012 3011 struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); 3013 3012 struct arm_smmu_attach_state state = { 3014 - .old_domain = iommu_get_domain_for_dev(dev), 3013 + .old_domain = old_domain, 3015 3014 .ssid = IOMMU_NO_PASID, 3016 3015 }; 3017 3016 struct arm_smmu_master *master; ··· 3187 3186 3188 3187 /* 3189 3188 * When the last user of the CD table goes away downgrade the STE back 3190 - * to a non-cd_table one. 3189 + * to a non-cd_table one, by re-attaching its sid_domain. 3191 3190 */ 3192 3191 if (!arm_smmu_ssids_in_use(&master->cd_table)) { 3193 3192 struct iommu_domain *sid_domain = ··· 3195 3194 3196 3195 if (sid_domain->type == IOMMU_DOMAIN_IDENTITY || 3197 3196 sid_domain->type == IOMMU_DOMAIN_BLOCKED) 3198 - sid_domain->ops->attach_dev(sid_domain, dev); 3197 + sid_domain->ops->attach_dev(sid_domain, dev, 3198 + sid_domain); 3199 3199 } 3200 3200 return 0; 3201 3201 } 3202 3202 3203 3203 static void arm_smmu_attach_dev_ste(struct iommu_domain *domain, 3204 + struct iommu_domain *old_domain, 3204 3205 struct device *dev, 3205 3206 struct arm_smmu_ste *ste, 3206 3207 unsigned int s1dss) ··· 3210 3207 struct arm_smmu_master *master = dev_iommu_priv_get(dev); 3211 3208 struct arm_smmu_attach_state state = { 3212 3209 .master = master, 3213 - .old_domain = iommu_get_domain_for_dev(dev), 3210 + .old_domain = old_domain, 3214 3211 .ssid = IOMMU_NO_PASID, 3215 3212 }; 3216 3213 ··· 3251 3248 } 3252 3249 3253 3250 static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, 3254 - struct device *dev) 3251 + struct device *dev, 3252 + struct iommu_domain *old_domain) 3255 3253 { 3256 3254 struct arm_smmu_ste ste; 3257 3255 struct arm_smmu_master *master = dev_iommu_priv_get(dev); 3258 3256 3259 3257 arm_smmu_master_clear_vmaster(master); 3260 3258 arm_smmu_make_bypass_ste(master->smmu, &ste); 3261 - arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS); 3259 + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, 3260 + STRTAB_STE_1_S1DSS_BYPASS); 3262 3261 return 0; 3263 3262 } 3264 3263 ··· 3274 3269 }; 3275 3270 3276 3271 static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, 3277 - struct device *dev) 3272 + struct device *dev, 3273 + struct iommu_domain *old_domain) 3278 3274 { 3279 3275 struct arm_smmu_ste ste; 3280 3276 struct arm_smmu_master *master = dev_iommu_priv_get(dev); 3281 3277 3282 3278 arm_smmu_master_clear_vmaster(master); 3283 3279 arm_smmu_make_abort_ste(&ste); 3284 - arm_smmu_attach_dev_ste(domain, dev, &ste, 3280 + arm_smmu_attach_dev_ste(domain, old_domain, dev, &ste, 3285 3281 STRTAB_STE_1_S1DSS_TERMINATE); 3286 3282 return 0; 3287 3283 } ··· 3588 3582 3589 3583 WARN_ON(master->iopf_refcount); 3590 3584 3591 - /* Put the STE back to what arm_smmu_init_strtab() sets */ 3592 - if (dev->iommu->require_direct) 3593 - arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev); 3594 - else 3595 - arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev); 3596 - 3597 3585 arm_smmu_disable_pasid(master); 3598 3586 arm_smmu_remove_master(master); 3599 3587 if (arm_smmu_cdtab_allocated(&master->cd_table)) ··· 3678 3678 static const struct iommu_ops arm_smmu_ops = { 3679 3679 .identity_domain = &arm_smmu_identity_domain, 3680 3680 .blocked_domain = &arm_smmu_blocked_domain, 3681 + .release_domain = &arm_smmu_blocked_domain, 3681 3682 .capable = arm_smmu_capable, 3682 3683 .hw_info = arm_smmu_hw_info, 3683 3684 .domain_alloc_sva = arm_smmu_sva_domain_alloc,

+18 -10

drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c

··· 367 367 static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = { 368 368 { .compatible = "qcom,adreno" }, 369 369 { .compatible = "qcom,adreno-gmu" }, 370 + { .compatible = "qcom,glymur-mdss" }, 370 371 { .compatible = "qcom,mdp4" }, 371 372 { .compatible = "qcom,mdss" }, 372 373 { .compatible = "qcom,qcm2290-mdss" }, ··· 432 431 433 432 /* 434 433 * Some platforms support more than the Arm SMMU architected maximum of 435 - * 128 stream matching groups. For unknown reasons, the additional 436 - * groups don't exhibit the same behavior as the architected registers, 437 - * so limit the groups to 128 until the behavior is fixed for the other 438 - * groups. 434 + * 128 stream matching groups. The additional registers appear to have 435 + * the same behavior as the architected registers in the hardware. 436 + * However, on some firmware versions, the hypervisor does not 437 + * correctly trap and emulate accesses to the additional registers, 438 + * resulting in unexpected behavior. 439 + * 440 + * If there are more than 128 groups, use the last reliable group to 441 + * detect if we need to apply the bypass quirk. 439 442 */ 440 - if (smmu->num_mapping_groups > 128) { 441 - dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n"); 442 - smmu->num_mapping_groups = 128; 443 - } 444 - 445 - last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); 443 + if (smmu->num_mapping_groups > 128) 444 + last_s2cr = ARM_SMMU_GR0_S2CR(127); 445 + else 446 + last_s2cr = ARM_SMMU_GR0_S2CR(smmu->num_mapping_groups - 1); 446 447 447 448 /* 448 449 * With some firmware versions writes to S2CR of type FAULT are ··· 467 464 468 465 reg = FIELD_PREP(ARM_SMMU_CBAR_TYPE, CBAR_TYPE_S1_TRANS_S2_BYPASS); 469 466 arm_smmu_gr1_write(smmu, ARM_SMMU_GR1_CBAR(qsmmu->bypass_cbndx), reg); 467 + 468 + if (smmu->num_mapping_groups > 128) { 469 + dev_notice(smmu->dev, "\tLimiting the stream matching groups to 128\n"); 470 + smmu->num_mapping_groups = 128; 471 + } 470 472 } 471 473 472 474 for (i = 0; i < smmu->num_mapping_groups; i++) {

+6 -3

drivers/iommu/arm/arm-smmu/arm-smmu.c

··· 1165 1165 } 1166 1166 } 1167 1167 1168 - static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev) 1168 + static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev, 1169 + struct iommu_domain *old) 1169 1170 { 1170 1171 struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain); 1171 1172 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); ··· 1235 1234 } 1236 1235 1237 1236 static int arm_smmu_attach_dev_identity(struct iommu_domain *domain, 1238 - struct device *dev) 1237 + struct device *dev, 1238 + struct iommu_domain *old) 1239 1239 { 1240 1240 return arm_smmu_attach_dev_type(dev, S2CR_TYPE_BYPASS); 1241 1241 } ··· 1251 1249 }; 1252 1250 1253 1251 static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain, 1254 - struct device *dev) 1252 + struct device *dev, 1253 + struct iommu_domain *old) 1255 1254 { 1256 1255 return arm_smmu_attach_dev_type(dev, S2CR_TYPE_FAULT); 1257 1256 }

+10 -11

drivers/iommu/arm/arm-smmu/qcom_iommu.c

··· 359 359 kfree(qcom_domain); 360 360 } 361 361 362 - static int qcom_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) 362 + static int qcom_iommu_attach_dev(struct iommu_domain *domain, 363 + struct device *dev, struct iommu_domain *old) 363 364 { 364 365 struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); 365 366 struct qcom_iommu_domain *qcom_domain = to_qcom_iommu_domain(domain); ··· 389 388 } 390 389 391 390 static int qcom_iommu_identity_attach(struct iommu_domain *identity_domain, 392 - struct device *dev) 391 + struct device *dev, 392 + struct iommu_domain *old) 393 393 { 394 - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 395 394 struct qcom_iommu_domain *qcom_domain; 396 395 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 397 396 struct qcom_iommu_dev *qcom_iommu = dev_iommu_priv_get(dev); 398 397 unsigned int i; 399 398 400 - if (domain == identity_domain || !domain) 399 + if (old == identity_domain || !old) 401 400 return 0; 402 401 403 - qcom_domain = to_qcom_iommu_domain(domain); 402 + qcom_domain = to_qcom_iommu_domain(old); 404 403 if (WARN_ON(!qcom_domain->iommu)) 405 404 return -EINVAL; 406 405 ··· 566 565 567 566 qcom_iommu = platform_get_drvdata(iommu_pdev); 568 567 568 + put_device(&iommu_pdev->dev); 569 + 569 570 /* make sure the asid specified in dt is valid, so we don't have 570 571 * to sanity check this elsewhere: 571 572 */ 572 573 if (WARN_ON(asid > qcom_iommu->max_asid) || 573 - WARN_ON(qcom_iommu->ctxs[asid] == NULL)) { 574 - put_device(&iommu_pdev->dev); 574 + WARN_ON(qcom_iommu->ctxs[asid] == NULL)) 575 575 return -EINVAL; 576 - } 577 576 578 577 if (!dev_iommu_priv_get(dev)) { 579 578 dev_iommu_priv_set(dev, qcom_iommu); ··· 582 581 * multiple different iommu devices. Multiple context 583 582 * banks are ok, but multiple devices are not: 584 583 */ 585 - if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) { 586 - put_device(&iommu_pdev->dev); 584 + if (WARN_ON(qcom_iommu != dev_iommu_priv_get(dev))) 587 585 return -EINVAL; 588 - } 589 586 } 590 587 591 588 return iommu_fwspec_add_ids(dev, &asid, 1);

+9 -11

drivers/iommu/exynos-iommu.c

··· 984 984 } 985 985 986 986 static int exynos_iommu_identity_attach(struct iommu_domain *identity_domain, 987 - struct device *dev) 987 + struct device *dev, 988 + struct iommu_domain *old) 988 989 { 989 990 struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); 990 991 struct exynos_iommu_domain *domain; ··· 1036 1035 }; 1037 1036 1038 1037 static int exynos_iommu_attach_device(struct iommu_domain *iommu_domain, 1039 - struct device *dev) 1038 + struct device *dev, 1039 + struct iommu_domain *old) 1040 1040 { 1041 1041 struct exynos_iommu_domain *domain = to_exynos_domain(iommu_domain); 1042 1042 struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); ··· 1046 1044 unsigned long flags; 1047 1045 int err; 1048 1046 1049 - err = exynos_iommu_identity_attach(&exynos_identity_domain, dev); 1047 + err = exynos_iommu_identity_attach(&exynos_identity_domain, dev, old); 1050 1048 if (err) 1051 1049 return err; 1052 1050 ··· 1431 1429 struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev); 1432 1430 struct sysmmu_drvdata *data; 1433 1431 1434 - WARN_ON(exynos_iommu_identity_attach(&exynos_identity_domain, dev)); 1435 - 1436 1432 list_for_each_entry(data, &owner->controllers, owner_node) 1437 1433 device_link_del(data->link); 1438 1434 } ··· 1446 1446 return -ENODEV; 1447 1447 1448 1448 data = platform_get_drvdata(sysmmu); 1449 - if (!data) { 1450 - put_device(&sysmmu->dev); 1449 + put_device(&sysmmu->dev); 1450 + if (!data) 1451 1451 return -ENODEV; 1452 - } 1453 1452 1454 1453 if (!owner) { 1455 1454 owner = kzalloc(sizeof(*owner), GFP_KERNEL); 1456 - if (!owner) { 1457 - put_device(&sysmmu->dev); 1455 + if (!owner) 1458 1456 return -ENOMEM; 1459 - } 1460 1457 1461 1458 INIT_LIST_HEAD(&owner->controllers); 1462 1459 mutex_init(&owner->rpm_lock); ··· 1473 1476 1474 1477 static const struct iommu_ops exynos_iommu_ops = { 1475 1478 .identity_domain = &exynos_identity_domain, 1479 + .release_domain = &exynos_identity_domain, 1476 1480 .domain_alloc_paging = exynos_iommu_domain_alloc_paging, 1477 1481 .device_group = generic_device_group, 1478 1482 .probe_device = exynos_iommu_probe_device,

+6 -6

drivers/iommu/fsl_pamu_domain.c

··· 238 238 } 239 239 240 240 static int fsl_pamu_attach_device(struct iommu_domain *domain, 241 - struct device *dev) 241 + struct device *dev, struct iommu_domain *old) 242 242 { 243 243 struct fsl_dma_domain *dma_domain = to_fsl_dma_domain(domain); 244 244 unsigned long flags; ··· 298 298 * switches to what looks like BLOCKING. 299 299 */ 300 300 static int fsl_pamu_platform_attach(struct iommu_domain *platform_domain, 301 - struct device *dev) 301 + struct device *dev, 302 + struct iommu_domain *old) 302 303 { 303 - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 304 304 struct fsl_dma_domain *dma_domain; 305 305 const u32 *prop; 306 306 int len; ··· 311 311 * Hack to keep things working as they always have, only leaving an 312 312 * UNMANAGED domain makes it BLOCKING. 313 313 */ 314 - if (domain == platform_domain || !domain || 315 - domain->type != IOMMU_DOMAIN_UNMANAGED) 314 + if (old == platform_domain || !old || 315 + old->type != IOMMU_DOMAIN_UNMANAGED) 316 316 return 0; 317 317 318 - dma_domain = to_fsl_dma_domain(domain); 318 + dma_domain = to_fsl_dma_domain(old); 319 319 320 320 /* 321 321 * Use LIODN of the PCI controller while detaching a

+14

drivers/iommu/generic_pt/.kunitconfig

··· 1 + CONFIG_KUNIT=y 2 + CONFIG_GENERIC_PT=y 3 + CONFIG_DEBUG_GENERIC_PT=y 4 + CONFIG_IOMMU_PT=y 5 + CONFIG_IOMMU_PT_AMDV1=y 6 + CONFIG_IOMMU_PT_VTDSS=y 7 + CONFIG_IOMMU_PT_X86_64=y 8 + CONFIG_IOMMU_PT_KUNIT_TEST=y 9 + 10 + CONFIG_IOMMUFD=y 11 + CONFIG_DEBUG_KERNEL=y 12 + CONFIG_FAULT_INJECTION=y 13 + CONFIG_RUNTIME_TESTING_MENU=y 14 + CONFIG_IOMMUFD_TEST=y

+79

drivers/iommu/generic_pt/Kconfig

··· 1 + # SPDX-License-Identifier: GPL-2.0-only 2 + 3 + menuconfig GENERIC_PT 4 + bool "Generic Radix Page Table" if COMPILE_TEST 5 + help 6 + Generic library for building radix tree page tables. 7 + 8 + Generic PT provides a set of HW page table formats and a common 9 + set of APIs to work with them. 10 + 11 + if GENERIC_PT 12 + config DEBUG_GENERIC_PT 13 + bool "Extra debugging checks for GENERIC_PT" 14 + help 15 + Enable extra run time debugging checks for GENERIC_PT code. This 16 + incurs a runtime cost and should not be enabled for production 17 + kernels. 18 + 19 + The kunit tests require this to be enabled to get full coverage. 20 + 21 + config IOMMU_PT 22 + tristate "IOMMU Page Tables" 23 + select IOMMU_API 24 + depends on IOMMU_SUPPORT 25 + depends on GENERIC_PT 26 + help 27 + Generic library for building IOMMU page tables 28 + 29 + IOMMU_PT provides an implementation of the page table operations 30 + related to struct iommu_domain using GENERIC_PT. It provides a single 31 + implementation of the page table operations that can be shared by 32 + multiple drivers. 33 + 34 + if IOMMU_PT 35 + config IOMMU_PT_AMDV1 36 + tristate "IOMMU page table for 64-bit AMD IOMMU v1" 37 + depends on !GENERIC_ATOMIC64 # for cmpxchg64 38 + help 39 + iommu_domain implementation for the AMD v1 page table. AMDv1 is the 40 + "host" page table. It supports granular page sizes of almost every 41 + power of 2 and decodes the full 64-bit IOVA space. 42 + 43 + Selected automatically by an IOMMU driver that uses this format. 44 + 45 + config IOMMU_PT_VTDSS 46 + tristate "IOMMU page table for Intel VT-d Second Stage" 47 + depends on !GENERIC_ATOMIC64 # for cmpxchg64 48 + help 49 + iommu_domain implementation for the Intel VT-d's 64 bit 3/4/5 50 + level Second Stage page table. It is similar to the X86_64 format with 51 + 4K/2M/1G page sizes. 52 + 53 + Selected automatically by an IOMMU driver that uses this format. 54 + 55 + config IOMMU_PT_X86_64 56 + tristate "IOMMU page table for x86 64-bit, 4/5 levels" 57 + depends on !GENERIC_ATOMIC64 # for cmpxchg64 58 + help 59 + iommu_domain implementation for the x86 64-bit 4/5 level page table. 60 + It supports 4K/2M/1G page sizes and can decode a sign-extended 61 + portion of the 64-bit IOVA space. 62 + 63 + Selected automatically by an IOMMU driver that uses this format. 64 + 65 + config IOMMU_PT_KUNIT_TEST 66 + tristate "IOMMU Page Table KUnit Test" if !KUNIT_ALL_TESTS 67 + depends on KUNIT 68 + depends on IOMMU_PT_AMDV1 || !IOMMU_PT_AMDV1 69 + depends on IOMMU_PT_X86_64 || !IOMMU_PT_X86_64 70 + depends on IOMMU_PT_VTDSS || !IOMMU_PT_VTDSS 71 + default KUNIT_ALL_TESTS 72 + help 73 + Enable kunit tests for GENERIC_PT and IOMMU_PT that covers all the 74 + enabled page table formats. The test covers most of the GENERIC_PT 75 + functions provided by the page table format, as well as covering the 76 + iommu_domain related functions. 77 + 78 + endif 79 + endif

+28

drivers/iommu/generic_pt/fmt/Makefile

··· 1 + # SPDX-License-Identifier: GPL-2.0 2 + 3 + iommu_pt_fmt-$(CONFIG_IOMMU_PT_AMDV1) += amdv1 4 + iommu_pt_fmt-$(CONFIG_IOMMUFD_TEST) += mock 5 + 6 + iommu_pt_fmt-$(CONFIG_IOMMU_PT_VTDSS) += vtdss 7 + 8 + iommu_pt_fmt-$(CONFIG_IOMMU_PT_X86_64) += x86_64 9 + 10 + IOMMU_PT_KUNIT_TEST := 11 + define create_format 12 + obj-$(2) += iommu_$(1).o 13 + iommu_pt_kunit_test-y += kunit_iommu_$(1).o 14 + CFLAGS_kunit_iommu_$(1).o += -DGENERIC_PT_KUNIT=1 15 + IOMMU_PT_KUNIT_TEST := iommu_pt_kunit_test.o 16 + 17 + endef 18 + 19 + $(eval $(foreach fmt,$(iommu_pt_fmt-y),$(call create_format,$(fmt),y))) 20 + $(eval $(foreach fmt,$(iommu_pt_fmt-m),$(call create_format,$(fmt),m))) 21 + 22 + # The kunit objects are constructed by compiling the main source 23 + # with -DGENERIC_PT_KUNIT 24 + $(obj)/kunit_iommu_%.o: $(src)/iommu_%.c FORCE 25 + $(call rule_mkdir) 26 + $(call if_changed_dep,cc_o_c) 27 + 28 + obj-$(CONFIG_IOMMU_PT_KUNIT_TEST) += $(IOMMU_PT_KUNIT_TEST)

+411

drivers/iommu/generic_pt/fmt/amdv1.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * AMD IOMMU v1 page table 6 + * 7 + * This is described in Section "2.2.3 I/O Page Tables for Host Translations" 8 + * of the "AMD I/O Virtualization Technology (IOMMU) Specification" 9 + * 10 + * Note the level numbering here matches the core code, so level 0 is the same 11 + * as mode 1. 12 + * 13 + */ 14 + #ifndef __GENERIC_PT_FMT_AMDV1_H 15 + #define __GENERIC_PT_FMT_AMDV1_H 16 + 17 + #include "defs_amdv1.h" 18 + #include "../pt_defs.h" 19 + 20 + #include <asm/page.h> 21 + #include <linux/bitfield.h> 22 + #include <linux/container_of.h> 23 + #include <linux/mem_encrypt.h> 24 + #include <linux/minmax.h> 25 + #include <linux/sizes.h> 26 + #include <linux/string.h> 27 + 28 + enum { 29 + PT_ITEM_WORD_SIZE = sizeof(u64), 30 + /* 31 + * The IOMMUFD selftest uses the AMDv1 format with some alterations It 32 + * uses a 2k page size to test cases where the CPU page size is not the 33 + * same. 34 + */ 35 + #ifdef AMDV1_IOMMUFD_SELFTEST 36 + PT_MAX_VA_ADDRESS_LG2 = 56, 37 + PT_MAX_OUTPUT_ADDRESS_LG2 = 51, 38 + PT_MAX_TOP_LEVEL = 4, 39 + PT_GRANULE_LG2SZ = 11, 40 + #else 41 + PT_MAX_VA_ADDRESS_LG2 = 64, 42 + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, 43 + PT_MAX_TOP_LEVEL = 5, 44 + PT_GRANULE_LG2SZ = 12, 45 + #endif 46 + PT_TABLEMEM_LG2SZ = 12, 47 + 48 + /* The DTE only has these bits for the top phyiscal address */ 49 + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), 50 + }; 51 + 52 + /* PTE bits */ 53 + enum { 54 + AMDV1PT_FMT_PR = BIT(0), 55 + AMDV1PT_FMT_D = BIT(6), 56 + AMDV1PT_FMT_NEXT_LEVEL = GENMASK_ULL(11, 9), 57 + AMDV1PT_FMT_OA = GENMASK_ULL(51, 12), 58 + AMDV1PT_FMT_FC = BIT_ULL(60), 59 + AMDV1PT_FMT_IR = BIT_ULL(61), 60 + AMDV1PT_FMT_IW = BIT_ULL(62), 61 + }; 62 + 63 + /* 64 + * gcc 13 has a bug where it thinks the output of FIELD_GET() is an enum, make 65 + * these defines to avoid it. 66 + */ 67 + #define AMDV1PT_FMT_NL_DEFAULT 0 68 + #define AMDV1PT_FMT_NL_SIZE 7 69 + 70 + static inline pt_oaddr_t amdv1pt_table_pa(const struct pt_state *pts) 71 + { 72 + u64 entry = pts->entry; 73 + 74 + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) 75 + entry = __sme_clr(entry); 76 + return oalog2_mul(FIELD_GET(AMDV1PT_FMT_OA, entry), PT_GRANULE_LG2SZ); 77 + } 78 + #define pt_table_pa amdv1pt_table_pa 79 + 80 + /* Returns the oa for the start of the contiguous entry */ 81 + static inline pt_oaddr_t amdv1pt_entry_oa(const struct pt_state *pts) 82 + { 83 + u64 entry = pts->entry; 84 + pt_oaddr_t oa; 85 + 86 + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) 87 + entry = __sme_clr(entry); 88 + oa = FIELD_GET(AMDV1PT_FMT_OA, entry); 89 + 90 + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) == AMDV1PT_FMT_NL_SIZE) { 91 + unsigned int sz_bits = oaffz(oa); 92 + 93 + oa = oalog2_set_mod(oa, 0, sz_bits); 94 + } else if (PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, entry) != 95 + AMDV1PT_FMT_NL_DEFAULT)) 96 + return 0; 97 + return oalog2_mul(oa, PT_GRANULE_LG2SZ); 98 + } 99 + #define pt_entry_oa amdv1pt_entry_oa 100 + 101 + static inline bool amdv1pt_can_have_leaf(const struct pt_state *pts) 102 + { 103 + /* 104 + * Table 15: Page Table Level Parameters 105 + * The top most level cannot have translation entries 106 + */ 107 + return pts->level < PT_MAX_TOP_LEVEL; 108 + } 109 + #define pt_can_have_leaf amdv1pt_can_have_leaf 110 + 111 + /* Body in pt_fmt_defaults.h */ 112 + static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); 113 + 114 + static inline unsigned int 115 + amdv1pt_entry_num_contig_lg2(const struct pt_state *pts) 116 + { 117 + u32 code; 118 + 119 + if (FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) == 120 + AMDV1PT_FMT_NL_DEFAULT) 121 + return ilog2(1); 122 + 123 + PT_WARN_ON(FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry) != 124 + AMDV1PT_FMT_NL_SIZE); 125 + 126 + /* 127 + * The contiguous size is encoded in the length of a string of 1's in 128 + * the low bits of the OA. Reverse the equation: 129 + * code = log2_to_int(num_contig_lg2 + item_lg2sz - 130 + * PT_GRANULE_LG2SZ - 1) - 1 131 + * Which can be expressed as: 132 + * num_contig_lg2 = oalog2_ffz(code) + 1 - 133 + * item_lg2sz - PT_GRANULE_LG2SZ 134 + * 135 + * Assume the bit layout is correct and remove the masking. Reorganize 136 + * the equation to move all the arithmetic before the ffz. 137 + */ 138 + code = pts->entry >> (__bf_shf(AMDV1PT_FMT_OA) - 1 + 139 + pt_table_item_lg2sz(pts) - PT_GRANULE_LG2SZ); 140 + return ffz_t(u32, code); 141 + } 142 + #define pt_entry_num_contig_lg2 amdv1pt_entry_num_contig_lg2 143 + 144 + static inline unsigned int amdv1pt_num_items_lg2(const struct pt_state *pts) 145 + { 146 + /* 147 + * Top entry covers bits [63:57] only, this is handled through 148 + * max_vasz_lg2. 149 + */ 150 + if (PT_WARN_ON(pts->level == 5)) 151 + return 7; 152 + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); 153 + } 154 + #define pt_num_items_lg2 amdv1pt_num_items_lg2 155 + 156 + static inline pt_vaddr_t amdv1pt_possible_sizes(const struct pt_state *pts) 157 + { 158 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 159 + 160 + if (!amdv1pt_can_have_leaf(pts)) 161 + return 0; 162 + 163 + /* 164 + * Table 14: Example Page Size Encodings 165 + * Address bits 51:32 can be used to encode page sizes greater than 4 166 + * Gbytes. Address bits 63:52 are zero-extended. 167 + * 168 + * 512GB Pages are not supported due to a hardware bug. 169 + * Otherwise every power of two size is supported. 170 + */ 171 + return GENMASK_ULL(min(51, isz_lg2 + amdv1pt_num_items_lg2(pts) - 1), 172 + isz_lg2) & ~SZ_512G; 173 + } 174 + #define pt_possible_sizes amdv1pt_possible_sizes 175 + 176 + static inline enum pt_entry_type amdv1pt_load_entry_raw(struct pt_state *pts) 177 + { 178 + const u64 *tablep = pt_cur_table(pts, u64) + pts->index; 179 + unsigned int next_level; 180 + u64 entry; 181 + 182 + pts->entry = entry = READ_ONCE(*tablep); 183 + if (!(entry & AMDV1PT_FMT_PR)) 184 + return PT_ENTRY_EMPTY; 185 + 186 + next_level = FIELD_GET(AMDV1PT_FMT_NEXT_LEVEL, pts->entry); 187 + if (pts->level == 0 || next_level == AMDV1PT_FMT_NL_DEFAULT || 188 + next_level == AMDV1PT_FMT_NL_SIZE) 189 + return PT_ENTRY_OA; 190 + return PT_ENTRY_TABLE; 191 + } 192 + #define pt_load_entry_raw amdv1pt_load_entry_raw 193 + 194 + static inline void 195 + amdv1pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, 196 + unsigned int oasz_lg2, 197 + const struct pt_write_attrs *attrs) 198 + { 199 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 200 + u64 *tablep = pt_cur_table(pts, u64) + pts->index; 201 + u64 entry; 202 + 203 + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) 204 + return; 205 + 206 + entry = AMDV1PT_FMT_PR | 207 + FIELD_PREP(AMDV1PT_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | 208 + attrs->descriptor_bits; 209 + 210 + if (oasz_lg2 == isz_lg2) { 211 + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, 212 + AMDV1PT_FMT_NL_DEFAULT); 213 + WRITE_ONCE(*tablep, entry); 214 + } else { 215 + unsigned int num_contig_lg2 = oasz_lg2 - isz_lg2; 216 + u64 *end = tablep + log2_to_int(num_contig_lg2); 217 + 218 + entry |= FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, 219 + AMDV1PT_FMT_NL_SIZE) | 220 + FIELD_PREP(AMDV1PT_FMT_OA, 221 + oalog2_to_int(oasz_lg2 - PT_GRANULE_LG2SZ - 222 + 1) - 223 + 1); 224 + 225 + /* See amdv1pt_clear_entries() */ 226 + if (num_contig_lg2 <= ilog2(32)) { 227 + for (; tablep != end; tablep++) 228 + WRITE_ONCE(*tablep, entry); 229 + } else { 230 + memset64(tablep, entry, log2_to_int(num_contig_lg2)); 231 + } 232 + } 233 + pts->entry = entry; 234 + } 235 + #define pt_install_leaf_entry amdv1pt_install_leaf_entry 236 + 237 + static inline bool amdv1pt_install_table(struct pt_state *pts, 238 + pt_oaddr_t table_pa, 239 + const struct pt_write_attrs *attrs) 240 + { 241 + u64 entry; 242 + 243 + /* 244 + * IR and IW are ANDed from the table levels along with the PTE. We 245 + * always control permissions from the PTE, so always set IR and IW for 246 + * tables. 247 + */ 248 + entry = AMDV1PT_FMT_PR | 249 + FIELD_PREP(AMDV1PT_FMT_NEXT_LEVEL, pts->level) | 250 + FIELD_PREP(AMDV1PT_FMT_OA, 251 + log2_div(table_pa, PT_GRANULE_LG2SZ)) | 252 + AMDV1PT_FMT_IR | AMDV1PT_FMT_IW; 253 + if (pts_feature(pts, PT_FEAT_AMDV1_ENCRYPT_TABLES)) 254 + entry = __sme_set(entry); 255 + return pt_table_install64(pts, entry); 256 + } 257 + #define pt_install_table amdv1pt_install_table 258 + 259 + static inline void amdv1pt_attr_from_entry(const struct pt_state *pts, 260 + struct pt_write_attrs *attrs) 261 + { 262 + attrs->descriptor_bits = 263 + pts->entry & (AMDV1PT_FMT_FC | AMDV1PT_FMT_IR | AMDV1PT_FMT_IW); 264 + } 265 + #define pt_attr_from_entry amdv1pt_attr_from_entry 266 + 267 + static inline void amdv1pt_clear_entries(struct pt_state *pts, 268 + unsigned int num_contig_lg2) 269 + { 270 + u64 *tablep = pt_cur_table(pts, u64) + pts->index; 271 + u64 *end = tablep + log2_to_int(num_contig_lg2); 272 + 273 + /* 274 + * gcc generates rep stos for the io-pgtable code, and this difference 275 + * can show in microbenchmarks with larger contiguous page sizes. 276 + * rep is slower for small cases. 277 + */ 278 + if (num_contig_lg2 <= ilog2(32)) { 279 + for (; tablep != end; tablep++) 280 + WRITE_ONCE(*tablep, 0); 281 + } else { 282 + memset64(tablep, 0, log2_to_int(num_contig_lg2)); 283 + } 284 + } 285 + #define pt_clear_entries amdv1pt_clear_entries 286 + 287 + static inline bool amdv1pt_entry_is_write_dirty(const struct pt_state *pts) 288 + { 289 + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); 290 + u64 *tablep = pt_cur_table(pts, u64) + 291 + log2_set_mod(pts->index, 0, num_contig_lg2); 292 + u64 *end = tablep + log2_to_int(num_contig_lg2); 293 + 294 + for (; tablep != end; tablep++) 295 + if (READ_ONCE(*tablep) & AMDV1PT_FMT_D) 296 + return true; 297 + return false; 298 + } 299 + #define pt_entry_is_write_dirty amdv1pt_entry_is_write_dirty 300 + 301 + static inline void amdv1pt_entry_make_write_clean(struct pt_state *pts) 302 + { 303 + unsigned int num_contig_lg2 = amdv1pt_entry_num_contig_lg2(pts); 304 + u64 *tablep = pt_cur_table(pts, u64) + 305 + log2_set_mod(pts->index, 0, num_contig_lg2); 306 + u64 *end = tablep + log2_to_int(num_contig_lg2); 307 + 308 + for (; tablep != end; tablep++) 309 + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)AMDV1PT_FMT_D); 310 + } 311 + #define pt_entry_make_write_clean amdv1pt_entry_make_write_clean 312 + 313 + static inline bool amdv1pt_entry_make_write_dirty(struct pt_state *pts) 314 + { 315 + u64 *tablep = pt_cur_table(pts, u64) + pts->index; 316 + u64 new = pts->entry | AMDV1PT_FMT_D; 317 + 318 + return try_cmpxchg64(tablep, &pts->entry, new); 319 + } 320 + #define pt_entry_make_write_dirty amdv1pt_entry_make_write_dirty 321 + 322 + /* --- iommu */ 323 + #include <linux/generic_pt/iommu.h> 324 + #include <linux/iommu.h> 325 + 326 + #define pt_iommu_table pt_iommu_amdv1 327 + 328 + /* The common struct is in the per-format common struct */ 329 + static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) 330 + { 331 + return &container_of(iommu_table, struct pt_iommu_amdv1, iommu) 332 + ->amdpt.common; 333 + } 334 + 335 + static inline struct pt_iommu *iommu_from_common(struct pt_common *common) 336 + { 337 + return &container_of(common, struct pt_iommu_amdv1, amdpt.common)->iommu; 338 + } 339 + 340 + static inline int amdv1pt_iommu_set_prot(struct pt_common *common, 341 + struct pt_write_attrs *attrs, 342 + unsigned int iommu_prot) 343 + { 344 + u64 pte = 0; 345 + 346 + if (pt_feature(common, PT_FEAT_AMDV1_FORCE_COHERENCE)) 347 + pte |= AMDV1PT_FMT_FC; 348 + if (iommu_prot & IOMMU_READ) 349 + pte |= AMDV1PT_FMT_IR; 350 + if (iommu_prot & IOMMU_WRITE) 351 + pte |= AMDV1PT_FMT_IW; 352 + 353 + /* 354 + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to 355 + * control this. For now if the tables use sme_set then so do the ptes. 356 + */ 357 + if (pt_feature(common, PT_FEAT_AMDV1_ENCRYPT_TABLES)) 358 + pte = __sme_set(pte); 359 + 360 + attrs->descriptor_bits = pte; 361 + return 0; 362 + } 363 + #define pt_iommu_set_prot amdv1pt_iommu_set_prot 364 + 365 + static inline int amdv1pt_iommu_fmt_init(struct pt_iommu_amdv1 *iommu_table, 366 + const struct pt_iommu_amdv1_cfg *cfg) 367 + { 368 + struct pt_amdv1 *table = &iommu_table->amdpt; 369 + unsigned int max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2; 370 + 371 + if (cfg->starting_level == 0 || cfg->starting_level > PT_MAX_TOP_LEVEL) 372 + return -EINVAL; 373 + 374 + if (!pt_feature(&table->common, PT_FEAT_DYNAMIC_TOP) && 375 + cfg->starting_level != PT_MAX_TOP_LEVEL) 376 + max_vasz_lg2 = PT_GRANULE_LG2SZ + 377 + (PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64))) * 378 + (cfg->starting_level + 1); 379 + 380 + table->common.max_vasz_lg2 = 381 + min(max_vasz_lg2, cfg->common.hw_max_vasz_lg2); 382 + table->common.max_oasz_lg2 = 383 + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); 384 + pt_top_set_level(&table->common, cfg->starting_level); 385 + return 0; 386 + } 387 + #define pt_iommu_fmt_init amdv1pt_iommu_fmt_init 388 + 389 + #ifndef PT_FMT_VARIANT 390 + static inline void 391 + amdv1pt_iommu_fmt_hw_info(struct pt_iommu_amdv1 *table, 392 + const struct pt_range *top_range, 393 + struct pt_iommu_amdv1_hw_info *info) 394 + { 395 + info->host_pt_root = virt_to_phys(top_range->top_table); 396 + PT_WARN_ON(info->host_pt_root & ~PT_TOP_PHYS_MASK); 397 + info->mode = top_range->top_level + 1; 398 + } 399 + #define pt_iommu_fmt_hw_info amdv1pt_iommu_fmt_hw_info 400 + #endif 401 + 402 + #if defined(GENERIC_PT_KUNIT) 403 + static const struct pt_iommu_amdv1_cfg amdv1_kunit_fmt_cfgs[] = { 404 + /* Matches what io_pgtable does */ 405 + [0] = { .starting_level = 2 }, 406 + }; 407 + #define kunit_fmt_cfgs amdv1_kunit_fmt_cfgs 408 + enum { KUNIT_FMT_FEATURES = 0 }; 409 + #endif 410 + 411 + #endif

+21

drivers/iommu/generic_pt/fmt/defs_amdv1.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + */ 6 + #ifndef __GENERIC_PT_FMT_DEFS_AMDV1_H 7 + #define __GENERIC_PT_FMT_DEFS_AMDV1_H 8 + 9 + #include <linux/generic_pt/common.h> 10 + #include <linux/types.h> 11 + 12 + typedef u64 pt_vaddr_t; 13 + typedef u64 pt_oaddr_t; 14 + 15 + struct amdv1pt_write_attrs { 16 + u64 descriptor_bits; 17 + gfp_t gfp; 18 + }; 19 + #define pt_write_attrs amdv1pt_write_attrs 20 + 21 + #endif

+21

drivers/iommu/generic_pt/fmt/defs_vtdss.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + */ 6 + #ifndef __GENERIC_PT_FMT_DEFS_VTDSS_H 7 + #define __GENERIC_PT_FMT_DEFS_VTDSS_H 8 + 9 + #include <linux/generic_pt/common.h> 10 + #include <linux/types.h> 11 + 12 + typedef u64 pt_vaddr_t; 13 + typedef u64 pt_oaddr_t; 14 + 15 + struct vtdss_pt_write_attrs { 16 + u64 descriptor_bits; 17 + gfp_t gfp; 18 + }; 19 + #define pt_write_attrs vtdss_pt_write_attrs 20 + 21 + #endif

+21

drivers/iommu/generic_pt/fmt/defs_x86_64.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + */ 6 + #ifndef __GENERIC_PT_FMT_DEFS_X86_64_H 7 + #define __GENERIC_PT_FMT_DEFS_X86_64_H 8 + 9 + #include <linux/generic_pt/common.h> 10 + #include <linux/types.h> 11 + 12 + typedef u64 pt_vaddr_t; 13 + typedef u64 pt_oaddr_t; 14 + 15 + struct x86_64_pt_write_attrs { 16 + u64 descriptor_bits; 17 + gfp_t gfp; 18 + }; 19 + #define pt_write_attrs x86_64_pt_write_attrs 20 + 21 + #endif

+15

drivers/iommu/generic_pt/fmt/iommu_amdv1.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + */ 5 + #define PT_FMT amdv1 6 + #define PT_SUPPORTED_FEATURES \ 7 + (BIT(PT_FEAT_FULL_VA) | BIT(PT_FEAT_DYNAMIC_TOP) | \ 8 + BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ 9 + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ 10 + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) 11 + #define PT_FORCE_ENABLED_FEATURES \ 12 + (BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | \ 13 + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE)) 14 + 15 + #include "iommu_template.h"

+10

drivers/iommu/generic_pt/fmt/iommu_mock.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + */ 5 + #define AMDV1_IOMMUFD_SELFTEST 1 6 + #define PT_FMT amdv1 7 + #define PT_FMT_VARIANT mock 8 + #define PT_SUPPORTED_FEATURES 0 9 + 10 + #include "iommu_template.h"

+48

drivers/iommu/generic_pt/fmt/iommu_template.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * Template to build the iommu module and kunit from the format and 6 + * implementation headers. 7 + * 8 + * The format should have: 9 + * #define PT_FMT <name> 10 + * #define PT_SUPPORTED_FEATURES (BIT(PT_FEAT_xx) | BIT(PT_FEAT_yy)) 11 + * And optionally: 12 + * #define PT_FORCE_ENABLED_FEATURES .. 13 + * #define PT_FMT_VARIANT <suffix> 14 + */ 15 + #include <linux/args.h> 16 + #include <linux/stringify.h> 17 + 18 + #ifdef PT_FMT_VARIANT 19 + #define PTPFX_RAW \ 20 + CONCATENATE(CONCATENATE(PT_FMT, _), PT_FMT_VARIANT) 21 + #else 22 + #define PTPFX_RAW PT_FMT 23 + #endif 24 + 25 + #define PTPFX CONCATENATE(PTPFX_RAW, _) 26 + 27 + #define _PT_FMT_H PT_FMT.h 28 + #define PT_FMT_H __stringify(_PT_FMT_H) 29 + 30 + #define _PT_DEFS_H CONCATENATE(defs_, _PT_FMT_H) 31 + #define PT_DEFS_H __stringify(_PT_DEFS_H) 32 + 33 + #include <linux/generic_pt/common.h> 34 + #include PT_DEFS_H 35 + #include "../pt_defs.h" 36 + #include PT_FMT_H 37 + #include "../pt_common.h" 38 + 39 + #ifndef GENERIC_PT_KUNIT 40 + #include "../iommu_pt.h" 41 + #else 42 + /* 43 + * The makefile will compile the .c file twice, once with GENERIC_PT_KUNIT set 44 + * which means we are building the kunit modle. 45 + */ 46 + #include "../kunit_generic_pt.h" 47 + #include "../kunit_iommu_pt.h" 48 + #endif

+10

drivers/iommu/generic_pt/fmt/iommu_vtdss.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES 4 + */ 5 + #define PT_FMT vtdss 6 + #define PT_SUPPORTED_FEATURES \ 7 + (BIT(PT_FEAT_FLUSH_RANGE) | BIT(PT_FEAT_VTDSS_FORCE_COHERENCE) | \ 8 + BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) | BIT(PT_FEAT_DMA_INCOHERENT)) 9 + 10 + #include "iommu_template.h"

+11

drivers/iommu/generic_pt/fmt/iommu_x86_64.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + */ 5 + #define PT_FMT x86_64 6 + #define PT_SUPPORTED_FEATURES \ 7 + (BIT(PT_FEAT_SIGN_EXTEND) | BIT(PT_FEAT_FLUSH_RANGE) | \ 8 + BIT(PT_FEAT_FLUSH_RANGE_NO_GAPS) | \ 9 + BIT(PT_FEAT_X86_64_AMD_ENCRYPT_TABLES) | BIT(PT_FEAT_DMA_INCOHERENT)) 10 + 11 + #include "iommu_template.h"

+285

drivers/iommu/generic_pt/fmt/vtdss.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * Intel VT-d Second Stange 5/4 level page table 6 + * 7 + * This is described in 8 + * Section "3.7 Second-Stage Translation" 9 + * Section "9.8 Second-Stage Paging Entries" 10 + * 11 + * Of the "Intel Virtualization Technology for Directed I/O Architecture 12 + * Specification". 13 + * 14 + * The named levels in the spec map to the pts->level as: 15 + * Table/SS-PTE - 0 16 + * Directory/SS-PDE - 1 17 + * Directory Ptr/SS-PDPTE - 2 18 + * PML4/SS-PML4E - 3 19 + * PML5/SS-PML5E - 4 20 + */ 21 + #ifndef __GENERIC_PT_FMT_VTDSS_H 22 + #define __GENERIC_PT_FMT_VTDSS_H 23 + 24 + #include "defs_vtdss.h" 25 + #include "../pt_defs.h" 26 + 27 + #include <linux/bitfield.h> 28 + #include <linux/container_of.h> 29 + #include <linux/log2.h> 30 + 31 + enum { 32 + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, 33 + PT_MAX_VA_ADDRESS_LG2 = 57, 34 + PT_ITEM_WORD_SIZE = sizeof(u64), 35 + PT_MAX_TOP_LEVEL = 4, 36 + PT_GRANULE_LG2SZ = 12, 37 + PT_TABLEMEM_LG2SZ = 12, 38 + 39 + /* SSPTPTR is 4k aligned and limited by HAW */ 40 + PT_TOP_PHYS_MASK = GENMASK_ULL(63, 12), 41 + }; 42 + 43 + /* Shared descriptor bits */ 44 + enum { 45 + VTDSS_FMT_R = BIT(0), 46 + VTDSS_FMT_W = BIT(1), 47 + VTDSS_FMT_A = BIT(8), 48 + VTDSS_FMT_D = BIT(9), 49 + VTDSS_FMT_SNP = BIT(11), 50 + VTDSS_FMT_OA = GENMASK_ULL(51, 12), 51 + }; 52 + 53 + /* PDPTE/PDE */ 54 + enum { 55 + VTDSS_FMT_PS = BIT(7), 56 + }; 57 + 58 + #define common_to_vtdss_pt(common_ptr) \ 59 + container_of_const(common_ptr, struct pt_vtdss, common) 60 + #define to_vtdss_pt(pts) common_to_vtdss_pt((pts)->range->common) 61 + 62 + static inline pt_oaddr_t vtdss_pt_table_pa(const struct pt_state *pts) 63 + { 64 + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), 65 + PT_TABLEMEM_LG2SZ); 66 + } 67 + #define pt_table_pa vtdss_pt_table_pa 68 + 69 + static inline pt_oaddr_t vtdss_pt_entry_oa(const struct pt_state *pts) 70 + { 71 + return oalog2_mul(FIELD_GET(VTDSS_FMT_OA, pts->entry), 72 + PT_GRANULE_LG2SZ); 73 + } 74 + #define pt_entry_oa vtdss_pt_entry_oa 75 + 76 + static inline bool vtdss_pt_can_have_leaf(const struct pt_state *pts) 77 + { 78 + return pts->level <= 2; 79 + } 80 + #define pt_can_have_leaf vtdss_pt_can_have_leaf 81 + 82 + static inline unsigned int vtdss_pt_num_items_lg2(const struct pt_state *pts) 83 + { 84 + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); 85 + } 86 + #define pt_num_items_lg2 vtdss_pt_num_items_lg2 87 + 88 + static inline enum pt_entry_type vtdss_pt_load_entry_raw(struct pt_state *pts) 89 + { 90 + const u64 *tablep = pt_cur_table(pts, u64); 91 + u64 entry; 92 + 93 + pts->entry = entry = READ_ONCE(tablep[pts->index]); 94 + if (!entry) 95 + return PT_ENTRY_EMPTY; 96 + if (pts->level == 0 || 97 + (vtdss_pt_can_have_leaf(pts) && (pts->entry & VTDSS_FMT_PS))) 98 + return PT_ENTRY_OA; 99 + return PT_ENTRY_TABLE; 100 + } 101 + #define pt_load_entry_raw vtdss_pt_load_entry_raw 102 + 103 + static inline void 104 + vtdss_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, 105 + unsigned int oasz_lg2, 106 + const struct pt_write_attrs *attrs) 107 + { 108 + u64 *tablep = pt_cur_table(pts, u64); 109 + u64 entry; 110 + 111 + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) 112 + return; 113 + 114 + entry = FIELD_PREP(VTDSS_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | 115 + attrs->descriptor_bits; 116 + if (pts->level != 0) 117 + entry |= VTDSS_FMT_PS; 118 + 119 + WRITE_ONCE(tablep[pts->index], entry); 120 + pts->entry = entry; 121 + } 122 + #define pt_install_leaf_entry vtdss_pt_install_leaf_entry 123 + 124 + static inline bool vtdss_pt_install_table(struct pt_state *pts, 125 + pt_oaddr_t table_pa, 126 + const struct pt_write_attrs *attrs) 127 + { 128 + u64 entry; 129 + 130 + entry = VTDSS_FMT_R | VTDSS_FMT_W | 131 + FIELD_PREP(VTDSS_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); 132 + return pt_table_install64(pts, entry); 133 + } 134 + #define pt_install_table vtdss_pt_install_table 135 + 136 + static inline void vtdss_pt_attr_from_entry(const struct pt_state *pts, 137 + struct pt_write_attrs *attrs) 138 + { 139 + attrs->descriptor_bits = pts->entry & 140 + (VTDSS_FMT_R | VTDSS_FMT_W | VTDSS_FMT_SNP); 141 + } 142 + #define pt_attr_from_entry vtdss_pt_attr_from_entry 143 + 144 + static inline bool vtdss_pt_entry_is_write_dirty(const struct pt_state *pts) 145 + { 146 + u64 *tablep = pt_cur_table(pts, u64) + pts->index; 147 + 148 + return READ_ONCE(*tablep) & VTDSS_FMT_D; 149 + } 150 + #define pt_entry_is_write_dirty vtdss_pt_entry_is_write_dirty 151 + 152 + static inline void vtdss_pt_entry_make_write_clean(struct pt_state *pts) 153 + { 154 + u64 *tablep = pt_cur_table(pts, u64) + pts->index; 155 + 156 + WRITE_ONCE(*tablep, READ_ONCE(*tablep) & ~(u64)VTDSS_FMT_D); 157 + } 158 + #define pt_entry_make_write_clean vtdss_pt_entry_make_write_clean 159 + 160 + static inline bool vtdss_pt_entry_make_write_dirty(struct pt_state *pts) 161 + { 162 + u64 *tablep = pt_cur_table(pts, u64) + pts->index; 163 + u64 new = pts->entry | VTDSS_FMT_D; 164 + 165 + return try_cmpxchg64(tablep, &pts->entry, new); 166 + } 167 + #define pt_entry_make_write_dirty vtdss_pt_entry_make_write_dirty 168 + 169 + static inline unsigned int vtdss_pt_max_sw_bit(struct pt_common *common) 170 + { 171 + return 10; 172 + } 173 + #define pt_max_sw_bit vtdss_pt_max_sw_bit 174 + 175 + static inline u64 vtdss_pt_sw_bit(unsigned int bitnr) 176 + { 177 + if (__builtin_constant_p(bitnr) && bitnr > 10) 178 + BUILD_BUG(); 179 + 180 + /* Bits marked Ignored in the specification */ 181 + switch (bitnr) { 182 + case 0: 183 + return BIT(10); 184 + case 1 ... 9: 185 + return BIT_ULL((bitnr - 1) + 52); 186 + case 10: 187 + return BIT_ULL(63); 188 + /* Some bits in 9-3 are available in some entries */ 189 + default: 190 + PT_WARN_ON(true); 191 + return 0; 192 + } 193 + } 194 + #define pt_sw_bit vtdss_pt_sw_bit 195 + 196 + /* --- iommu */ 197 + #include <linux/generic_pt/iommu.h> 198 + #include <linux/iommu.h> 199 + 200 + #define pt_iommu_table pt_iommu_vtdss 201 + 202 + /* The common struct is in the per-format common struct */ 203 + static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) 204 + { 205 + return &container_of(iommu_table, struct pt_iommu_table, iommu) 206 + ->vtdss_pt.common; 207 + } 208 + 209 + static inline struct pt_iommu *iommu_from_common(struct pt_common *common) 210 + { 211 + return &container_of(common, struct pt_iommu_table, vtdss_pt.common) 212 + ->iommu; 213 + } 214 + 215 + static inline int vtdss_pt_iommu_set_prot(struct pt_common *common, 216 + struct pt_write_attrs *attrs, 217 + unsigned int iommu_prot) 218 + { 219 + u64 pte = 0; 220 + 221 + /* 222 + * VTDSS does not have a present bit, so we tell if any entry is present 223 + * by checking for R or W. 224 + */ 225 + if (!(iommu_prot & (IOMMU_READ | IOMMU_WRITE))) 226 + return -EINVAL; 227 + 228 + if (iommu_prot & IOMMU_READ) 229 + pte |= VTDSS_FMT_R; 230 + if (iommu_prot & IOMMU_WRITE) 231 + pte |= VTDSS_FMT_W; 232 + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_COHERENCE)) 233 + pte |= VTDSS_FMT_SNP; 234 + 235 + if (pt_feature(common, PT_FEAT_VTDSS_FORCE_WRITEABLE) && 236 + !(iommu_prot & IOMMU_WRITE)) { 237 + pr_err_ratelimited( 238 + "Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); 239 + return -EINVAL; 240 + } 241 + 242 + attrs->descriptor_bits = pte; 243 + return 0; 244 + } 245 + #define pt_iommu_set_prot vtdss_pt_iommu_set_prot 246 + 247 + static inline int vtdss_pt_iommu_fmt_init(struct pt_iommu_vtdss *iommu_table, 248 + const struct pt_iommu_vtdss_cfg *cfg) 249 + { 250 + struct pt_vtdss *table = &iommu_table->vtdss_pt; 251 + 252 + if (cfg->top_level > 4 || cfg->top_level < 2) 253 + return -EOPNOTSUPP; 254 + 255 + pt_top_set_level(&table->common, cfg->top_level); 256 + return 0; 257 + } 258 + #define pt_iommu_fmt_init vtdss_pt_iommu_fmt_init 259 + 260 + static inline void 261 + vtdss_pt_iommu_fmt_hw_info(struct pt_iommu_vtdss *table, 262 + const struct pt_range *top_range, 263 + struct pt_iommu_vtdss_hw_info *info) 264 + { 265 + info->ssptptr = virt_to_phys(top_range->top_table); 266 + PT_WARN_ON(info->ssptptr & ~PT_TOP_PHYS_MASK); 267 + /* 268 + * top_level = 2 = 3 level table aw=1 269 + * top_level = 3 = 4 level table aw=2 270 + * top_level = 4 = 5 level table aw=3 271 + */ 272 + info->aw = top_range->top_level - 1; 273 + } 274 + #define pt_iommu_fmt_hw_info vtdss_pt_iommu_fmt_hw_info 275 + 276 + #if defined(GENERIC_PT_KUNIT) 277 + static const struct pt_iommu_vtdss_cfg vtdss_kunit_fmt_cfgs[] = { 278 + [0] = { .common.hw_max_vasz_lg2 = 39, .top_level = 2}, 279 + [1] = { .common.hw_max_vasz_lg2 = 48, .top_level = 3}, 280 + [2] = { .common.hw_max_vasz_lg2 = 57, .top_level = 4}, 281 + }; 282 + #define kunit_fmt_cfgs vtdss_kunit_fmt_cfgs 283 + enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE) }; 284 + #endif 285 + #endif

+279

drivers/iommu/generic_pt/fmt/x86_64.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * x86 page table. Supports the 4 and 5 level variations. 6 + * 7 + * The 4 and 5 level version is described in: 8 + * Section "4.4 4-Level Paging and 5-Level Paging" of the Intel Software 9 + * Developer's Manual Volume 3 10 + * 11 + * Section "9.7 First-Stage Paging Entries" of the "Intel Virtualization 12 + * Technology for Directed I/O Architecture Specification" 13 + * 14 + * Section "2.2.6 I/O Page Tables for Guest Translations" of the "AMD I/O 15 + * Virtualization Technology (IOMMU) Specification" 16 + * 17 + * It is used by x86 CPUs, AMD and VT-d IOMMU HW. 18 + * 19 + * Note the 3 level format is very similar and almost implemented here. The 20 + * reserved/ignored layout is different and there are functional bit 21 + * differences. 22 + * 23 + * This format uses PT_FEAT_SIGN_EXTEND to have a upper/non-canonical/lower 24 + * split. PT_FEAT_SIGN_EXTEND is optional as AMD IOMMU sometimes uses non-sign 25 + * extended addressing with this page table format. 26 + * 27 + * The named levels in the spec map to the pts->level as: 28 + * Table/PTE - 0 29 + * Directory/PDE - 1 30 + * Directory Ptr/PDPTE - 2 31 + * PML4/PML4E - 3 32 + * PML5/PML5E - 4 33 + */ 34 + #ifndef __GENERIC_PT_FMT_X86_64_H 35 + #define __GENERIC_PT_FMT_X86_64_H 36 + 37 + #include "defs_x86_64.h" 38 + #include "../pt_defs.h" 39 + 40 + #include <linux/bitfield.h> 41 + #include <linux/container_of.h> 42 + #include <linux/log2.h> 43 + #include <linux/mem_encrypt.h> 44 + 45 + enum { 46 + PT_MAX_OUTPUT_ADDRESS_LG2 = 52, 47 + PT_MAX_VA_ADDRESS_LG2 = 57, 48 + PT_ITEM_WORD_SIZE = sizeof(u64), 49 + PT_MAX_TOP_LEVEL = 4, 50 + PT_GRANULE_LG2SZ = 12, 51 + PT_TABLEMEM_LG2SZ = 12, 52 + 53 + /* 54 + * For AMD the GCR3 Base only has these bits. For VT-d FSPTPTR is 4k 55 + * aligned and is limited by the architected HAW 56 + */ 57 + PT_TOP_PHYS_MASK = GENMASK_ULL(51, 12), 58 + }; 59 + 60 + /* Shared descriptor bits */ 61 + enum { 62 + X86_64_FMT_P = BIT(0), 63 + X86_64_FMT_RW = BIT(1), 64 + X86_64_FMT_U = BIT(2), 65 + X86_64_FMT_A = BIT(5), 66 + X86_64_FMT_D = BIT(6), 67 + X86_64_FMT_OA = GENMASK_ULL(51, 12), 68 + X86_64_FMT_XD = BIT_ULL(63), 69 + }; 70 + 71 + /* PDPTE/PDE */ 72 + enum { 73 + X86_64_FMT_PS = BIT(7), 74 + }; 75 + 76 + static inline pt_oaddr_t x86_64_pt_table_pa(const struct pt_state *pts) 77 + { 78 + u64 entry = pts->entry; 79 + 80 + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) 81 + entry = __sme_clr(entry); 82 + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), 83 + PT_TABLEMEM_LG2SZ); 84 + } 85 + #define pt_table_pa x86_64_pt_table_pa 86 + 87 + static inline pt_oaddr_t x86_64_pt_entry_oa(const struct pt_state *pts) 88 + { 89 + u64 entry = pts->entry; 90 + 91 + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) 92 + entry = __sme_clr(entry); 93 + return oalog2_mul(FIELD_GET(X86_64_FMT_OA, entry), 94 + PT_GRANULE_LG2SZ); 95 + } 96 + #define pt_entry_oa x86_64_pt_entry_oa 97 + 98 + static inline bool x86_64_pt_can_have_leaf(const struct pt_state *pts) 99 + { 100 + return pts->level <= 2; 101 + } 102 + #define pt_can_have_leaf x86_64_pt_can_have_leaf 103 + 104 + static inline unsigned int x86_64_pt_num_items_lg2(const struct pt_state *pts) 105 + { 106 + return PT_TABLEMEM_LG2SZ - ilog2(sizeof(u64)); 107 + } 108 + #define pt_num_items_lg2 x86_64_pt_num_items_lg2 109 + 110 + static inline enum pt_entry_type x86_64_pt_load_entry_raw(struct pt_state *pts) 111 + { 112 + const u64 *tablep = pt_cur_table(pts, u64); 113 + u64 entry; 114 + 115 + pts->entry = entry = READ_ONCE(tablep[pts->index]); 116 + if (!(entry & X86_64_FMT_P)) 117 + return PT_ENTRY_EMPTY; 118 + if (pts->level == 0 || 119 + (x86_64_pt_can_have_leaf(pts) && (entry & X86_64_FMT_PS))) 120 + return PT_ENTRY_OA; 121 + return PT_ENTRY_TABLE; 122 + } 123 + #define pt_load_entry_raw x86_64_pt_load_entry_raw 124 + 125 + static inline void 126 + x86_64_pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, 127 + unsigned int oasz_lg2, 128 + const struct pt_write_attrs *attrs) 129 + { 130 + u64 *tablep = pt_cur_table(pts, u64); 131 + u64 entry; 132 + 133 + if (!pt_check_install_leaf_args(pts, oa, oasz_lg2)) 134 + return; 135 + 136 + entry = X86_64_FMT_P | 137 + FIELD_PREP(X86_64_FMT_OA, log2_div(oa, PT_GRANULE_LG2SZ)) | 138 + attrs->descriptor_bits; 139 + if (pts->level != 0) 140 + entry |= X86_64_FMT_PS; 141 + 142 + WRITE_ONCE(tablep[pts->index], entry); 143 + pts->entry = entry; 144 + } 145 + #define pt_install_leaf_entry x86_64_pt_install_leaf_entry 146 + 147 + static inline bool x86_64_pt_install_table(struct pt_state *pts, 148 + pt_oaddr_t table_pa, 149 + const struct pt_write_attrs *attrs) 150 + { 151 + u64 entry; 152 + 153 + entry = X86_64_FMT_P | X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | 154 + FIELD_PREP(X86_64_FMT_OA, log2_div(table_pa, PT_GRANULE_LG2SZ)); 155 + if (pts_feature(pts, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) 156 + entry = __sme_set(entry); 157 + return pt_table_install64(pts, entry); 158 + } 159 + #define pt_install_table x86_64_pt_install_table 160 + 161 + static inline void x86_64_pt_attr_from_entry(const struct pt_state *pts, 162 + struct pt_write_attrs *attrs) 163 + { 164 + attrs->descriptor_bits = pts->entry & 165 + (X86_64_FMT_RW | X86_64_FMT_U | X86_64_FMT_A | 166 + X86_64_FMT_D | X86_64_FMT_XD); 167 + } 168 + #define pt_attr_from_entry x86_64_pt_attr_from_entry 169 + 170 + static inline unsigned int x86_64_pt_max_sw_bit(struct pt_common *common) 171 + { 172 + return 12; 173 + } 174 + #define pt_max_sw_bit x86_64_pt_max_sw_bit 175 + 176 + static inline u64 x86_64_pt_sw_bit(unsigned int bitnr) 177 + { 178 + if (__builtin_constant_p(bitnr) && bitnr > 12) 179 + BUILD_BUG(); 180 + 181 + /* Bits marked Ignored/AVL in the specification */ 182 + switch (bitnr) { 183 + case 0: 184 + return BIT(9); 185 + case 1: 186 + return BIT(11); 187 + case 2 ... 12: 188 + return BIT_ULL((bitnr - 2) + 52); 189 + /* Some bits in 8,6,4,3 are available in some entries */ 190 + default: 191 + PT_WARN_ON(true); 192 + return 0; 193 + } 194 + } 195 + #define pt_sw_bit x86_64_pt_sw_bit 196 + 197 + /* --- iommu */ 198 + #include <linux/generic_pt/iommu.h> 199 + #include <linux/iommu.h> 200 + 201 + #define pt_iommu_table pt_iommu_x86_64 202 + 203 + /* The common struct is in the per-format common struct */ 204 + static inline struct pt_common *common_from_iommu(struct pt_iommu *iommu_table) 205 + { 206 + return &container_of(iommu_table, struct pt_iommu_table, iommu) 207 + ->x86_64_pt.common; 208 + } 209 + 210 + static inline struct pt_iommu *iommu_from_common(struct pt_common *common) 211 + { 212 + return &container_of(common, struct pt_iommu_table, x86_64_pt.common) 213 + ->iommu; 214 + } 215 + 216 + static inline int x86_64_pt_iommu_set_prot(struct pt_common *common, 217 + struct pt_write_attrs *attrs, 218 + unsigned int iommu_prot) 219 + { 220 + u64 pte; 221 + 222 + pte = X86_64_FMT_U | X86_64_FMT_A; 223 + if (iommu_prot & IOMMU_WRITE) 224 + pte |= X86_64_FMT_RW | X86_64_FMT_D; 225 + 226 + /* 227 + * Ideally we'd have an IOMMU_ENCRYPTED flag set by higher levels to 228 + * control this. For now if the tables use sme_set then so do the ptes. 229 + */ 230 + if (pt_feature(common, PT_FEAT_X86_64_AMD_ENCRYPT_TABLES)) 231 + pte = __sme_set(pte); 232 + 233 + attrs->descriptor_bits = pte; 234 + return 0; 235 + } 236 + #define pt_iommu_set_prot x86_64_pt_iommu_set_prot 237 + 238 + static inline int 239 + x86_64_pt_iommu_fmt_init(struct pt_iommu_x86_64 *iommu_table, 240 + const struct pt_iommu_x86_64_cfg *cfg) 241 + { 242 + struct pt_x86_64 *table = &iommu_table->x86_64_pt; 243 + 244 + if (cfg->top_level < 3 || cfg->top_level > 4) 245 + return -EOPNOTSUPP; 246 + 247 + pt_top_set_level(&table->common, cfg->top_level); 248 + 249 + table->common.max_oasz_lg2 = 250 + min(PT_MAX_OUTPUT_ADDRESS_LG2, cfg->common.hw_max_oasz_lg2); 251 + return 0; 252 + } 253 + #define pt_iommu_fmt_init x86_64_pt_iommu_fmt_init 254 + 255 + static inline void 256 + x86_64_pt_iommu_fmt_hw_info(struct pt_iommu_x86_64 *table, 257 + const struct pt_range *top_range, 258 + struct pt_iommu_x86_64_hw_info *info) 259 + { 260 + info->gcr3_pt = virt_to_phys(top_range->top_table); 261 + PT_WARN_ON(info->gcr3_pt & ~PT_TOP_PHYS_MASK); 262 + info->levels = top_range->top_level + 1; 263 + } 264 + #define pt_iommu_fmt_hw_info x86_64_pt_iommu_fmt_hw_info 265 + 266 + #if defined(GENERIC_PT_KUNIT) 267 + static const struct pt_iommu_x86_64_cfg x86_64_kunit_fmt_cfgs[] = { 268 + [0] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), 269 + .common.hw_max_vasz_lg2 = 48, .top_level = 3 }, 270 + [1] = { .common.features = BIT(PT_FEAT_SIGN_EXTEND), 271 + .common.hw_max_vasz_lg2 = 57, .top_level = 4 }, 272 + /* AMD IOMMU PASID 0 formats with no SIGN_EXTEND */ 273 + [2] = { .common.hw_max_vasz_lg2 = 47, .top_level = 3 }, 274 + [3] = { .common.hw_max_vasz_lg2 = 56, .top_level = 4}, 275 + }; 276 + #define kunit_fmt_cfgs x86_64_kunit_fmt_cfgs 277 + enum { KUNIT_FMT_FEATURES = BIT(PT_FEAT_SIGN_EXTEND)}; 278 + #endif 279 + #endif

+1289

drivers/iommu/generic_pt/iommu_pt.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * "Templated C code" for implementing the iommu operations for page tables. 6 + * This is compiled multiple times, over all the page table formats to pick up 7 + * the per-format definitions. 8 + */ 9 + #ifndef __GENERIC_PT_IOMMU_PT_H 10 + #define __GENERIC_PT_IOMMU_PT_H 11 + 12 + #include "pt_iter.h" 13 + 14 + #include <linux/export.h> 15 + #include <linux/iommu.h> 16 + #include "../iommu-pages.h" 17 + #include <linux/cleanup.h> 18 + #include <linux/dma-mapping.h> 19 + 20 + enum { 21 + SW_BIT_CACHE_FLUSH_DONE = 0, 22 + }; 23 + 24 + static void flush_writes_range(const struct pt_state *pts, 25 + unsigned int start_index, unsigned int end_index) 26 + { 27 + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) 28 + iommu_pages_flush_incoherent( 29 + iommu_from_common(pts->range->common)->iommu_device, 30 + pts->table, start_index * PT_ITEM_WORD_SIZE, 31 + (end_index - start_index) * PT_ITEM_WORD_SIZE); 32 + } 33 + 34 + static void flush_writes_item(const struct pt_state *pts) 35 + { 36 + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) 37 + iommu_pages_flush_incoherent( 38 + iommu_from_common(pts->range->common)->iommu_device, 39 + pts->table, pts->index * PT_ITEM_WORD_SIZE, 40 + PT_ITEM_WORD_SIZE); 41 + } 42 + 43 + static void gather_range_pages(struct iommu_iotlb_gather *iotlb_gather, 44 + struct pt_iommu *iommu_table, pt_vaddr_t iova, 45 + pt_vaddr_t len, 46 + struct iommu_pages_list *free_list) 47 + { 48 + struct pt_common *common = common_from_iommu(iommu_table); 49 + 50 + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) 51 + iommu_pages_stop_incoherent_list(free_list, 52 + iommu_table->iommu_device); 53 + 54 + if (pt_feature(common, PT_FEAT_FLUSH_RANGE_NO_GAPS) && 55 + iommu_iotlb_gather_is_disjoint(iotlb_gather, iova, len)) { 56 + iommu_iotlb_sync(&iommu_table->domain, iotlb_gather); 57 + /* 58 + * Note that the sync frees the gather's free list, so we must 59 + * not have any pages on that list that are covered by iova/len 60 + */ 61 + } else if (pt_feature(common, PT_FEAT_FLUSH_RANGE)) { 62 + iommu_iotlb_gather_add_range(iotlb_gather, iova, len); 63 + } 64 + 65 + iommu_pages_list_splice(free_list, &iotlb_gather->freelist); 66 + } 67 + 68 + #define DOMAIN_NS(op) CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), op) 69 + 70 + static int make_range_ul(struct pt_common *common, struct pt_range *range, 71 + unsigned long iova, unsigned long len) 72 + { 73 + unsigned long last; 74 + 75 + if (unlikely(len == 0)) 76 + return -EINVAL; 77 + 78 + if (check_add_overflow(iova, len - 1, &last)) 79 + return -EOVERFLOW; 80 + 81 + *range = pt_make_range(common, iova, last); 82 + if (sizeof(iova) > sizeof(range->va)) { 83 + if (unlikely(range->va != iova || range->last_va != last)) 84 + return -EOVERFLOW; 85 + } 86 + return 0; 87 + } 88 + 89 + static __maybe_unused int make_range_u64(struct pt_common *common, 90 + struct pt_range *range, u64 iova, 91 + u64 len) 92 + { 93 + if (unlikely(iova > ULONG_MAX || len > ULONG_MAX)) 94 + return -EOVERFLOW; 95 + return make_range_ul(common, range, iova, len); 96 + } 97 + 98 + /* 99 + * Some APIs use unsigned long, while othersuse dma_addr_t as the type. Dispatch 100 + * to the correct validation based on the type. 101 + */ 102 + #define make_range_no_check(common, range, iova, len) \ 103 + ({ \ 104 + int ret; \ 105 + if (sizeof(iova) > sizeof(unsigned long) || \ 106 + sizeof(len) > sizeof(unsigned long)) \ 107 + ret = make_range_u64(common, range, iova, len); \ 108 + else \ 109 + ret = make_range_ul(common, range, iova, len); \ 110 + ret; \ 111 + }) 112 + 113 + #define make_range(common, range, iova, len) \ 114 + ({ \ 115 + int ret = make_range_no_check(common, range, iova, len); \ 116 + if (!ret) \ 117 + ret = pt_check_range(range); \ 118 + ret; \ 119 + }) 120 + 121 + static inline unsigned int compute_best_pgsize(struct pt_state *pts, 122 + pt_oaddr_t oa) 123 + { 124 + struct pt_iommu *iommu_table = iommu_from_common(pts->range->common); 125 + 126 + if (!pt_can_have_leaf(pts)) 127 + return 0; 128 + 129 + /* 130 + * The page size is limited by the domain's bitmap. This allows the core 131 + * code to reduce the supported page sizes by changing the bitmap. 132 + */ 133 + return pt_compute_best_pgsize(pt_possible_sizes(pts) & 134 + iommu_table->domain.pgsize_bitmap, 135 + pts->range->va, pts->range->last_va, oa); 136 + } 137 + 138 + static __always_inline int __do_iova_to_phys(struct pt_range *range, void *arg, 139 + unsigned int level, 140 + struct pt_table_p *table, 141 + pt_level_fn_t descend_fn) 142 + { 143 + struct pt_state pts = pt_init(range, level, table); 144 + pt_oaddr_t *res = arg; 145 + 146 + switch (pt_load_single_entry(&pts)) { 147 + case PT_ENTRY_EMPTY: 148 + return -ENOENT; 149 + case PT_ENTRY_TABLE: 150 + return pt_descend(&pts, arg, descend_fn); 151 + case PT_ENTRY_OA: 152 + *res = pt_entry_oa_exact(&pts); 153 + return 0; 154 + } 155 + return -ENOENT; 156 + } 157 + PT_MAKE_LEVELS(__iova_to_phys, __do_iova_to_phys); 158 + 159 + /** 160 + * iova_to_phys() - Return the output address for the given IOVA 161 + * @domain: Table to query 162 + * @iova: IO virtual address to query 163 + * 164 + * Determine the output address from the given IOVA. @iova may have any 165 + * alignment, the returned physical will be adjusted with any sub page offset. 166 + * 167 + * Context: The caller must hold a read range lock that includes @iova. 168 + * 169 + * Return: 0 if there is no translation for the given iova. 170 + */ 171 + phys_addr_t DOMAIN_NS(iova_to_phys)(struct iommu_domain *domain, 172 + dma_addr_t iova) 173 + { 174 + struct pt_iommu *iommu_table = 175 + container_of(domain, struct pt_iommu, domain); 176 + struct pt_range range; 177 + pt_oaddr_t res; 178 + int ret; 179 + 180 + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); 181 + if (ret) 182 + return ret; 183 + 184 + ret = pt_walk_range(&range, __iova_to_phys, &res); 185 + /* PHYS_ADDR_MAX would be a better error code */ 186 + if (ret) 187 + return 0; 188 + return res; 189 + } 190 + EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(iova_to_phys), "GENERIC_PT_IOMMU"); 191 + 192 + struct pt_iommu_dirty_args { 193 + struct iommu_dirty_bitmap *dirty; 194 + unsigned int flags; 195 + }; 196 + 197 + static void record_dirty(struct pt_state *pts, 198 + struct pt_iommu_dirty_args *dirty, 199 + unsigned int num_contig_lg2) 200 + { 201 + pt_vaddr_t dirty_len; 202 + 203 + if (num_contig_lg2 != ilog2(1)) { 204 + unsigned int index = pts->index; 205 + unsigned int end_index = log2_set_mod_max_t( 206 + unsigned int, pts->index, num_contig_lg2); 207 + 208 + /* Adjust for being contained inside a contiguous page */ 209 + end_index = min(end_index, pts->end_index); 210 + dirty_len = (end_index - index) * 211 + log2_to_int(pt_table_item_lg2sz(pts)); 212 + } else { 213 + dirty_len = log2_to_int(pt_table_item_lg2sz(pts)); 214 + } 215 + 216 + if (dirty->dirty->bitmap) 217 + iova_bitmap_set(dirty->dirty->bitmap, pts->range->va, 218 + dirty_len); 219 + 220 + if (!(dirty->flags & IOMMU_DIRTY_NO_CLEAR)) { 221 + /* 222 + * No write log required because DMA incoherence and atomic 223 + * dirty tracking bits can't work together 224 + */ 225 + pt_entry_make_write_clean(pts); 226 + iommu_iotlb_gather_add_range(dirty->dirty->gather, 227 + pts->range->va, dirty_len); 228 + } 229 + } 230 + 231 + static inline int __read_and_clear_dirty(struct pt_range *range, void *arg, 232 + unsigned int level, 233 + struct pt_table_p *table) 234 + { 235 + struct pt_state pts = pt_init(range, level, table); 236 + struct pt_iommu_dirty_args *dirty = arg; 237 + int ret; 238 + 239 + for_each_pt_level_entry(&pts) { 240 + if (pts.type == PT_ENTRY_TABLE) { 241 + ret = pt_descend(&pts, arg, __read_and_clear_dirty); 242 + if (ret) 243 + return ret; 244 + continue; 245 + } 246 + if (pts.type == PT_ENTRY_OA && pt_entry_is_write_dirty(&pts)) 247 + record_dirty(&pts, dirty, 248 + pt_entry_num_contig_lg2(&pts)); 249 + } 250 + return 0; 251 + } 252 + 253 + /** 254 + * read_and_clear_dirty() - Manipulate the HW set write dirty state 255 + * @domain: Domain to manipulate 256 + * @iova: IO virtual address to start 257 + * @size: Length of the IOVA 258 + * @flags: A bitmap of IOMMU_DIRTY_NO_CLEAR 259 + * @dirty: Place to store the dirty bits 260 + * 261 + * Iterate over all the entries in the mapped range and record their write dirty 262 + * status in iommu_dirty_bitmap. If IOMMU_DIRTY_NO_CLEAR is not specified then 263 + * the entries will be left dirty, otherwise they are returned to being not 264 + * write dirty. 265 + * 266 + * Context: The caller must hold a read range lock that includes @iova. 267 + * 268 + * Returns: -ERRNO on failure, 0 on success. 269 + */ 270 + int DOMAIN_NS(read_and_clear_dirty)(struct iommu_domain *domain, 271 + unsigned long iova, size_t size, 272 + unsigned long flags, 273 + struct iommu_dirty_bitmap *dirty) 274 + { 275 + struct pt_iommu *iommu_table = 276 + container_of(domain, struct pt_iommu, domain); 277 + struct pt_iommu_dirty_args dirty_args = { 278 + .dirty = dirty, 279 + .flags = flags, 280 + }; 281 + struct pt_range range; 282 + int ret; 283 + 284 + #if !IS_ENABLED(CONFIG_IOMMUFD_DRIVER) || !defined(pt_entry_is_write_dirty) 285 + return -EOPNOTSUPP; 286 + #endif 287 + 288 + ret = make_range(common_from_iommu(iommu_table), &range, iova, size); 289 + if (ret) 290 + return ret; 291 + 292 + ret = pt_walk_range(&range, __read_and_clear_dirty, &dirty_args); 293 + PT_WARN_ON(ret); 294 + return ret; 295 + } 296 + EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(read_and_clear_dirty), "GENERIC_PT_IOMMU"); 297 + 298 + static inline int __set_dirty(struct pt_range *range, void *arg, 299 + unsigned int level, struct pt_table_p *table) 300 + { 301 + struct pt_state pts = pt_init(range, level, table); 302 + 303 + switch (pt_load_single_entry(&pts)) { 304 + case PT_ENTRY_EMPTY: 305 + return -ENOENT; 306 + case PT_ENTRY_TABLE: 307 + return pt_descend(&pts, arg, __set_dirty); 308 + case PT_ENTRY_OA: 309 + if (!pt_entry_make_write_dirty(&pts)) 310 + return -EAGAIN; 311 + return 0; 312 + } 313 + return -ENOENT; 314 + } 315 + 316 + static int __maybe_unused NS(set_dirty)(struct pt_iommu *iommu_table, 317 + dma_addr_t iova) 318 + { 319 + struct pt_range range; 320 + int ret; 321 + 322 + ret = make_range(common_from_iommu(iommu_table), &range, iova, 1); 323 + if (ret) 324 + return ret; 325 + 326 + /* 327 + * Note: There is no locking here yet, if the test suite races this it 328 + * can crash. It should use RCU locking eventually. 329 + */ 330 + return pt_walk_range(&range, __set_dirty, NULL); 331 + } 332 + 333 + struct pt_iommu_collect_args { 334 + struct iommu_pages_list free_list; 335 + /* Fail if any OAs are within the range */ 336 + u8 check_mapped : 1; 337 + }; 338 + 339 + static int __collect_tables(struct pt_range *range, void *arg, 340 + unsigned int level, struct pt_table_p *table) 341 + { 342 + struct pt_state pts = pt_init(range, level, table); 343 + struct pt_iommu_collect_args *collect = arg; 344 + int ret; 345 + 346 + if (!collect->check_mapped && !pt_can_have_table(&pts)) 347 + return 0; 348 + 349 + for_each_pt_level_entry(&pts) { 350 + if (pts.type == PT_ENTRY_TABLE) { 351 + iommu_pages_list_add(&collect->free_list, pts.table_lower); 352 + ret = pt_descend(&pts, arg, __collect_tables); 353 + if (ret) 354 + return ret; 355 + continue; 356 + } 357 + if (pts.type == PT_ENTRY_OA && collect->check_mapped) 358 + return -EADDRINUSE; 359 + } 360 + return 0; 361 + } 362 + 363 + enum alloc_mode {ALLOC_NORMAL, ALLOC_DEFER_COHERENT_FLUSH}; 364 + 365 + /* Allocate a table, the empty table will be ready to be installed. */ 366 + static inline struct pt_table_p *_table_alloc(struct pt_common *common, 367 + size_t lg2sz, gfp_t gfp, 368 + enum alloc_mode mode) 369 + { 370 + struct pt_iommu *iommu_table = iommu_from_common(common); 371 + struct pt_table_p *table_mem; 372 + 373 + table_mem = iommu_alloc_pages_node_sz(iommu_table->nid, gfp, 374 + log2_to_int(lg2sz)); 375 + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && 376 + mode == ALLOC_NORMAL) { 377 + int ret = iommu_pages_start_incoherent( 378 + table_mem, iommu_table->iommu_device); 379 + if (ret) { 380 + iommu_free_pages(table_mem); 381 + return ERR_PTR(ret); 382 + } 383 + } 384 + return table_mem; 385 + } 386 + 387 + static inline struct pt_table_p *table_alloc_top(struct pt_common *common, 388 + uintptr_t top_of_table, 389 + gfp_t gfp, 390 + enum alloc_mode mode) 391 + { 392 + /* 393 + * Top doesn't need the free list or otherwise, so it technically 394 + * doesn't need to use iommu pages. Use the API anyhow as the top is 395 + * usually not smaller than PAGE_SIZE to keep things simple. 396 + */ 397 + return _table_alloc(common, pt_top_memsize_lg2(common, top_of_table), 398 + gfp, mode); 399 + } 400 + 401 + /* Allocate an interior table */ 402 + static inline struct pt_table_p *table_alloc(const struct pt_state *parent_pts, 403 + gfp_t gfp, enum alloc_mode mode) 404 + { 405 + struct pt_state child_pts = 406 + pt_init(parent_pts->range, parent_pts->level - 1, NULL); 407 + 408 + return _table_alloc(parent_pts->range->common, 409 + pt_num_items_lg2(&child_pts) + 410 + ilog2(PT_ITEM_WORD_SIZE), 411 + gfp, mode); 412 + } 413 + 414 + static inline int pt_iommu_new_table(struct pt_state *pts, 415 + struct pt_write_attrs *attrs) 416 + { 417 + struct pt_table_p *table_mem; 418 + phys_addr_t phys; 419 + 420 + /* Given PA/VA/length can't be represented */ 421 + if (PT_WARN_ON(!pt_can_have_table(pts))) 422 + return -ENXIO; 423 + 424 + table_mem = table_alloc(pts, attrs->gfp, ALLOC_NORMAL); 425 + if (IS_ERR(table_mem)) 426 + return PTR_ERR(table_mem); 427 + 428 + phys = virt_to_phys(table_mem); 429 + if (!pt_install_table(pts, phys, attrs)) { 430 + iommu_pages_free_incoherent( 431 + table_mem, 432 + iommu_from_common(pts->range->common)->iommu_device); 433 + return -EAGAIN; 434 + } 435 + 436 + if (pts_feature(pts, PT_FEAT_DMA_INCOHERENT)) { 437 + flush_writes_item(pts); 438 + pt_set_sw_bit_release(pts, SW_BIT_CACHE_FLUSH_DONE); 439 + } 440 + 441 + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { 442 + /* 443 + * The underlying table can't store the physical table address. 444 + * This happens when kunit testing tables outside their normal 445 + * environment where a CPU might be limited. 446 + */ 447 + pt_load_single_entry(pts); 448 + if (PT_WARN_ON(pt_table_pa(pts) != phys)) { 449 + pt_clear_entries(pts, ilog2(1)); 450 + iommu_pages_free_incoherent( 451 + table_mem, iommu_from_common(pts->range->common) 452 + ->iommu_device); 453 + return -EINVAL; 454 + } 455 + } 456 + 457 + pts->table_lower = table_mem; 458 + return 0; 459 + } 460 + 461 + struct pt_iommu_map_args { 462 + struct iommu_iotlb_gather *iotlb_gather; 463 + struct pt_write_attrs attrs; 464 + pt_oaddr_t oa; 465 + unsigned int leaf_pgsize_lg2; 466 + unsigned int leaf_level; 467 + }; 468 + 469 + /* 470 + * This will recursively check any tables in the block to validate they are 471 + * empty and then free them through the gather. 472 + */ 473 + static int clear_contig(const struct pt_state *start_pts, 474 + struct iommu_iotlb_gather *iotlb_gather, 475 + unsigned int step, unsigned int pgsize_lg2) 476 + { 477 + struct pt_iommu *iommu_table = 478 + iommu_from_common(start_pts->range->common); 479 + struct pt_range range = *start_pts->range; 480 + struct pt_state pts = 481 + pt_init(&range, start_pts->level, start_pts->table); 482 + struct pt_iommu_collect_args collect = { .check_mapped = true }; 483 + int ret; 484 + 485 + pts.index = start_pts->index; 486 + pts.end_index = start_pts->index + step; 487 + for (; _pt_iter_load(&pts); pt_next_entry(&pts)) { 488 + if (pts.type == PT_ENTRY_TABLE) { 489 + collect.free_list = 490 + IOMMU_PAGES_LIST_INIT(collect.free_list); 491 + ret = pt_walk_descend_all(&pts, __collect_tables, 492 + &collect); 493 + if (ret) 494 + return ret; 495 + 496 + /* 497 + * The table item must be cleared before we can update 498 + * the gather 499 + */ 500 + pt_clear_entries(&pts, ilog2(1)); 501 + flush_writes_item(&pts); 502 + 503 + iommu_pages_list_add(&collect.free_list, 504 + pt_table_ptr(&pts)); 505 + gather_range_pages( 506 + iotlb_gather, iommu_table, range.va, 507 + log2_to_int(pt_table_item_lg2sz(&pts)), 508 + &collect.free_list); 509 + } else if (pts.type != PT_ENTRY_EMPTY) { 510 + return -EADDRINUSE; 511 + } 512 + } 513 + return 0; 514 + } 515 + 516 + static int __map_range_leaf(struct pt_range *range, void *arg, 517 + unsigned int level, struct pt_table_p *table) 518 + { 519 + struct pt_state pts = pt_init(range, level, table); 520 + struct pt_iommu_map_args *map = arg; 521 + unsigned int leaf_pgsize_lg2 = map->leaf_pgsize_lg2; 522 + unsigned int start_index; 523 + pt_oaddr_t oa = map->oa; 524 + unsigned int step; 525 + bool need_contig; 526 + int ret = 0; 527 + 528 + PT_WARN_ON(map->leaf_level != level); 529 + PT_WARN_ON(!pt_can_have_leaf(&pts)); 530 + 531 + step = log2_to_int_t(unsigned int, 532 + leaf_pgsize_lg2 - pt_table_item_lg2sz(&pts)); 533 + need_contig = leaf_pgsize_lg2 != pt_table_item_lg2sz(&pts); 534 + 535 + _pt_iter_first(&pts); 536 + start_index = pts.index; 537 + do { 538 + pts.type = pt_load_entry_raw(&pts); 539 + if (pts.type != PT_ENTRY_EMPTY || need_contig) { 540 + if (pts.index != start_index) 541 + pt_index_to_va(&pts); 542 + ret = clear_contig(&pts, map->iotlb_gather, step, 543 + leaf_pgsize_lg2); 544 + if (ret) 545 + break; 546 + } 547 + 548 + if (IS_ENABLED(CONFIG_DEBUG_GENERIC_PT)) { 549 + pt_index_to_va(&pts); 550 + PT_WARN_ON(compute_best_pgsize(&pts, oa) != 551 + leaf_pgsize_lg2); 552 + } 553 + pt_install_leaf_entry(&pts, oa, leaf_pgsize_lg2, &map->attrs); 554 + 555 + oa += log2_to_int(leaf_pgsize_lg2); 556 + pts.index += step; 557 + } while (pts.index < pts.end_index); 558 + 559 + flush_writes_range(&pts, start_index, pts.index); 560 + 561 + map->oa = oa; 562 + return ret; 563 + } 564 + 565 + static int __map_range(struct pt_range *range, void *arg, unsigned int level, 566 + struct pt_table_p *table) 567 + { 568 + struct pt_state pts = pt_init(range, level, table); 569 + struct pt_iommu_map_args *map = arg; 570 + int ret; 571 + 572 + PT_WARN_ON(map->leaf_level == level); 573 + PT_WARN_ON(!pt_can_have_table(&pts)); 574 + 575 + _pt_iter_first(&pts); 576 + 577 + /* Descend to a child table */ 578 + do { 579 + pts.type = pt_load_entry_raw(&pts); 580 + 581 + if (pts.type != PT_ENTRY_TABLE) { 582 + if (pts.type != PT_ENTRY_EMPTY) 583 + return -EADDRINUSE; 584 + ret = pt_iommu_new_table(&pts, &map->attrs); 585 + if (ret) { 586 + /* 587 + * Racing with another thread installing a table 588 + */ 589 + if (ret == -EAGAIN) 590 + continue; 591 + return ret; 592 + } 593 + } else { 594 + pts.table_lower = pt_table_ptr(&pts); 595 + /* 596 + * Racing with a shared pt_iommu_new_table()? The other 597 + * thread is still flushing the cache, so we have to 598 + * also flush it to ensure that when our thread's map 599 + * completes all the table items leading to our mapping 600 + * are visible. 601 + * 602 + * This requires the pt_set_bit_release() to be a 603 + * release of the cache flush so that this can acquire 604 + * visibility at the iommu. 605 + */ 606 + if (pts_feature(&pts, PT_FEAT_DMA_INCOHERENT) && 607 + !pt_test_sw_bit_acquire(&pts, 608 + SW_BIT_CACHE_FLUSH_DONE)) 609 + flush_writes_item(&pts); 610 + } 611 + 612 + /* 613 + * The already present table can possibly be shared with another 614 + * concurrent map. 615 + */ 616 + if (map->leaf_level == level - 1) 617 + ret = pt_descend(&pts, arg, __map_range_leaf); 618 + else 619 + ret = pt_descend(&pts, arg, __map_range); 620 + if (ret) 621 + return ret; 622 + 623 + pts.index++; 624 + pt_index_to_va(&pts); 625 + if (pts.index >= pts.end_index) 626 + break; 627 + } while (true); 628 + return 0; 629 + } 630 + 631 + /* 632 + * Fast path for the easy case of mapping a 4k page to an already allocated 633 + * table. This is a common workload. If it returns EAGAIN run the full algorithm 634 + * instead. 635 + */ 636 + static __always_inline int __do_map_single_page(struct pt_range *range, 637 + void *arg, unsigned int level, 638 + struct pt_table_p *table, 639 + pt_level_fn_t descend_fn) 640 + { 641 + struct pt_state pts = pt_init(range, level, table); 642 + struct pt_iommu_map_args *map = arg; 643 + 644 + pts.type = pt_load_single_entry(&pts); 645 + if (level == 0) { 646 + if (pts.type != PT_ENTRY_EMPTY) 647 + return -EADDRINUSE; 648 + pt_install_leaf_entry(&pts, map->oa, PAGE_SHIFT, 649 + &map->attrs); 650 + /* No flush, not used when incoherent */ 651 + map->oa += PAGE_SIZE; 652 + return 0; 653 + } 654 + if (pts.type == PT_ENTRY_TABLE) 655 + return pt_descend(&pts, arg, descend_fn); 656 + /* Something else, use the slow path */ 657 + return -EAGAIN; 658 + } 659 + PT_MAKE_LEVELS(__map_single_page, __do_map_single_page); 660 + 661 + /* 662 + * Add a table to the top, increasing the top level as much as necessary to 663 + * encompass range. 664 + */ 665 + static int increase_top(struct pt_iommu *iommu_table, struct pt_range *range, 666 + struct pt_iommu_map_args *map) 667 + { 668 + struct iommu_pages_list free_list = IOMMU_PAGES_LIST_INIT(free_list); 669 + struct pt_common *common = common_from_iommu(iommu_table); 670 + uintptr_t top_of_table = READ_ONCE(common->top_of_table); 671 + uintptr_t new_top_of_table = top_of_table; 672 + struct pt_table_p *table_mem; 673 + unsigned int new_level; 674 + spinlock_t *domain_lock; 675 + unsigned long flags; 676 + int ret; 677 + 678 + while (true) { 679 + struct pt_range top_range = 680 + _pt_top_range(common, new_top_of_table); 681 + struct pt_state pts = pt_init_top(&top_range); 682 + 683 + top_range.va = range->va; 684 + top_range.last_va = range->last_va; 685 + 686 + if (!pt_check_range(&top_range) && 687 + map->leaf_level <= pts.level) { 688 + new_level = pts.level; 689 + break; 690 + } 691 + 692 + pts.level++; 693 + if (pts.level > PT_MAX_TOP_LEVEL || 694 + pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) { 695 + ret = -ERANGE; 696 + goto err_free; 697 + } 698 + 699 + table_mem = 700 + table_alloc_top(common, _pt_top_set(NULL, pts.level), 701 + map->attrs.gfp, ALLOC_DEFER_COHERENT_FLUSH); 702 + if (IS_ERR(table_mem)) { 703 + ret = PTR_ERR(table_mem); 704 + goto err_free; 705 + } 706 + iommu_pages_list_add(&free_list, table_mem); 707 + 708 + /* The new table links to the lower table always at index 0 */ 709 + top_range.va = 0; 710 + top_range.top_level = pts.level; 711 + pts.table_lower = pts.table; 712 + pts.table = table_mem; 713 + pt_load_single_entry(&pts); 714 + PT_WARN_ON(pts.index != 0); 715 + pt_install_table(&pts, virt_to_phys(pts.table_lower), 716 + &map->attrs); 717 + new_top_of_table = _pt_top_set(pts.table, pts.level); 718 + } 719 + 720 + /* 721 + * Avoid double flushing, flush it once after all pt_install_table() 722 + */ 723 + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { 724 + ret = iommu_pages_start_incoherent_list( 725 + &free_list, iommu_table->iommu_device); 726 + if (ret) 727 + goto err_free; 728 + } 729 + 730 + /* 731 + * top_of_table is write locked by the spinlock, but readers can use 732 + * READ_ONCE() to get the value. Since we encode both the level and the 733 + * pointer in one quanta the lockless reader will always see something 734 + * valid. The HW must be updated to the new level under the spinlock 735 + * before top_of_table is updated so that concurrent readers don't map 736 + * into the new level until it is fully functional. If another thread 737 + * already updated it while we were working then throw everything away 738 + * and try again. 739 + */ 740 + domain_lock = iommu_table->driver_ops->get_top_lock(iommu_table); 741 + spin_lock_irqsave(domain_lock, flags); 742 + if (common->top_of_table != top_of_table || 743 + top_of_table == new_top_of_table) { 744 + spin_unlock_irqrestore(domain_lock, flags); 745 + ret = -EAGAIN; 746 + goto err_free; 747 + } 748 + 749 + /* 750 + * We do not issue any flushes for change_top on the expectation that 751 + * any walk cache will not become a problem by adding another layer to 752 + * the tree. Misses will rewalk from the updated top pointer, hits 753 + * continue to be correct. Negative caching is fine too since all the 754 + * new IOVA added by the new top is non-present. 755 + */ 756 + iommu_table->driver_ops->change_top( 757 + iommu_table, virt_to_phys(table_mem), new_level); 758 + WRITE_ONCE(common->top_of_table, new_top_of_table); 759 + spin_unlock_irqrestore(domain_lock, flags); 760 + return 0; 761 + 762 + err_free: 763 + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) 764 + iommu_pages_stop_incoherent_list(&free_list, 765 + iommu_table->iommu_device); 766 + iommu_put_pages_list(&free_list); 767 + return ret; 768 + } 769 + 770 + static int check_map_range(struct pt_iommu *iommu_table, struct pt_range *range, 771 + struct pt_iommu_map_args *map) 772 + { 773 + struct pt_common *common = common_from_iommu(iommu_table); 774 + int ret; 775 + 776 + do { 777 + ret = pt_check_range(range); 778 + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) 779 + return ret; 780 + 781 + if (!ret && map->leaf_level <= range->top_level) 782 + break; 783 + 784 + ret = increase_top(iommu_table, range, map); 785 + if (ret && ret != -EAGAIN) 786 + return ret; 787 + 788 + /* Reload the new top */ 789 + *range = pt_make_range(common, range->va, range->last_va); 790 + } while (ret); 791 + PT_WARN_ON(pt_check_range(range)); 792 + return 0; 793 + } 794 + 795 + static int do_map(struct pt_range *range, struct pt_common *common, 796 + bool single_page, struct pt_iommu_map_args *map) 797 + { 798 + /* 799 + * The __map_single_page() fast path does not support DMA_INCOHERENT 800 + * flushing to keep its .text small. 801 + */ 802 + if (single_page && !pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { 803 + int ret; 804 + 805 + ret = pt_walk_range(range, __map_single_page, map); 806 + if (ret != -EAGAIN) 807 + return ret; 808 + /* EAGAIN falls through to the full path */ 809 + } 810 + 811 + if (map->leaf_level == range->top_level) 812 + return pt_walk_range(range, __map_range_leaf, map); 813 + return pt_walk_range(range, __map_range, map); 814 + } 815 + 816 + /** 817 + * map_pages() - Install translation for an IOVA range 818 + * @domain: Domain to manipulate 819 + * @iova: IO virtual address to start 820 + * @paddr: Physical/Output address to start 821 + * @pgsize: Length of each page 822 + * @pgcount: Length of the range in pgsize units starting from @iova 823 + * @prot: A bitmap of IOMMU_READ/WRITE/CACHE/NOEXEC/MMIO 824 + * @gfp: GFP flags for any memory allocations 825 + * @mapped: Total bytes successfully mapped 826 + * 827 + * The range starting at IOVA will have paddr installed into it. The caller 828 + * must specify a valid pgsize and pgcount to segment the range into compatible 829 + * blocks. 830 + * 831 + * On error the caller will probably want to invoke unmap on the range from iova 832 + * up to the amount indicated by @mapped to return the table back to an 833 + * unchanged state. 834 + * 835 + * Context: The caller must hold a write range lock that includes the whole 836 + * range. 837 + * 838 + * Returns: -ERRNO on failure, 0 on success. The number of bytes of VA that were 839 + * mapped are added to @mapped, @mapped is not zerod first. 840 + */ 841 + int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, 842 + phys_addr_t paddr, size_t pgsize, size_t pgcount, 843 + int prot, gfp_t gfp, size_t *mapped) 844 + { 845 + struct pt_iommu *iommu_table = 846 + container_of(domain, struct pt_iommu, domain); 847 + pt_vaddr_t pgsize_bitmap = iommu_table->domain.pgsize_bitmap; 848 + struct pt_common *common = common_from_iommu(iommu_table); 849 + struct iommu_iotlb_gather iotlb_gather; 850 + pt_vaddr_t len = pgsize * pgcount; 851 + struct pt_iommu_map_args map = { 852 + .iotlb_gather = &iotlb_gather, 853 + .oa = paddr, 854 + .leaf_pgsize_lg2 = vaffs(pgsize), 855 + }; 856 + bool single_page = false; 857 + struct pt_range range; 858 + int ret; 859 + 860 + iommu_iotlb_gather_init(&iotlb_gather); 861 + 862 + if (WARN_ON(!(prot & (IOMMU_READ | IOMMU_WRITE)))) 863 + return -EINVAL; 864 + 865 + /* Check the paddr doesn't exceed what the table can store */ 866 + if ((sizeof(pt_oaddr_t) < sizeof(paddr) && 867 + (pt_vaddr_t)paddr > PT_VADDR_MAX) || 868 + (common->max_oasz_lg2 != PT_VADDR_MAX_LG2 && 869 + oalog2_div(paddr, common->max_oasz_lg2))) 870 + return -ERANGE; 871 + 872 + ret = pt_iommu_set_prot(common, &map.attrs, prot); 873 + if (ret) 874 + return ret; 875 + map.attrs.gfp = gfp; 876 + 877 + ret = make_range_no_check(common, &range, iova, len); 878 + if (ret) 879 + return ret; 880 + 881 + /* Calculate target page size and level for the leaves */ 882 + if (pt_has_system_page_size(common) && pgsize == PAGE_SIZE && 883 + pgcount == 1) { 884 + PT_WARN_ON(!(pgsize_bitmap & PAGE_SIZE)); 885 + if (log2_mod(iova | paddr, PAGE_SHIFT)) 886 + return -ENXIO; 887 + map.leaf_pgsize_lg2 = PAGE_SHIFT; 888 + map.leaf_level = 0; 889 + single_page = true; 890 + } else { 891 + map.leaf_pgsize_lg2 = pt_compute_best_pgsize( 892 + pgsize_bitmap, range.va, range.last_va, paddr); 893 + if (!map.leaf_pgsize_lg2) 894 + return -ENXIO; 895 + map.leaf_level = 896 + pt_pgsz_lg2_to_level(common, map.leaf_pgsize_lg2); 897 + } 898 + 899 + ret = check_map_range(iommu_table, &range, &map); 900 + if (ret) 901 + return ret; 902 + 903 + PT_WARN_ON(map.leaf_level > range.top_level); 904 + 905 + ret = do_map(&range, common, single_page, &map); 906 + 907 + /* 908 + * Table levels were freed and replaced with large items, flush any walk 909 + * cache that may refer to the freed levels. 910 + */ 911 + if (!iommu_pages_list_empty(&iotlb_gather.freelist)) 912 + iommu_iotlb_sync(&iommu_table->domain, &iotlb_gather); 913 + 914 + /* Bytes successfully mapped */ 915 + PT_WARN_ON(!ret && map.oa - paddr != len); 916 + *mapped += map.oa - paddr; 917 + return ret; 918 + } 919 + EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), "GENERIC_PT_IOMMU"); 920 + 921 + struct pt_unmap_args { 922 + struct iommu_pages_list free_list; 923 + pt_vaddr_t unmapped; 924 + }; 925 + 926 + static __maybe_unused int __unmap_range(struct pt_range *range, void *arg, 927 + unsigned int level, 928 + struct pt_table_p *table) 929 + { 930 + struct pt_state pts = pt_init(range, level, table); 931 + struct pt_unmap_args *unmap = arg; 932 + unsigned int num_oas = 0; 933 + unsigned int start_index; 934 + int ret = 0; 935 + 936 + _pt_iter_first(&pts); 937 + start_index = pts.index; 938 + pts.type = pt_load_entry_raw(&pts); 939 + /* 940 + * A starting index is in the middle of a contiguous entry 941 + * 942 + * The IOMMU API does not require drivers to support unmapping parts of 943 + * large pages. Long ago VFIO would try to split maps but the current 944 + * version never does. 945 + * 946 + * Instead when unmap reaches a partial unmap of the start of a large 947 + * IOPTE it should remove the entire IOPTE and return that size to the 948 + * caller. 949 + */ 950 + if (pts.type == PT_ENTRY_OA) { 951 + if (log2_mod(range->va, pt_entry_oa_lg2sz(&pts))) 952 + return -EINVAL; 953 + /* Micro optimization */ 954 + goto start_oa; 955 + } 956 + 957 + do { 958 + if (pts.type != PT_ENTRY_OA) { 959 + bool fully_covered; 960 + 961 + if (pts.type != PT_ENTRY_TABLE) { 962 + ret = -EINVAL; 963 + break; 964 + } 965 + 966 + if (pts.index != start_index) 967 + pt_index_to_va(&pts); 968 + pts.table_lower = pt_table_ptr(&pts); 969 + 970 + fully_covered = pt_entry_fully_covered( 971 + &pts, pt_table_item_lg2sz(&pts)); 972 + 973 + ret = pt_descend(&pts, arg, __unmap_range); 974 + if (ret) 975 + break; 976 + 977 + /* 978 + * If the unmapping range fully covers the table then we 979 + * can free it as well. The clear is delayed until we 980 + * succeed in clearing the lower table levels. 981 + */ 982 + if (fully_covered) { 983 + iommu_pages_list_add(&unmap->free_list, 984 + pts.table_lower); 985 + pt_clear_entries(&pts, ilog2(1)); 986 + } 987 + pts.index++; 988 + } else { 989 + unsigned int num_contig_lg2; 990 + start_oa: 991 + /* 992 + * If the caller requested an last that falls within a 993 + * single entry then the entire entry is unmapped and 994 + * the length returned will be larger than requested. 995 + */ 996 + num_contig_lg2 = pt_entry_num_contig_lg2(&pts); 997 + pt_clear_entries(&pts, num_contig_lg2); 998 + num_oas += log2_to_int(num_contig_lg2); 999 + pts.index += log2_to_int(num_contig_lg2); 1000 + } 1001 + if (pts.index >= pts.end_index) 1002 + break; 1003 + pts.type = pt_load_entry_raw(&pts); 1004 + } while (true); 1005 + 1006 + unmap->unmapped += log2_mul(num_oas, pt_table_item_lg2sz(&pts)); 1007 + flush_writes_range(&pts, start_index, pts.index); 1008 + 1009 + return ret; 1010 + } 1011 + 1012 + /** 1013 + * unmap_pages() - Make a range of IOVA empty/not present 1014 + * @domain: Domain to manipulate 1015 + * @iova: IO virtual address to start 1016 + * @pgsize: Length of each page 1017 + * @pgcount: Length of the range in pgsize units starting from @iova 1018 + * @iotlb_gather: Gather struct that must be flushed on return 1019 + * 1020 + * unmap_pages() will remove a translation created by map_pages(). It cannot 1021 + * subdivide a mapping created by map_pages(), so it should be called with IOVA 1022 + * ranges that match those passed to map_pages(). The IOVA range can aggregate 1023 + * contiguous map_pages() calls so long as no individual range is split. 1024 + * 1025 + * Context: The caller must hold a write range lock that includes 1026 + * the whole range. 1027 + * 1028 + * Returns: Number of bytes of VA unmapped. iova + res will be the point 1029 + * unmapping stopped. 1030 + */ 1031 + size_t DOMAIN_NS(unmap_pages)(struct iommu_domain *domain, unsigned long iova, 1032 + size_t pgsize, size_t pgcount, 1033 + struct iommu_iotlb_gather *iotlb_gather) 1034 + { 1035 + struct pt_iommu *iommu_table = 1036 + container_of(domain, struct pt_iommu, domain); 1037 + struct pt_unmap_args unmap = { .free_list = IOMMU_PAGES_LIST_INIT( 1038 + unmap.free_list) }; 1039 + pt_vaddr_t len = pgsize * pgcount; 1040 + struct pt_range range; 1041 + int ret; 1042 + 1043 + ret = make_range(common_from_iommu(iommu_table), &range, iova, len); 1044 + if (ret) 1045 + return 0; 1046 + 1047 + pt_walk_range(&range, __unmap_range, &unmap); 1048 + 1049 + gather_range_pages(iotlb_gather, iommu_table, iova, len, 1050 + &unmap.free_list); 1051 + 1052 + return unmap.unmapped; 1053 + } 1054 + EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unmap_pages), "GENERIC_PT_IOMMU"); 1055 + 1056 + static void NS(get_info)(struct pt_iommu *iommu_table, 1057 + struct pt_iommu_info *info) 1058 + { 1059 + struct pt_common *common = common_from_iommu(iommu_table); 1060 + struct pt_range range = pt_top_range(common); 1061 + struct pt_state pts = pt_init_top(&range); 1062 + pt_vaddr_t pgsize_bitmap = 0; 1063 + 1064 + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) { 1065 + for (pts.level = 0; pts.level <= PT_MAX_TOP_LEVEL; 1066 + pts.level++) { 1067 + if (pt_table_item_lg2sz(&pts) >= common->max_vasz_lg2) 1068 + break; 1069 + pgsize_bitmap |= pt_possible_sizes(&pts); 1070 + } 1071 + } else { 1072 + for (pts.level = 0; pts.level <= range.top_level; pts.level++) 1073 + pgsize_bitmap |= pt_possible_sizes(&pts); 1074 + } 1075 + 1076 + /* Hide page sizes larger than the maximum OA */ 1077 + info->pgsize_bitmap = oalog2_mod(pgsize_bitmap, common->max_oasz_lg2); 1078 + } 1079 + 1080 + static void NS(deinit)(struct pt_iommu *iommu_table) 1081 + { 1082 + struct pt_common *common = common_from_iommu(iommu_table); 1083 + struct pt_range range = pt_all_range(common); 1084 + struct pt_iommu_collect_args collect = { 1085 + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), 1086 + }; 1087 + 1088 + iommu_pages_list_add(&collect.free_list, range.top_table); 1089 + pt_walk_range(&range, __collect_tables, &collect); 1090 + 1091 + /* 1092 + * The driver has to already have fenced the HW access to the page table 1093 + * and invalidated any caching referring to this memory. 1094 + */ 1095 + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) 1096 + iommu_pages_stop_incoherent_list(&collect.free_list, 1097 + iommu_table->iommu_device); 1098 + iommu_put_pages_list(&collect.free_list); 1099 + } 1100 + 1101 + static const struct pt_iommu_ops NS(ops) = { 1102 + #if IS_ENABLED(CONFIG_IOMMUFD_DRIVER) && defined(pt_entry_is_write_dirty) && \ 1103 + IS_ENABLED(CONFIG_IOMMUFD_TEST) && defined(pt_entry_make_write_dirty) 1104 + .set_dirty = NS(set_dirty), 1105 + #endif 1106 + .get_info = NS(get_info), 1107 + .deinit = NS(deinit), 1108 + }; 1109 + 1110 + static int pt_init_common(struct pt_common *common) 1111 + { 1112 + struct pt_range top_range = pt_top_range(common); 1113 + 1114 + if (PT_WARN_ON(top_range.top_level > PT_MAX_TOP_LEVEL)) 1115 + return -EINVAL; 1116 + 1117 + if (top_range.top_level == PT_MAX_TOP_LEVEL || 1118 + common->max_vasz_lg2 == top_range.max_vasz_lg2) 1119 + common->features &= ~BIT(PT_FEAT_DYNAMIC_TOP); 1120 + 1121 + if (top_range.max_vasz_lg2 == PT_VADDR_MAX_LG2) 1122 + common->features |= BIT(PT_FEAT_FULL_VA); 1123 + 1124 + /* Requested features must match features compiled into this format */ 1125 + if ((common->features & ~(unsigned int)PT_SUPPORTED_FEATURES) || 1126 + (!IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) && 1127 + (common->features & PT_FORCE_ENABLED_FEATURES) != 1128 + PT_FORCE_ENABLED_FEATURES)) 1129 + return -EOPNOTSUPP; 1130 + 1131 + /* 1132 + * Check if the top level of the page table is too small to hold the 1133 + * specified maxvasz. 1134 + */ 1135 + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP) && 1136 + top_range.top_level != PT_MAX_TOP_LEVEL) { 1137 + struct pt_state pts = { .range = &top_range, 1138 + .level = top_range.top_level }; 1139 + 1140 + if (common->max_vasz_lg2 > 1141 + pt_num_items_lg2(&pts) + pt_table_item_lg2sz(&pts)) 1142 + return -EOPNOTSUPP; 1143 + } 1144 + 1145 + if (common->max_oasz_lg2 == 0) 1146 + common->max_oasz_lg2 = pt_max_oa_lg2(common); 1147 + else 1148 + common->max_oasz_lg2 = min(common->max_oasz_lg2, 1149 + pt_max_oa_lg2(common)); 1150 + return 0; 1151 + } 1152 + 1153 + static int pt_iommu_init_domain(struct pt_iommu *iommu_table, 1154 + struct iommu_domain *domain) 1155 + { 1156 + struct pt_common *common = common_from_iommu(iommu_table); 1157 + struct pt_iommu_info info; 1158 + struct pt_range range; 1159 + 1160 + NS(get_info)(iommu_table, &info); 1161 + 1162 + domain->type = __IOMMU_DOMAIN_PAGING; 1163 + domain->pgsize_bitmap = info.pgsize_bitmap; 1164 + 1165 + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP)) 1166 + range = _pt_top_range(common, 1167 + _pt_top_set(NULL, PT_MAX_TOP_LEVEL)); 1168 + else 1169 + range = pt_top_range(common); 1170 + 1171 + /* A 64-bit high address space table on a 32-bit system cannot work. */ 1172 + domain->geometry.aperture_start = (unsigned long)range.va; 1173 + if ((pt_vaddr_t)domain->geometry.aperture_start != range.va) 1174 + return -EOVERFLOW; 1175 + 1176 + /* 1177 + * The aperture is limited to what the API can do after considering all 1178 + * the different types dma_addr_t/unsigned long/pt_vaddr_t that are used 1179 + * to store a VA. Set the aperture to something that is valid for all 1180 + * cases. Saturate instead of truncate the end if the types are smaller 1181 + * than the top range. aperture_end should be called aperture_last. 1182 + */ 1183 + domain->geometry.aperture_end = (unsigned long)range.last_va; 1184 + if ((pt_vaddr_t)domain->geometry.aperture_end != range.last_va) { 1185 + domain->geometry.aperture_end = ULONG_MAX; 1186 + domain->pgsize_bitmap &= ULONG_MAX; 1187 + } 1188 + domain->geometry.force_aperture = true; 1189 + 1190 + return 0; 1191 + } 1192 + 1193 + static void pt_iommu_zero(struct pt_iommu_table *fmt_table) 1194 + { 1195 + struct pt_iommu *iommu_table = &fmt_table->iommu; 1196 + struct pt_iommu cfg = *iommu_table; 1197 + 1198 + static_assert(offsetof(struct pt_iommu_table, iommu.domain) == 0); 1199 + memset_after(fmt_table, 0, iommu.domain); 1200 + 1201 + /* The caller can initialize some of these values */ 1202 + iommu_table->iommu_device = cfg.iommu_device; 1203 + iommu_table->driver_ops = cfg.driver_ops; 1204 + iommu_table->nid = cfg.nid; 1205 + } 1206 + 1207 + #define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) 1208 + #define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) 1209 + 1210 + int pt_iommu_init(struct pt_iommu_table *fmt_table, 1211 + const struct pt_iommu_table_cfg *cfg, gfp_t gfp) 1212 + { 1213 + struct pt_iommu *iommu_table = &fmt_table->iommu; 1214 + struct pt_common *common = common_from_iommu(iommu_table); 1215 + struct pt_table_p *table_mem; 1216 + int ret; 1217 + 1218 + if (cfg->common.hw_max_vasz_lg2 > PT_MAX_VA_ADDRESS_LG2 || 1219 + !cfg->common.hw_max_vasz_lg2 || !cfg->common.hw_max_oasz_lg2) 1220 + return -EINVAL; 1221 + 1222 + pt_iommu_zero(fmt_table); 1223 + common->features = cfg->common.features; 1224 + common->max_vasz_lg2 = cfg->common.hw_max_vasz_lg2; 1225 + common->max_oasz_lg2 = cfg->common.hw_max_oasz_lg2; 1226 + ret = pt_iommu_fmt_init(fmt_table, cfg); 1227 + if (ret) 1228 + return ret; 1229 + 1230 + if (cfg->common.hw_max_oasz_lg2 > pt_max_oa_lg2(common)) 1231 + return -EINVAL; 1232 + 1233 + ret = pt_init_common(common); 1234 + if (ret) 1235 + return ret; 1236 + 1237 + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && 1238 + WARN_ON(!iommu_table->driver_ops || 1239 + !iommu_table->driver_ops->change_top || 1240 + !iommu_table->driver_ops->get_top_lock)) 1241 + return -EINVAL; 1242 + 1243 + if (pt_feature(common, PT_FEAT_SIGN_EXTEND) && 1244 + (pt_feature(common, PT_FEAT_FULL_VA) || 1245 + pt_feature(common, PT_FEAT_DYNAMIC_TOP))) 1246 + return -EINVAL; 1247 + 1248 + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT) && 1249 + WARN_ON(!iommu_table->iommu_device)) 1250 + return -EINVAL; 1251 + 1252 + ret = pt_iommu_init_domain(iommu_table, &iommu_table->domain); 1253 + if (ret) 1254 + return ret; 1255 + 1256 + table_mem = table_alloc_top(common, common->top_of_table, gfp, 1257 + ALLOC_NORMAL); 1258 + if (IS_ERR(table_mem)) 1259 + return PTR_ERR(table_mem); 1260 + pt_top_set(common, table_mem, pt_top_get_level(common)); 1261 + 1262 + /* Must be last, see pt_iommu_deinit() */ 1263 + iommu_table->ops = &NS(ops); 1264 + return 0; 1265 + } 1266 + EXPORT_SYMBOL_NS_GPL(pt_iommu_init, "GENERIC_PT_IOMMU"); 1267 + 1268 + #ifdef pt_iommu_fmt_hw_info 1269 + #define pt_iommu_table_hw_info CONCATENATE(pt_iommu_table, _hw_info) 1270 + #define pt_iommu_hw_info CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), hw_info) 1271 + void pt_iommu_hw_info(struct pt_iommu_table *fmt_table, 1272 + struct pt_iommu_table_hw_info *info) 1273 + { 1274 + struct pt_iommu *iommu_table = &fmt_table->iommu; 1275 + struct pt_common *common = common_from_iommu(iommu_table); 1276 + struct pt_range top_range = pt_top_range(common); 1277 + 1278 + pt_iommu_fmt_hw_info(fmt_table, &top_range, info); 1279 + } 1280 + EXPORT_SYMBOL_NS_GPL(pt_iommu_hw_info, "GENERIC_PT_IOMMU"); 1281 + #endif 1282 + 1283 + MODULE_LICENSE("GPL"); 1284 + MODULE_DESCRIPTION("IOMMU Page table implementation for " __stringify(PTPFX_RAW)); 1285 + MODULE_IMPORT_NS("GENERIC_PT"); 1286 + /* For iommu_dirty_bitmap_record() */ 1287 + MODULE_IMPORT_NS("IOMMUFD"); 1288 + 1289 + #endif /* __GENERIC_PT_IOMMU_PT_H */

+823

drivers/iommu/generic_pt/kunit_generic_pt.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * Test the format API directly. 6 + * 7 + */ 8 + #include "kunit_iommu.h" 9 + #include "pt_iter.h" 10 + 11 + static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, 12 + pt_vaddr_t len) 13 + { 14 + struct kunit_iommu_priv *priv = test->priv; 15 + int ret; 16 + 17 + KUNIT_ASSERT_EQ(test, len, (size_t)len); 18 + 19 + ret = iommu_map(&priv->domain, va, pa, len, IOMMU_READ | IOMMU_WRITE, 20 + GFP_KERNEL); 21 + KUNIT_ASSERT_NO_ERRNO_FN(test, "map_pages", ret); 22 + } 23 + 24 + #define KUNIT_ASSERT_PT_LOAD(test, pts, entry) \ 25 + ({ \ 26 + pt_load_entry(pts); \ 27 + KUNIT_ASSERT_EQ(test, (pts)->type, entry); \ 28 + }) 29 + 30 + struct check_levels_arg { 31 + struct kunit *test; 32 + void *fn_arg; 33 + void (*fn)(struct kunit *test, struct pt_state *pts, void *arg); 34 + }; 35 + 36 + static int __check_all_levels(struct pt_range *range, void *arg, 37 + unsigned int level, struct pt_table_p *table) 38 + { 39 + struct pt_state pts = pt_init(range, level, table); 40 + struct check_levels_arg *chk = arg; 41 + struct kunit *test = chk->test; 42 + int ret; 43 + 44 + _pt_iter_first(&pts); 45 + 46 + 47 + /* 48 + * If we were able to use the full VA space this should always be the 49 + * last index in each table. 50 + */ 51 + if (!(IS_32BIT && range->max_vasz_lg2 > 32)) { 52 + if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND) && 53 + pts.level == pts.range->top_level) 54 + KUNIT_ASSERT_EQ(test, pts.index, 55 + log2_to_int(range->max_vasz_lg2 - 1 - 56 + pt_table_item_lg2sz(&pts)) - 57 + 1); 58 + else 59 + KUNIT_ASSERT_EQ(test, pts.index, 60 + log2_to_int(pt_table_oa_lg2sz(&pts) - 61 + pt_table_item_lg2sz(&pts)) - 62 + 1); 63 + } 64 + 65 + if (pt_can_have_table(&pts)) { 66 + pt_load_single_entry(&pts); 67 + KUNIT_ASSERT_EQ(test, pts.type, PT_ENTRY_TABLE); 68 + ret = pt_descend(&pts, arg, __check_all_levels); 69 + KUNIT_ASSERT_EQ(test, ret, 0); 70 + 71 + /* Index 0 is used by the test */ 72 + if (IS_32BIT && !pts.index) 73 + return 0; 74 + KUNIT_ASSERT_NE(chk->test, pts.index, 0); 75 + } 76 + 77 + /* 78 + * A format should not create a table with only one entry, at least this 79 + * test approach won't work. 80 + */ 81 + KUNIT_ASSERT_GT(chk->test, pts.end_index, 1); 82 + 83 + /* 84 + * For increase top we end up using index 0 for the original top's tree, 85 + * so use index 1 for testing instead. 86 + */ 87 + pts.index = 0; 88 + pt_index_to_va(&pts); 89 + pt_load_single_entry(&pts); 90 + if (pts.type == PT_ENTRY_TABLE && pts.end_index > 2) { 91 + pts.index = 1; 92 + pt_index_to_va(&pts); 93 + } 94 + (*chk->fn)(chk->test, &pts, chk->fn_arg); 95 + return 0; 96 + } 97 + 98 + /* 99 + * Call fn for each level in the table with a pts setup to index 0 in a table 100 + * for that level. This allows writing tests that run on every level. 101 + * The test can use every index in the table except the last one. 102 + */ 103 + static void check_all_levels(struct kunit *test, 104 + void (*fn)(struct kunit *test, 105 + struct pt_state *pts, void *arg), 106 + void *fn_arg) 107 + { 108 + struct kunit_iommu_priv *priv = test->priv; 109 + struct pt_range range = pt_top_range(priv->common); 110 + struct check_levels_arg chk = { 111 + .test = test, 112 + .fn = fn, 113 + .fn_arg = fn_arg, 114 + }; 115 + int ret; 116 + 117 + if (pt_feature(priv->common, PT_FEAT_DYNAMIC_TOP) && 118 + priv->common->max_vasz_lg2 > range.max_vasz_lg2) 119 + range.last_va = fvalog2_set_mod_max(range.va, 120 + priv->common->max_vasz_lg2); 121 + 122 + /* 123 + * Map a page at the highest VA, this will populate all the levels so we 124 + * can then iterate over them. Index 0 will be used for testing. 125 + */ 126 + if (IS_32BIT && range.max_vasz_lg2 > 32) 127 + range.last_va = (u32)range.last_va; 128 + range.va = range.last_va - (priv->smallest_pgsz - 1); 129 + do_map(test, range.va, 0, priv->smallest_pgsz); 130 + 131 + range = pt_make_range(priv->common, range.va, range.last_va); 132 + ret = pt_walk_range(&range, __check_all_levels, &chk); 133 + KUNIT_ASSERT_EQ(test, ret, 0); 134 + } 135 + 136 + static void test_init(struct kunit *test) 137 + { 138 + struct kunit_iommu_priv *priv = test->priv; 139 + 140 + /* Fixture does the setup */ 141 + KUNIT_ASSERT_NE(test, priv->info.pgsize_bitmap, 0); 142 + } 143 + 144 + /* 145 + * Basic check that the log2_* functions are working, especially at the integer 146 + * limits. 147 + */ 148 + static void test_bitops(struct kunit *test) 149 + { 150 + int i; 151 + 152 + KUNIT_ASSERT_EQ(test, fls_t(u32, 0), 0); 153 + KUNIT_ASSERT_EQ(test, fls_t(u32, 1), 1); 154 + KUNIT_ASSERT_EQ(test, fls_t(u32, BIT(2)), 3); 155 + KUNIT_ASSERT_EQ(test, fls_t(u32, U32_MAX), 32); 156 + 157 + KUNIT_ASSERT_EQ(test, fls_t(u64, 0), 0); 158 + KUNIT_ASSERT_EQ(test, fls_t(u64, 1), 1); 159 + KUNIT_ASSERT_EQ(test, fls_t(u64, BIT(2)), 3); 160 + KUNIT_ASSERT_EQ(test, fls_t(u64, U64_MAX), 64); 161 + 162 + KUNIT_ASSERT_EQ(test, ffs_t(u32, 1), 0); 163 + KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(2)), 2); 164 + KUNIT_ASSERT_EQ(test, ffs_t(u32, BIT(31)), 31); 165 + 166 + KUNIT_ASSERT_EQ(test, ffs_t(u64, 1), 0); 167 + KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT(2)), 2); 168 + KUNIT_ASSERT_EQ(test, ffs_t(u64, BIT_ULL(63)), 63); 169 + 170 + for (i = 0; i != 31; i++) 171 + KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i); 172 + 173 + for (i = 0; i != 63; i++) 174 + KUNIT_ASSERT_EQ(test, ffz_t(u64, BIT_ULL(i) - 1), i); 175 + 176 + for (i = 0; i != 32; i++) { 177 + u64 val = get_random_u64(); 178 + 179 + KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffs_t(u32, val)), 0); 180 + KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffs_t(u64, val)), 0); 181 + 182 + KUNIT_ASSERT_EQ(test, log2_mod_t(u32, val, ffz_t(u32, val)), 183 + log2_to_max_int_t(u32, ffz_t(u32, val))); 184 + KUNIT_ASSERT_EQ(test, log2_mod_t(u64, val, ffz_t(u64, val)), 185 + log2_to_max_int_t(u64, ffz_t(u64, val))); 186 + } 187 + } 188 + 189 + static unsigned int ref_best_pgsize(pt_vaddr_t pgsz_bitmap, pt_vaddr_t va, 190 + pt_vaddr_t last_va, pt_oaddr_t oa) 191 + { 192 + pt_vaddr_t pgsz_lg2; 193 + 194 + /* Brute force the constraints described in pt_compute_best_pgsize() */ 195 + for (pgsz_lg2 = PT_VADDR_MAX_LG2 - 1; pgsz_lg2 != 0; pgsz_lg2--) { 196 + if ((pgsz_bitmap & log2_to_int(pgsz_lg2)) && 197 + log2_mod(va, pgsz_lg2) == 0 && 198 + oalog2_mod(oa, pgsz_lg2) == 0 && 199 + va + log2_to_int(pgsz_lg2) - 1 <= last_va && 200 + log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2) && 201 + oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)) 202 + return pgsz_lg2; 203 + } 204 + return 0; 205 + } 206 + 207 + /* Check that the bit logic in pt_compute_best_pgsize() works. */ 208 + static void test_best_pgsize(struct kunit *test) 209 + { 210 + unsigned int a_lg2; 211 + unsigned int b_lg2; 212 + unsigned int c_lg2; 213 + 214 + /* Try random prefixes with every suffix combination */ 215 + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { 216 + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { 217 + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { 218 + pt_vaddr_t pgsz_bitmap = get_random_u64(); 219 + pt_vaddr_t va = get_random_u64() << a_lg2; 220 + pt_oaddr_t oa = get_random_u64() << b_lg2; 221 + pt_vaddr_t last_va = log2_set_mod_max( 222 + get_random_u64(), c_lg2); 223 + 224 + if (va > last_va) 225 + swap(va, last_va); 226 + KUNIT_ASSERT_EQ( 227 + test, 228 + pt_compute_best_pgsize(pgsz_bitmap, va, 229 + last_va, oa), 230 + ref_best_pgsize(pgsz_bitmap, va, 231 + last_va, oa)); 232 + } 233 + } 234 + } 235 + 236 + /* 0 prefix, every suffix */ 237 + for (c_lg2 = 1; c_lg2 != PT_VADDR_MAX_LG2 - 1; c_lg2++) { 238 + pt_vaddr_t pgsz_bitmap = get_random_u64(); 239 + pt_vaddr_t va = 0; 240 + pt_oaddr_t oa = 0; 241 + pt_vaddr_t last_va = log2_set_mod_max(0, c_lg2); 242 + 243 + KUNIT_ASSERT_EQ(test, 244 + pt_compute_best_pgsize(pgsz_bitmap, va, last_va, 245 + oa), 246 + ref_best_pgsize(pgsz_bitmap, va, last_va, oa)); 247 + } 248 + 249 + /* 1's prefix, every suffix */ 250 + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { 251 + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { 252 + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { 253 + pt_vaddr_t pgsz_bitmap = get_random_u64(); 254 + pt_vaddr_t va = PT_VADDR_MAX << a_lg2; 255 + pt_oaddr_t oa = PT_VADDR_MAX << b_lg2; 256 + pt_vaddr_t last_va = PT_VADDR_MAX; 257 + 258 + KUNIT_ASSERT_EQ( 259 + test, 260 + pt_compute_best_pgsize(pgsz_bitmap, va, 261 + last_va, oa), 262 + ref_best_pgsize(pgsz_bitmap, va, 263 + last_va, oa)); 264 + } 265 + } 266 + } 267 + 268 + /* pgsize_bitmap is always 0 */ 269 + for (a_lg2 = 1; a_lg2 != 10; a_lg2++) { 270 + for (b_lg2 = 1; b_lg2 != 10; b_lg2++) { 271 + for (c_lg2 = 1; c_lg2 != 10; c_lg2++) { 272 + pt_vaddr_t pgsz_bitmap = 0; 273 + pt_vaddr_t va = get_random_u64() << a_lg2; 274 + pt_oaddr_t oa = get_random_u64() << b_lg2; 275 + pt_vaddr_t last_va = log2_set_mod_max( 276 + get_random_u64(), c_lg2); 277 + 278 + if (va > last_va) 279 + swap(va, last_va); 280 + KUNIT_ASSERT_EQ( 281 + test, 282 + pt_compute_best_pgsize(pgsz_bitmap, va, 283 + last_va, oa), 284 + 0); 285 + } 286 + } 287 + } 288 + 289 + if (sizeof(pt_vaddr_t) <= 4) 290 + return; 291 + 292 + /* over 32 bit page sizes */ 293 + for (a_lg2 = 32; a_lg2 != 42; a_lg2++) { 294 + for (b_lg2 = 32; b_lg2 != 42; b_lg2++) { 295 + for (c_lg2 = 32; c_lg2 != 42; c_lg2++) { 296 + pt_vaddr_t pgsz_bitmap = get_random_u64(); 297 + pt_vaddr_t va = get_random_u64() << a_lg2; 298 + pt_oaddr_t oa = get_random_u64() << b_lg2; 299 + pt_vaddr_t last_va = log2_set_mod_max( 300 + get_random_u64(), c_lg2); 301 + 302 + if (va > last_va) 303 + swap(va, last_va); 304 + KUNIT_ASSERT_EQ( 305 + test, 306 + pt_compute_best_pgsize(pgsz_bitmap, va, 307 + last_va, oa), 308 + ref_best_pgsize(pgsz_bitmap, va, 309 + last_va, oa)); 310 + } 311 + } 312 + } 313 + } 314 + 315 + /* 316 + * Check that pt_install_table() and pt_table_pa() match 317 + */ 318 + static void test_lvl_table_ptr(struct kunit *test, struct pt_state *pts, 319 + void *arg) 320 + { 321 + struct kunit_iommu_priv *priv = test->priv; 322 + pt_oaddr_t paddr = 323 + log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2); 324 + struct pt_write_attrs attrs = {}; 325 + 326 + if (!pt_can_have_table(pts)) 327 + return; 328 + 329 + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", 330 + pt_iommu_set_prot(pts->range->common, &attrs, 331 + IOMMU_READ)); 332 + 333 + pt_load_single_entry(pts); 334 + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); 335 + 336 + KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs)); 337 + 338 + /* A second install should pass because install updates pts->entry. */ 339 + KUNIT_ASSERT_EQ(test, pt_install_table(pts, paddr, &attrs), true); 340 + 341 + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_TABLE); 342 + KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr); 343 + 344 + pt_clear_entries(pts, ilog2(1)); 345 + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); 346 + } 347 + 348 + static void test_table_ptr(struct kunit *test) 349 + { 350 + check_all_levels(test, test_lvl_table_ptr, NULL); 351 + } 352 + 353 + struct lvl_radix_arg { 354 + pt_vaddr_t vbits; 355 + }; 356 + 357 + /* 358 + * Check pt_table_oa_lg2sz() and pt_table_item_lg2sz() they need to decode a 359 + * continuous list of VA across all the levels that covers the entire advertised 360 + * VA space. 361 + */ 362 + static void test_lvl_radix(struct kunit *test, struct pt_state *pts, void *arg) 363 + { 364 + unsigned int table_lg2sz = pt_table_oa_lg2sz(pts); 365 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 366 + struct lvl_radix_arg *radix = arg; 367 + 368 + /* Every bit below us is decoded */ 369 + KUNIT_ASSERT_EQ(test, log2_set_mod_max(0, isz_lg2), radix->vbits); 370 + 371 + /* We are not decoding bits someone else is */ 372 + KUNIT_ASSERT_EQ(test, log2_div(radix->vbits, isz_lg2), 0); 373 + 374 + /* Can't decode past the pt_vaddr_t size */ 375 + KUNIT_ASSERT_LE(test, table_lg2sz, PT_VADDR_MAX_LG2); 376 + KUNIT_ASSERT_EQ(test, fvalog2_div(table_lg2sz, PT_MAX_VA_ADDRESS_LG2), 377 + 0); 378 + 379 + radix->vbits = fvalog2_set_mod_max(0, table_lg2sz); 380 + } 381 + 382 + static void test_max_va(struct kunit *test) 383 + { 384 + struct kunit_iommu_priv *priv = test->priv; 385 + struct pt_range range = pt_top_range(priv->common); 386 + 387 + KUNIT_ASSERT_GE(test, priv->common->max_vasz_lg2, range.max_vasz_lg2); 388 + } 389 + 390 + static void test_table_radix(struct kunit *test) 391 + { 392 + struct kunit_iommu_priv *priv = test->priv; 393 + struct lvl_radix_arg radix = { .vbits = priv->smallest_pgsz - 1 }; 394 + struct pt_range range; 395 + 396 + check_all_levels(test, test_lvl_radix, &radix); 397 + 398 + range = pt_top_range(priv->common); 399 + if (range.max_vasz_lg2 == PT_VADDR_MAX_LG2) { 400 + KUNIT_ASSERT_EQ(test, radix.vbits, PT_VADDR_MAX); 401 + } else { 402 + if (!IS_32BIT) 403 + KUNIT_ASSERT_EQ(test, 404 + log2_set_mod_max(0, range.max_vasz_lg2), 405 + radix.vbits); 406 + KUNIT_ASSERT_EQ(test, log2_div(radix.vbits, range.max_vasz_lg2), 407 + 0); 408 + } 409 + } 410 + 411 + static unsigned int safe_pt_num_items_lg2(const struct pt_state *pts) 412 + { 413 + struct pt_range top_range = pt_top_range(pts->range->common); 414 + struct pt_state top_pts = pt_init_top(&top_range); 415 + 416 + /* 417 + * Avoid calling pt_num_items_lg2() on the top, instead we can derive 418 + * the size of the top table from the top range. 419 + */ 420 + if (pts->level == top_range.top_level) 421 + return ilog2(pt_range_to_end_index(&top_pts)); 422 + return pt_num_items_lg2(pts); 423 + } 424 + 425 + static void test_lvl_possible_sizes(struct kunit *test, struct pt_state *pts, 426 + void *arg) 427 + { 428 + unsigned int num_items_lg2 = safe_pt_num_items_lg2(pts); 429 + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); 430 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 431 + 432 + if (!pt_can_have_leaf(pts)) { 433 + KUNIT_ASSERT_EQ(test, pgsize_bitmap, 0); 434 + return; 435 + } 436 + 437 + /* No bits for sizes that would be outside this table */ 438 + KUNIT_ASSERT_EQ(test, log2_mod(pgsize_bitmap, isz_lg2), 0); 439 + KUNIT_ASSERT_EQ( 440 + test, fvalog2_div(pgsize_bitmap, num_items_lg2 + isz_lg2), 0); 441 + 442 + /* 443 + * Non contiguous must be supported. AMDv1 has a HW bug where it does 444 + * not support it on one of the levels. 445 + */ 446 + if ((u64)pgsize_bitmap != 0xff0000000000ULL || 447 + strcmp(__stringify(PTPFX_RAW), "amdv1") != 0) 448 + KUNIT_ASSERT_TRUE(test, pgsize_bitmap & log2_to_int(isz_lg2)); 449 + else 450 + KUNIT_ASSERT_NE(test, pgsize_bitmap, 0); 451 + 452 + /* A contiguous entry should not span the whole table */ 453 + if (num_items_lg2 + isz_lg2 != PT_VADDR_MAX_LG2) 454 + KUNIT_ASSERT_FALSE( 455 + test, 456 + pgsize_bitmap & log2_to_int(num_items_lg2 + isz_lg2)); 457 + } 458 + 459 + static void test_entry_possible_sizes(struct kunit *test) 460 + { 461 + check_all_levels(test, test_lvl_possible_sizes, NULL); 462 + } 463 + 464 + static void sweep_all_pgsizes(struct kunit *test, struct pt_state *pts, 465 + struct pt_write_attrs *attrs, 466 + pt_oaddr_t test_oaddr) 467 + { 468 + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); 469 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 470 + unsigned int len_lg2; 471 + 472 + if (pts->index != 0) 473 + return; 474 + 475 + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) { 476 + struct pt_state sub_pts = *pts; 477 + pt_oaddr_t oaddr; 478 + 479 + if (!(pgsize_bitmap & log2_to_int(len_lg2))) 480 + continue; 481 + 482 + oaddr = log2_set_mod(test_oaddr, 0, len_lg2); 483 + pt_install_leaf_entry(pts, oaddr, len_lg2, attrs); 484 + /* Verify that every contiguous item translates correctly */ 485 + for (sub_pts.index = 0; 486 + sub_pts.index != log2_to_int(len_lg2 - isz_lg2); 487 + sub_pts.index++) { 488 + KUNIT_ASSERT_PT_LOAD(test, &sub_pts, PT_ENTRY_OA); 489 + KUNIT_ASSERT_EQ(test, pt_item_oa(&sub_pts), 490 + oaddr + sub_pts.index * 491 + oalog2_mul(1, isz_lg2)); 492 + KUNIT_ASSERT_EQ(test, pt_entry_oa(&sub_pts), oaddr); 493 + KUNIT_ASSERT_EQ(test, pt_entry_num_contig_lg2(&sub_pts), 494 + len_lg2 - isz_lg2); 495 + } 496 + 497 + pt_clear_entries(pts, len_lg2 - isz_lg2); 498 + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); 499 + } 500 + } 501 + 502 + /* 503 + * Check that pt_install_leaf_entry() and pt_entry_oa() match. 504 + * Check that pt_clear_entries() works. 505 + */ 506 + static void test_lvl_entry_oa(struct kunit *test, struct pt_state *pts, 507 + void *arg) 508 + { 509 + unsigned int max_oa_lg2 = pts->range->common->max_oasz_lg2; 510 + struct kunit_iommu_priv *priv = test->priv; 511 + struct pt_write_attrs attrs = {}; 512 + 513 + if (!pt_can_have_leaf(pts)) 514 + return; 515 + 516 + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", 517 + pt_iommu_set_prot(pts->range->common, &attrs, 518 + IOMMU_READ)); 519 + 520 + sweep_all_pgsizes(test, pts, &attrs, priv->test_oa); 521 + 522 + /* Check that the table can store the boundary OAs */ 523 + sweep_all_pgsizes(test, pts, &attrs, 0); 524 + if (max_oa_lg2 == PT_OADDR_MAX_LG2) 525 + sweep_all_pgsizes(test, pts, &attrs, PT_OADDR_MAX); 526 + else 527 + sweep_all_pgsizes(test, pts, &attrs, 528 + oalog2_to_max_int(max_oa_lg2)); 529 + } 530 + 531 + static void test_entry_oa(struct kunit *test) 532 + { 533 + check_all_levels(test, test_lvl_entry_oa, NULL); 534 + } 535 + 536 + /* Test pt_attr_from_entry() */ 537 + static void test_lvl_attr_from_entry(struct kunit *test, struct pt_state *pts, 538 + void *arg) 539 + { 540 + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); 541 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 542 + struct kunit_iommu_priv *priv = test->priv; 543 + unsigned int len_lg2; 544 + unsigned int prot; 545 + 546 + if (!pt_can_have_leaf(pts)) 547 + return; 548 + 549 + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) { 550 + if (!(pgsize_bitmap & log2_to_int(len_lg2))) 551 + continue; 552 + for (prot = 0; prot <= (IOMMU_READ | IOMMU_WRITE | IOMMU_CACHE | 553 + IOMMU_NOEXEC | IOMMU_MMIO); 554 + prot++) { 555 + pt_oaddr_t oaddr; 556 + struct pt_write_attrs attrs = {}; 557 + u64 good_entry; 558 + 559 + /* 560 + * If the format doesn't support this combination of 561 + * prot bits skip it 562 + */ 563 + if (pt_iommu_set_prot(pts->range->common, &attrs, 564 + prot)) { 565 + /* But RW has to be supported */ 566 + KUNIT_ASSERT_NE(test, prot, 567 + IOMMU_READ | IOMMU_WRITE); 568 + continue; 569 + } 570 + 571 + oaddr = log2_set_mod(priv->test_oa, 0, len_lg2); 572 + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); 573 + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); 574 + 575 + good_entry = pts->entry; 576 + 577 + memset(&attrs, 0, sizeof(attrs)); 578 + pt_attr_from_entry(pts, &attrs); 579 + 580 + pt_clear_entries(pts, len_lg2 - isz_lg2); 581 + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); 582 + 583 + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); 584 + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); 585 + 586 + /* 587 + * The descriptor produced by pt_attr_from_entry() 588 + * produce an identical entry value when re-written 589 + */ 590 + KUNIT_ASSERT_EQ(test, good_entry, pts->entry); 591 + 592 + pt_clear_entries(pts, len_lg2 - isz_lg2); 593 + } 594 + } 595 + } 596 + 597 + static void test_attr_from_entry(struct kunit *test) 598 + { 599 + check_all_levels(test, test_lvl_attr_from_entry, NULL); 600 + } 601 + 602 + static void test_lvl_dirty(struct kunit *test, struct pt_state *pts, void *arg) 603 + { 604 + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); 605 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 606 + struct kunit_iommu_priv *priv = test->priv; 607 + unsigned int start_idx = pts->index; 608 + struct pt_write_attrs attrs = {}; 609 + unsigned int len_lg2; 610 + 611 + if (!pt_can_have_leaf(pts)) 612 + return; 613 + 614 + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", 615 + pt_iommu_set_prot(pts->range->common, &attrs, 616 + IOMMU_READ | IOMMU_WRITE)); 617 + 618 + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2; len_lg2++) { 619 + pt_oaddr_t oaddr; 620 + unsigned int i; 621 + 622 + if (!(pgsize_bitmap & log2_to_int(len_lg2))) 623 + continue; 624 + 625 + oaddr = log2_set_mod(priv->test_oa, 0, len_lg2); 626 + pt_install_leaf_entry(pts, oaddr, len_lg2, &attrs); 627 + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_OA); 628 + 629 + pt_load_entry(pts); 630 + pt_entry_make_write_clean(pts); 631 + pt_load_entry(pts); 632 + KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts)); 633 + 634 + for (i = 0; i != log2_to_int(len_lg2 - isz_lg2); i++) { 635 + /* dirty every contiguous entry */ 636 + pts->index = start_idx + i; 637 + pt_load_entry(pts); 638 + KUNIT_ASSERT_TRUE(test, pt_entry_make_write_dirty(pts)); 639 + pts->index = start_idx; 640 + pt_load_entry(pts); 641 + KUNIT_ASSERT_TRUE(test, pt_entry_is_write_dirty(pts)); 642 + 643 + pt_entry_make_write_clean(pts); 644 + pt_load_entry(pts); 645 + KUNIT_ASSERT_FALSE(test, pt_entry_is_write_dirty(pts)); 646 + } 647 + 648 + pt_clear_entries(pts, len_lg2 - isz_lg2); 649 + } 650 + } 651 + 652 + static __maybe_unused void test_dirty(struct kunit *test) 653 + { 654 + struct kunit_iommu_priv *priv = test->priv; 655 + 656 + if (!pt_dirty_supported(priv->common)) 657 + kunit_skip(test, 658 + "Page table features do not support dirty tracking"); 659 + 660 + check_all_levels(test, test_lvl_dirty, NULL); 661 + } 662 + 663 + static void test_lvl_sw_bit_leaf(struct kunit *test, struct pt_state *pts, 664 + void *arg) 665 + { 666 + struct kunit_iommu_priv *priv = test->priv; 667 + pt_vaddr_t pgsize_bitmap = pt_possible_sizes(pts); 668 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 669 + struct pt_write_attrs attrs = {}; 670 + unsigned int len_lg2; 671 + 672 + if (!pt_can_have_leaf(pts)) 673 + return; 674 + if (pts->index != 0) 675 + return; 676 + 677 + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", 678 + pt_iommu_set_prot(pts->range->common, &attrs, 679 + IOMMU_READ)); 680 + 681 + for (len_lg2 = 0; len_lg2 < PT_VADDR_MAX_LG2 - 1; len_lg2++) { 682 + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, len_lg2); 683 + struct pt_write_attrs new_attrs = {}; 684 + unsigned int bitnr; 685 + 686 + if (!(pgsize_bitmap & log2_to_int(len_lg2))) 687 + continue; 688 + 689 + pt_install_leaf_entry(pts, paddr, len_lg2, &attrs); 690 + 691 + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); 692 + bitnr++) 693 + KUNIT_ASSERT_FALSE(test, 694 + pt_test_sw_bit_acquire(pts, bitnr)); 695 + 696 + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); 697 + bitnr++) { 698 + KUNIT_ASSERT_FALSE(test, 699 + pt_test_sw_bit_acquire(pts, bitnr)); 700 + pt_set_sw_bit_release(pts, bitnr); 701 + KUNIT_ASSERT_TRUE(test, 702 + pt_test_sw_bit_acquire(pts, bitnr)); 703 + } 704 + 705 + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); 706 + bitnr++) 707 + KUNIT_ASSERT_TRUE(test, 708 + pt_test_sw_bit_acquire(pts, bitnr)); 709 + 710 + KUNIT_ASSERT_EQ(test, pt_item_oa(pts), paddr); 711 + 712 + /* SW bits didn't leak into the attrs */ 713 + pt_attr_from_entry(pts, &new_attrs); 714 + KUNIT_ASSERT_MEMEQ(test, &new_attrs, &attrs, sizeof(attrs)); 715 + 716 + pt_clear_entries(pts, len_lg2 - isz_lg2); 717 + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); 718 + } 719 + } 720 + 721 + static __maybe_unused void test_sw_bit_leaf(struct kunit *test) 722 + { 723 + check_all_levels(test, test_lvl_sw_bit_leaf, NULL); 724 + } 725 + 726 + static void test_lvl_sw_bit_table(struct kunit *test, struct pt_state *pts, 727 + void *arg) 728 + { 729 + struct kunit_iommu_priv *priv = test->priv; 730 + struct pt_write_attrs attrs = {}; 731 + pt_oaddr_t paddr = 732 + log2_set_mod(priv->test_oa, 0, priv->smallest_pgsz_lg2); 733 + unsigned int bitnr; 734 + 735 + if (!pt_can_have_leaf(pts)) 736 + return; 737 + if (pts->index != 0) 738 + return; 739 + 740 + KUNIT_ASSERT_NO_ERRNO_FN(test, "pt_iommu_set_prot", 741 + pt_iommu_set_prot(pts->range->common, &attrs, 742 + IOMMU_READ)); 743 + 744 + KUNIT_ASSERT_TRUE(test, pt_install_table(pts, paddr, &attrs)); 745 + 746 + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) 747 + KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr)); 748 + 749 + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) { 750 + KUNIT_ASSERT_FALSE(test, pt_test_sw_bit_acquire(pts, bitnr)); 751 + pt_set_sw_bit_release(pts, bitnr); 752 + KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr)); 753 + } 754 + 755 + for (bitnr = 0; bitnr <= pt_max_sw_bit(pts->range->common); bitnr++) 756 + KUNIT_ASSERT_TRUE(test, pt_test_sw_bit_acquire(pts, bitnr)); 757 + 758 + KUNIT_ASSERT_EQ(test, pt_table_pa(pts), paddr); 759 + 760 + pt_clear_entries(pts, ilog2(1)); 761 + KUNIT_ASSERT_PT_LOAD(test, pts, PT_ENTRY_EMPTY); 762 + } 763 + 764 + static __maybe_unused void test_sw_bit_table(struct kunit *test) 765 + { 766 + check_all_levels(test, test_lvl_sw_bit_table, NULL); 767 + } 768 + 769 + static struct kunit_case generic_pt_test_cases[] = { 770 + KUNIT_CASE_FMT(test_init), 771 + KUNIT_CASE_FMT(test_bitops), 772 + KUNIT_CASE_FMT(test_best_pgsize), 773 + KUNIT_CASE_FMT(test_table_ptr), 774 + KUNIT_CASE_FMT(test_max_va), 775 + KUNIT_CASE_FMT(test_table_radix), 776 + KUNIT_CASE_FMT(test_entry_possible_sizes), 777 + KUNIT_CASE_FMT(test_entry_oa), 778 + KUNIT_CASE_FMT(test_attr_from_entry), 779 + #ifdef pt_entry_is_write_dirty 780 + KUNIT_CASE_FMT(test_dirty), 781 + #endif 782 + #ifdef pt_sw_bit 783 + KUNIT_CASE_FMT(test_sw_bit_leaf), 784 + KUNIT_CASE_FMT(test_sw_bit_table), 785 + #endif 786 + {}, 787 + }; 788 + 789 + static int pt_kunit_generic_pt_init(struct kunit *test) 790 + { 791 + struct kunit_iommu_priv *priv; 792 + int ret; 793 + 794 + priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL); 795 + if (!priv) 796 + return -ENOMEM; 797 + ret = pt_kunit_priv_init(test, priv); 798 + if (ret) { 799 + kunit_kfree(test, priv); 800 + return ret; 801 + } 802 + test->priv = priv; 803 + return 0; 804 + } 805 + 806 + static void pt_kunit_generic_pt_exit(struct kunit *test) 807 + { 808 + struct kunit_iommu_priv *priv = test->priv; 809 + 810 + if (!test->priv) 811 + return; 812 + 813 + pt_iommu_deinit(priv->iommu); 814 + kunit_kfree(test, test->priv); 815 + } 816 + 817 + static struct kunit_suite NS(generic_pt_suite) = { 818 + .name = __stringify(NS(fmt_test)), 819 + .init = pt_kunit_generic_pt_init, 820 + .exit = pt_kunit_generic_pt_exit, 821 + .test_cases = generic_pt_test_cases, 822 + }; 823 + kunit_test_suites(&NS(generic_pt_suite));

+184

drivers/iommu/generic_pt/kunit_iommu.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + */ 5 + #ifndef __GENERIC_PT_KUNIT_IOMMU_H 6 + #define __GENERIC_PT_KUNIT_IOMMU_H 7 + 8 + #define GENERIC_PT_KUNIT 1 9 + #include <kunit/device.h> 10 + #include <kunit/test.h> 11 + #include "../iommu-pages.h" 12 + #include "pt_iter.h" 13 + 14 + #define pt_iommu_table_cfg CONCATENATE(pt_iommu_table, _cfg) 15 + #define pt_iommu_init CONCATENATE(CONCATENATE(pt_iommu_, PTPFX), init) 16 + int pt_iommu_init(struct pt_iommu_table *fmt_table, 17 + const struct pt_iommu_table_cfg *cfg, gfp_t gfp); 18 + 19 + /* The format can provide a list of configurations it would like to test */ 20 + #ifdef kunit_fmt_cfgs 21 + static const void *kunit_pt_gen_params_cfg(struct kunit *test, const void *prev, 22 + char *desc) 23 + { 24 + uintptr_t cfg_id = (uintptr_t)prev; 25 + 26 + cfg_id++; 27 + if (cfg_id >= ARRAY_SIZE(kunit_fmt_cfgs) + 1) 28 + return NULL; 29 + snprintf(desc, KUNIT_PARAM_DESC_SIZE, "%s_cfg_%u", 30 + __stringify(PTPFX_RAW), (unsigned int)(cfg_id - 1)); 31 + return (void *)cfg_id; 32 + } 33 + #define KUNIT_CASE_FMT(test_name) \ 34 + KUNIT_CASE_PARAM(test_name, kunit_pt_gen_params_cfg) 35 + #else 36 + #define KUNIT_CASE_FMT(test_name) KUNIT_CASE(test_name) 37 + #endif 38 + 39 + #define KUNIT_ASSERT_NO_ERRNO(test, ret) \ 40 + KUNIT_ASSERT_EQ_MSG(test, ret, 0, KUNIT_SUBSUBTEST_INDENT "errno %pe", \ 41 + ERR_PTR(ret)) 42 + 43 + #define KUNIT_ASSERT_NO_ERRNO_FN(test, fn, ret) \ 44 + KUNIT_ASSERT_EQ_MSG(test, ret, 0, \ 45 + KUNIT_SUBSUBTEST_INDENT "errno %pe from %s", \ 46 + ERR_PTR(ret), fn) 47 + 48 + /* 49 + * When the test is run on a 32 bit system unsigned long can be 32 bits. This 50 + * cause the iommu op signatures to be restricted to 32 bits. Meaning the test 51 + * has to be mindful not to create any VA's over the 32 bit limit. Reduce the 52 + * scope of the testing as the main purpose of checking on full 32 bit is to 53 + * look for 32bitism in the core code. Run the test on i386 with X86_PAE=y to 54 + * get the full coverage when dma_addr_t & phys_addr_t are 8 bytes 55 + */ 56 + #define IS_32BIT (sizeof(unsigned long) == 4) 57 + 58 + struct kunit_iommu_priv { 59 + union { 60 + struct iommu_domain domain; 61 + struct pt_iommu_table fmt_table; 62 + }; 63 + spinlock_t top_lock; 64 + struct device *dummy_dev; 65 + struct pt_iommu *iommu; 66 + struct pt_common *common; 67 + struct pt_iommu_table_cfg cfg; 68 + struct pt_iommu_info info; 69 + unsigned int smallest_pgsz_lg2; 70 + pt_vaddr_t smallest_pgsz; 71 + unsigned int largest_pgsz_lg2; 72 + pt_oaddr_t test_oa; 73 + pt_vaddr_t safe_pgsize_bitmap; 74 + unsigned long orig_nr_secondary_pagetable; 75 + 76 + }; 77 + PT_IOMMU_CHECK_DOMAIN(struct kunit_iommu_priv, fmt_table.iommu, domain); 78 + 79 + static void pt_kunit_iotlb_sync(struct iommu_domain *domain, 80 + struct iommu_iotlb_gather *gather) 81 + { 82 + iommu_put_pages_list(&gather->freelist); 83 + } 84 + 85 + #define IOMMU_PT_DOMAIN_OPS1(x) IOMMU_PT_DOMAIN_OPS(x) 86 + static const struct iommu_domain_ops kunit_pt_ops = { 87 + IOMMU_PT_DOMAIN_OPS1(PTPFX_RAW), 88 + .iotlb_sync = &pt_kunit_iotlb_sync, 89 + }; 90 + 91 + static void pt_kunit_change_top(struct pt_iommu *iommu_table, 92 + phys_addr_t top_paddr, unsigned int top_level) 93 + { 94 + } 95 + 96 + static spinlock_t *pt_kunit_get_top_lock(struct pt_iommu *iommu_table) 97 + { 98 + struct kunit_iommu_priv *priv = container_of( 99 + iommu_table, struct kunit_iommu_priv, fmt_table.iommu); 100 + 101 + return &priv->top_lock; 102 + } 103 + 104 + static const struct pt_iommu_driver_ops pt_kunit_driver_ops = { 105 + .change_top = &pt_kunit_change_top, 106 + .get_top_lock = &pt_kunit_get_top_lock, 107 + }; 108 + 109 + static int pt_kunit_priv_init(struct kunit *test, struct kunit_iommu_priv *priv) 110 + { 111 + unsigned int va_lg2sz; 112 + int ret; 113 + 114 + /* Enough so the memory allocator works */ 115 + priv->dummy_dev = kunit_device_register(test, "pt_kunit_dev"); 116 + if (IS_ERR(priv->dummy_dev)) 117 + return PTR_ERR(priv->dummy_dev); 118 + set_dev_node(priv->dummy_dev, NUMA_NO_NODE); 119 + 120 + spin_lock_init(&priv->top_lock); 121 + 122 + #ifdef kunit_fmt_cfgs 123 + priv->cfg = kunit_fmt_cfgs[((uintptr_t)test->param_value) - 1]; 124 + /* 125 + * The format can set a list of features that the kunit_fmt_cfgs 126 + * controls, other features are default to on. 127 + */ 128 + priv->cfg.common.features |= PT_SUPPORTED_FEATURES & 129 + (~KUNIT_FMT_FEATURES); 130 + #else 131 + priv->cfg.common.features = PT_SUPPORTED_FEATURES; 132 + #endif 133 + 134 + /* Defaults, for the kunit */ 135 + if (!priv->cfg.common.hw_max_vasz_lg2) 136 + priv->cfg.common.hw_max_vasz_lg2 = PT_MAX_VA_ADDRESS_LG2; 137 + if (!priv->cfg.common.hw_max_oasz_lg2) 138 + priv->cfg.common.hw_max_oasz_lg2 = pt_max_oa_lg2(NULL); 139 + 140 + priv->fmt_table.iommu.nid = NUMA_NO_NODE; 141 + priv->fmt_table.iommu.driver_ops = &pt_kunit_driver_ops; 142 + priv->fmt_table.iommu.iommu_device = priv->dummy_dev; 143 + priv->domain.ops = &kunit_pt_ops; 144 + ret = pt_iommu_init(&priv->fmt_table, &priv->cfg, GFP_KERNEL); 145 + if (ret) { 146 + if (ret == -EOVERFLOW) 147 + kunit_skip( 148 + test, 149 + "This configuration cannot be tested on 32 bit"); 150 + return ret; 151 + } 152 + 153 + priv->iommu = &priv->fmt_table.iommu; 154 + priv->common = common_from_iommu(&priv->fmt_table.iommu); 155 + priv->iommu->ops->get_info(priv->iommu, &priv->info); 156 + 157 + /* 158 + * size_t is used to pass the mapping length, it can be 32 bit, truncate 159 + * the pagesizes so we don't use large sizes. 160 + */ 161 + priv->info.pgsize_bitmap = (size_t)priv->info.pgsize_bitmap; 162 + 163 + priv->smallest_pgsz_lg2 = vaffs(priv->info.pgsize_bitmap); 164 + priv->smallest_pgsz = log2_to_int(priv->smallest_pgsz_lg2); 165 + priv->largest_pgsz_lg2 = 166 + vafls((dma_addr_t)priv->info.pgsize_bitmap) - 1; 167 + 168 + priv->test_oa = 169 + oalog2_mod(0x74a71445deadbeef, priv->common->max_oasz_lg2); 170 + 171 + /* 172 + * We run out of VA space if the mappings get too big, make something 173 + * smaller that can safely pass through dma_addr_t API. 174 + */ 175 + va_lg2sz = priv->common->max_vasz_lg2; 176 + if (IS_32BIT && va_lg2sz > 32) 177 + va_lg2sz = 32; 178 + priv->safe_pgsize_bitmap = 179 + log2_mod(priv->info.pgsize_bitmap, va_lg2sz - 1); 180 + 181 + return 0; 182 + } 183 + 184 + #endif

+487

drivers/iommu/generic_pt/kunit_iommu_pt.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES 4 + */ 5 + #include "kunit_iommu.h" 6 + #include "pt_iter.h" 7 + #include <linux/generic_pt/iommu.h> 8 + #include <linux/iommu.h> 9 + 10 + static void do_map(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, 11 + pt_vaddr_t len); 12 + 13 + struct count_valids { 14 + u64 per_size[PT_VADDR_MAX_LG2]; 15 + }; 16 + 17 + static int __count_valids(struct pt_range *range, void *arg, unsigned int level, 18 + struct pt_table_p *table) 19 + { 20 + struct pt_state pts = pt_init(range, level, table); 21 + struct count_valids *valids = arg; 22 + 23 + for_each_pt_level_entry(&pts) { 24 + if (pts.type == PT_ENTRY_TABLE) { 25 + pt_descend(&pts, arg, __count_valids); 26 + continue; 27 + } 28 + if (pts.type == PT_ENTRY_OA) { 29 + valids->per_size[pt_entry_oa_lg2sz(&pts)]++; 30 + continue; 31 + } 32 + } 33 + return 0; 34 + } 35 + 36 + /* 37 + * Number of valid table entries. This counts contiguous entries as a single 38 + * valid. 39 + */ 40 + static unsigned int count_valids(struct kunit *test) 41 + { 42 + struct kunit_iommu_priv *priv = test->priv; 43 + struct pt_range range = pt_top_range(priv->common); 44 + struct count_valids valids = {}; 45 + u64 total = 0; 46 + unsigned int i; 47 + 48 + KUNIT_ASSERT_NO_ERRNO(test, 49 + pt_walk_range(&range, __count_valids, &valids)); 50 + 51 + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) 52 + total += valids.per_size[i]; 53 + return total; 54 + } 55 + 56 + /* Only a single page size is present, count the number of valid entries */ 57 + static unsigned int count_valids_single(struct kunit *test, pt_vaddr_t pgsz) 58 + { 59 + struct kunit_iommu_priv *priv = test->priv; 60 + struct pt_range range = pt_top_range(priv->common); 61 + struct count_valids valids = {}; 62 + u64 total = 0; 63 + unsigned int i; 64 + 65 + KUNIT_ASSERT_NO_ERRNO(test, 66 + pt_walk_range(&range, __count_valids, &valids)); 67 + 68 + for (i = 0; i != ARRAY_SIZE(valids.per_size); i++) { 69 + if ((1ULL << i) == pgsz) 70 + total = valids.per_size[i]; 71 + else 72 + KUNIT_ASSERT_EQ(test, valids.per_size[i], 0); 73 + } 74 + return total; 75 + } 76 + 77 + static void do_unmap(struct kunit *test, pt_vaddr_t va, pt_vaddr_t len) 78 + { 79 + struct kunit_iommu_priv *priv = test->priv; 80 + size_t ret; 81 + 82 + ret = iommu_unmap(&priv->domain, va, len); 83 + KUNIT_ASSERT_EQ(test, ret, len); 84 + } 85 + 86 + static void check_iova(struct kunit *test, pt_vaddr_t va, pt_oaddr_t pa, 87 + pt_vaddr_t len) 88 + { 89 + struct kunit_iommu_priv *priv = test->priv; 90 + pt_vaddr_t pfn = log2_div(va, priv->smallest_pgsz_lg2); 91 + pt_vaddr_t end_pfn = pfn + log2_div(len, priv->smallest_pgsz_lg2); 92 + 93 + for (; pfn != end_pfn; pfn++) { 94 + phys_addr_t res = iommu_iova_to_phys(&priv->domain, 95 + pfn * priv->smallest_pgsz); 96 + 97 + KUNIT_ASSERT_EQ(test, res, (phys_addr_t)pa); 98 + if (res != pa) 99 + break; 100 + pa += priv->smallest_pgsz; 101 + } 102 + } 103 + 104 + static void test_increase_level(struct kunit *test) 105 + { 106 + struct kunit_iommu_priv *priv = test->priv; 107 + struct pt_common *common = priv->common; 108 + 109 + if (!pt_feature(common, PT_FEAT_DYNAMIC_TOP)) 110 + kunit_skip(test, "PT_FEAT_DYNAMIC_TOP not set for this format"); 111 + 112 + if (IS_32BIT) 113 + kunit_skip(test, "Unable to test on 32bit"); 114 + 115 + KUNIT_ASSERT_GT(test, common->max_vasz_lg2, 116 + pt_top_range(common).max_vasz_lg2); 117 + 118 + /* Add every possible level to the max */ 119 + while (common->max_vasz_lg2 != pt_top_range(common).max_vasz_lg2) { 120 + struct pt_range top_range = pt_top_range(common); 121 + 122 + if (top_range.va == 0) 123 + do_map(test, top_range.last_va + 1, 0, 124 + priv->smallest_pgsz); 125 + else 126 + do_map(test, top_range.va - priv->smallest_pgsz, 0, 127 + priv->smallest_pgsz); 128 + 129 + KUNIT_ASSERT_EQ(test, pt_top_range(common).top_level, 130 + top_range.top_level + 1); 131 + KUNIT_ASSERT_GE(test, common->max_vasz_lg2, 132 + pt_top_range(common).max_vasz_lg2); 133 + } 134 + } 135 + 136 + static void test_map_simple(struct kunit *test) 137 + { 138 + struct kunit_iommu_priv *priv = test->priv; 139 + struct pt_range range = pt_top_range(priv->common); 140 + struct count_valids valids = {}; 141 + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; 142 + unsigned int pgsz_lg2; 143 + pt_vaddr_t cur_va; 144 + 145 + /* Map every reported page size */ 146 + cur_va = range.va + priv->smallest_pgsz * 256; 147 + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { 148 + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); 149 + u64 len = log2_to_int(pgsz_lg2); 150 + 151 + if (!(pgsize_bitmap & len)) 152 + continue; 153 + 154 + cur_va = ALIGN(cur_va, len); 155 + do_map(test, cur_va, paddr, len); 156 + if (len <= SZ_2G) 157 + check_iova(test, cur_va, paddr, len); 158 + cur_va += len; 159 + } 160 + 161 + /* The read interface reports that every page size was created */ 162 + range = pt_top_range(priv->common); 163 + KUNIT_ASSERT_NO_ERRNO(test, 164 + pt_walk_range(&range, __count_valids, &valids)); 165 + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { 166 + if (pgsize_bitmap & (1ULL << pgsz_lg2)) 167 + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 1); 168 + else 169 + KUNIT_ASSERT_EQ(test, valids.per_size[pgsz_lg2], 0); 170 + } 171 + 172 + /* Unmap works */ 173 + range = pt_top_range(priv->common); 174 + cur_va = range.va + priv->smallest_pgsz * 256; 175 + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { 176 + u64 len = log2_to_int(pgsz_lg2); 177 + 178 + if (!(pgsize_bitmap & len)) 179 + continue; 180 + cur_va = ALIGN(cur_va, len); 181 + do_unmap(test, cur_va, len); 182 + cur_va += len; 183 + } 184 + KUNIT_ASSERT_EQ(test, count_valids(test), 0); 185 + } 186 + 187 + /* 188 + * Test to convert a table pointer into an OA by mapping something small, 189 + * unmapping it so as to leave behind a table pointer, then mapping something 190 + * larger that will convert the table into an OA. 191 + */ 192 + static void test_map_table_to_oa(struct kunit *test) 193 + { 194 + struct kunit_iommu_priv *priv = test->priv; 195 + pt_vaddr_t limited_pgbitmap = 196 + priv->info.pgsize_bitmap % (IS_32BIT ? SZ_2G : SZ_16G); 197 + struct pt_range range = pt_top_range(priv->common); 198 + unsigned int pgsz_lg2; 199 + pt_vaddr_t max_pgsize; 200 + pt_vaddr_t cur_va; 201 + 202 + max_pgsize = 1ULL << (vafls(limited_pgbitmap) - 1); 203 + KUNIT_ASSERT_TRUE(test, priv->info.pgsize_bitmap & max_pgsize); 204 + 205 + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { 206 + pt_oaddr_t paddr = log2_set_mod(priv->test_oa, 0, pgsz_lg2); 207 + u64 len = log2_to_int(pgsz_lg2); 208 + pt_vaddr_t offset; 209 + 210 + if (!(priv->info.pgsize_bitmap & len)) 211 + continue; 212 + if (len > max_pgsize) 213 + break; 214 + 215 + cur_va = ALIGN(range.va + priv->smallest_pgsz * 256, 216 + max_pgsize); 217 + for (offset = 0; offset != max_pgsize; offset += len) 218 + do_map(test, cur_va + offset, paddr + offset, len); 219 + check_iova(test, cur_va, paddr, max_pgsize); 220 + KUNIT_ASSERT_EQ(test, count_valids_single(test, len), 221 + log2_div(max_pgsize, pgsz_lg2)); 222 + 223 + if (len == max_pgsize) { 224 + do_unmap(test, cur_va, max_pgsize); 225 + } else { 226 + do_unmap(test, cur_va, max_pgsize / 2); 227 + for (offset = max_pgsize / 2; offset != max_pgsize; 228 + offset += len) 229 + do_unmap(test, cur_va + offset, len); 230 + } 231 + 232 + KUNIT_ASSERT_EQ(test, count_valids(test), 0); 233 + } 234 + } 235 + 236 + /* 237 + * Test unmapping a small page at the start of a large page. This always unmaps 238 + * the large page. 239 + */ 240 + static void test_unmap_split(struct kunit *test) 241 + { 242 + struct kunit_iommu_priv *priv = test->priv; 243 + struct pt_range top_range = pt_top_range(priv->common); 244 + pt_vaddr_t pgsize_bitmap = priv->safe_pgsize_bitmap; 245 + unsigned int pgsz_lg2; 246 + unsigned int count = 0; 247 + 248 + for (pgsz_lg2 = 0; pgsz_lg2 != PT_VADDR_MAX_LG2; pgsz_lg2++) { 249 + pt_vaddr_t base_len = log2_to_int(pgsz_lg2); 250 + unsigned int next_pgsz_lg2; 251 + 252 + if (!(pgsize_bitmap & base_len)) 253 + continue; 254 + 255 + for (next_pgsz_lg2 = pgsz_lg2 + 1; 256 + next_pgsz_lg2 != PT_VADDR_MAX_LG2; next_pgsz_lg2++) { 257 + pt_vaddr_t next_len = log2_to_int(next_pgsz_lg2); 258 + pt_vaddr_t vaddr = top_range.va; 259 + pt_oaddr_t paddr = 0; 260 + size_t gnmapped; 261 + 262 + if (!(pgsize_bitmap & next_len)) 263 + continue; 264 + 265 + do_map(test, vaddr, paddr, next_len); 266 + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); 267 + KUNIT_ASSERT_EQ(test, gnmapped, next_len); 268 + 269 + /* Make sure unmap doesn't keep going */ 270 + do_map(test, vaddr, paddr, next_len); 271 + do_map(test, vaddr + next_len, paddr, next_len); 272 + gnmapped = iommu_unmap(&priv->domain, vaddr, base_len); 273 + KUNIT_ASSERT_EQ(test, gnmapped, next_len); 274 + gnmapped = iommu_unmap(&priv->domain, vaddr + next_len, 275 + next_len); 276 + KUNIT_ASSERT_EQ(test, gnmapped, next_len); 277 + 278 + count++; 279 + } 280 + } 281 + 282 + if (count == 0) 283 + kunit_skip(test, "Test needs two page sizes"); 284 + } 285 + 286 + static void unmap_collisions(struct kunit *test, struct maple_tree *mt, 287 + pt_vaddr_t start, pt_vaddr_t last) 288 + { 289 + struct kunit_iommu_priv *priv = test->priv; 290 + MA_STATE(mas, mt, start, last); 291 + void *entry; 292 + 293 + mtree_lock(mt); 294 + mas_for_each(&mas, entry, last) { 295 + pt_vaddr_t mas_start = mas.index; 296 + pt_vaddr_t len = (mas.last - mas_start) + 1; 297 + pt_oaddr_t paddr; 298 + 299 + mas_erase(&mas); 300 + mas_pause(&mas); 301 + mtree_unlock(mt); 302 + 303 + paddr = oalog2_mod(mas_start, priv->common->max_oasz_lg2); 304 + check_iova(test, mas_start, paddr, len); 305 + do_unmap(test, mas_start, len); 306 + mtree_lock(mt); 307 + } 308 + mtree_unlock(mt); 309 + } 310 + 311 + static void clamp_range(struct kunit *test, struct pt_range *range) 312 + { 313 + struct kunit_iommu_priv *priv = test->priv; 314 + 315 + if (range->last_va - range->va > SZ_1G) 316 + range->last_va = range->va + SZ_1G; 317 + KUNIT_ASSERT_NE(test, range->last_va, PT_VADDR_MAX); 318 + if (range->va <= MAPLE_RESERVED_RANGE) 319 + range->va = 320 + ALIGN(MAPLE_RESERVED_RANGE, priv->smallest_pgsz); 321 + } 322 + 323 + /* 324 + * Randomly map and unmap ranges that can large physical pages. If a random 325 + * range overlaps with existing ranges then unmap them. This hits all the 326 + * special cases. 327 + */ 328 + static void test_random_map(struct kunit *test) 329 + { 330 + struct kunit_iommu_priv *priv = test->priv; 331 + struct pt_range upper_range = pt_upper_range(priv->common); 332 + struct pt_range top_range = pt_top_range(priv->common); 333 + struct maple_tree mt; 334 + unsigned int iter; 335 + 336 + mt_init(&mt); 337 + 338 + /* 339 + * Shrink the range so randomization is more likely to have 340 + * intersections 341 + */ 342 + clamp_range(test, &top_range); 343 + clamp_range(test, &upper_range); 344 + 345 + for (iter = 0; iter != 1000; iter++) { 346 + struct pt_range *range = &top_range; 347 + pt_oaddr_t paddr; 348 + pt_vaddr_t start; 349 + pt_vaddr_t end; 350 + int ret; 351 + 352 + if (pt_feature(priv->common, PT_FEAT_SIGN_EXTEND) && 353 + ULONG_MAX >= PT_VADDR_MAX && get_random_u32_inclusive(0, 1)) 354 + range = &upper_range; 355 + 356 + start = get_random_u32_below( 357 + min(U32_MAX, range->last_va - range->va)); 358 + end = get_random_u32_below( 359 + min(U32_MAX, range->last_va - start)); 360 + 361 + start = ALIGN_DOWN(start, priv->smallest_pgsz); 362 + end = ALIGN(end, priv->smallest_pgsz); 363 + start += range->va; 364 + end += start; 365 + if (start < range->va || end > range->last_va + 1 || 366 + start >= end) 367 + continue; 368 + 369 + /* Try overmapping to test the failure handling */ 370 + paddr = oalog2_mod(start, priv->common->max_oasz_lg2); 371 + ret = iommu_map(&priv->domain, start, paddr, end - start, 372 + IOMMU_READ | IOMMU_WRITE, GFP_KERNEL); 373 + if (ret) { 374 + KUNIT_ASSERT_EQ(test, ret, -EADDRINUSE); 375 + unmap_collisions(test, &mt, start, end - 1); 376 + do_map(test, start, paddr, end - start); 377 + } 378 + 379 + KUNIT_ASSERT_NO_ERRNO_FN(test, "mtree_insert_range", 380 + mtree_insert_range(&mt, start, end - 1, 381 + XA_ZERO_ENTRY, 382 + GFP_KERNEL)); 383 + 384 + check_iova(test, start, paddr, end - start); 385 + if (iter % 100) 386 + cond_resched(); 387 + } 388 + 389 + unmap_collisions(test, &mt, 0, PT_VADDR_MAX); 390 + KUNIT_ASSERT_EQ(test, count_valids(test), 0); 391 + 392 + mtree_destroy(&mt); 393 + } 394 + 395 + /* See https://lore.kernel.org/r/b9b18a03-63a2-4065-a27e-d92dd5c860bc@amd.com */ 396 + static void test_pgsize_boundary(struct kunit *test) 397 + { 398 + struct kunit_iommu_priv *priv = test->priv; 399 + struct pt_range top_range = pt_top_range(priv->common); 400 + 401 + if (top_range.va != 0 || top_range.last_va < 0xfef9ffff || 402 + priv->smallest_pgsz != SZ_4K) 403 + kunit_skip(test, "Format does not have the required range"); 404 + 405 + do_map(test, 0xfef80000, 0x208b95d000, 0xfef9ffff - 0xfef80000 + 1); 406 + } 407 + 408 + /* See https://lore.kernel.org/r/20250826143816.38686-1-eugkoira@amazon.com */ 409 + static void test_mixed(struct kunit *test) 410 + { 411 + struct kunit_iommu_priv *priv = test->priv; 412 + struct pt_range top_range = pt_top_range(priv->common); 413 + u64 start = 0x3fe400ULL << 12; 414 + u64 end = 0x4c0600ULL << 12; 415 + pt_vaddr_t len = end - start; 416 + pt_oaddr_t oa = start; 417 + 418 + if (top_range.last_va <= start || sizeof(unsigned long) == 4) 419 + kunit_skip(test, "range is too small"); 420 + if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21))) 421 + kunit_skip(test, "incompatible psize"); 422 + 423 + do_map(test, start, oa, len); 424 + /* 14 2M, 3 1G, 3 2M */ 425 + KUNIT_ASSERT_EQ(test, count_valids(test), 20); 426 + check_iova(test, start, oa, len); 427 + } 428 + 429 + static struct kunit_case iommu_test_cases[] = { 430 + KUNIT_CASE_FMT(test_increase_level), 431 + KUNIT_CASE_FMT(test_map_simple), 432 + KUNIT_CASE_FMT(test_map_table_to_oa), 433 + KUNIT_CASE_FMT(test_unmap_split), 434 + KUNIT_CASE_FMT(test_random_map), 435 + KUNIT_CASE_FMT(test_pgsize_boundary), 436 + KUNIT_CASE_FMT(test_mixed), 437 + {}, 438 + }; 439 + 440 + static int pt_kunit_iommu_init(struct kunit *test) 441 + { 442 + struct kunit_iommu_priv *priv; 443 + int ret; 444 + 445 + priv = kunit_kzalloc(test, sizeof(*priv), GFP_KERNEL); 446 + if (!priv) 447 + return -ENOMEM; 448 + 449 + priv->orig_nr_secondary_pagetable = 450 + global_node_page_state(NR_SECONDARY_PAGETABLE); 451 + ret = pt_kunit_priv_init(test, priv); 452 + if (ret) { 453 + kunit_kfree(test, priv); 454 + return ret; 455 + } 456 + test->priv = priv; 457 + return 0; 458 + } 459 + 460 + static void pt_kunit_iommu_exit(struct kunit *test) 461 + { 462 + struct kunit_iommu_priv *priv = test->priv; 463 + 464 + if (!test->priv) 465 + return; 466 + 467 + pt_iommu_deinit(priv->iommu); 468 + /* 469 + * Look for memory leaks, assumes kunit is running isolated and nothing 470 + * else is using secondary page tables. 471 + */ 472 + KUNIT_ASSERT_EQ(test, priv->orig_nr_secondary_pagetable, 473 + global_node_page_state(NR_SECONDARY_PAGETABLE)); 474 + kunit_kfree(test, test->priv); 475 + } 476 + 477 + static struct kunit_suite NS(iommu_suite) = { 478 + .name = __stringify(NS(iommu_test)), 479 + .init = pt_kunit_iommu_init, 480 + .exit = pt_kunit_iommu_exit, 481 + .test_cases = iommu_test_cases, 482 + }; 483 + kunit_test_suites(&NS(iommu_suite)); 484 + 485 + MODULE_LICENSE("GPL"); 486 + MODULE_DESCRIPTION("Kunit for generic page table"); 487 + MODULE_IMPORT_NS("GENERIC_PT_IOMMU");

+389

drivers/iommu/generic_pt/pt_common.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * This header is included after the format. It contains definitions 6 + * that build on the format definitions to create the basic format API. 7 + * 8 + * The format API is listed here, with kdocs. The functions without bodies are 9 + * implemented in the format using the pattern: 10 + * static inline FMTpt_XXX(..) {..} 11 + * #define pt_XXX FMTpt_XXX 12 + * 13 + * If the format doesn't implement a function then pt_fmt_defaults.h can provide 14 + * a generic version. 15 + * 16 + * The routines marked "@pts: Entry to query" operate on the entire contiguous 17 + * entry and can be called with a pts->index pointing to any sub item that makes 18 + * up that entry. 19 + * 20 + * The header order is: 21 + * pt_defs.h 22 + * FMT.h 23 + * pt_common.h 24 + */ 25 + #ifndef __GENERIC_PT_PT_COMMON_H 26 + #define __GENERIC_PT_PT_COMMON_H 27 + 28 + #include "pt_defs.h" 29 + #include "pt_fmt_defaults.h" 30 + 31 + /** 32 + * pt_attr_from_entry() - Convert the permission bits back to attrs 33 + * @pts: Entry to convert from 34 + * @attrs: Resulting attrs 35 + * 36 + * Fill in the attrs with the permission bits encoded in the current leaf entry. 37 + * The attrs should be usable with pt_install_leaf_entry() to reconstruct the 38 + * same entry. 39 + */ 40 + static inline void pt_attr_from_entry(const struct pt_state *pts, 41 + struct pt_write_attrs *attrs); 42 + 43 + /** 44 + * pt_can_have_leaf() - True if the current level can have an OA entry 45 + * @pts: The current level 46 + * 47 + * True if the current level can support pt_install_leaf_entry(). A leaf 48 + * entry produce an OA. 49 + */ 50 + static inline bool pt_can_have_leaf(const struct pt_state *pts); 51 + 52 + /** 53 + * pt_can_have_table() - True if the current level can have a lower table 54 + * @pts: The current level 55 + * 56 + * Every level except 0 is allowed to have a lower table. 57 + */ 58 + static inline bool pt_can_have_table(const struct pt_state *pts) 59 + { 60 + /* No further tables at level 0 */ 61 + return pts->level > 0; 62 + } 63 + 64 + /** 65 + * pt_clear_entries() - Make entries empty (non-present) 66 + * @pts: Starting table index 67 + * @num_contig_lg2: Number of contiguous items to clear 68 + * 69 + * Clear a run of entries. A cleared entry will load back as PT_ENTRY_EMPTY 70 + * and does not have any effect on table walking. The starting index must be 71 + * aligned to num_contig_lg2. 72 + */ 73 + static inline void pt_clear_entries(struct pt_state *pts, 74 + unsigned int num_contig_lg2); 75 + 76 + /** 77 + * pt_entry_make_write_dirty() - Make an entry dirty 78 + * @pts: Table entry to change 79 + * 80 + * Make pt_entry_is_write_dirty() return true for this entry. This can be called 81 + * asynchronously with any other table manipulation under a RCU lock and must 82 + * not corrupt the table. 83 + */ 84 + static inline bool pt_entry_make_write_dirty(struct pt_state *pts); 85 + 86 + /** 87 + * pt_entry_make_write_clean() - Make the entry write clean 88 + * @pts: Table entry to change 89 + * 90 + * Modify the entry so that pt_entry_is_write_dirty() == false. The HW will 91 + * eventually be notified of this change via a TLB flush, which is the point 92 + * that the HW must become synchronized. Any "write dirty" prior to the TLB 93 + * flush can be lost, but once the TLB flush completes all writes must make 94 + * their entries write dirty. 95 + * 96 + * The format should alter the entry in a way that is compatible with any 97 + * concurrent update from HW. The entire contiguous entry is changed. 98 + */ 99 + static inline void pt_entry_make_write_clean(struct pt_state *pts); 100 + 101 + /** 102 + * pt_entry_is_write_dirty() - True if the entry has been written to 103 + * @pts: Entry to query 104 + * 105 + * "write dirty" means that the HW has written to the OA translated 106 + * by this entry. If the entry is contiguous then the consolidated 107 + * "write dirty" for all the items must be returned. 108 + */ 109 + static inline bool pt_entry_is_write_dirty(const struct pt_state *pts); 110 + 111 + /** 112 + * pt_dirty_supported() - True if the page table supports dirty tracking 113 + * @common: Page table to query 114 + */ 115 + static inline bool pt_dirty_supported(struct pt_common *common); 116 + 117 + /** 118 + * pt_entry_num_contig_lg2() - Number of contiguous items for this leaf entry 119 + * @pts: Entry to query 120 + * 121 + * Return the number of contiguous items this leaf entry spans. If the entry 122 + * is single item it returns ilog2(1). 123 + */ 124 + static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts); 125 + 126 + /** 127 + * pt_entry_oa() - Output Address for this leaf entry 128 + * @pts: Entry to query 129 + * 130 + * Return the output address for the start of the entry. If the entry 131 + * is contiguous this returns the same value for each sub-item. I.e.:: 132 + * 133 + * log2_mod(pt_entry_oa(), pt_entry_oa_lg2sz()) == 0 134 + * 135 + * See pt_item_oa(). The format should implement one of these two functions 136 + * depending on how it stores the OAs in the table. 137 + */ 138 + static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts); 139 + 140 + /** 141 + * pt_entry_oa_lg2sz() - Return the size of an OA entry 142 + * @pts: Entry to query 143 + * 144 + * If the entry is not contiguous this returns pt_table_item_lg2sz(), otherwise 145 + * it returns the total VA/OA size of the entire contiguous entry. 146 + */ 147 + static inline unsigned int pt_entry_oa_lg2sz(const struct pt_state *pts) 148 + { 149 + return pt_entry_num_contig_lg2(pts) + pt_table_item_lg2sz(pts); 150 + } 151 + 152 + /** 153 + * pt_entry_oa_exact() - Return the complete OA for an entry 154 + * @pts: Entry to query 155 + * 156 + * During iteration the first entry could have a VA with an offset from the 157 + * natural start of the entry. Return the exact OA including the pts's VA 158 + * offset. 159 + */ 160 + static inline pt_oaddr_t pt_entry_oa_exact(const struct pt_state *pts) 161 + { 162 + return _pt_entry_oa_fast(pts) | 163 + log2_mod(pts->range->va, pt_entry_oa_lg2sz(pts)); 164 + } 165 + 166 + /** 167 + * pt_full_va_prefix() - The top bits of the VA 168 + * @common: Page table to query 169 + * 170 + * This is usually 0, but some formats have their VA space going downward from 171 + * PT_VADDR_MAX, and will return that instead. This value must always be 172 + * adjusted by struct pt_common max_vasz_lg2. 173 + */ 174 + static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common); 175 + 176 + /** 177 + * pt_has_system_page_size() - True if level 0 can install a PAGE_SHIFT entry 178 + * @common: Page table to query 179 + * 180 + * If true the caller can use, at level 0, pt_install_leaf_entry(PAGE_SHIFT). 181 + * This is useful to create optimized paths for common cases of PAGE_SIZE 182 + * mappings. 183 + */ 184 + static inline bool pt_has_system_page_size(const struct pt_common *common); 185 + 186 + /** 187 + * pt_install_leaf_entry() - Write a leaf entry to the table 188 + * @pts: Table index to change 189 + * @oa: Output Address for this leaf 190 + * @oasz_lg2: Size in VA/OA for this leaf 191 + * @attrs: Attributes to modify the entry 192 + * 193 + * A leaf OA entry will return PT_ENTRY_OA from pt_load_entry(). It translates 194 + * the VA indicated by pts to the given OA. 195 + * 196 + * For a single item non-contiguous entry oasz_lg2 is pt_table_item_lg2sz(). 197 + * For contiguous it is pt_table_item_lg2sz() + num_contig_lg2. 198 + * 199 + * This must not be called if pt_can_have_leaf() == false. Contiguous sizes 200 + * not indicated by pt_possible_sizes() must not be specified. 201 + */ 202 + static inline void pt_install_leaf_entry(struct pt_state *pts, pt_oaddr_t oa, 203 + unsigned int oasz_lg2, 204 + const struct pt_write_attrs *attrs); 205 + 206 + /** 207 + * pt_install_table() - Write a table entry to the table 208 + * @pts: Table index to change 209 + * @table_pa: CPU physical address of the lower table's memory 210 + * @attrs: Attributes to modify the table index 211 + * 212 + * A table entry will return PT_ENTRY_TABLE from pt_load_entry(). The table_pa 213 + * is the table at pts->level - 1. This is done by cmpxchg so pts must have the 214 + * current entry loaded. The pts is updated with the installed entry. 215 + * 216 + * This must not be called if pt_can_have_table() == false. 217 + * 218 + * Returns: true if the table was installed successfully. 219 + */ 220 + static inline bool pt_install_table(struct pt_state *pts, pt_oaddr_t table_pa, 221 + const struct pt_write_attrs *attrs); 222 + 223 + /** 224 + * pt_item_oa() - Output Address for this leaf item 225 + * @pts: Item to query 226 + * 227 + * Return the output address for this item. If the item is part of a contiguous 228 + * entry it returns the value of the OA for this individual sub item. 229 + * 230 + * See pt_entry_oa(). The format should implement one of these two functions 231 + * depending on how it stores the OA's in the table. 232 + */ 233 + static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts); 234 + 235 + /** 236 + * pt_load_entry_raw() - Read from the location pts points at into the pts 237 + * @pts: Table index to load 238 + * 239 + * Return the type of entry that was loaded. pts->entry will be filled in with 240 + * the entry's content. See pt_load_entry() 241 + */ 242 + static inline enum pt_entry_type pt_load_entry_raw(struct pt_state *pts); 243 + 244 + /** 245 + * pt_max_oa_lg2() - Return the maximum OA the table format can hold 246 + * @common: Page table to query 247 + * 248 + * The value oalog2_to_max_int(pt_max_oa_lg2()) is the MAX for the 249 + * OA. This is the absolute maximum address the table can hold. struct pt_common 250 + * max_oasz_lg2 sets a lower dynamic maximum based on HW capability. 251 + */ 252 + static inline unsigned int 253 + pt_max_oa_lg2(const struct pt_common *common); 254 + 255 + /** 256 + * pt_num_items_lg2() - Return the number of items in this table level 257 + * @pts: The current level 258 + * 259 + * The number of items in a table level defines the number of bits this level 260 + * decodes from the VA. This function is not called for the top level, 261 + * so it does not need to compute a special value for the top case. The 262 + * result for the top is based on pt_common max_vasz_lg2. 263 + * 264 + * The value is used as part of determining the table indexes via the 265 + * equation:: 266 + * 267 + * log2_mod(log2_div(VA, pt_table_item_lg2sz()), pt_num_items_lg2()) 268 + */ 269 + static inline unsigned int pt_num_items_lg2(const struct pt_state *pts); 270 + 271 + /** 272 + * pt_pgsz_lg2_to_level - Return the level that maps the page size 273 + * @common: Page table to query 274 + * @pgsize_lg2: Log2 page size 275 + * 276 + * Returns the table level that will map the given page size. The page 277 + * size must be part of the pt_possible_sizes() for some level. 278 + */ 279 + static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, 280 + unsigned int pgsize_lg2); 281 + 282 + /** 283 + * pt_possible_sizes() - Return a bitmap of possible output sizes at this level 284 + * @pts: The current level 285 + * 286 + * Each level has a list of possible output sizes that can be installed as 287 + * leaf entries. If pt_can_have_leaf() is false returns zero. 288 + * 289 + * Otherwise the bit in position pt_table_item_lg2sz() should be set indicating 290 + * that a non-contiguous single item leaf entry is supported. The following 291 + * pt_num_items_lg2() number of bits can be set indicating contiguous entries 292 + * are supported. Bit pt_table_item_lg2sz() + pt_num_items_lg2() must not be 293 + * set, contiguous entries cannot span the entire table. 294 + * 295 + * The OR of pt_possible_sizes() of all levels is the typical bitmask of all 296 + * supported sizes in the entire table. 297 + */ 298 + static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts); 299 + 300 + /** 301 + * pt_table_item_lg2sz() - Size of a single item entry in this table level 302 + * @pts: The current level 303 + * 304 + * The size of the item specifies how much VA and OA a single item occupies. 305 + * 306 + * See pt_entry_oa_lg2sz() for the same value including the effect of contiguous 307 + * entries. 308 + */ 309 + static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts); 310 + 311 + /** 312 + * pt_table_oa_lg2sz() - Return the VA/OA size of the entire table 313 + * @pts: The current level 314 + * 315 + * Return the size of VA decoded by the entire table level. 316 + */ 317 + static inline unsigned int pt_table_oa_lg2sz(const struct pt_state *pts) 318 + { 319 + if (pts->range->top_level == pts->level) 320 + return pts->range->max_vasz_lg2; 321 + return min_t(unsigned int, pts->range->common->max_vasz_lg2, 322 + pt_num_items_lg2(pts) + pt_table_item_lg2sz(pts)); 323 + } 324 + 325 + /** 326 + * pt_table_pa() - Return the CPU physical address of the table entry 327 + * @pts: Entry to query 328 + * 329 + * This is only ever called on PT_ENTRY_TABLE entries. Must return the same 330 + * value passed to pt_install_table(). 331 + */ 332 + static inline pt_oaddr_t pt_table_pa(const struct pt_state *pts); 333 + 334 + /** 335 + * pt_table_ptr() - Return a CPU pointer for a table item 336 + * @pts: Entry to query 337 + * 338 + * Same as pt_table_pa() but returns a CPU pointer. 339 + */ 340 + static inline struct pt_table_p *pt_table_ptr(const struct pt_state *pts) 341 + { 342 + return __va(pt_table_pa(pts)); 343 + } 344 + 345 + /** 346 + * pt_max_sw_bit() - Return the maximum software bit usable for any level and 347 + * entry 348 + * @common: Page table 349 + * 350 + * The swbit can be passed as bitnr to the other sw_bit functions. 351 + */ 352 + static inline unsigned int pt_max_sw_bit(struct pt_common *common); 353 + 354 + /** 355 + * pt_test_sw_bit_acquire() - Read a software bit in an item 356 + * @pts: Entry to read 357 + * @bitnr: Bit to read 358 + * 359 + * Software bits are ignored by HW and can be used for any purpose by the 360 + * software. This does a test bit and acquire operation. 361 + */ 362 + static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, 363 + unsigned int bitnr); 364 + 365 + /** 366 + * pt_set_sw_bit_release() - Set a software bit in an item 367 + * @pts: Entry to set 368 + * @bitnr: Bit to set 369 + * 370 + * Software bits are ignored by HW and can be used for any purpose by the 371 + * software. This does a set bit and release operation. 372 + */ 373 + static inline void pt_set_sw_bit_release(struct pt_state *pts, 374 + unsigned int bitnr); 375 + 376 + /** 377 + * pt_load_entry() - Read from the location pts points at into the pts 378 + * @pts: Table index to load 379 + * 380 + * Set the type of entry that was loaded. pts->entry and pts->table_lower 381 + * will be filled in with the entry's content. 382 + */ 383 + static inline void pt_load_entry(struct pt_state *pts) 384 + { 385 + pts->type = pt_load_entry_raw(pts); 386 + if (pts->type == PT_ENTRY_TABLE) 387 + pts->table_lower = pt_table_ptr(pts); 388 + } 389 + #endif

+332

drivers/iommu/generic_pt/pt_defs.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * This header is included before the format. It contains definitions 6 + * that are required to compile the format. The header order is: 7 + * pt_defs.h 8 + * fmt_XX.h 9 + * pt_common.h 10 + */ 11 + #ifndef __GENERIC_PT_DEFS_H 12 + #define __GENERIC_PT_DEFS_H 13 + 14 + #include <linux/generic_pt/common.h> 15 + 16 + #include <linux/types.h> 17 + #include <linux/atomic.h> 18 + #include <linux/bits.h> 19 + #include <linux/limits.h> 20 + #include <linux/bug.h> 21 + #include <linux/kconfig.h> 22 + #include "pt_log2.h" 23 + 24 + /* Header self-compile default defines */ 25 + #ifndef pt_write_attrs 26 + typedef u64 pt_vaddr_t; 27 + typedef u64 pt_oaddr_t; 28 + #endif 29 + 30 + struct pt_table_p; 31 + 32 + enum { 33 + PT_VADDR_MAX = sizeof(pt_vaddr_t) == 8 ? U64_MAX : U32_MAX, 34 + PT_VADDR_MAX_LG2 = sizeof(pt_vaddr_t) == 8 ? 64 : 32, 35 + PT_OADDR_MAX = sizeof(pt_oaddr_t) == 8 ? U64_MAX : U32_MAX, 36 + PT_OADDR_MAX_LG2 = sizeof(pt_oaddr_t) == 8 ? 64 : 32, 37 + }; 38 + 39 + /* 40 + * The format instantiation can have features wired off or on to optimize the 41 + * code gen. Supported features are just a reflection of what the current set of 42 + * kernel users want to use. 43 + */ 44 + #ifndef PT_SUPPORTED_FEATURES 45 + #define PT_SUPPORTED_FEATURES 0 46 + #endif 47 + 48 + /* 49 + * When in debug mode we compile all formats with all features. This allows the 50 + * kunit to test the full matrix. SIGN_EXTEND can't co-exist with DYNAMIC_TOP or 51 + * FULL_VA. DMA_INCOHERENT requires a SW bit that not all formats have 52 + */ 53 + #if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) 54 + enum { 55 + PT_ORIG_SUPPORTED_FEATURES = PT_SUPPORTED_FEATURES, 56 + PT_DEBUG_SUPPORTED_FEATURES = 57 + UINT_MAX & 58 + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_DMA_INCOHERENT) ? 59 + 0 : 60 + BIT(PT_FEAT_DMA_INCOHERENT))) & 61 + ~((PT_ORIG_SUPPORTED_FEATURES & BIT(PT_FEAT_SIGN_EXTEND)) ? 62 + BIT(PT_FEAT_DYNAMIC_TOP) | BIT(PT_FEAT_FULL_VA) : 63 + BIT(PT_FEAT_SIGN_EXTEND)), 64 + }; 65 + #undef PT_SUPPORTED_FEATURES 66 + #define PT_SUPPORTED_FEATURES PT_DEBUG_SUPPORTED_FEATURES 67 + #endif 68 + 69 + #ifndef PT_FORCE_ENABLED_FEATURES 70 + #define PT_FORCE_ENABLED_FEATURES 0 71 + #endif 72 + 73 + /** 74 + * DOC: Generic Page Table Language 75 + * 76 + * Language used in Generic Page Table 77 + * VA 78 + * The input address to the page table, often the virtual address. 79 + * OA 80 + * The output address from the page table, often the physical address. 81 + * leaf 82 + * An entry that results in an output address. 83 + * start/end 84 + * An half-open range, e.g. [0,0) refers to no VA. 85 + * start/last 86 + * An inclusive closed range, e.g. [0,0] refers to the VA 0 87 + * common 88 + * The generic page table container struct pt_common 89 + * level 90 + * Level 0 is always a table of only leaves with no futher table pointers. 91 + * Increasing levels increase the size of the table items. The least 92 + * significant VA bits used to index page tables are used to index the Level 93 + * 0 table. The various labels for table levels used by HW descriptions are 94 + * not used. 95 + * top_level 96 + * The inclusive highest level of the table. A two-level table 97 + * has a top level of 1. 98 + * table 99 + * A linear array of translation items for that level. 100 + * index 101 + * The position in a table of an element: item = table[index] 102 + * item 103 + * A single index in a table 104 + * entry 105 + * A single logical element in a table. If contiguous pages are not 106 + * supported then item and entry are the same thing, otherwise entry refers 107 + * to all the items that comprise a single contiguous translation. 108 + * item/entry_size 109 + * The number of bytes of VA the table index translates for. 110 + * If the item is a table entry then the next table covers 111 + * this size. If the entry translates to an output address then the 112 + * full OA is: OA | (VA % entry_size) 113 + * contig_count 114 + * The number of consecutive items fused into a single entry. 115 + * item_size * contig_count is the size of that entry's translation. 116 + * lg2 117 + * Indicates the value is encoded as log2, i.e. 1<<x is the actual value. 118 + * Normally the compiler is fine to optimize divide and mod with log2 values 119 + * automatically when inlining, however if the values are not constant 120 + * expressions it can't. So we do it by hand; we want to avoid 64-bit 121 + * divmod. 122 + */ 123 + 124 + /* Returned by pt_load_entry() and for_each_pt_level_entry() */ 125 + enum pt_entry_type { 126 + PT_ENTRY_EMPTY, 127 + /* Entry is valid and points to a lower table level */ 128 + PT_ENTRY_TABLE, 129 + /* Entry is valid and returns an output address */ 130 + PT_ENTRY_OA, 131 + }; 132 + 133 + struct pt_range { 134 + struct pt_common *common; 135 + struct pt_table_p *top_table; 136 + pt_vaddr_t va; 137 + pt_vaddr_t last_va; 138 + u8 top_level; 139 + u8 max_vasz_lg2; 140 + }; 141 + 142 + /* 143 + * Similar to xa_state, this records information about an in-progress parse at a 144 + * single level. 145 + */ 146 + struct pt_state { 147 + struct pt_range *range; 148 + struct pt_table_p *table; 149 + struct pt_table_p *table_lower; 150 + u64 entry; 151 + enum pt_entry_type type; 152 + unsigned short index; 153 + unsigned short end_index; 154 + u8 level; 155 + }; 156 + 157 + #define pt_cur_table(pts, type) ((type *)((pts)->table)) 158 + 159 + /* 160 + * Try to install a new table pointer. The locking methodology requires this to 161 + * be atomic (multiple threads can race to install a pointer). The losing 162 + * threads will fail the atomic and return false. They should free any memory 163 + * and reparse the table level again. 164 + */ 165 + #if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) 166 + static inline bool pt_table_install64(struct pt_state *pts, u64 table_entry) 167 + { 168 + u64 *entryp = pt_cur_table(pts, u64) + pts->index; 169 + u64 old_entry = pts->entry; 170 + bool ret; 171 + 172 + /* 173 + * Ensure the zero'd table content itself is visible before its PTE can 174 + * be. release is a NOP on !SMP, but the HW is still doing an acquire. 175 + */ 176 + if (!IS_ENABLED(CONFIG_SMP)) 177 + dma_wmb(); 178 + ret = try_cmpxchg64_release(entryp, &old_entry, table_entry); 179 + if (ret) 180 + pts->entry = table_entry; 181 + return ret; 182 + } 183 + #endif 184 + 185 + static inline bool pt_table_install32(struct pt_state *pts, u32 table_entry) 186 + { 187 + u32 *entryp = pt_cur_table(pts, u32) + pts->index; 188 + u32 old_entry = pts->entry; 189 + bool ret; 190 + 191 + /* 192 + * Ensure the zero'd table content itself is visible before its PTE can 193 + * be. release is a NOP on !SMP, but the HW is still doing an acquire. 194 + */ 195 + if (!IS_ENABLED(CONFIG_SMP)) 196 + dma_wmb(); 197 + ret = try_cmpxchg_release(entryp, &old_entry, table_entry); 198 + if (ret) 199 + pts->entry = table_entry; 200 + return ret; 201 + } 202 + 203 + #define PT_SUPPORTED_FEATURE(feature_nr) (PT_SUPPORTED_FEATURES & BIT(feature_nr)) 204 + 205 + static inline bool pt_feature(const struct pt_common *common, 206 + unsigned int feature_nr) 207 + { 208 + if (PT_FORCE_ENABLED_FEATURES & BIT(feature_nr)) 209 + return true; 210 + if (!PT_SUPPORTED_FEATURE(feature_nr)) 211 + return false; 212 + return common->features & BIT(feature_nr); 213 + } 214 + 215 + static inline bool pts_feature(const struct pt_state *pts, 216 + unsigned int feature_nr) 217 + { 218 + return pt_feature(pts->range->common, feature_nr); 219 + } 220 + 221 + /* 222 + * PT_WARN_ON is used for invariants that the kunit should be checking can't 223 + * happen. 224 + */ 225 + #if IS_ENABLED(CONFIG_DEBUG_GENERIC_PT) 226 + #define PT_WARN_ON WARN_ON 227 + #else 228 + static inline bool PT_WARN_ON(bool condition) 229 + { 230 + return false; 231 + } 232 + #endif 233 + 234 + /* These all work on the VA type */ 235 + #define log2_to_int(a_lg2) log2_to_int_t(pt_vaddr_t, a_lg2) 236 + #define log2_to_max_int(a_lg2) log2_to_max_int_t(pt_vaddr_t, a_lg2) 237 + #define log2_div(a, b_lg2) log2_div_t(pt_vaddr_t, a, b_lg2) 238 + #define log2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_vaddr_t, a, b, c_lg2) 239 + #define log2_mod(a, b_lg2) log2_mod_t(pt_vaddr_t, a, b_lg2) 240 + #define log2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_vaddr_t, a, b_lg2) 241 + #define log2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_vaddr_t, a, val, b_lg2) 242 + #define log2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_vaddr_t, a, b_lg2) 243 + #define log2_mul(a, b_lg2) log2_mul_t(pt_vaddr_t, a, b_lg2) 244 + #define vaffs(a) ffs_t(pt_vaddr_t, a) 245 + #define vafls(a) fls_t(pt_vaddr_t, a) 246 + #define vaffz(a) ffz_t(pt_vaddr_t, a) 247 + 248 + /* 249 + * The full VA (fva) versions permit the lg2 value to be == PT_VADDR_MAX_LG2 and 250 + * generate a useful defined result. The non-fva versions will malfunction at 251 + * this extreme. 252 + */ 253 + static inline pt_vaddr_t fvalog2_div(pt_vaddr_t a, unsigned int b_lg2) 254 + { 255 + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) 256 + return 0; 257 + return log2_div_t(pt_vaddr_t, a, b_lg2); 258 + } 259 + 260 + static inline pt_vaddr_t fvalog2_mod(pt_vaddr_t a, unsigned int b_lg2) 261 + { 262 + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) 263 + return a; 264 + return log2_mod_t(pt_vaddr_t, a, b_lg2); 265 + } 266 + 267 + static inline bool fvalog2_div_eq(pt_vaddr_t a, pt_vaddr_t b, 268 + unsigned int c_lg2) 269 + { 270 + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && c_lg2 == PT_VADDR_MAX_LG2) 271 + return true; 272 + return log2_div_eq_t(pt_vaddr_t, a, b, c_lg2); 273 + } 274 + 275 + static inline pt_vaddr_t fvalog2_set_mod(pt_vaddr_t a, pt_vaddr_t val, 276 + unsigned int b_lg2) 277 + { 278 + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) 279 + return val; 280 + return log2_set_mod_t(pt_vaddr_t, a, val, b_lg2); 281 + } 282 + 283 + static inline pt_vaddr_t fvalog2_set_mod_max(pt_vaddr_t a, unsigned int b_lg2) 284 + { 285 + if (PT_SUPPORTED_FEATURE(PT_FEAT_FULL_VA) && b_lg2 == PT_VADDR_MAX_LG2) 286 + return PT_VADDR_MAX; 287 + return log2_set_mod_max_t(pt_vaddr_t, a, b_lg2); 288 + } 289 + 290 + /* These all work on the OA type */ 291 + #define oalog2_to_int(a_lg2) log2_to_int_t(pt_oaddr_t, a_lg2) 292 + #define oalog2_to_max_int(a_lg2) log2_to_max_int_t(pt_oaddr_t, a_lg2) 293 + #define oalog2_div(a, b_lg2) log2_div_t(pt_oaddr_t, a, b_lg2) 294 + #define oalog2_div_eq(a, b, c_lg2) log2_div_eq_t(pt_oaddr_t, a, b, c_lg2) 295 + #define oalog2_mod(a, b_lg2) log2_mod_t(pt_oaddr_t, a, b_lg2) 296 + #define oalog2_mod_eq_max(a, b_lg2) log2_mod_eq_max_t(pt_oaddr_t, a, b_lg2) 297 + #define oalog2_set_mod(a, val, b_lg2) log2_set_mod_t(pt_oaddr_t, a, val, b_lg2) 298 + #define oalog2_set_mod_max(a, b_lg2) log2_set_mod_max_t(pt_oaddr_t, a, b_lg2) 299 + #define oalog2_mul(a, b_lg2) log2_mul_t(pt_oaddr_t, a, b_lg2) 300 + #define oaffs(a) ffs_t(pt_oaddr_t, a) 301 + #define oafls(a) fls_t(pt_oaddr_t, a) 302 + #define oaffz(a) ffz_t(pt_oaddr_t, a) 303 + 304 + static inline uintptr_t _pt_top_set(struct pt_table_p *table_mem, 305 + unsigned int top_level) 306 + { 307 + return top_level | (uintptr_t)table_mem; 308 + } 309 + 310 + static inline void pt_top_set(struct pt_common *common, 311 + struct pt_table_p *table_mem, 312 + unsigned int top_level) 313 + { 314 + WRITE_ONCE(common->top_of_table, _pt_top_set(table_mem, top_level)); 315 + } 316 + 317 + static inline void pt_top_set_level(struct pt_common *common, 318 + unsigned int top_level) 319 + { 320 + pt_top_set(common, NULL, top_level); 321 + } 322 + 323 + static inline unsigned int pt_top_get_level(const struct pt_common *common) 324 + { 325 + return READ_ONCE(common->top_of_table) % (1 << PT_TOP_LEVEL_BITS); 326 + } 327 + 328 + static inline bool pt_check_install_leaf_args(struct pt_state *pts, 329 + pt_oaddr_t oa, 330 + unsigned int oasz_lg2); 331 + 332 + #endif

+295

drivers/iommu/generic_pt/pt_fmt_defaults.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * Default definitions for formats that don't define these functions. 6 + */ 7 + #ifndef __GENERIC_PT_PT_FMT_DEFAULTS_H 8 + #define __GENERIC_PT_PT_FMT_DEFAULTS_H 9 + 10 + #include "pt_defs.h" 11 + #include <linux/log2.h> 12 + 13 + /* Header self-compile default defines */ 14 + #ifndef pt_load_entry_raw 15 + #include "fmt/amdv1.h" 16 + #endif 17 + 18 + /* 19 + * The format must provide PT_GRANULE_LG2SZ, PT_TABLEMEM_LG2SZ, and 20 + * PT_ITEM_WORD_SIZE. They must be the same at every level excluding the top. 21 + */ 22 + #ifndef pt_table_item_lg2sz 23 + static inline unsigned int pt_table_item_lg2sz(const struct pt_state *pts) 24 + { 25 + return PT_GRANULE_LG2SZ + 26 + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)) * pts->level; 27 + } 28 + #endif 29 + 30 + #ifndef pt_pgsz_lg2_to_level 31 + static inline unsigned int pt_pgsz_lg2_to_level(struct pt_common *common, 32 + unsigned int pgsize_lg2) 33 + { 34 + return ((unsigned int)(pgsize_lg2 - PT_GRANULE_LG2SZ)) / 35 + (PT_TABLEMEM_LG2SZ - ilog2(PT_ITEM_WORD_SIZE)); 36 + } 37 + #endif 38 + 39 + /* 40 + * If not supplied by the format then contiguous pages are not supported. 41 + * 42 + * If contiguous pages are supported then the format must also provide 43 + * pt_contig_count_lg2() if it supports a single contiguous size per level, 44 + * or pt_possible_sizes() if it supports multiple sizes per level. 45 + */ 46 + #ifndef pt_entry_num_contig_lg2 47 + static inline unsigned int pt_entry_num_contig_lg2(const struct pt_state *pts) 48 + { 49 + return ilog2(1); 50 + } 51 + 52 + /* 53 + * Return the number of contiguous OA items forming an entry at this table level 54 + */ 55 + static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts) 56 + { 57 + return ilog2(1); 58 + } 59 + #endif 60 + 61 + /* If not supplied by the format then dirty tracking is not supported */ 62 + #ifndef pt_entry_is_write_dirty 63 + static inline bool pt_entry_is_write_dirty(const struct pt_state *pts) 64 + { 65 + return false; 66 + } 67 + 68 + static inline void pt_entry_make_write_clean(struct pt_state *pts) 69 + { 70 + } 71 + 72 + static inline bool pt_dirty_supported(struct pt_common *common) 73 + { 74 + return false; 75 + } 76 + #else 77 + /* If not supplied then dirty tracking is always enabled */ 78 + #ifndef pt_dirty_supported 79 + static inline bool pt_dirty_supported(struct pt_common *common) 80 + { 81 + return true; 82 + } 83 + #endif 84 + #endif 85 + 86 + #ifndef pt_entry_make_write_dirty 87 + static inline bool pt_entry_make_write_dirty(struct pt_state *pts) 88 + { 89 + return false; 90 + } 91 + #endif 92 + 93 + /* 94 + * Format supplies either: 95 + * pt_entry_oa - OA is at the start of a contiguous entry 96 + * or 97 + * pt_item_oa - OA is adjusted for every item in a contiguous entry 98 + * 99 + * Build the missing one 100 + * 101 + * The internal helper _pt_entry_oa_fast() allows generating 102 + * an efficient pt_entry_oa_exact(), it doesn't care which 103 + * option is selected. 104 + */ 105 + #ifdef pt_entry_oa 106 + static inline pt_oaddr_t pt_item_oa(const struct pt_state *pts) 107 + { 108 + return pt_entry_oa(pts) | 109 + log2_mul(pts->index, pt_table_item_lg2sz(pts)); 110 + } 111 + #define _pt_entry_oa_fast pt_entry_oa 112 + #endif 113 + 114 + #ifdef pt_item_oa 115 + static inline pt_oaddr_t pt_entry_oa(const struct pt_state *pts) 116 + { 117 + return log2_set_mod(pt_item_oa(pts), 0, 118 + pt_entry_num_contig_lg2(pts) + 119 + pt_table_item_lg2sz(pts)); 120 + } 121 + #define _pt_entry_oa_fast pt_item_oa 122 + #endif 123 + 124 + /* 125 + * If not supplied by the format then use the constant 126 + * PT_MAX_OUTPUT_ADDRESS_LG2. 127 + */ 128 + #ifndef pt_max_oa_lg2 129 + static inline unsigned int 130 + pt_max_oa_lg2(const struct pt_common *common) 131 + { 132 + return PT_MAX_OUTPUT_ADDRESS_LG2; 133 + } 134 + #endif 135 + 136 + #ifndef pt_has_system_page_size 137 + static inline bool pt_has_system_page_size(const struct pt_common *common) 138 + { 139 + return PT_GRANULE_LG2SZ == PAGE_SHIFT; 140 + } 141 + #endif 142 + 143 + /* 144 + * If not supplied by the format then assume only one contiguous size determined 145 + * by pt_contig_count_lg2() 146 + */ 147 + #ifndef pt_possible_sizes 148 + static inline unsigned short pt_contig_count_lg2(const struct pt_state *pts); 149 + 150 + /* Return a bitmap of possible leaf page sizes at this level */ 151 + static inline pt_vaddr_t pt_possible_sizes(const struct pt_state *pts) 152 + { 153 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 154 + 155 + if (!pt_can_have_leaf(pts)) 156 + return 0; 157 + return log2_to_int(isz_lg2) | 158 + log2_to_int(pt_contig_count_lg2(pts) + isz_lg2); 159 + } 160 + #endif 161 + 162 + /* If not supplied by the format then use 0. */ 163 + #ifndef pt_full_va_prefix 164 + static inline pt_vaddr_t pt_full_va_prefix(const struct pt_common *common) 165 + { 166 + return 0; 167 + } 168 + #endif 169 + 170 + /* If not supplied by the format then zero fill using PT_ITEM_WORD_SIZE */ 171 + #ifndef pt_clear_entries 172 + static inline void pt_clear_entries64(struct pt_state *pts, 173 + unsigned int num_contig_lg2) 174 + { 175 + u64 *tablep = pt_cur_table(pts, u64) + pts->index; 176 + u64 *end = tablep + log2_to_int(num_contig_lg2); 177 + 178 + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); 179 + for (; tablep != end; tablep++) 180 + WRITE_ONCE(*tablep, 0); 181 + } 182 + 183 + static inline void pt_clear_entries32(struct pt_state *pts, 184 + unsigned int num_contig_lg2) 185 + { 186 + u32 *tablep = pt_cur_table(pts, u32) + pts->index; 187 + u32 *end = tablep + log2_to_int(num_contig_lg2); 188 + 189 + PT_WARN_ON(log2_mod(pts->index, num_contig_lg2)); 190 + for (; tablep != end; tablep++) 191 + WRITE_ONCE(*tablep, 0); 192 + } 193 + 194 + static inline void pt_clear_entries(struct pt_state *pts, 195 + unsigned int num_contig_lg2) 196 + { 197 + if (PT_ITEM_WORD_SIZE == sizeof(u32)) 198 + pt_clear_entries32(pts, num_contig_lg2); 199 + else 200 + pt_clear_entries64(pts, num_contig_lg2); 201 + } 202 + #define pt_clear_entries pt_clear_entries 203 + #endif 204 + 205 + /* If not supplied then SW bits are not supported */ 206 + #ifdef pt_sw_bit 207 + static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, 208 + unsigned int bitnr) 209 + { 210 + /* Acquire, pairs with pt_set_sw_bit_release() */ 211 + smp_mb(); 212 + /* For a contiguous entry the sw bit is only stored in the first item. */ 213 + return pts->entry & pt_sw_bit(bitnr); 214 + } 215 + #define pt_test_sw_bit_acquire pt_test_sw_bit_acquire 216 + 217 + static inline void pt_set_sw_bit_release(struct pt_state *pts, 218 + unsigned int bitnr) 219 + { 220 + #if !IS_ENABLED(CONFIG_GENERIC_ATOMIC64) 221 + if (PT_ITEM_WORD_SIZE == sizeof(u64)) { 222 + u64 *entryp = pt_cur_table(pts, u64) + pts->index; 223 + u64 old_entry = pts->entry; 224 + u64 new_entry; 225 + 226 + do { 227 + new_entry = old_entry | pt_sw_bit(bitnr); 228 + } while (!try_cmpxchg64_release(entryp, &old_entry, new_entry)); 229 + pts->entry = new_entry; 230 + return; 231 + } 232 + #endif 233 + if (PT_ITEM_WORD_SIZE == sizeof(u32)) { 234 + u32 *entryp = pt_cur_table(pts, u32) + pts->index; 235 + u32 old_entry = pts->entry; 236 + u32 new_entry; 237 + 238 + do { 239 + new_entry = old_entry | pt_sw_bit(bitnr); 240 + } while (!try_cmpxchg_release(entryp, &old_entry, new_entry)); 241 + pts->entry = new_entry; 242 + } else 243 + BUILD_BUG(); 244 + } 245 + #define pt_set_sw_bit_release pt_set_sw_bit_release 246 + #else 247 + static inline unsigned int pt_max_sw_bit(struct pt_common *common) 248 + { 249 + return 0; 250 + } 251 + 252 + extern void __pt_no_sw_bit(void); 253 + static inline bool pt_test_sw_bit_acquire(struct pt_state *pts, 254 + unsigned int bitnr) 255 + { 256 + __pt_no_sw_bit(); 257 + return false; 258 + } 259 + 260 + static inline void pt_set_sw_bit_release(struct pt_state *pts, 261 + unsigned int bitnr) 262 + { 263 + __pt_no_sw_bit(); 264 + } 265 + #endif 266 + 267 + /* 268 + * Format can call in the pt_install_leaf_entry() to check the arguments are all 269 + * aligned correctly. 270 + */ 271 + static inline bool pt_check_install_leaf_args(struct pt_state *pts, 272 + pt_oaddr_t oa, 273 + unsigned int oasz_lg2) 274 + { 275 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 276 + 277 + if (PT_WARN_ON(oalog2_mod(oa, oasz_lg2))) 278 + return false; 279 + 280 + #ifdef pt_possible_sizes 281 + if (PT_WARN_ON(isz_lg2 > oasz_lg2 || 282 + oasz_lg2 > isz_lg2 + pt_num_items_lg2(pts))) 283 + return false; 284 + #else 285 + if (PT_WARN_ON(oasz_lg2 != isz_lg2 && 286 + oasz_lg2 != isz_lg2 + pt_contig_count_lg2(pts))) 287 + return false; 288 + #endif 289 + 290 + if (PT_WARN_ON(oalog2_mod(pts->index, oasz_lg2 - isz_lg2))) 291 + return false; 292 + return true; 293 + } 294 + 295 + #endif

+636

drivers/iommu/generic_pt/pt_iter.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * Iterators for Generic Page Table 6 + */ 7 + #ifndef __GENERIC_PT_PT_ITER_H 8 + #define __GENERIC_PT_PT_ITER_H 9 + 10 + #include "pt_common.h" 11 + 12 + #include <linux/errno.h> 13 + 14 + /* 15 + * Use to mangle symbols so that backtraces and the symbol table are 16 + * understandable. Any non-inlined function should get mangled like this. 17 + */ 18 + #define NS(fn) CONCATENATE(PTPFX, fn) 19 + 20 + /** 21 + * pt_check_range() - Validate the range can be iterated 22 + * @range: Range to validate 23 + * 24 + * Check that VA and last_va fall within the permitted range of VAs. If the 25 + * format is using PT_FEAT_SIGN_EXTEND then this also checks the sign extension 26 + * is correct. 27 + */ 28 + static inline int pt_check_range(struct pt_range *range) 29 + { 30 + pt_vaddr_t prefix; 31 + 32 + PT_WARN_ON(!range->max_vasz_lg2); 33 + 34 + if (pt_feature(range->common, PT_FEAT_SIGN_EXTEND)) { 35 + PT_WARN_ON(range->common->max_vasz_lg2 != range->max_vasz_lg2); 36 + prefix = fvalog2_div(range->va, range->max_vasz_lg2 - 1) ? 37 + PT_VADDR_MAX : 38 + 0; 39 + } else { 40 + prefix = pt_full_va_prefix(range->common); 41 + } 42 + 43 + if (!fvalog2_div_eq(range->va, prefix, range->max_vasz_lg2) || 44 + !fvalog2_div_eq(range->last_va, prefix, range->max_vasz_lg2)) 45 + return -ERANGE; 46 + return 0; 47 + } 48 + 49 + /** 50 + * pt_index_to_va() - Update range->va to the current pts->index 51 + * @pts: Iteration State 52 + * 53 + * Adjust range->va to match the current index. This is done in a lazy manner 54 + * since computing the VA takes several instructions and is rarely required. 55 + */ 56 + static inline void pt_index_to_va(struct pt_state *pts) 57 + { 58 + pt_vaddr_t lower_va; 59 + 60 + lower_va = log2_mul(pts->index, pt_table_item_lg2sz(pts)); 61 + pts->range->va = fvalog2_set_mod(pts->range->va, lower_va, 62 + pt_table_oa_lg2sz(pts)); 63 + } 64 + 65 + /* 66 + * Add index_count_lg2 number of entries to pts's VA and index. The VA will be 67 + * adjusted to the end of the contiguous block if it is currently in the middle. 68 + */ 69 + static inline void _pt_advance(struct pt_state *pts, 70 + unsigned int index_count_lg2) 71 + { 72 + pts->index = log2_set_mod(pts->index + log2_to_int(index_count_lg2), 0, 73 + index_count_lg2); 74 + } 75 + 76 + /** 77 + * pt_entry_fully_covered() - Check if the item or entry is entirely contained 78 + * within pts->range 79 + * @pts: Iteration State 80 + * @oasz_lg2: The size of the item to check, pt_table_item_lg2sz() or 81 + * pt_entry_oa_lg2sz() 82 + * 83 + * Returns: true if the item is fully enclosed by the pts->range. 84 + */ 85 + static inline bool pt_entry_fully_covered(const struct pt_state *pts, 86 + unsigned int oasz_lg2) 87 + { 88 + struct pt_range *range = pts->range; 89 + 90 + /* Range begins at the start of the entry */ 91 + if (log2_mod(pts->range->va, oasz_lg2)) 92 + return false; 93 + 94 + /* Range ends past the end of the entry */ 95 + if (!log2_div_eq(range->va, range->last_va, oasz_lg2)) 96 + return true; 97 + 98 + /* Range ends at the end of the entry */ 99 + return log2_mod_eq_max(range->last_va, oasz_lg2); 100 + } 101 + 102 + /** 103 + * pt_range_to_index() - Starting index for an iteration 104 + * @pts: Iteration State 105 + * 106 + * Return: the starting index for the iteration in pts. 107 + */ 108 + static inline unsigned int pt_range_to_index(const struct pt_state *pts) 109 + { 110 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 111 + 112 + PT_WARN_ON(pts->level > pts->range->top_level); 113 + if (pts->range->top_level == pts->level) 114 + return log2_div(fvalog2_mod(pts->range->va, 115 + pts->range->max_vasz_lg2), 116 + isz_lg2); 117 + return log2_mod(log2_div(pts->range->va, isz_lg2), 118 + pt_num_items_lg2(pts)); 119 + } 120 + 121 + /** 122 + * pt_range_to_end_index() - Ending index iteration 123 + * @pts: Iteration State 124 + * 125 + * Return: the last index for the iteration in pts. 126 + */ 127 + static inline unsigned int pt_range_to_end_index(const struct pt_state *pts) 128 + { 129 + unsigned int isz_lg2 = pt_table_item_lg2sz(pts); 130 + struct pt_range *range = pts->range; 131 + unsigned int num_entries_lg2; 132 + 133 + if (range->va == range->last_va) 134 + return pts->index + 1; 135 + 136 + if (pts->range->top_level == pts->level) 137 + return log2_div(fvalog2_mod(pts->range->last_va, 138 + pts->range->max_vasz_lg2), 139 + isz_lg2) + 140 + 1; 141 + 142 + num_entries_lg2 = pt_num_items_lg2(pts); 143 + 144 + /* last_va falls within this table */ 145 + if (log2_div_eq(range->va, range->last_va, num_entries_lg2 + isz_lg2)) 146 + return log2_mod(log2_div(pts->range->last_va, isz_lg2), 147 + num_entries_lg2) + 148 + 1; 149 + 150 + return log2_to_int(num_entries_lg2); 151 + } 152 + 153 + static inline void _pt_iter_first(struct pt_state *pts) 154 + { 155 + pts->index = pt_range_to_index(pts); 156 + pts->end_index = pt_range_to_end_index(pts); 157 + PT_WARN_ON(pts->index > pts->end_index); 158 + } 159 + 160 + static inline bool _pt_iter_load(struct pt_state *pts) 161 + { 162 + if (pts->index >= pts->end_index) 163 + return false; 164 + pt_load_entry(pts); 165 + return true; 166 + } 167 + 168 + /** 169 + * pt_next_entry() - Advance pts to the next entry 170 + * @pts: Iteration State 171 + * 172 + * Update pts to go to the next index at this level. If pts is pointing at a 173 + * contiguous entry then the index may advance my more than one. 174 + */ 175 + static inline void pt_next_entry(struct pt_state *pts) 176 + { 177 + if (pts->type == PT_ENTRY_OA && 178 + !__builtin_constant_p(pt_entry_num_contig_lg2(pts) == 0)) 179 + _pt_advance(pts, pt_entry_num_contig_lg2(pts)); 180 + else 181 + pts->index++; 182 + pt_index_to_va(pts); 183 + } 184 + 185 + /** 186 + * for_each_pt_level_entry() - For loop wrapper over entries in the range 187 + * @pts: Iteration State 188 + * 189 + * This is the basic iteration primitive. It iterates over all the entries in 190 + * pts->range that fall within the pts's current table level. Each step does 191 + * pt_load_entry(pts). 192 + */ 193 + #define for_each_pt_level_entry(pts) \ 194 + for (_pt_iter_first(pts); _pt_iter_load(pts); pt_next_entry(pts)) 195 + 196 + /** 197 + * pt_load_single_entry() - Version of pt_load_entry() usable within a walker 198 + * @pts: Iteration State 199 + * 200 + * Alternative to for_each_pt_level_entry() if the walker function uses only a 201 + * single entry. 202 + */ 203 + static inline enum pt_entry_type pt_load_single_entry(struct pt_state *pts) 204 + { 205 + pts->index = pt_range_to_index(pts); 206 + pt_load_entry(pts); 207 + return pts->type; 208 + } 209 + 210 + static __always_inline struct pt_range _pt_top_range(struct pt_common *common, 211 + uintptr_t top_of_table) 212 + { 213 + struct pt_range range = { 214 + .common = common, 215 + .top_table = 216 + (struct pt_table_p *)(top_of_table & 217 + ~(uintptr_t)PT_TOP_LEVEL_MASK), 218 + .top_level = top_of_table % (1 << PT_TOP_LEVEL_BITS), 219 + }; 220 + struct pt_state pts = { .range = &range, .level = range.top_level }; 221 + unsigned int max_vasz_lg2; 222 + 223 + max_vasz_lg2 = common->max_vasz_lg2; 224 + if (pt_feature(common, PT_FEAT_DYNAMIC_TOP) && 225 + pts.level != PT_MAX_TOP_LEVEL) 226 + max_vasz_lg2 = min_t(unsigned int, common->max_vasz_lg2, 227 + pt_num_items_lg2(&pts) + 228 + pt_table_item_lg2sz(&pts)); 229 + 230 + /* 231 + * The top range will default to the lower region only with sign extend. 232 + */ 233 + range.max_vasz_lg2 = max_vasz_lg2; 234 + if (pt_feature(common, PT_FEAT_SIGN_EXTEND)) 235 + max_vasz_lg2--; 236 + 237 + range.va = fvalog2_set_mod(pt_full_va_prefix(common), 0, max_vasz_lg2); 238 + range.last_va = 239 + fvalog2_set_mod_max(pt_full_va_prefix(common), max_vasz_lg2); 240 + return range; 241 + } 242 + 243 + /** 244 + * pt_top_range() - Return a range that spans part of the top level 245 + * @common: Table 246 + * 247 + * For PT_FEAT_SIGN_EXTEND this will return the lower range, and cover half the 248 + * total page table. Otherwise it returns the entire page table. 249 + */ 250 + static __always_inline struct pt_range pt_top_range(struct pt_common *common) 251 + { 252 + /* 253 + * The top pointer can change without locking. We capture the value and 254 + * it's level here and are safe to walk it so long as both values are 255 + * captured without tearing. 256 + */ 257 + return _pt_top_range(common, READ_ONCE(common->top_of_table)); 258 + } 259 + 260 + /** 261 + * pt_all_range() - Return a range that spans the entire page table 262 + * @common: Table 263 + * 264 + * The returned range spans the whole page table. Due to how PT_FEAT_SIGN_EXTEND 265 + * is supported range->va and range->last_va will be incorrect during the 266 + * iteration and must not be accessed. 267 + */ 268 + static inline struct pt_range pt_all_range(struct pt_common *common) 269 + { 270 + struct pt_range range = pt_top_range(common); 271 + 272 + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) 273 + return range; 274 + 275 + /* 276 + * Pretend the table is linear from 0 without a sign extension. This 277 + * generates the correct indexes for iteration. 278 + */ 279 + range.last_va = fvalog2_set_mod_max(0, range.max_vasz_lg2); 280 + return range; 281 + } 282 + 283 + /** 284 + * pt_upper_range() - Return a range that spans part of the top level 285 + * @common: Table 286 + * 287 + * For PT_FEAT_SIGN_EXTEND this will return the upper range, and cover half the 288 + * total page table. Otherwise it returns the entire page table. 289 + */ 290 + static inline struct pt_range pt_upper_range(struct pt_common *common) 291 + { 292 + struct pt_range range = pt_top_range(common); 293 + 294 + if (!pt_feature(common, PT_FEAT_SIGN_EXTEND)) 295 + return range; 296 + 297 + range.va = fvalog2_set_mod(PT_VADDR_MAX, 0, range.max_vasz_lg2 - 1); 298 + range.last_va = PT_VADDR_MAX; 299 + return range; 300 + } 301 + 302 + /** 303 + * pt_make_range() - Return a range that spans part of the table 304 + * @common: Table 305 + * @va: Start address 306 + * @last_va: Last address 307 + * 308 + * The caller must validate the range with pt_check_range() before using it. 309 + */ 310 + static __always_inline struct pt_range 311 + pt_make_range(struct pt_common *common, pt_vaddr_t va, pt_vaddr_t last_va) 312 + { 313 + struct pt_range range = 314 + _pt_top_range(common, READ_ONCE(common->top_of_table)); 315 + 316 + range.va = va; 317 + range.last_va = last_va; 318 + 319 + return range; 320 + } 321 + 322 + /* 323 + * Span a slice of the table starting at a lower table level from an active 324 + * walk. 325 + */ 326 + static __always_inline struct pt_range 327 + pt_make_child_range(const struct pt_range *parent, pt_vaddr_t va, 328 + pt_vaddr_t last_va) 329 + { 330 + struct pt_range range = *parent; 331 + 332 + range.va = va; 333 + range.last_va = last_va; 334 + 335 + PT_WARN_ON(last_va < va); 336 + PT_WARN_ON(pt_check_range(&range)); 337 + 338 + return range; 339 + } 340 + 341 + /** 342 + * pt_init() - Initialize a pt_state on the stack 343 + * @range: Range pointer to embed in the state 344 + * @level: Table level for the state 345 + * @table: Pointer to the table memory at level 346 + * 347 + * Helper to initialize the on-stack pt_state from walker arguments. 348 + */ 349 + static __always_inline struct pt_state 350 + pt_init(struct pt_range *range, unsigned int level, struct pt_table_p *table) 351 + { 352 + struct pt_state pts = { 353 + .range = range, 354 + .table = table, 355 + .level = level, 356 + }; 357 + return pts; 358 + } 359 + 360 + /** 361 + * pt_init_top() - Initialize a pt_state on the stack 362 + * @range: Range pointer to embed in the state 363 + * 364 + * The pt_state points to the top most level. 365 + */ 366 + static __always_inline struct pt_state pt_init_top(struct pt_range *range) 367 + { 368 + return pt_init(range, range->top_level, range->top_table); 369 + } 370 + 371 + typedef int (*pt_level_fn_t)(struct pt_range *range, void *arg, 372 + unsigned int level, struct pt_table_p *table); 373 + 374 + /** 375 + * pt_descend() - Recursively invoke the walker for the lower level 376 + * @pts: Iteration State 377 + * @arg: Value to pass to the function 378 + * @fn: Walker function to call 379 + * 380 + * pts must point to a table item. Invoke fn as a walker on the table 381 + * pts points to. 382 + */ 383 + static __always_inline int pt_descend(struct pt_state *pts, void *arg, 384 + pt_level_fn_t fn) 385 + { 386 + int ret; 387 + 388 + if (PT_WARN_ON(!pts->table_lower)) 389 + return -EINVAL; 390 + 391 + ret = (*fn)(pts->range, arg, pts->level - 1, pts->table_lower); 392 + return ret; 393 + } 394 + 395 + /** 396 + * pt_walk_range() - Walk over a VA range 397 + * @range: Range pointer 398 + * @fn: Walker function to call 399 + * @arg: Value to pass to the function 400 + * 401 + * Walk over a VA range. The caller should have done a validity check, at 402 + * least calling pt_check_range(), when building range. The walk will 403 + * start at the top most table. 404 + */ 405 + static __always_inline int pt_walk_range(struct pt_range *range, 406 + pt_level_fn_t fn, void *arg) 407 + { 408 + return fn(range, arg, range->top_level, range->top_table); 409 + } 410 + 411 + /* 412 + * pt_walk_descend() - Recursively invoke the walker for a slice of a lower 413 + * level 414 + * @pts: Iteration State 415 + * @va: Start address 416 + * @last_va: Last address 417 + * @fn: Walker function to call 418 + * @arg: Value to pass to the function 419 + * 420 + * With pts pointing at a table item this will descend and over a slice of the 421 + * lower table. The caller must ensure that va/last_va are within the table 422 + * item. This creates a new walk and does not alter pts or pts->range. 423 + */ 424 + static __always_inline int pt_walk_descend(const struct pt_state *pts, 425 + pt_vaddr_t va, pt_vaddr_t last_va, 426 + pt_level_fn_t fn, void *arg) 427 + { 428 + struct pt_range range = pt_make_child_range(pts->range, va, last_va); 429 + 430 + if (PT_WARN_ON(!pt_can_have_table(pts)) || 431 + PT_WARN_ON(!pts->table_lower)) 432 + return -EINVAL; 433 + 434 + return fn(&range, arg, pts->level - 1, pts->table_lower); 435 + } 436 + 437 + /* 438 + * pt_walk_descend_all() - Recursively invoke the walker for a table item 439 + * @parent_pts: Iteration State 440 + * @fn: Walker function to call 441 + * @arg: Value to pass to the function 442 + * 443 + * With pts pointing at a table item this will descend and over the entire lower 444 + * table. This creates a new walk and does not alter pts or pts->range. 445 + */ 446 + static __always_inline int 447 + pt_walk_descend_all(const struct pt_state *parent_pts, pt_level_fn_t fn, 448 + void *arg) 449 + { 450 + unsigned int isz_lg2 = pt_table_item_lg2sz(parent_pts); 451 + 452 + return pt_walk_descend(parent_pts, 453 + log2_set_mod(parent_pts->range->va, 0, isz_lg2), 454 + log2_set_mod_max(parent_pts->range->va, isz_lg2), 455 + fn, arg); 456 + } 457 + 458 + /** 459 + * pt_range_slice() - Return a range that spans indexes 460 + * @pts: Iteration State 461 + * @start_index: Starting index within pts 462 + * @end_index: Ending index within pts 463 + * 464 + * Create a range than spans an index range of the current table level 465 + * pt_state points at. 466 + */ 467 + static inline struct pt_range pt_range_slice(const struct pt_state *pts, 468 + unsigned int start_index, 469 + unsigned int end_index) 470 + { 471 + unsigned int table_lg2sz = pt_table_oa_lg2sz(pts); 472 + pt_vaddr_t last_va; 473 + pt_vaddr_t va; 474 + 475 + va = fvalog2_set_mod(pts->range->va, 476 + log2_mul(start_index, pt_table_item_lg2sz(pts)), 477 + table_lg2sz); 478 + last_va = fvalog2_set_mod( 479 + pts->range->va, 480 + log2_mul(end_index, pt_table_item_lg2sz(pts)) - 1, table_lg2sz); 481 + return pt_make_child_range(pts->range, va, last_va); 482 + } 483 + 484 + /** 485 + * pt_top_memsize_lg2() 486 + * @common: Table 487 + * @top_of_table: Top of table value from _pt_top_set() 488 + * 489 + * Compute the allocation size of the top table. For PT_FEAT_DYNAMIC_TOP this 490 + * will compute the top size assuming the table will grow. 491 + */ 492 + static inline unsigned int pt_top_memsize_lg2(struct pt_common *common, 493 + uintptr_t top_of_table) 494 + { 495 + struct pt_range range = _pt_top_range(common, top_of_table); 496 + struct pt_state pts = pt_init_top(&range); 497 + unsigned int num_items_lg2; 498 + 499 + num_items_lg2 = common->max_vasz_lg2 - pt_table_item_lg2sz(&pts); 500 + if (range.top_level != PT_MAX_TOP_LEVEL && 501 + pt_feature(common, PT_FEAT_DYNAMIC_TOP)) 502 + num_items_lg2 = min(num_items_lg2, pt_num_items_lg2(&pts)); 503 + 504 + /* Round up the allocation size to the minimum alignment */ 505 + return max(ffs_t(u64, PT_TOP_PHYS_MASK), 506 + num_items_lg2 + ilog2(PT_ITEM_WORD_SIZE)); 507 + } 508 + 509 + /** 510 + * pt_compute_best_pgsize() - Determine the best page size for leaf entries 511 + * @pgsz_bitmap: Permitted page sizes 512 + * @va: Starting virtual address for the leaf entry 513 + * @last_va: Last virtual address for the leaf entry, sets the max page size 514 + * @oa: Starting output address for the leaf entry 515 + * 516 + * Compute the largest page size for va, last_va, and oa together and return it 517 + * in lg2. The largest page size depends on the format's supported page sizes at 518 + * this level, and the relative alignment of the VA and OA addresses. 0 means 519 + * the OA cannot be stored with the provided pgsz_bitmap. 520 + */ 521 + static inline unsigned int pt_compute_best_pgsize(pt_vaddr_t pgsz_bitmap, 522 + pt_vaddr_t va, 523 + pt_vaddr_t last_va, 524 + pt_oaddr_t oa) 525 + { 526 + unsigned int best_pgsz_lg2; 527 + unsigned int pgsz_lg2; 528 + pt_vaddr_t len = last_va - va + 1; 529 + pt_vaddr_t mask; 530 + 531 + if (PT_WARN_ON(va >= last_va)) 532 + return 0; 533 + 534 + /* 535 + * Given a VA/OA pair the best page size is the largest page size 536 + * where: 537 + * 538 + * 1) VA and OA start at the page. Bitwise this is the count of least 539 + * significant 0 bits. 540 + * This also implies that last_va/oa has the same prefix as va/oa. 541 + */ 542 + mask = va | oa; 543 + 544 + /* 545 + * 2) The page size is not larger than the last_va (length). Since page 546 + * sizes are always power of two this can't be larger than the 547 + * largest power of two factor of the length. 548 + */ 549 + mask |= log2_to_int(vafls(len) - 1); 550 + 551 + best_pgsz_lg2 = vaffs(mask); 552 + 553 + /* Choose the highest bit <= best_pgsz_lg2 */ 554 + if (best_pgsz_lg2 < PT_VADDR_MAX_LG2 - 1) 555 + pgsz_bitmap = log2_mod(pgsz_bitmap, best_pgsz_lg2 + 1); 556 + 557 + pgsz_lg2 = vafls(pgsz_bitmap); 558 + if (!pgsz_lg2) 559 + return 0; 560 + 561 + pgsz_lg2--; 562 + 563 + PT_WARN_ON(log2_mod(va, pgsz_lg2) != 0); 564 + PT_WARN_ON(oalog2_mod(oa, pgsz_lg2) != 0); 565 + PT_WARN_ON(va + log2_to_int(pgsz_lg2) - 1 > last_va); 566 + PT_WARN_ON(!log2_div_eq(va, va + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); 567 + PT_WARN_ON( 568 + !oalog2_div_eq(oa, oa + log2_to_int(pgsz_lg2) - 1, pgsz_lg2)); 569 + return pgsz_lg2; 570 + } 571 + 572 + #define _PT_MAKE_CALL_LEVEL(fn) \ 573 + static __always_inline int fn(struct pt_range *range, void *arg, \ 574 + unsigned int level, \ 575 + struct pt_table_p *table) \ 576 + { \ 577 + static_assert(PT_MAX_TOP_LEVEL <= 5); \ 578 + if (level == 0) \ 579 + return CONCATENATE(fn, 0)(range, arg, 0, table); \ 580 + if (level == 1 || PT_MAX_TOP_LEVEL == 1) \ 581 + return CONCATENATE(fn, 1)(range, arg, 1, table); \ 582 + if (level == 2 || PT_MAX_TOP_LEVEL == 2) \ 583 + return CONCATENATE(fn, 2)(range, arg, 2, table); \ 584 + if (level == 3 || PT_MAX_TOP_LEVEL == 3) \ 585 + return CONCATENATE(fn, 3)(range, arg, 3, table); \ 586 + if (level == 4 || PT_MAX_TOP_LEVEL == 4) \ 587 + return CONCATENATE(fn, 4)(range, arg, 4, table); \ 588 + return CONCATENATE(fn, 5)(range, arg, 5, table); \ 589 + } 590 + 591 + static inline int __pt_make_level_fn_err(struct pt_range *range, void *arg, 592 + unsigned int unused_level, 593 + struct pt_table_p *table) 594 + { 595 + static_assert(PT_MAX_TOP_LEVEL <= 5); 596 + return -EPROTOTYPE; 597 + } 598 + 599 + #define __PT_MAKE_LEVEL_FN(fn, level, descend_fn, do_fn) \ 600 + static inline int fn(struct pt_range *range, void *arg, \ 601 + unsigned int unused_level, \ 602 + struct pt_table_p *table) \ 603 + { \ 604 + return do_fn(range, arg, level, table, descend_fn); \ 605 + } 606 + 607 + /** 608 + * PT_MAKE_LEVELS() - Build an unwound walker 609 + * @fn: Name of the walker function 610 + * @do_fn: Function to call at each level 611 + * 612 + * This builds a function call tree that can be fully inlined. 613 + * The caller must provide a function body in an __always_inline function:: 614 + * 615 + * static __always_inline int do_fn(struct pt_range *range, void *arg, 616 + * unsigned int level, struct pt_table_p *table, 617 + * pt_level_fn_t descend_fn) 618 + * 619 + * An inline function will be created for each table level that calls do_fn with 620 + * a compile time constant for level and a pointer to the next lower function. 621 + * This generates an optimally inlined walk where each of the functions sees a 622 + * constant level and can codegen the exact constants/etc for that level. 623 + * 624 + * Note this can produce a lot of code! 625 + */ 626 + #define PT_MAKE_LEVELS(fn, do_fn) \ 627 + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 0), 0, __pt_make_level_fn_err, \ 628 + do_fn); \ 629 + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 1), 1, CONCATENATE(fn, 0), do_fn); \ 630 + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 2), 2, CONCATENATE(fn, 1), do_fn); \ 631 + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 3), 3, CONCATENATE(fn, 2), do_fn); \ 632 + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 4), 4, CONCATENATE(fn, 3), do_fn); \ 633 + __PT_MAKE_LEVEL_FN(CONCATENATE(fn, 5), 5, CONCATENATE(fn, 4), do_fn); \ 634 + _PT_MAKE_CALL_LEVEL(fn) 635 + 636 + #endif

+122

drivers/iommu/generic_pt/pt_log2.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + * 5 + * Helper macros for working with log2 values 6 + * 7 + */ 8 + #ifndef __GENERIC_PT_LOG2_H 9 + #define __GENERIC_PT_LOG2_H 10 + #include <linux/bitops.h> 11 + #include <linux/limits.h> 12 + 13 + /* Compute a */ 14 + #define log2_to_int_t(type, a_lg2) ((type)(((type)1) << (a_lg2))) 15 + static_assert(log2_to_int_t(unsigned int, 0) == 1); 16 + 17 + /* Compute a - 1 (aka all low bits set) */ 18 + #define log2_to_max_int_t(type, a_lg2) ((type)(log2_to_int_t(type, a_lg2) - 1)) 19 + 20 + /* Compute a / b */ 21 + #define log2_div_t(type, a, b_lg2) ((type)(((type)a) >> (b_lg2))) 22 + static_assert(log2_div_t(unsigned int, 4, 2) == 1); 23 + 24 + /* 25 + * Compute: 26 + * a / c == b / c 27 + * aka the high bits are equal 28 + */ 29 + #define log2_div_eq_t(type, a, b, c_lg2) \ 30 + (log2_div_t(type, (a) ^ (b), c_lg2) == 0) 31 + static_assert(log2_div_eq_t(unsigned int, 1, 1, 2)); 32 + 33 + /* Compute a % b */ 34 + #define log2_mod_t(type, a, b_lg2) \ 35 + ((type)(((type)a) & log2_to_max_int_t(type, b_lg2))) 36 + static_assert(log2_mod_t(unsigned int, 1, 2) == 1); 37 + 38 + /* 39 + * Compute: 40 + * a % b == b - 1 41 + * aka the low bits are all 1s 42 + */ 43 + #define log2_mod_eq_max_t(type, a, b_lg2) \ 44 + (log2_mod_t(type, a, b_lg2) == log2_to_max_int_t(type, b_lg2)) 45 + static_assert(log2_mod_eq_max_t(unsigned int, 3, 2)); 46 + 47 + /* 48 + * Return a value such that: 49 + * a / b == ret / b 50 + * ret % b == val 51 + * aka set the low bits to val. val must be < b 52 + */ 53 + #define log2_set_mod_t(type, a, val, b_lg2) \ 54 + ((((type)(a)) & (~log2_to_max_int_t(type, b_lg2))) | ((type)(val))) 55 + static_assert(log2_set_mod_t(unsigned int, 3, 1, 2) == 1); 56 + 57 + /* Return a value such that: 58 + * a / b == ret / b 59 + * ret % b == b - 1 60 + * aka set the low bits to all 1s 61 + */ 62 + #define log2_set_mod_max_t(type, a, b_lg2) \ 63 + (((type)(a)) | log2_to_max_int_t(type, b_lg2)) 64 + static_assert(log2_set_mod_max_t(unsigned int, 2, 2) == 3); 65 + 66 + /* Compute a * b */ 67 + #define log2_mul_t(type, a, b_lg2) ((type)(((type)a) << (b_lg2))) 68 + static_assert(log2_mul_t(unsigned int, 2, 2) == 8); 69 + 70 + #define _dispatch_sz(type, fn, a) \ 71 + (sizeof(type) == 4 ? fn##32((u32)a) : fn##64(a)) 72 + 73 + /* 74 + * Return the highest value such that: 75 + * fls_t(u32, 0) == 0 76 + * fls_t(u3, 1) == 1 77 + * a >= log2_to_int(ret - 1) 78 + * aka find last set bit 79 + */ 80 + static inline unsigned int fls32(u32 a) 81 + { 82 + return fls(a); 83 + } 84 + #define fls_t(type, a) _dispatch_sz(type, fls, a) 85 + 86 + /* 87 + * Return the highest value such that: 88 + * ffs_t(u32, 0) == UNDEFINED 89 + * ffs_t(u32, 1) == 0 90 + * log_mod(a, ret) == 0 91 + * aka find first set bit 92 + */ 93 + static inline unsigned int __ffs32(u32 a) 94 + { 95 + return __ffs(a); 96 + } 97 + #define ffs_t(type, a) _dispatch_sz(type, __ffs, a) 98 + 99 + /* 100 + * Return the highest value such that: 101 + * ffz_t(u32, U32_MAX) == UNDEFINED 102 + * ffz_t(u32, 0) == 0 103 + * ffz_t(u32, 1) == 1 104 + * log_mod(a, ret) == log_to_max_int(ret) 105 + * aka find first zero bit 106 + */ 107 + static inline unsigned int ffz32(u32 a) 108 + { 109 + return ffz(a); 110 + } 111 + static inline unsigned int ffz64(u64 a) 112 + { 113 + if (sizeof(u64) == sizeof(unsigned long)) 114 + return ffz(a); 115 + 116 + if ((u32)a == U32_MAX) 117 + return ffz32(a >> 32) + 32; 118 + return ffz32(a); 119 + } 120 + #define ffz_t(type, a) _dispatch_sz(type, ffz, a) 121 + 122 + #endif

+5 -1

drivers/iommu/intel/Kconfig

··· 13 13 bool "Support for Intel IOMMU using DMA Remapping Devices" 14 14 depends on PCI_MSI && ACPI && X86 15 15 select IOMMU_API 16 + select GENERIC_PT 17 + select IOMMU_PT 18 + select IOMMU_PT_X86_64 19 + select IOMMU_PT_VTDSS 16 20 select IOMMU_IOVA 17 21 select IOMMU_IOPF 18 22 select IOMMUFD_DRIVER if IOMMUFD ··· 70 66 71 67 config INTEL_IOMMU_FLOPPY_WA 72 68 def_bool y 73 - depends on X86 69 + depends on X86 && BLK_DEV_FD 74 70 help 75 71 Floppy disk drivers are known to bypass DMA API calls 76 72 thereby failing to work when IOMMU is enabled. This

+175 -756

drivers/iommu/intel/iommu.c

··· 45 45 46 46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57 47 47 48 - #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1) 49 - #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1) 50 - 51 - /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR 52 - to match. That way, we can use 'unsigned long' for PFNs with impunity. */ 53 - #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \ 54 - __DOMAIN_MAX_PFN(gaw), (unsigned long)-1)) 55 - #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT) 56 - 57 48 static void __init check_tylersburg_isoch(void); 49 + static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, 50 + bool enable); 58 51 static int rwbf_quirk; 59 52 60 53 #define rwbf_required(iommu) (rwbf_quirk || cap_rwbf((iommu)->cap)) ··· 210 217 #define IDENTMAP_AZALIA 4 211 218 212 219 const struct iommu_ops intel_iommu_ops; 213 - static const struct iommu_dirty_ops intel_dirty_ops; 214 220 215 221 static bool translation_pre_enabled(struct intel_iommu *iommu) 216 222 { ··· 277 285 } 278 286 __setup("intel_iommu=", intel_iommu_setup); 279 287 280 - static int domain_pfn_supported(struct dmar_domain *domain, unsigned long pfn) 281 - { 282 - int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; 283 - 284 - return !(addr_width < BITS_PER_LONG && pfn >> addr_width); 285 - } 286 - 287 288 /* 288 289 * Calculate the Supported Adjusted Guest Address Widths of an IOMMU. 289 290 * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of ··· 336 351 { 337 352 return sm_supported(iommu) ? 338 353 ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap); 339 - } 340 - 341 - /* Return the super pagesize bitmap if supported. */ 342 - static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain) 343 - { 344 - unsigned long bitmap = 0; 345 - 346 - /* 347 - * 1-level super page supports page size of 2MiB, 2-level super page 348 - * supports page size of both 2MiB and 1GiB. 349 - */ 350 - if (domain->iommu_superpage == 1) 351 - bitmap |= SZ_2M; 352 - else if (domain->iommu_superpage == 2) 353 - bitmap |= SZ_2M | SZ_1G; 354 - 355 - return bitmap; 356 354 } 357 355 358 356 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus, ··· 524 556 return iommu; 525 557 } 526 558 527 - static void domain_flush_cache(struct dmar_domain *domain, 528 - void *addr, int size) 529 - { 530 - if (!domain->iommu_coherency) 531 - clflush_cache_range(addr, size); 532 - } 533 - 534 559 static void free_context_table(struct intel_iommu *iommu) 535 560 { 536 561 struct context_entry *context; ··· 667 706 pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level); 668 707 } 669 708 #endif 670 - 671 - static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, 672 - unsigned long pfn, int *target_level, 673 - gfp_t gfp) 674 - { 675 - struct dma_pte *parent, *pte; 676 - int level = agaw_to_level(domain->agaw); 677 - int offset; 678 - 679 - if (!domain_pfn_supported(domain, pfn)) 680 - /* Address beyond IOMMU's addressing capabilities. */ 681 - return NULL; 682 - 683 - parent = domain->pgd; 684 - 685 - while (1) { 686 - void *tmp_page; 687 - 688 - offset = pfn_level_offset(pfn, level); 689 - pte = &parent[offset]; 690 - if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte))) 691 - break; 692 - if (level == *target_level) 693 - break; 694 - 695 - if (!dma_pte_present(pte)) { 696 - uint64_t pteval, tmp; 697 - 698 - tmp_page = iommu_alloc_pages_node_sz(domain->nid, gfp, 699 - SZ_4K); 700 - 701 - if (!tmp_page) 702 - return NULL; 703 - 704 - domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE); 705 - pteval = virt_to_phys(tmp_page) | DMA_PTE_READ | 706 - DMA_PTE_WRITE; 707 - if (domain->use_first_level) 708 - pteval |= DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 709 - 710 - tmp = 0ULL; 711 - if (!try_cmpxchg64(&pte->val, &tmp, pteval)) 712 - /* Someone else set it while we were thinking; use theirs. */ 713 - iommu_free_pages(tmp_page); 714 - else 715 - domain_flush_cache(domain, pte, sizeof(*pte)); 716 - } 717 - if (level == 1) 718 - break; 719 - 720 - parent = phys_to_virt(dma_pte_addr(pte)); 721 - level--; 722 - } 723 - 724 - if (!*target_level) 725 - *target_level = level; 726 - 727 - return pte; 728 - } 729 - 730 - /* return address's pte at specific level */ 731 - static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, 732 - unsigned long pfn, 733 - int level, int *large_page) 734 - { 735 - struct dma_pte *parent, *pte; 736 - int total = agaw_to_level(domain->agaw); 737 - int offset; 738 - 739 - parent = domain->pgd; 740 - while (level <= total) { 741 - offset = pfn_level_offset(pfn, total); 742 - pte = &parent[offset]; 743 - if (level == total) 744 - return pte; 745 - 746 - if (!dma_pte_present(pte)) { 747 - *large_page = total; 748 - break; 749 - } 750 - 751 - if (dma_pte_superpage(pte)) { 752 - *large_page = total; 753 - return pte; 754 - } 755 - 756 - parent = phys_to_virt(dma_pte_addr(pte)); 757 - total--; 758 - } 759 - return NULL; 760 - } 761 - 762 - /* clear last level pte, a tlb flush should be followed */ 763 - static void dma_pte_clear_range(struct dmar_domain *domain, 764 - unsigned long start_pfn, 765 - unsigned long last_pfn) 766 - { 767 - unsigned int large_page; 768 - struct dma_pte *first_pte, *pte; 769 - 770 - if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 771 - WARN_ON(start_pfn > last_pfn)) 772 - return; 773 - 774 - /* we don't need lock here; nobody else touches the iova range */ 775 - do { 776 - large_page = 1; 777 - first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); 778 - if (!pte) { 779 - start_pfn = align_to_level(start_pfn + 1, large_page + 1); 780 - continue; 781 - } 782 - do { 783 - dma_clear_pte(pte); 784 - start_pfn += lvl_to_nr_pages(large_page); 785 - pte++; 786 - } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); 787 - 788 - domain_flush_cache(domain, first_pte, 789 - (void *)pte - (void *)first_pte); 790 - 791 - } while (start_pfn && start_pfn <= last_pfn); 792 - } 793 - 794 - static void dma_pte_free_level(struct dmar_domain *domain, int level, 795 - int retain_level, struct dma_pte *pte, 796 - unsigned long pfn, unsigned long start_pfn, 797 - unsigned long last_pfn) 798 - { 799 - pfn = max(start_pfn, pfn); 800 - pte = &pte[pfn_level_offset(pfn, level)]; 801 - 802 - do { 803 - unsigned long level_pfn; 804 - struct dma_pte *level_pte; 805 - 806 - if (!dma_pte_present(pte) || dma_pte_superpage(pte)) 807 - goto next; 808 - 809 - level_pfn = pfn & level_mask(level); 810 - level_pte = phys_to_virt(dma_pte_addr(pte)); 811 - 812 - if (level > 2) { 813 - dma_pte_free_level(domain, level - 1, retain_level, 814 - level_pte, level_pfn, start_pfn, 815 - last_pfn); 816 - } 817 - 818 - /* 819 - * Free the page table if we're below the level we want to 820 - * retain and the range covers the entire table. 821 - */ 822 - if (level < retain_level && !(start_pfn > level_pfn || 823 - last_pfn < level_pfn + level_size(level) - 1)) { 824 - dma_clear_pte(pte); 825 - domain_flush_cache(domain, pte, sizeof(*pte)); 826 - iommu_free_pages(level_pte); 827 - } 828 - next: 829 - pfn += level_size(level); 830 - } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 831 - } 832 - 833 - /* 834 - * clear last level (leaf) ptes and free page table pages below the 835 - * level we wish to keep intact. 836 - */ 837 - static void dma_pte_free_pagetable(struct dmar_domain *domain, 838 - unsigned long start_pfn, 839 - unsigned long last_pfn, 840 - int retain_level) 841 - { 842 - dma_pte_clear_range(domain, start_pfn, last_pfn); 843 - 844 - /* We don't need lock here; nobody else touches the iova range */ 845 - dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level, 846 - domain->pgd, 0, start_pfn, last_pfn); 847 - 848 - /* free pgd */ 849 - if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 850 - iommu_free_pages(domain->pgd); 851 - domain->pgd = NULL; 852 - } 853 - } 854 - 855 - /* When a page at a given level is being unlinked from its parent, we don't 856 - need to *modify* it at all. All we need to do is make a list of all the 857 - pages which can be freed just as soon as we've flushed the IOTLB and we 858 - know the hardware page-walk will no longer touch them. 859 - The 'pte' argument is the *parent* PTE, pointing to the page that is to 860 - be freed. */ 861 - static void dma_pte_list_pagetables(struct dmar_domain *domain, 862 - int level, struct dma_pte *parent_pte, 863 - struct iommu_pages_list *freelist) 864 - { 865 - struct dma_pte *pte = phys_to_virt(dma_pte_addr(parent_pte)); 866 - 867 - iommu_pages_list_add(freelist, pte); 868 - 869 - if (level == 1) 870 - return; 871 - 872 - do { 873 - if (dma_pte_present(pte) && !dma_pte_superpage(pte)) 874 - dma_pte_list_pagetables(domain, level - 1, pte, freelist); 875 - pte++; 876 - } while (!first_pte_in_page(pte)); 877 - } 878 - 879 - static void dma_pte_clear_level(struct dmar_domain *domain, int level, 880 - struct dma_pte *pte, unsigned long pfn, 881 - unsigned long start_pfn, unsigned long last_pfn, 882 - struct iommu_pages_list *freelist) 883 - { 884 - struct dma_pte *first_pte = NULL, *last_pte = NULL; 885 - 886 - pfn = max(start_pfn, pfn); 887 - pte = &pte[pfn_level_offset(pfn, level)]; 888 - 889 - do { 890 - unsigned long level_pfn = pfn & level_mask(level); 891 - 892 - if (!dma_pte_present(pte)) 893 - goto next; 894 - 895 - /* If range covers entire pagetable, free it */ 896 - if (start_pfn <= level_pfn && 897 - last_pfn >= level_pfn + level_size(level) - 1) { 898 - /* These suborbinate page tables are going away entirely. Don't 899 - bother to clear them; we're just going to *free* them. */ 900 - if (level > 1 && !dma_pte_superpage(pte)) 901 - dma_pte_list_pagetables(domain, level - 1, pte, freelist); 902 - 903 - dma_clear_pte(pte); 904 - if (!first_pte) 905 - first_pte = pte; 906 - last_pte = pte; 907 - } else if (level > 1) { 908 - /* Recurse down into a level that isn't *entirely* obsolete */ 909 - dma_pte_clear_level(domain, level - 1, 910 - phys_to_virt(dma_pte_addr(pte)), 911 - level_pfn, start_pfn, last_pfn, 912 - freelist); 913 - } 914 - next: 915 - pfn = level_pfn + level_size(level); 916 - } while (!first_pte_in_page(++pte) && pfn <= last_pfn); 917 - 918 - if (first_pte) 919 - domain_flush_cache(domain, first_pte, 920 - (void *)++last_pte - (void *)first_pte); 921 - } 922 - 923 - /* We can't just free the pages because the IOMMU may still be walking 924 - the page tables, and may have cached the intermediate levels. The 925 - pages can only be freed after the IOTLB flush has been done. */ 926 - static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn, 927 - unsigned long last_pfn, 928 - struct iommu_pages_list *freelist) 929 - { 930 - if (WARN_ON(!domain_pfn_supported(domain, last_pfn)) || 931 - WARN_ON(start_pfn > last_pfn)) 932 - return; 933 - 934 - /* we don't need lock here; nobody else touches the iova range */ 935 - dma_pte_clear_level(domain, agaw_to_level(domain->agaw), 936 - domain->pgd, 0, start_pfn, last_pfn, freelist); 937 - 938 - /* free pgd */ 939 - if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) { 940 - iommu_pages_list_add(freelist, domain->pgd); 941 - domain->pgd = NULL; 942 - } 943 - } 944 709 945 710 /* iommu handling */ 946 711 static int iommu_alloc_root_entry(struct intel_iommu *iommu) ··· 1147 1460 domain_lookup_dev_info(domain, iommu, bus, devfn); 1148 1461 u16 did = domain_id_iommu(domain, iommu); 1149 1462 int translation = CONTEXT_TT_MULTI_LEVEL; 1150 - struct dma_pte *pgd = domain->pgd; 1463 + struct pt_iommu_vtdss_hw_info pt_info; 1151 1464 struct context_entry *context; 1152 1465 int ret; 1153 1466 1154 1467 if (WARN_ON(!intel_domain_is_ss_paging(domain))) 1155 1468 return -EINVAL; 1469 + 1470 + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); 1156 1471 1157 1472 pr_debug("Set context mapping for %02x:%02x.%d\n", 1158 1473 bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); ··· 1178 1489 else 1179 1490 translation = CONTEXT_TT_MULTI_LEVEL; 1180 1491 1181 - context_set_address_root(context, virt_to_phys(pgd)); 1182 - context_set_address_width(context, domain->agaw); 1492 + context_set_address_root(context, pt_info.ssptptr); 1493 + context_set_address_width(context, pt_info.aw); 1183 1494 context_set_translation_type(context, translation); 1184 1495 context_set_fault_enable(context); 1185 1496 context_set_present(context); ··· 1222 1533 return ret; 1223 1534 1224 1535 iommu_enable_pci_ats(info); 1225 - 1226 - return 0; 1227 - } 1228 - 1229 - /* Return largest possible superpage level for a given mapping */ 1230 - static int hardware_largepage_caps(struct dmar_domain *domain, unsigned long iov_pfn, 1231 - unsigned long phy_pfn, unsigned long pages) 1232 - { 1233 - int support, level = 1; 1234 - unsigned long pfnmerge; 1235 - 1236 - support = domain->iommu_superpage; 1237 - 1238 - /* To use a large page, the virtual *and* physical addresses 1239 - must be aligned to 2MiB/1GiB/etc. Lower bits set in either 1240 - of them will mean we have to use smaller pages. So just 1241 - merge them and check both at once. */ 1242 - pfnmerge = iov_pfn | phy_pfn; 1243 - 1244 - while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { 1245 - pages >>= VTD_STRIDE_SHIFT; 1246 - if (!pages) 1247 - break; 1248 - pfnmerge >>= VTD_STRIDE_SHIFT; 1249 - level++; 1250 - support--; 1251 - } 1252 - return level; 1253 - } 1254 - 1255 - /* 1256 - * Ensure that old small page tables are removed to make room for superpage(s). 1257 - * We're going to add new large pages, so make sure we don't remove their parent 1258 - * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared. 1259 - */ 1260 - static void switch_to_super_page(struct dmar_domain *domain, 1261 - unsigned long start_pfn, 1262 - unsigned long end_pfn, int level) 1263 - { 1264 - unsigned long lvl_pages = lvl_to_nr_pages(level); 1265 - struct dma_pte *pte = NULL; 1266 - 1267 - if (WARN_ON(!IS_ALIGNED(start_pfn, lvl_pages) || 1268 - !IS_ALIGNED(end_pfn + 1, lvl_pages))) 1269 - return; 1270 - 1271 - while (start_pfn <= end_pfn) { 1272 - if (!pte) 1273 - pte = pfn_to_dma_pte(domain, start_pfn, &level, 1274 - GFP_ATOMIC); 1275 - 1276 - if (dma_pte_present(pte)) { 1277 - dma_pte_free_pagetable(domain, start_pfn, 1278 - start_pfn + lvl_pages - 1, 1279 - level + 1); 1280 - 1281 - cache_tag_flush_range(domain, start_pfn << VTD_PAGE_SHIFT, 1282 - end_pfn << VTD_PAGE_SHIFT, 0); 1283 - } 1284 - 1285 - pte++; 1286 - start_pfn += lvl_pages; 1287 - if (first_pte_in_page(pte)) 1288 - pte = NULL; 1289 - } 1290 - } 1291 - 1292 - static int 1293 - __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, 1294 - unsigned long phys_pfn, unsigned long nr_pages, int prot, 1295 - gfp_t gfp) 1296 - { 1297 - struct dma_pte *first_pte = NULL, *pte = NULL; 1298 - unsigned int largepage_lvl = 0; 1299 - unsigned long lvl_pages = 0; 1300 - phys_addr_t pteval; 1301 - u64 attr; 1302 - 1303 - if (unlikely(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1))) 1304 - return -EINVAL; 1305 - 1306 - if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0) 1307 - return -EINVAL; 1308 - 1309 - if (!(prot & DMA_PTE_WRITE) && domain->nested_parent) { 1310 - pr_err_ratelimited("Read-only mapping is disallowed on the domain which serves as the parent in a nested configuration, due to HW errata (ERRATA_772415_SPR17)\n"); 1311 - return -EINVAL; 1312 - } 1313 - 1314 - attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP); 1315 - if (domain->use_first_level) { 1316 - attr |= DMA_FL_PTE_PRESENT | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS; 1317 - if (prot & DMA_PTE_WRITE) 1318 - attr |= DMA_FL_PTE_DIRTY; 1319 - } 1320 - 1321 - domain->has_mappings = true; 1322 - 1323 - pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr; 1324 - 1325 - while (nr_pages > 0) { 1326 - uint64_t tmp; 1327 - 1328 - if (!pte) { 1329 - largepage_lvl = hardware_largepage_caps(domain, iov_pfn, 1330 - phys_pfn, nr_pages); 1331 - 1332 - pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl, 1333 - gfp); 1334 - if (!pte) 1335 - return -ENOMEM; 1336 - first_pte = pte; 1337 - 1338 - lvl_pages = lvl_to_nr_pages(largepage_lvl); 1339 - 1340 - /* It is large page*/ 1341 - if (largepage_lvl > 1) { 1342 - unsigned long end_pfn; 1343 - unsigned long pages_to_remove; 1344 - 1345 - pteval |= DMA_PTE_LARGE_PAGE; 1346 - pages_to_remove = min_t(unsigned long, 1347 - round_down(nr_pages, lvl_pages), 1348 - nr_pte_to_next_page(pte) * lvl_pages); 1349 - end_pfn = iov_pfn + pages_to_remove - 1; 1350 - switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl); 1351 - } else { 1352 - pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; 1353 - } 1354 - 1355 - } 1356 - /* We don't need lock here, nobody else 1357 - * touches the iova range 1358 - */ 1359 - tmp = 0ULL; 1360 - if (!try_cmpxchg64_local(&pte->val, &tmp, pteval)) { 1361 - static int dumps = 5; 1362 - pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n", 1363 - iov_pfn, tmp, (unsigned long long)pteval); 1364 - if (dumps) { 1365 - dumps--; 1366 - debug_dma_dump_mappings(NULL); 1367 - } 1368 - WARN_ON(1); 1369 - } 1370 - 1371 - nr_pages -= lvl_pages; 1372 - iov_pfn += lvl_pages; 1373 - phys_pfn += lvl_pages; 1374 - pteval += lvl_pages * VTD_PAGE_SIZE; 1375 - 1376 - /* If the next PTE would be the first in a new page, then we 1377 - * need to flush the cache on the entries we've just written. 1378 - * And then we'll need to recalculate 'pte', so clear it and 1379 - * let it get set again in the if (!pte) block above. 1380 - * 1381 - * If we're done (!nr_pages) we need to flush the cache too. 1382 - * 1383 - * Also if we've been setting superpages, we may need to 1384 - * recalculate 'pte' and switch back to smaller pages for the 1385 - * end of the mapping, if the trailing size is not enough to 1386 - * use another superpage (i.e. nr_pages < lvl_pages). 1387 - */ 1388 - pte++; 1389 - if (!nr_pages || first_pte_in_page(pte) || 1390 - (largepage_lvl > 1 && nr_pages < lvl_pages)) { 1391 - domain_flush_cache(domain, first_pte, 1392 - (void *)pte - (void *)first_pte); 1393 - pte = NULL; 1394 - } 1395 - } 1396 1536 1397 1537 return 0; 1398 1538 } ··· 1287 1769 struct device *dev, 1288 1770 u32 pasid, struct iommu_domain *old) 1289 1771 { 1290 - struct dma_pte *pgd = domain->pgd; 1291 - int level, flags = 0; 1772 + struct pt_iommu_x86_64_hw_info pt_info; 1773 + unsigned int flags = 0; 1292 1774 1293 - level = agaw_to_level(domain->agaw); 1294 - if (level != 4 && level != 5) 1775 + pt_iommu_x86_64_hw_info(&domain->fspt, &pt_info); 1776 + if (WARN_ON(pt_info.levels != 4 && pt_info.levels != 5)) 1295 1777 return -EINVAL; 1296 1778 1297 - if (level == 5) 1779 + if (pt_info.levels == 5) 1298 1780 flags |= PASID_FLAG_FL5LP; 1299 1781 1300 1782 if (domain->force_snooping) 1301 1783 flags |= PASID_FLAG_PAGE_SNOOP; 1302 1784 1785 + if (!(domain->fspt.x86_64_pt.common.features & 1786 + BIT(PT_FEAT_DMA_INCOHERENT))) 1787 + flags |= PASID_FLAG_PWSNP; 1788 + 1303 1789 return __domain_setup_first_level(iommu, dev, pasid, 1304 1790 domain_id_iommu(domain, iommu), 1305 - __pa(pgd), flags, old); 1791 + pt_info.gcr3_pt, flags, old); 1306 1792 } 1307 1793 1308 1794 static int dmar_domain_attach_device(struct dmar_domain *domain, ··· 2752 3230 } 2753 3231 2754 3232 static int blocking_domain_attach_dev(struct iommu_domain *domain, 2755 - struct device *dev) 3233 + struct device *dev, 3234 + struct iommu_domain *old) 2756 3235 { 2757 3236 struct device_domain_info *info = dev_iommu_priv_get(dev); 2758 3237 ··· 2774 3251 } 2775 3252 }; 2776 3253 2777 - static int iommu_superpage_capability(struct intel_iommu *iommu, bool first_stage) 3254 + static struct dmar_domain *paging_domain_alloc(void) 2778 3255 { 2779 - if (!intel_iommu_superpage) 2780 - return 0; 2781 - 2782 - if (first_stage) 2783 - return cap_fl1gp_support(iommu->cap) ? 2 : 1; 2784 - 2785 - return fls(cap_super_page_val(iommu->cap)); 2786 - } 2787 - 2788 - static struct dmar_domain *paging_domain_alloc(struct device *dev, bool first_stage) 2789 - { 2790 - struct device_domain_info *info = dev_iommu_priv_get(dev); 2791 - struct intel_iommu *iommu = info->iommu; 2792 3256 struct dmar_domain *domain; 2793 - int addr_width; 2794 3257 2795 3258 domain = kzalloc(sizeof(*domain), GFP_KERNEL); 2796 3259 if (!domain) ··· 2791 3282 INIT_LIST_HEAD(&domain->s1_domains); 2792 3283 spin_lock_init(&domain->s1_lock); 2793 3284 2794 - domain->nid = dev_to_node(dev); 2795 - domain->use_first_level = first_stage; 3285 + return domain; 3286 + } 2796 3287 2797 - domain->domain.type = IOMMU_DOMAIN_UNMANAGED; 2798 - 2799 - /* calculate the address width */ 2800 - addr_width = agaw_to_width(iommu->agaw); 2801 - if (addr_width > cap_mgaw(iommu->cap)) 2802 - addr_width = cap_mgaw(iommu->cap); 2803 - domain->gaw = addr_width; 2804 - domain->agaw = iommu->agaw; 2805 - domain->max_addr = __DOMAIN_MAX_ADDR(addr_width); 2806 - 2807 - /* iommu memory access coherency */ 2808 - domain->iommu_coherency = iommu_paging_structure_coherency(iommu); 2809 - 2810 - /* pagesize bitmap */ 2811 - domain->domain.pgsize_bitmap = SZ_4K; 2812 - domain->iommu_superpage = iommu_superpage_capability(iommu, first_stage); 2813 - domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain); 3288 + static unsigned int compute_vasz_lg2_fs(struct intel_iommu *iommu, 3289 + unsigned int *top_level) 3290 + { 3291 + unsigned int mgaw = cap_mgaw(iommu->cap); 2814 3292 2815 3293 /* 2816 - * IOVA aperture: First-level translation restricts the input-address 2817 - * to a canonical address (i.e., address bits 63:N have the same value 2818 - * as address bit [N-1], where N is 48-bits with 4-level paging and 2819 - * 57-bits with 5-level paging). Hence, skip bit [N-1]. 3294 + * Spec 3.6 First-Stage Translation: 3295 + * 3296 + * Software must limit addresses to less than the minimum of MGAW 3297 + * and the lower canonical address width implied by FSPM (i.e., 3298 + * 47-bit when FSPM is 4-level and 56-bit when FSPM is 5-level). 2820 3299 */ 2821 - domain->domain.geometry.force_aperture = true; 2822 - domain->domain.geometry.aperture_start = 0; 2823 - if (first_stage) 2824 - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1); 2825 - else 2826 - domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw); 2827 - 2828 - /* always allocate the top pgd */ 2829 - domain->pgd = iommu_alloc_pages_node_sz(domain->nid, GFP_KERNEL, SZ_4K); 2830 - if (!domain->pgd) { 2831 - kfree(domain); 2832 - return ERR_PTR(-ENOMEM); 3300 + if (mgaw > 48 && cap_fl5lp_support(iommu->cap)) { 3301 + *top_level = 4; 3302 + return min(57, mgaw); 2833 3303 } 2834 - domain_flush_cache(domain, domain->pgd, PAGE_SIZE); 2835 3304 2836 - return domain; 3305 + /* Four level is always supported */ 3306 + *top_level = 3; 3307 + return min(48, mgaw); 2837 3308 } 2838 3309 2839 3310 static struct iommu_domain * 2840 3311 intel_iommu_domain_alloc_first_stage(struct device *dev, 2841 3312 struct intel_iommu *iommu, u32 flags) 2842 3313 { 3314 + struct pt_iommu_x86_64_cfg cfg = {}; 2843 3315 struct dmar_domain *dmar_domain; 3316 + int ret; 2844 3317 2845 3318 if (flags & ~IOMMU_HWPT_ALLOC_PASID) 2846 3319 return ERR_PTR(-EOPNOTSUPP); ··· 2831 3340 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 2832 3341 return ERR_PTR(-EOPNOTSUPP); 2833 3342 2834 - dmar_domain = paging_domain_alloc(dev, true); 3343 + dmar_domain = paging_domain_alloc(); 2835 3344 if (IS_ERR(dmar_domain)) 2836 3345 return ERR_CAST(dmar_domain); 2837 3346 3347 + cfg.common.hw_max_vasz_lg2 = 3348 + compute_vasz_lg2_fs(iommu, &cfg.top_level); 3349 + cfg.common.hw_max_oasz_lg2 = 52; 3350 + cfg.common.features = BIT(PT_FEAT_SIGN_EXTEND) | 3351 + BIT(PT_FEAT_FLUSH_RANGE); 3352 + /* First stage always uses scalable mode */ 3353 + if (!ecap_smpwc(iommu->ecap)) 3354 + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); 3355 + dmar_domain->iommu.iommu_device = dev; 3356 + dmar_domain->iommu.nid = dev_to_node(dev); 2838 3357 dmar_domain->domain.ops = &intel_fs_paging_domain_ops; 2839 3358 /* 2840 3359 * iotlb sync for map is only needed for legacy implementations that ··· 2854 3353 if (rwbf_required(iommu)) 2855 3354 dmar_domain->iotlb_sync_map = true; 2856 3355 3356 + ret = pt_iommu_x86_64_init(&dmar_domain->fspt, &cfg, GFP_KERNEL); 3357 + if (ret) { 3358 + kfree(dmar_domain); 3359 + return ERR_PTR(ret); 3360 + } 3361 + 3362 + if (!cap_fl1gp_support(iommu->cap)) 3363 + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; 3364 + if (!intel_iommu_superpage) 3365 + dmar_domain->domain.pgsize_bitmap = SZ_4K; 3366 + 2857 3367 return &dmar_domain->domain; 2858 3368 } 3369 + 3370 + static unsigned int compute_vasz_lg2_ss(struct intel_iommu *iommu, 3371 + unsigned int *top_level) 3372 + { 3373 + unsigned int sagaw = cap_sagaw(iommu->cap); 3374 + unsigned int mgaw = cap_mgaw(iommu->cap); 3375 + 3376 + /* 3377 + * Find the largest table size that both the mgaw and sagaw support. 3378 + * This sets the valid range of IOVA and the top starting level. 3379 + * Some HW may only support a 4 or 5 level walk but must limit IOVA to 3380 + * 3 levels. 3381 + */ 3382 + if (mgaw > 48 && sagaw >= BIT(3)) { 3383 + *top_level = 4; 3384 + return min(57, mgaw); 3385 + } else if (mgaw > 39 && sagaw >= BIT(2)) { 3386 + *top_level = 3 + ffs(sagaw >> 3); 3387 + return min(48, mgaw); 3388 + } else if (mgaw > 30 && sagaw >= BIT(1)) { 3389 + *top_level = 2 + ffs(sagaw >> 2); 3390 + return min(39, mgaw); 3391 + } 3392 + return 0; 3393 + } 3394 + 3395 + static const struct iommu_dirty_ops intel_second_stage_dirty_ops = { 3396 + IOMMU_PT_DIRTY_OPS(vtdss), 3397 + .set_dirty_tracking = intel_iommu_set_dirty_tracking, 3398 + }; 2859 3399 2860 3400 static struct iommu_domain * 2861 3401 intel_iommu_domain_alloc_second_stage(struct device *dev, 2862 3402 struct intel_iommu *iommu, u32 flags) 2863 3403 { 3404 + struct pt_iommu_vtdss_cfg cfg = {}; 2864 3405 struct dmar_domain *dmar_domain; 3406 + unsigned int sslps; 3407 + int ret; 2865 3408 2866 3409 if (flags & 2867 3410 (~(IOMMU_HWPT_ALLOC_NEST_PARENT | IOMMU_HWPT_ALLOC_DIRTY_TRACKING | ··· 2922 3377 if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) 2923 3378 return ERR_PTR(-EOPNOTSUPP); 2924 3379 2925 - dmar_domain = paging_domain_alloc(dev, false); 3380 + dmar_domain = paging_domain_alloc(); 2926 3381 if (IS_ERR(dmar_domain)) 2927 3382 return ERR_CAST(dmar_domain); 2928 3383 3384 + cfg.common.hw_max_vasz_lg2 = compute_vasz_lg2_ss(iommu, &cfg.top_level); 3385 + cfg.common.hw_max_oasz_lg2 = 52; 3386 + cfg.common.features = BIT(PT_FEAT_FLUSH_RANGE); 3387 + 3388 + /* 3389 + * Read-only mapping is disallowed on the domain which serves as the 3390 + * parent in a nested configuration, due to HW errata 3391 + * (ERRATA_772415_SPR17) 3392 + */ 3393 + if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) 3394 + cfg.common.features |= BIT(PT_FEAT_VTDSS_FORCE_WRITEABLE); 3395 + 3396 + if (!iommu_paging_structure_coherency(iommu)) 3397 + cfg.common.features |= BIT(PT_FEAT_DMA_INCOHERENT); 3398 + dmar_domain->iommu.iommu_device = dev; 3399 + dmar_domain->iommu.nid = dev_to_node(dev); 2929 3400 dmar_domain->domain.ops = &intel_ss_paging_domain_ops; 2930 3401 dmar_domain->nested_parent = flags & IOMMU_HWPT_ALLOC_NEST_PARENT; 2931 3402 2932 3403 if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) 2933 - dmar_domain->domain.dirty_ops = &intel_dirty_ops; 3404 + dmar_domain->domain.dirty_ops = &intel_second_stage_dirty_ops; 3405 + 3406 + ret = pt_iommu_vtdss_init(&dmar_domain->sspt, &cfg, GFP_KERNEL); 3407 + if (ret) { 3408 + kfree(dmar_domain); 3409 + return ERR_PTR(ret); 3410 + } 3411 + 3412 + /* Adjust the supported page sizes to HW capability */ 3413 + sslps = cap_super_page_val(iommu->cap); 3414 + if (!(sslps & BIT(0))) 3415 + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_2M; 3416 + if (!(sslps & BIT(1))) 3417 + dmar_domain->domain.pgsize_bitmap &= ~(u64)SZ_1G; 3418 + if (!intel_iommu_superpage) 3419 + dmar_domain->domain.pgsize_bitmap = SZ_4K; 2934 3420 2935 3421 /* 2936 3422 * Besides the internal write buffer flush, the caching mode used for ··· 3003 3427 if (WARN_ON(!list_empty(&dmar_domain->devices))) 3004 3428 return; 3005 3429 3006 - if (dmar_domain->pgd) { 3007 - struct iommu_pages_list freelist = 3008 - IOMMU_PAGES_LIST_INIT(freelist); 3009 - 3010 - domain_unmap(dmar_domain, 0, DOMAIN_MAX_PFN(dmar_domain->gaw), 3011 - &freelist); 3012 - iommu_put_pages_list(&freelist); 3013 - } 3430 + pt_iommu_deinit(&dmar_domain->iommu); 3014 3431 3015 3432 kfree(dmar_domain->qi_batch); 3016 3433 kfree(dmar_domain); ··· 3018 3449 3019 3450 /* Only SL is available in legacy mode */ 3020 3451 if (!sm_supported(iommu) || !ecap_flts(iommu->ecap)) 3452 + return -EINVAL; 3453 + 3454 + if (!ecap_smpwc(iommu->ecap) && 3455 + !(dmar_domain->fspt.x86_64_pt.common.features & 3456 + BIT(PT_FEAT_DMA_INCOHERENT))) 3457 + return -EINVAL; 3458 + 3459 + /* Supports the number of table levels */ 3460 + if (!cap_fl5lp_support(iommu->cap) && 3461 + dmar_domain->fspt.x86_64_pt.common.max_vasz_lg2 > 48) 3021 3462 return -EINVAL; 3022 3463 3023 3464 /* Same page size support */ ··· 3046 3467 paging_domain_compatible_second_stage(struct dmar_domain *dmar_domain, 3047 3468 struct intel_iommu *iommu) 3048 3469 { 3470 + unsigned int vasz_lg2 = dmar_domain->sspt.vtdss_pt.common.max_vasz_lg2; 3049 3471 unsigned int sslps = cap_super_page_val(iommu->cap); 3472 + struct pt_iommu_vtdss_hw_info pt_info; 3473 + 3474 + pt_iommu_vtdss_hw_info(&dmar_domain->sspt, &pt_info); 3050 3475 3051 3476 if (dmar_domain->domain.dirty_ops && !ssads_supported(iommu)) 3052 3477 return -EINVAL; ··· 3059 3476 3060 3477 /* Legacy mode always supports second stage */ 3061 3478 if (sm_supported(iommu) && !ecap_slts(iommu->ecap)) 3479 + return -EINVAL; 3480 + 3481 + if (!iommu_paging_structure_coherency(iommu) && 3482 + !(dmar_domain->sspt.vtdss_pt.common.features & 3483 + BIT(PT_FEAT_DMA_INCOHERENT))) 3484 + return -EINVAL; 3485 + 3486 + /* Address width falls within the capability */ 3487 + if (cap_mgaw(iommu->cap) < vasz_lg2) 3488 + return -EINVAL; 3489 + 3490 + /* Page table level is supported. */ 3491 + if (!(cap_sagaw(iommu->cap) & BIT(pt_info.aw))) 3062 3492 return -EINVAL; 3063 3493 3064 3494 /* Same page size support */ ··· 3085 3489 !dmar_domain->iotlb_sync_map) 3086 3490 return -EINVAL; 3087 3491 3492 + /* 3493 + * FIXME this is locked wrong, it needs to be under the 3494 + * dmar_domain->lock 3495 + */ 3496 + if ((dmar_domain->sspt.vtdss_pt.common.features & 3497 + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE)) && 3498 + !ecap_sc_support(iommu->ecap)) 3499 + return -EINVAL; 3088 3500 return 0; 3089 3501 } 3090 3502 ··· 3102 3498 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3103 3499 struct intel_iommu *iommu = info->iommu; 3104 3500 int ret = -EINVAL; 3105 - int addr_width; 3106 3501 3107 3502 if (intel_domain_is_fs_paging(dmar_domain)) 3108 3503 ret = paging_domain_compatible_first_stage(dmar_domain, iommu); ··· 3112 3509 if (ret) 3113 3510 return ret; 3114 3511 3115 - /* 3116 - * FIXME this is locked wrong, it needs to be under the 3117 - * dmar_domain->lock 3118 - */ 3119 - if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap)) 3120 - return -EINVAL; 3121 - 3122 - if (dmar_domain->iommu_coherency != 3123 - iommu_paging_structure_coherency(iommu)) 3124 - return -EINVAL; 3125 - 3126 - 3127 - /* check if this iommu agaw is sufficient for max mapped address */ 3128 - addr_width = agaw_to_width(iommu->agaw); 3129 - if (addr_width > cap_mgaw(iommu->cap)) 3130 - addr_width = cap_mgaw(iommu->cap); 3131 - 3132 - if (dmar_domain->gaw > addr_width || dmar_domain->agaw > iommu->agaw) 3133 - return -EINVAL; 3134 - 3135 3512 if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev) && 3136 3513 context_copied(iommu, info->bus, info->devfn)) 3137 3514 return intel_pasid_setup_sm_context(dev); ··· 3120 3537 } 3121 3538 3122 3539 static int intel_iommu_attach_device(struct iommu_domain *domain, 3123 - struct device *dev) 3540 + struct device *dev, 3541 + struct iommu_domain *old) 3124 3542 { 3125 3543 int ret; 3126 3544 ··· 3142 3558 return ret; 3143 3559 } 3144 3560 3145 - static int intel_iommu_map(struct iommu_domain *domain, 3146 - unsigned long iova, phys_addr_t hpa, 3147 - size_t size, int iommu_prot, gfp_t gfp) 3148 - { 3149 - struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3150 - u64 max_addr; 3151 - int prot = 0; 3152 - 3153 - if (iommu_prot & IOMMU_READ) 3154 - prot |= DMA_PTE_READ; 3155 - if (iommu_prot & IOMMU_WRITE) 3156 - prot |= DMA_PTE_WRITE; 3157 - if (dmar_domain->set_pte_snp) 3158 - prot |= DMA_PTE_SNP; 3159 - 3160 - max_addr = iova + size; 3161 - if (dmar_domain->max_addr < max_addr) { 3162 - u64 end; 3163 - 3164 - /* check if minimum agaw is sufficient for mapped address */ 3165 - end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1; 3166 - if (end < max_addr) { 3167 - pr_err("%s: iommu width (%d) is not " 3168 - "sufficient for the mapped address (%llx)\n", 3169 - __func__, dmar_domain->gaw, max_addr); 3170 - return -EFAULT; 3171 - } 3172 - dmar_domain->max_addr = max_addr; 3173 - } 3174 - /* Round up size to next multiple of PAGE_SIZE, if it and 3175 - the low bits of hpa would take us onto the next page */ 3176 - size = aligned_nrpages(hpa, size); 3177 - return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT, 3178 - hpa >> VTD_PAGE_SHIFT, size, prot, gfp); 3179 - } 3180 - 3181 - static int intel_iommu_map_pages(struct iommu_domain *domain, 3182 - unsigned long iova, phys_addr_t paddr, 3183 - size_t pgsize, size_t pgcount, 3184 - int prot, gfp_t gfp, size_t *mapped) 3185 - { 3186 - unsigned long pgshift = __ffs(pgsize); 3187 - size_t size = pgcount << pgshift; 3188 - int ret; 3189 - 3190 - if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G) 3191 - return -EINVAL; 3192 - 3193 - if (!IS_ALIGNED(iova | paddr, pgsize)) 3194 - return -EINVAL; 3195 - 3196 - ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp); 3197 - if (!ret && mapped) 3198 - *mapped = size; 3199 - 3200 - return ret; 3201 - } 3202 - 3203 - static size_t intel_iommu_unmap(struct iommu_domain *domain, 3204 - unsigned long iova, size_t size, 3205 - struct iommu_iotlb_gather *gather) 3206 - { 3207 - struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3208 - unsigned long start_pfn, last_pfn; 3209 - int level = 0; 3210 - 3211 - /* Cope with horrid API which requires us to unmap more than the 3212 - size argument if it happens to be a large-page mapping. */ 3213 - if (unlikely(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 3214 - &level, GFP_ATOMIC))) 3215 - return 0; 3216 - 3217 - if (size < VTD_PAGE_SIZE << level_to_offset_bits(level)) 3218 - size = VTD_PAGE_SIZE << level_to_offset_bits(level); 3219 - 3220 - start_pfn = iova >> VTD_PAGE_SHIFT; 3221 - last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT; 3222 - 3223 - domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist); 3224 - 3225 - if (dmar_domain->max_addr == iova + size) 3226 - dmar_domain->max_addr = iova; 3227 - 3228 - /* 3229 - * We do not use page-selective IOTLB invalidation in flush queue, 3230 - * so there is no need to track page and sync iotlb. 3231 - */ 3232 - if (!iommu_iotlb_gather_queued(gather)) 3233 - iommu_iotlb_gather_add_page(domain, gather, iova, size); 3234 - 3235 - return size; 3236 - } 3237 - 3238 - static size_t intel_iommu_unmap_pages(struct iommu_domain *domain, 3239 - unsigned long iova, 3240 - size_t pgsize, size_t pgcount, 3241 - struct iommu_iotlb_gather *gather) 3242 - { 3243 - unsigned long pgshift = __ffs(pgsize); 3244 - size_t size = pgcount << pgshift; 3245 - 3246 - return intel_iommu_unmap(domain, iova, size, gather); 3247 - } 3248 - 3249 3561 static void intel_iommu_tlb_sync(struct iommu_domain *domain, 3250 3562 struct iommu_iotlb_gather *gather) 3251 3563 { ··· 3149 3669 gather->end, 3150 3670 iommu_pages_list_empty(&gather->freelist)); 3151 3671 iommu_put_pages_list(&gather->freelist); 3152 - } 3153 - 3154 - static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, 3155 - dma_addr_t iova) 3156 - { 3157 - struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3158 - struct dma_pte *pte; 3159 - int level = 0; 3160 - u64 phys = 0; 3161 - 3162 - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level, 3163 - GFP_ATOMIC); 3164 - if (pte && dma_pte_present(pte)) 3165 - phys = dma_pte_addr(pte) + 3166 - (iova & (BIT_MASK(level_to_offset_bits(level) + 3167 - VTD_PAGE_SHIFT) - 1)); 3168 - 3169 - return phys; 3170 3672 } 3171 3673 3172 3674 static bool domain_support_force_snooping(struct dmar_domain *domain) ··· 3192 3730 struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3193 3731 3194 3732 guard(spinlock_irqsave)(&dmar_domain->lock); 3195 - if (!domain_support_force_snooping(dmar_domain) || 3196 - dmar_domain->has_mappings) 3733 + if (!domain_support_force_snooping(dmar_domain)) 3197 3734 return false; 3198 3735 3199 3736 /* 3200 3737 * Second level page table supports per-PTE snoop control. The 3201 3738 * iommu_map() interface will handle this by setting SNP bit. 3202 3739 */ 3203 - dmar_domain->set_pte_snp = true; 3740 + dmar_domain->sspt.vtdss_pt.common.features |= 3741 + BIT(PT_FEAT_VTDSS_FORCE_COHERENCE); 3204 3742 dmar_domain->force_snooping = true; 3205 3743 return true; 3206 3744 } ··· 3764 4302 return ret; 3765 4303 } 3766 4304 3767 - static int intel_iommu_read_and_clear_dirty(struct iommu_domain *domain, 3768 - unsigned long iova, size_t size, 3769 - unsigned long flags, 3770 - struct iommu_dirty_bitmap *dirty) 3771 - { 3772 - struct dmar_domain *dmar_domain = to_dmar_domain(domain); 3773 - unsigned long end = iova + size - 1; 3774 - unsigned long pgsize; 3775 - 3776 - /* 3777 - * IOMMUFD core calls into a dirty tracking disabled domain without an 3778 - * IOVA bitmap set in order to clean dirty bits in all PTEs that might 3779 - * have occurred when we stopped dirty tracking. This ensures that we 3780 - * never inherit dirtied bits from a previous cycle. 3781 - */ 3782 - if (!dmar_domain->dirty_tracking && dirty->bitmap) 3783 - return -EINVAL; 3784 - 3785 - do { 3786 - struct dma_pte *pte; 3787 - int lvl = 0; 3788 - 3789 - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &lvl, 3790 - GFP_ATOMIC); 3791 - pgsize = level_size(lvl) << VTD_PAGE_SHIFT; 3792 - if (!pte || !dma_pte_present(pte)) { 3793 - iova += pgsize; 3794 - continue; 3795 - } 3796 - 3797 - if (dma_sl_pte_test_and_clear_dirty(pte, flags)) 3798 - iommu_dirty_bitmap_record(dirty, iova, pgsize); 3799 - iova += pgsize; 3800 - } while (iova < end); 3801 - 3802 - return 0; 3803 - } 3804 - 3805 - static const struct iommu_dirty_ops intel_dirty_ops = { 3806 - .set_dirty_tracking = intel_iommu_set_dirty_tracking, 3807 - .read_and_clear_dirty = intel_iommu_read_and_clear_dirty, 3808 - }; 3809 - 3810 4305 static int context_setup_pass_through(struct device *dev, u8 bus, u8 devfn) 3811 4306 { 3812 4307 struct device_domain_info *info = dev_iommu_priv_get(dev); ··· 3820 4401 context_setup_pass_through_cb, dev); 3821 4402 } 3822 4403 3823 - static int identity_domain_attach_dev(struct iommu_domain *domain, struct device *dev) 4404 + static int identity_domain_attach_dev(struct iommu_domain *domain, 4405 + struct device *dev, 4406 + struct iommu_domain *old) 3824 4407 { 3825 4408 struct device_domain_info *info = dev_iommu_priv_get(dev); 3826 4409 struct intel_iommu *iommu = info->iommu; ··· 3883 4462 }; 3884 4463 3885 4464 const struct iommu_domain_ops intel_fs_paging_domain_ops = { 4465 + IOMMU_PT_DOMAIN_OPS(x86_64), 3886 4466 .attach_dev = intel_iommu_attach_device, 3887 4467 .set_dev_pasid = intel_iommu_set_dev_pasid, 3888 - .map_pages = intel_iommu_map_pages, 3889 - .unmap_pages = intel_iommu_unmap_pages, 3890 4468 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 3891 4469 .flush_iotlb_all = intel_flush_iotlb_all, 3892 4470 .iotlb_sync = intel_iommu_tlb_sync, 3893 - .iova_to_phys = intel_iommu_iova_to_phys, 3894 4471 .free = intel_iommu_domain_free, 3895 4472 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_fs, 3896 4473 }; 3897 4474 3898 4475 const struct iommu_domain_ops intel_ss_paging_domain_ops = { 4476 + IOMMU_PT_DOMAIN_OPS(vtdss), 3899 4477 .attach_dev = intel_iommu_attach_device, 3900 4478 .set_dev_pasid = intel_iommu_set_dev_pasid, 3901 - .map_pages = intel_iommu_map_pages, 3902 - .unmap_pages = intel_iommu_unmap_pages, 3903 4479 .iotlb_sync_map = intel_iommu_iotlb_sync_map, 3904 4480 .flush_iotlb_all = intel_flush_iotlb_all, 3905 4481 .iotlb_sync = intel_iommu_tlb_sync, 3906 - .iova_to_phys = intel_iommu_iova_to_phys, 3907 4482 .free = intel_iommu_domain_free, 3908 4483 .enforce_cache_coherency = intel_iommu_enforce_cache_coherency_ss, 3909 4484 }; ··· 4214 4797 4215 4798 return ret; 4216 4799 } 4800 + 4801 + MODULE_IMPORT_NS("GENERIC_PT_IOMMU");

+15 -84

drivers/iommu/intel/iommu.h

··· 23 23 #include <linux/xarray.h> 24 24 #include <linux/perf_event.h> 25 25 #include <linux/pci.h> 26 + #include <linux/generic_pt/iommu.h> 26 27 27 - #include <asm/cacheflush.h> 28 28 #include <asm/iommu.h> 29 29 #include <uapi/linux/iommufd.h> 30 30 ··· 595 595 }; 596 596 597 597 struct dmar_domain { 598 - int nid; /* node id */ 598 + union { 599 + struct iommu_domain domain; 600 + struct pt_iommu iommu; 601 + /* First stage page table */ 602 + struct pt_iommu_x86_64 fspt; 603 + /* Second stage page table */ 604 + struct pt_iommu_vtdss sspt; 605 + }; 606 + 599 607 struct xarray iommu_array; /* Attached IOMMU array */ 600 608 601 - u8 iommu_coherency: 1; /* indicate coherency of iommu access */ 602 - u8 force_snooping : 1; /* Create IOPTEs with snoop control */ 603 - u8 set_pte_snp:1; 604 - u8 use_first_level:1; /* DMA translation for the domain goes 605 - * through the first level page table, 606 - * otherwise, goes through the second 607 - * level. 608 - */ 609 + u8 force_snooping:1; /* Create PASID entry with snoop control */ 609 610 u8 dirty_tracking:1; /* Dirty tracking is enabled */ 610 611 u8 nested_parent:1; /* Has other domains nested on it */ 611 - u8 has_mappings:1; /* Has mappings configured through 612 - * iommu_map() interface. 613 - */ 614 612 u8 iotlb_sync_map:1; /* Need to flush IOTLB cache or write 615 613 * buffer when creating mappings. 616 614 */ ··· 621 623 struct list_head cache_tags; /* Cache tag list */ 622 624 struct qi_batch *qi_batch; /* Batched QI descriptors */ 623 625 624 - int iommu_superpage;/* Level of superpages supported: 625 - 0 == 4KiB (no superpages), 1 == 2MiB, 626 - 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ 627 626 union { 628 627 /* DMA remapping domain */ 629 628 struct { 630 - /* virtual address */ 631 - struct dma_pte *pgd; 632 - /* max guest address width */ 633 - int gaw; 634 - /* 635 - * adjusted guest address width: 636 - * 0: level 2 30-bit 637 - * 1: level 3 39-bit 638 - * 2: level 4 48-bit 639 - * 3: level 5 57-bit 640 - */ 641 - int agaw; 642 - /* maximum mapped address */ 643 - u64 max_addr; 644 629 /* Protect the s1_domains list */ 645 630 spinlock_t s1_lock; 646 631 /* Track s1_domains nested on this domain */ ··· 645 664 struct mmu_notifier notifier; 646 665 }; 647 666 }; 648 - 649 - struct iommu_domain domain; /* generic domain data structure for 650 - iommu core */ 651 667 }; 668 + PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, iommu, domain); 669 + PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, sspt.iommu, domain); 670 + PT_IOMMU_CHECK_DOMAIN(struct dmar_domain, fspt.iommu, domain); 652 671 653 672 /* 654 673 * In theory, the VT-d 4.0 spec can support up to 2 ^ 16 counters. ··· 847 866 u64 val; 848 867 }; 849 868 850 - static inline void dma_clear_pte(struct dma_pte *pte) 851 - { 852 - pte->val = 0; 853 - } 854 - 855 869 static inline u64 dma_pte_addr(struct dma_pte *pte) 856 870 { 857 871 #ifdef CONFIG_64BIT ··· 862 886 return (pte->val & 3) != 0; 863 887 } 864 888 865 - static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte, 866 - unsigned long flags) 867 - { 868 - if (flags & IOMMU_DIRTY_NO_CLEAR) 869 - return (pte->val & DMA_SL_PTE_DIRTY) != 0; 870 - 871 - return test_and_clear_bit(DMA_SL_PTE_DIRTY_BIT, 872 - (unsigned long *)&pte->val); 873 - } 874 - 875 889 static inline bool dma_pte_superpage(struct dma_pte *pte) 876 890 { 877 891 return (pte->val & DMA_PTE_LARGE_PAGE); 878 - } 879 - 880 - static inline bool first_pte_in_page(struct dma_pte *pte) 881 - { 882 - return IS_ALIGNED((unsigned long)pte, VTD_PAGE_SIZE); 883 - } 884 - 885 - static inline int nr_pte_to_next_page(struct dma_pte *pte) 886 - { 887 - return first_pte_in_page(pte) ? BIT_ULL(VTD_STRIDE_SHIFT) : 888 - (struct dma_pte *)ALIGN((unsigned long)pte, VTD_PAGE_SIZE) - pte; 889 892 } 890 893 891 894 static inline bool context_present(struct context_entry *context) ··· 882 927 return agaw + 2; 883 928 } 884 929 885 - static inline int agaw_to_width(int agaw) 886 - { 887 - return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH); 888 - } 889 - 890 930 static inline int width_to_agaw(int width) 891 931 { 892 932 return DIV_ROUND_UP(width - 30, LEVEL_STRIDE); ··· 897 947 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK; 898 948 } 899 949 900 - static inline u64 level_mask(int level) 901 - { 902 - return -1ULL << level_to_offset_bits(level); 903 - } 904 - 905 - static inline u64 level_size(int level) 906 - { 907 - return 1ULL << level_to_offset_bits(level); 908 - } 909 - 910 - static inline u64 align_to_level(u64 pfn, int level) 911 - { 912 - return (pfn + level_size(level) - 1) & level_mask(level); 913 - } 914 - 915 - static inline unsigned long lvl_to_nr_pages(unsigned int lvl) 916 - { 917 - return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH); 918 - } 919 950 920 951 static inline void context_set_present(struct context_entry *context) 921 952 { ··· 1028 1097 struct qi_desc *desc) 1029 1098 { 1030 1099 u8 dw = 0, dr = 0; 1031 - int ih = 0; 1100 + int ih = addr & 1; 1032 1101 1033 1102 if (cap_write_drain(iommu->cap)) 1034 1103 dw = 1;

+1 -6

drivers/iommu/intel/nested.c

··· 19 19 #include "pasid.h" 20 20 21 21 static int intel_nested_attach_dev(struct iommu_domain *domain, 22 - struct device *dev) 22 + struct device *dev, struct iommu_domain *old) 23 23 { 24 24 struct device_domain_info *info = dev_iommu_priv_get(dev); 25 25 struct dmar_domain *dmar_domain = to_dmar_domain(domain); ··· 28 28 int ret = 0; 29 29 30 30 device_block_translation(dev); 31 - 32 - if (iommu->agaw < dmar_domain->s2_domain->agaw) { 33 - dev_err_ratelimited(dev, "Adjusted guest address width not compatible\n"); 34 - return -ENODEV; 35 - } 36 31 37 32 /* 38 33 * Stage-1 domain cannot work alone, it is nested on a s2_domain.

+20 -24

drivers/iommu/intel/pasid.c

··· 366 366 367 367 pasid_set_domain_id(pte, did); 368 368 pasid_set_address_width(pte, iommu->agaw); 369 - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 369 + pasid_set_page_snoop(pte, flags & PASID_FLAG_PWSNP); 370 370 371 371 /* Setup Present and PASID Granular Transfer Type: */ 372 372 pasid_set_translation_type(pte, PASID_ENTRY_PGTT_FL_ONLY); ··· 461 461 */ 462 462 static void pasid_pte_config_second_level(struct intel_iommu *iommu, 463 463 struct pasid_entry *pte, 464 - u64 pgd_val, int agaw, u16 did, 465 - bool dirty_tracking) 464 + struct dmar_domain *domain, u16 did) 466 465 { 466 + struct pt_iommu_vtdss_hw_info pt_info; 467 + 467 468 lockdep_assert_held(&iommu->lock); 468 469 470 + pt_iommu_vtdss_hw_info(&domain->sspt, &pt_info); 469 471 pasid_clear_entry(pte); 470 472 pasid_set_domain_id(pte, did); 471 - pasid_set_slptr(pte, pgd_val); 472 - pasid_set_address_width(pte, agaw); 473 + pasid_set_slptr(pte, pt_info.ssptptr); 474 + pasid_set_address_width(pte, pt_info.aw); 473 475 pasid_set_translation_type(pte, PASID_ENTRY_PGTT_SL_ONLY); 474 476 pasid_set_fault_enable(pte); 475 - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 476 - if (dirty_tracking) 477 + pasid_set_page_snoop(pte, !(domain->sspt.vtdss_pt.common.features & 478 + BIT(PT_FEAT_DMA_INCOHERENT))); 479 + if (domain->dirty_tracking) 477 480 pasid_set_ssade(pte); 478 481 479 482 pasid_set_present(pte); ··· 487 484 struct device *dev, u32 pasid) 488 485 { 489 486 struct pasid_entry *pte; 490 - struct dma_pte *pgd; 491 - u64 pgd_val; 492 487 u16 did; 488 + 493 489 494 490 /* 495 491 * If hardware advertises no support for second level ··· 500 498 return -EINVAL; 501 499 } 502 500 503 - pgd = domain->pgd; 504 - pgd_val = virt_to_phys(pgd); 505 501 did = domain_id_iommu(domain, iommu); 506 502 507 503 spin_lock(&iommu->lock); ··· 514 514 return -EBUSY; 515 515 } 516 516 517 - pasid_pte_config_second_level(iommu, pte, pgd_val, domain->agaw, 518 - did, domain->dirty_tracking); 517 + pasid_pte_config_second_level(iommu, pte, domain, did); 519 518 spin_unlock(&iommu->lock); 520 519 521 520 pasid_flush_caches(iommu, pte, pasid, did); ··· 528 529 u32 pasid) 529 530 { 530 531 struct pasid_entry *pte, new_pte; 531 - struct dma_pte *pgd; 532 - u64 pgd_val; 533 532 u16 did; 534 533 535 534 /* ··· 540 543 return -EINVAL; 541 544 } 542 545 543 - pgd = domain->pgd; 544 - pgd_val = virt_to_phys(pgd); 545 546 did = domain_id_iommu(domain, iommu); 546 547 547 - pasid_pte_config_second_level(iommu, &new_pte, pgd_val, 548 - domain->agaw, did, 549 - domain->dirty_tracking); 548 + pasid_pte_config_second_level(iommu, &new_pte, domain, did); 550 549 551 550 spin_lock(&iommu->lock); 552 551 pte = intel_pasid_get_entry(dev, pasid); ··· 740 747 struct dmar_domain *s2_domain, 741 748 u16 did) 742 749 { 743 - struct dma_pte *pgd = s2_domain->pgd; 750 + struct pt_iommu_vtdss_hw_info pt_info; 744 751 745 752 lockdep_assert_held(&iommu->lock); 753 + 754 + pt_iommu_vtdss_hw_info(&s2_domain->sspt, &pt_info); 746 755 747 756 pasid_clear_entry(pte); 748 757 ··· 765 770 if (s2_domain->force_snooping) 766 771 pasid_set_pgsnp(pte); 767 772 768 - pasid_set_slptr(pte, virt_to_phys(pgd)); 773 + pasid_set_slptr(pte, pt_info.ssptptr); 769 774 pasid_set_fault_enable(pte); 770 775 pasid_set_domain_id(pte, did); 771 - pasid_set_address_width(pte, s2_domain->agaw); 772 - pasid_set_page_snoop(pte, !!ecap_smpwc(iommu->ecap)); 776 + pasid_set_address_width(pte, pt_info.aw); 777 + pasid_set_page_snoop(pte, !(s2_domain->sspt.vtdss_pt.common.features & 778 + BIT(PT_FEAT_DMA_INCOHERENT))); 773 779 if (s2_domain->dirty_tracking) 774 780 pasid_set_ssade(pte); 775 781 pasid_set_translation_type(pte, PASID_ENTRY_PGTT_NESTED);

+1

drivers/iommu/intel/pasid.h

··· 24 24 25 25 #define PASID_FLAG_NESTED BIT(1) 26 26 #define PASID_FLAG_PAGE_SNOOP BIT(2) 27 + #define PASID_FLAG_PWSNP BIT(2) 27 28 28 29 /* 29 30 * The PASID_FLAG_FL5LP flag Indicates using 5-level paging for first-

+1

drivers/iommu/intel/svm.c

··· 170 170 171 171 /* Setup the pasid table: */ 172 172 sflags = cpu_feature_enabled(X86_FEATURE_LA57) ? PASID_FLAG_FL5LP : 0; 173 + sflags |= PASID_FLAG_PWSNP; 173 174 ret = __domain_setup_first_level(iommu, dev, pasid, 174 175 FLPT_DEFAULT_DID, __pa(mm->pgd), 175 176 sflags, old);

+214

drivers/iommu/io-pgtable-arm-selftests.c

··· 1 + // SPDX-License-Identifier: GPL-2.0-only 2 + /* 3 + * CPU-agnostic ARM page table allocator. 4 + * 5 + * Copyright (C) 2014 ARM Limited 6 + * 7 + * Author: Will Deacon <will.deacon@arm.com> 8 + */ 9 + 10 + #define pr_fmt(fmt) "arm-lpae io-pgtable: " fmt 11 + 12 + #include <kunit/device.h> 13 + #include <kunit/test.h> 14 + #include <linux/io-pgtable.h> 15 + #include <linux/kernel.h> 16 + 17 + #include "io-pgtable-arm.h" 18 + 19 + static struct io_pgtable_cfg *cfg_cookie; 20 + 21 + static void dummy_tlb_flush_all(void *cookie) 22 + { 23 + WARN_ON(cookie != cfg_cookie); 24 + } 25 + 26 + static void dummy_tlb_flush(unsigned long iova, size_t size, 27 + size_t granule, void *cookie) 28 + { 29 + WARN_ON(cookie != cfg_cookie); 30 + WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); 31 + } 32 + 33 + static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather, 34 + unsigned long iova, size_t granule, 35 + void *cookie) 36 + { 37 + dummy_tlb_flush(iova, granule, granule, cookie); 38 + } 39 + 40 + static const struct iommu_flush_ops dummy_tlb_ops = { 41 + .tlb_flush_all = dummy_tlb_flush_all, 42 + .tlb_flush_walk = dummy_tlb_flush, 43 + .tlb_add_page = dummy_tlb_add_page, 44 + }; 45 + 46 + #define __FAIL(test, i) ({ \ 47 + KUNIT_FAIL(test, "test failed for fmt idx %d\n", (i)); \ 48 + -EFAULT; \ 49 + }) 50 + 51 + static int arm_lpae_run_tests(struct kunit *test, struct io_pgtable_cfg *cfg) 52 + { 53 + static const enum io_pgtable_fmt fmts[] = { 54 + ARM_64_LPAE_S1, 55 + ARM_64_LPAE_S2, 56 + }; 57 + 58 + int i, j; 59 + unsigned long iova; 60 + size_t size, mapped; 61 + struct io_pgtable_ops *ops; 62 + 63 + for (i = 0; i < ARRAY_SIZE(fmts); ++i) { 64 + cfg_cookie = cfg; 65 + ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg); 66 + if (!ops) { 67 + kunit_err(test, "failed to allocate io pgtable ops\n"); 68 + return -ENOMEM; 69 + } 70 + 71 + /* 72 + * Initial sanity checks. 73 + * Empty page tables shouldn't provide any translations. 74 + */ 75 + if (ops->iova_to_phys(ops, 42)) 76 + return __FAIL(test, i); 77 + 78 + if (ops->iova_to_phys(ops, SZ_1G + 42)) 79 + return __FAIL(test, i); 80 + 81 + if (ops->iova_to_phys(ops, SZ_2G + 42)) 82 + return __FAIL(test, i); 83 + 84 + /* 85 + * Distinct mappings of different granule sizes. 86 + */ 87 + iova = 0; 88 + for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { 89 + size = 1UL << j; 90 + 91 + if (ops->map_pages(ops, iova, iova, size, 1, 92 + IOMMU_READ | IOMMU_WRITE | 93 + IOMMU_NOEXEC | IOMMU_CACHE, 94 + GFP_KERNEL, &mapped)) 95 + return __FAIL(test, i); 96 + 97 + /* Overlapping mappings */ 98 + if (!ops->map_pages(ops, iova, iova + size, size, 1, 99 + IOMMU_READ | IOMMU_NOEXEC, 100 + GFP_KERNEL, &mapped)) 101 + return __FAIL(test, i); 102 + 103 + if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) 104 + return __FAIL(test, i); 105 + 106 + iova += SZ_1G; 107 + } 108 + 109 + /* Full unmap */ 110 + iova = 0; 111 + for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { 112 + size = 1UL << j; 113 + 114 + if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) 115 + return __FAIL(test, i); 116 + 117 + if (ops->iova_to_phys(ops, iova + 42)) 118 + return __FAIL(test, i); 119 + 120 + /* Remap full block */ 121 + if (ops->map_pages(ops, iova, iova, size, 1, 122 + IOMMU_WRITE, GFP_KERNEL, &mapped)) 123 + return __FAIL(test, i); 124 + 125 + if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) 126 + return __FAIL(test, i); 127 + 128 + iova += SZ_1G; 129 + } 130 + 131 + /* 132 + * Map/unmap the last largest supported page of the IAS, this can 133 + * trigger corner cases in the concatednated page tables. 134 + */ 135 + mapped = 0; 136 + size = 1UL << __fls(cfg->pgsize_bitmap); 137 + iova = (1UL << cfg->ias) - size; 138 + if (ops->map_pages(ops, iova, iova, size, 1, 139 + IOMMU_READ | IOMMU_WRITE | 140 + IOMMU_NOEXEC | IOMMU_CACHE, 141 + GFP_KERNEL, &mapped)) 142 + return __FAIL(test, i); 143 + if (mapped != size) 144 + return __FAIL(test, i); 145 + if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) 146 + return __FAIL(test, i); 147 + 148 + free_io_pgtable_ops(ops); 149 + } 150 + 151 + return 0; 152 + } 153 + 154 + static void arm_lpae_do_selftests(struct kunit *test) 155 + { 156 + static const unsigned long pgsize[] = { 157 + SZ_4K | SZ_2M | SZ_1G, 158 + SZ_16K | SZ_32M, 159 + SZ_64K | SZ_512M, 160 + }; 161 + 162 + static const unsigned int address_size[] = { 163 + 32, 36, 40, 42, 44, 48, 164 + }; 165 + 166 + int i, j, k, pass = 0, fail = 0; 167 + struct device *dev; 168 + struct io_pgtable_cfg cfg = { 169 + .tlb = &dummy_tlb_ops, 170 + .coherent_walk = true, 171 + .quirks = IO_PGTABLE_QUIRK_NO_WARN, 172 + }; 173 + 174 + dev = kunit_device_register(test, "io-pgtable-test"); 175 + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev); 176 + if (IS_ERR_OR_NULL(dev)) 177 + return; 178 + 179 + cfg.iommu_dev = dev; 180 + 181 + for (i = 0; i < ARRAY_SIZE(pgsize); ++i) { 182 + for (j = 0; j < ARRAY_SIZE(address_size); ++j) { 183 + /* Don't use ias > oas as it is not valid for stage-2. */ 184 + for (k = 0; k <= j; ++k) { 185 + cfg.pgsize_bitmap = pgsize[i]; 186 + cfg.ias = address_size[k]; 187 + cfg.oas = address_size[j]; 188 + kunit_info(test, "pgsize_bitmap 0x%08lx, IAS %u OAS %u\n", 189 + pgsize[i], cfg.ias, cfg.oas); 190 + if (arm_lpae_run_tests(test, &cfg)) 191 + fail++; 192 + else 193 + pass++; 194 + } 195 + } 196 + } 197 + 198 + kunit_info(test, "completed with %d PASS %d FAIL\n", pass, fail); 199 + } 200 + 201 + static struct kunit_case io_pgtable_arm_test_cases[] = { 202 + KUNIT_CASE(arm_lpae_do_selftests), 203 + {}, 204 + }; 205 + 206 + static struct kunit_suite io_pgtable_arm_test = { 207 + .name = "io-pgtable-arm-test", 208 + .test_cases = io_pgtable_arm_test_cases, 209 + }; 210 + 211 + kunit_test_suite(io_pgtable_arm_test); 212 + 213 + MODULE_DESCRIPTION("io-pgtable-arm library kunit tests"); 214 + MODULE_LICENSE("GPL");

-203

drivers/iommu/io-pgtable-arm.c

··· 12 12 #include <linux/atomic.h> 13 13 #include <linux/bitops.h> 14 14 #include <linux/io-pgtable.h> 15 - #include <linux/kernel.h> 16 - #include <linux/device/faux.h> 17 15 #include <linux/sizes.h> 18 16 #include <linux/slab.h> 19 17 #include <linux/types.h> ··· 1265 1267 .alloc = arm_mali_lpae_alloc_pgtable, 1266 1268 .free = arm_lpae_free_pgtable, 1267 1269 }; 1268 - 1269 - #ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE_SELFTEST 1270 - 1271 - static struct io_pgtable_cfg *cfg_cookie __initdata; 1272 - 1273 - static void __init dummy_tlb_flush_all(void *cookie) 1274 - { 1275 - WARN_ON(cookie != cfg_cookie); 1276 - } 1277 - 1278 - static void __init dummy_tlb_flush(unsigned long iova, size_t size, 1279 - size_t granule, void *cookie) 1280 - { 1281 - WARN_ON(cookie != cfg_cookie); 1282 - WARN_ON(!(size & cfg_cookie->pgsize_bitmap)); 1283 - } 1284 - 1285 - static void __init dummy_tlb_add_page(struct iommu_iotlb_gather *gather, 1286 - unsigned long iova, size_t granule, 1287 - void *cookie) 1288 - { 1289 - dummy_tlb_flush(iova, granule, granule, cookie); 1290 - } 1291 - 1292 - static const struct iommu_flush_ops dummy_tlb_ops __initconst = { 1293 - .tlb_flush_all = dummy_tlb_flush_all, 1294 - .tlb_flush_walk = dummy_tlb_flush, 1295 - .tlb_add_page = dummy_tlb_add_page, 1296 - }; 1297 - 1298 - static void __init arm_lpae_dump_ops(struct io_pgtable_ops *ops) 1299 - { 1300 - struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops); 1301 - struct io_pgtable_cfg *cfg = &data->iop.cfg; 1302 - 1303 - pr_err("cfg: pgsize_bitmap 0x%lx, ias %u-bit\n", 1304 - cfg->pgsize_bitmap, cfg->ias); 1305 - pr_err("data: %d levels, 0x%zx pgd_size, %u pg_shift, %u bits_per_level, pgd @ %p\n", 1306 - ARM_LPAE_MAX_LEVELS - data->start_level, ARM_LPAE_PGD_SIZE(data), 1307 - ilog2(ARM_LPAE_GRANULE(data)), data->bits_per_level, data->pgd); 1308 - } 1309 - 1310 - #define __FAIL(ops, i) ({ \ 1311 - WARN(1, "selftest: test failed for fmt idx %d\n", (i)); \ 1312 - arm_lpae_dump_ops(ops); \ 1313 - -EFAULT; \ 1314 - }) 1315 - 1316 - static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg) 1317 - { 1318 - static const enum io_pgtable_fmt fmts[] __initconst = { 1319 - ARM_64_LPAE_S1, 1320 - ARM_64_LPAE_S2, 1321 - }; 1322 - 1323 - int i, j; 1324 - unsigned long iova; 1325 - size_t size, mapped; 1326 - struct io_pgtable_ops *ops; 1327 - 1328 - for (i = 0; i < ARRAY_SIZE(fmts); ++i) { 1329 - cfg_cookie = cfg; 1330 - ops = alloc_io_pgtable_ops(fmts[i], cfg, cfg); 1331 - if (!ops) { 1332 - pr_err("selftest: failed to allocate io pgtable ops\n"); 1333 - return -ENOMEM; 1334 - } 1335 - 1336 - /* 1337 - * Initial sanity checks. 1338 - * Empty page tables shouldn't provide any translations. 1339 - */ 1340 - if (ops->iova_to_phys(ops, 42)) 1341 - return __FAIL(ops, i); 1342 - 1343 - if (ops->iova_to_phys(ops, SZ_1G + 42)) 1344 - return __FAIL(ops, i); 1345 - 1346 - if (ops->iova_to_phys(ops, SZ_2G + 42)) 1347 - return __FAIL(ops, i); 1348 - 1349 - /* 1350 - * Distinct mappings of different granule sizes. 1351 - */ 1352 - iova = 0; 1353 - for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { 1354 - size = 1UL << j; 1355 - 1356 - if (ops->map_pages(ops, iova, iova, size, 1, 1357 - IOMMU_READ | IOMMU_WRITE | 1358 - IOMMU_NOEXEC | IOMMU_CACHE, 1359 - GFP_KERNEL, &mapped)) 1360 - return __FAIL(ops, i); 1361 - 1362 - /* Overlapping mappings */ 1363 - if (!ops->map_pages(ops, iova, iova + size, size, 1, 1364 - IOMMU_READ | IOMMU_NOEXEC, 1365 - GFP_KERNEL, &mapped)) 1366 - return __FAIL(ops, i); 1367 - 1368 - if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) 1369 - return __FAIL(ops, i); 1370 - 1371 - iova += SZ_1G; 1372 - } 1373 - 1374 - /* Full unmap */ 1375 - iova = 0; 1376 - for_each_set_bit(j, &cfg->pgsize_bitmap, BITS_PER_LONG) { 1377 - size = 1UL << j; 1378 - 1379 - if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) 1380 - return __FAIL(ops, i); 1381 - 1382 - if (ops->iova_to_phys(ops, iova + 42)) 1383 - return __FAIL(ops, i); 1384 - 1385 - /* Remap full block */ 1386 - if (ops->map_pages(ops, iova, iova, size, 1, 1387 - IOMMU_WRITE, GFP_KERNEL, &mapped)) 1388 - return __FAIL(ops, i); 1389 - 1390 - if (ops->iova_to_phys(ops, iova + 42) != (iova + 42)) 1391 - return __FAIL(ops, i); 1392 - 1393 - iova += SZ_1G; 1394 - } 1395 - 1396 - /* 1397 - * Map/unmap the last largest supported page of the IAS, this can 1398 - * trigger corner cases in the concatednated page tables. 1399 - */ 1400 - mapped = 0; 1401 - size = 1UL << __fls(cfg->pgsize_bitmap); 1402 - iova = (1UL << cfg->ias) - size; 1403 - if (ops->map_pages(ops, iova, iova, size, 1, 1404 - IOMMU_READ | IOMMU_WRITE | 1405 - IOMMU_NOEXEC | IOMMU_CACHE, 1406 - GFP_KERNEL, &mapped)) 1407 - return __FAIL(ops, i); 1408 - if (mapped != size) 1409 - return __FAIL(ops, i); 1410 - if (ops->unmap_pages(ops, iova, size, 1, NULL) != size) 1411 - return __FAIL(ops, i); 1412 - 1413 - free_io_pgtable_ops(ops); 1414 - } 1415 - 1416 - return 0; 1417 - } 1418 - 1419 - static int __init arm_lpae_do_selftests(void) 1420 - { 1421 - static const unsigned long pgsize[] __initconst = { 1422 - SZ_4K | SZ_2M | SZ_1G, 1423 - SZ_16K | SZ_32M, 1424 - SZ_64K | SZ_512M, 1425 - }; 1426 - 1427 - static const unsigned int address_size[] __initconst = { 1428 - 32, 36, 40, 42, 44, 48, 1429 - }; 1430 - 1431 - int i, j, k, pass = 0, fail = 0; 1432 - struct faux_device *dev; 1433 - struct io_pgtable_cfg cfg = { 1434 - .tlb = &dummy_tlb_ops, 1435 - .coherent_walk = true, 1436 - .quirks = IO_PGTABLE_QUIRK_NO_WARN, 1437 - }; 1438 - 1439 - dev = faux_device_create("io-pgtable-test", NULL, 0); 1440 - if (!dev) 1441 - return -ENOMEM; 1442 - 1443 - cfg.iommu_dev = &dev->dev; 1444 - 1445 - for (i = 0; i < ARRAY_SIZE(pgsize); ++i) { 1446 - for (j = 0; j < ARRAY_SIZE(address_size); ++j) { 1447 - /* Don't use ias > oas as it is not valid for stage-2. */ 1448 - for (k = 0; k <= j; ++k) { 1449 - cfg.pgsize_bitmap = pgsize[i]; 1450 - cfg.ias = address_size[k]; 1451 - cfg.oas = address_size[j]; 1452 - pr_info("selftest: pgsize_bitmap 0x%08lx, IAS %u OAS %u\n", 1453 - pgsize[i], cfg.ias, cfg.oas); 1454 - if (arm_lpae_run_tests(&cfg)) 1455 - fail++; 1456 - else 1457 - pass++; 1458 - } 1459 - } 1460 - } 1461 - 1462 - pr_info("selftest: completed with %d PASS %d FAIL\n", pass, fail); 1463 - faux_device_destroy(dev); 1464 - 1465 - return fail ? -EFAULT : 0; 1466 - } 1467 - subsys_initcall(arm_lpae_do_selftests); 1468 - #endif

-4

drivers/iommu/io-pgtable.c

··· 28 28 #ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S 29 29 [ARM_V7S] = &io_pgtable_arm_v7s_init_fns, 30 30 #endif 31 - #ifdef CONFIG_AMD_IOMMU 32 - [AMD_IOMMU_V1] = &io_pgtable_amd_iommu_v1_init_fns, 33 - [AMD_IOMMU_V2] = &io_pgtable_amd_iommu_v2_init_fns, 34 - #endif 35 31 }; 36 32 37 33 static int check_custom_allocator(enum io_pgtable_fmt fmt,

+135 -1

drivers/iommu/iommu-pages.c

··· 4 4 * Pasha Tatashin <pasha.tatashin@soleen.com> 5 5 */ 6 6 #include "iommu-pages.h" 7 + #include <linux/dma-mapping.h> 7 8 #include <linux/gfp.h> 8 9 #include <linux/mm.h> 9 10 ··· 23 22 #undef IOPTDESC_MATCH 24 23 static_assert(sizeof(struct ioptdesc) <= sizeof(struct page)); 25 24 25 + static inline size_t ioptdesc_mem_size(struct ioptdesc *desc) 26 + { 27 + return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT); 28 + } 29 + 26 30 /** 27 31 * iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from 28 32 * specific NUMA node ··· 42 36 */ 43 37 void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size) 44 38 { 39 + struct ioptdesc *iopt; 45 40 unsigned long pgcnt; 46 41 struct folio *folio; 47 42 unsigned int order; ··· 67 60 if (unlikely(!folio)) 68 61 return NULL; 69 62 63 + iopt = folio_ioptdesc(folio); 64 + iopt->incoherent = false; 65 + 70 66 /* 71 67 * All page allocations that should be reported to as "iommu-pagetables" 72 68 * to userspace must use one of the functions below. This includes ··· 90 80 static void __iommu_free_desc(struct ioptdesc *iopt) 91 81 { 92 82 struct folio *folio = ioptdesc_folio(iopt); 93 - const unsigned long pgcnt = 1UL << folio_order(folio); 83 + const unsigned long pgcnt = folio_nr_pages(folio); 84 + 85 + if (IOMMU_PAGES_USE_DMA_API) 86 + WARN_ON_ONCE(iopt->incoherent); 94 87 95 88 mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt); 96 89 lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt); ··· 130 117 __iommu_free_desc(iopt); 131 118 } 132 119 EXPORT_SYMBOL_GPL(iommu_put_pages_list); 120 + 121 + /** 122 + * iommu_pages_start_incoherent - Setup the page for cache incoherent operation 123 + * @virt: The page to setup 124 + * @dma_dev: The iommu device 125 + * 126 + * For incoherent memory this will use the DMA API to manage the cache flushing 127 + * on some arches. This is a lot of complexity compared to just calling 128 + * arch_sync_dma_for_device(), but it is what the existing ARM iommu drivers 129 + * have been doing. The DMA API requires keeping track of the DMA map and 130 + * freeing it when required. This keeps track of the dma map inside the ioptdesc 131 + * so that error paths are simple for the caller. 132 + */ 133 + int iommu_pages_start_incoherent(void *virt, struct device *dma_dev) 134 + { 135 + struct ioptdesc *iopt = virt_to_ioptdesc(virt); 136 + dma_addr_t dma; 137 + 138 + if (WARN_ON(iopt->incoherent)) 139 + return -EINVAL; 140 + 141 + if (!IOMMU_PAGES_USE_DMA_API) { 142 + iommu_pages_flush_incoherent(dma_dev, virt, 0, 143 + ioptdesc_mem_size(iopt)); 144 + } else { 145 + dma = dma_map_single(dma_dev, virt, ioptdesc_mem_size(iopt), 146 + DMA_TO_DEVICE); 147 + if (dma_mapping_error(dma_dev, dma)) 148 + return -EINVAL; 149 + 150 + /* 151 + * The DMA API is not allowed to do anything other than DMA 152 + * direct. It would be nice to also check 153 + * dev_is_dma_coherent(dma_dev)); 154 + */ 155 + if (WARN_ON(dma != virt_to_phys(virt))) { 156 + dma_unmap_single(dma_dev, dma, ioptdesc_mem_size(iopt), 157 + DMA_TO_DEVICE); 158 + return -EOPNOTSUPP; 159 + } 160 + } 161 + 162 + iopt->incoherent = 1; 163 + return 0; 164 + } 165 + EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent); 166 + 167 + /** 168 + * iommu_pages_start_incoherent_list - Make a list of pages incoherent 169 + * @list: The list of pages to setup 170 + * @dma_dev: The iommu device 171 + * 172 + * Perform iommu_pages_start_incoherent() across all of list. 173 + * 174 + * If this fails the caller must call iommu_pages_stop_incoherent_list(). 175 + */ 176 + int iommu_pages_start_incoherent_list(struct iommu_pages_list *list, 177 + struct device *dma_dev) 178 + { 179 + struct ioptdesc *cur; 180 + int ret; 181 + 182 + list_for_each_entry(cur, &list->pages, iopt_freelist_elm) { 183 + if (WARN_ON(cur->incoherent)) 184 + continue; 185 + 186 + ret = iommu_pages_start_incoherent( 187 + folio_address(ioptdesc_folio(cur)), dma_dev); 188 + if (ret) 189 + return ret; 190 + } 191 + return 0; 192 + } 193 + EXPORT_SYMBOL_GPL(iommu_pages_start_incoherent_list); 194 + 195 + /** 196 + * iommu_pages_stop_incoherent_list - Undo incoherence across a list 197 + * @list: The list of pages to release 198 + * @dma_dev: The iommu device 199 + * 200 + * Revert iommu_pages_start_incoherent() across all of the list. Pages that did 201 + * not call or succeed iommu_pages_start_incoherent() will be ignored. 202 + */ 203 + #if IOMMU_PAGES_USE_DMA_API 204 + void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, 205 + struct device *dma_dev) 206 + { 207 + struct ioptdesc *cur; 208 + 209 + list_for_each_entry(cur, &list->pages, iopt_freelist_elm) { 210 + struct folio *folio = ioptdesc_folio(cur); 211 + 212 + if (!cur->incoherent) 213 + continue; 214 + dma_unmap_single(dma_dev, virt_to_phys(folio_address(folio)), 215 + ioptdesc_mem_size(cur), DMA_TO_DEVICE); 216 + cur->incoherent = 0; 217 + } 218 + } 219 + EXPORT_SYMBOL_GPL(iommu_pages_stop_incoherent_list); 220 + 221 + /** 222 + * iommu_pages_free_incoherent - Free an incoherent page 223 + * @virt: virtual address of the page to be freed. 224 + * @dma_dev: The iommu device 225 + * 226 + * If the page is incoherent it made coherent again then freed. 227 + */ 228 + void iommu_pages_free_incoherent(void *virt, struct device *dma_dev) 229 + { 230 + struct ioptdesc *iopt = virt_to_ioptdesc(virt); 231 + 232 + if (iopt->incoherent) { 233 + dma_unmap_single(dma_dev, virt_to_phys(virt), 234 + ioptdesc_mem_size(iopt), DMA_TO_DEVICE); 235 + iopt->incoherent = 0; 236 + } 237 + __iommu_free_desc(iopt); 238 + } 239 + EXPORT_SYMBOL_GPL(iommu_pages_free_incoherent); 240 + #endif

+49 -2

drivers/iommu/iommu-pages.h

··· 21 21 22 22 struct list_head iopt_freelist_elm; 23 23 unsigned long __page_mapping; 24 - pgoff_t __index; 24 + union { 25 + u8 incoherent; 26 + pgoff_t __index; 27 + }; 25 28 void *_private; 26 29 27 30 unsigned int __page_type; ··· 101 98 return iommu_alloc_pages_node_sz(NUMA_NO_NODE, gfp, size); 102 99 } 103 100 104 - #endif /* __IOMMU_PAGES_H */ 101 + int iommu_pages_start_incoherent(void *virt, struct device *dma_dev); 102 + int iommu_pages_start_incoherent_list(struct iommu_pages_list *list, 103 + struct device *dma_dev); 104 + 105 + #ifdef CONFIG_X86 106 + #define IOMMU_PAGES_USE_DMA_API 0 107 + #include <linux/cacheflush.h> 108 + 109 + static inline void iommu_pages_flush_incoherent(struct device *dma_dev, 110 + void *virt, size_t offset, 111 + size_t len) 112 + { 113 + clflush_cache_range(virt + offset, len); 114 + } 115 + static inline void 116 + iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, 117 + struct device *dma_dev) 118 + { 119 + /* 120 + * For performance leave the incoherent flag alone which turns this into 121 + * a NOP. For X86 the rest of the stop/free flow ignores the flag. 122 + */ 123 + } 124 + static inline void iommu_pages_free_incoherent(void *virt, 125 + struct device *dma_dev) 126 + { 127 + iommu_free_pages(virt); 128 + } 129 + #else 130 + #define IOMMU_PAGES_USE_DMA_API 1 131 + #include <linux/dma-mapping.h> 132 + 133 + static inline void iommu_pages_flush_incoherent(struct device *dma_dev, 134 + void *virt, size_t offset, 135 + size_t len) 136 + { 137 + dma_sync_single_for_device(dma_dev, (uintptr_t)virt + offset, len, 138 + DMA_TO_DEVICE); 139 + } 140 + void iommu_pages_stop_incoherent_list(struct iommu_pages_list *list, 141 + struct device *dma_dev); 142 + void iommu_pages_free_incoherent(void *virt, struct device *dma_dev); 143 + #endif 144 + 145 + #endif /* __IOMMU_PAGES_H */

+31 -13

drivers/iommu/iommu.c

··· 100 100 unsigned long action, void *data); 101 101 static void iommu_release_device(struct device *dev); 102 102 static int __iommu_attach_device(struct iommu_domain *domain, 103 - struct device *dev); 103 + struct device *dev, struct iommu_domain *old); 104 104 static int __iommu_attach_group(struct iommu_domain *domain, 105 105 struct iommu_group *group); 106 106 static struct iommu_domain *__iommu_paging_domain_alloc_flags(struct device *dev, ··· 114 114 static int __iommu_device_set_domain(struct iommu_group *group, 115 115 struct device *dev, 116 116 struct iommu_domain *new_domain, 117 + struct iommu_domain *old_domain, 117 118 unsigned int flags); 118 119 static int __iommu_group_set_domain_internal(struct iommu_group *group, 119 120 struct iommu_domain *new_domain, ··· 543 542 * Regardless, if a delayed attach never occurred, then the release 544 543 * should still avoid touching any hardware configuration either. 545 544 */ 546 - if (!dev->iommu->attach_deferred && ops->release_domain) 547 - ops->release_domain->ops->attach_dev(ops->release_domain, dev); 545 + if (!dev->iommu->attach_deferred && ops->release_domain) { 546 + struct iommu_domain *release_domain = ops->release_domain; 547 + 548 + /* 549 + * If the device requires direct mappings then it should not 550 + * be parked on a BLOCKED domain during release as that would 551 + * break the direct mappings. 552 + */ 553 + if (dev->iommu->require_direct && ops->identity_domain && 554 + release_domain == ops->blocked_domain) 555 + release_domain = ops->identity_domain; 556 + 557 + release_domain->ops->attach_dev(release_domain, dev, 558 + group->domain); 559 + } 548 560 549 561 if (ops->release_device) 550 562 ops->release_device(dev); ··· 642 628 if (group->default_domain) 643 629 iommu_create_device_direct_mappings(group->default_domain, dev); 644 630 if (group->domain) { 645 - ret = __iommu_device_set_domain(group, dev, group->domain, 0); 631 + ret = __iommu_device_set_domain(group, dev, group->domain, NULL, 632 + 0); 646 633 if (ret) 647 634 goto err_remove_gdev; 648 635 } else if (!group->default_domain && !group_list) { ··· 2130 2115 } 2131 2116 2132 2117 static int __iommu_attach_device(struct iommu_domain *domain, 2133 - struct device *dev) 2118 + struct device *dev, struct iommu_domain *old) 2134 2119 { 2135 2120 int ret; 2136 2121 2137 2122 if (unlikely(domain->ops->attach_dev == NULL)) 2138 2123 return -ENODEV; 2139 2124 2140 - ret = domain->ops->attach_dev(domain, dev); 2125 + ret = domain->ops->attach_dev(domain, dev, old); 2141 2126 if (ret) 2142 2127 return ret; 2143 2128 dev->iommu->attach_deferred = 0; ··· 2186 2171 int iommu_deferred_attach(struct device *dev, struct iommu_domain *domain) 2187 2172 { 2188 2173 if (dev->iommu && dev->iommu->attach_deferred) 2189 - return __iommu_attach_device(domain, dev); 2174 + return __iommu_attach_device(domain, dev, NULL); 2190 2175 2191 2176 return 0; 2192 2177 } ··· 2299 2284 static int __iommu_device_set_domain(struct iommu_group *group, 2300 2285 struct device *dev, 2301 2286 struct iommu_domain *new_domain, 2287 + struct iommu_domain *old_domain, 2302 2288 unsigned int flags) 2303 2289 { 2304 2290 int ret; ··· 2325 2309 dev->iommu->attach_deferred = 0; 2326 2310 } 2327 2311 2328 - ret = __iommu_attach_device(new_domain, dev); 2312 + ret = __iommu_attach_device(new_domain, dev, old_domain); 2329 2313 if (ret) { 2330 2314 /* 2331 2315 * If we have a blocking domain then try to attach that in hopes ··· 2335 2319 if ((flags & IOMMU_SET_DOMAIN_MUST_SUCCEED) && 2336 2320 group->blocking_domain && 2337 2321 group->blocking_domain != new_domain) 2338 - __iommu_attach_device(group->blocking_domain, dev); 2322 + __iommu_attach_device(group->blocking_domain, dev, 2323 + old_domain); 2339 2324 return ret; 2340 2325 } 2341 2326 return 0; ··· 2383 2366 result = 0; 2384 2367 for_each_group_device(group, gdev) { 2385 2368 ret = __iommu_device_set_domain(group, gdev->dev, new_domain, 2386 - flags); 2369 + group->domain, flags); 2387 2370 if (ret) { 2388 2371 result = ret; 2389 2372 /* ··· 2408 2391 */ 2409 2392 last_gdev = gdev; 2410 2393 for_each_group_device(group, gdev) { 2394 + /* No need to revert the last gdev that failed to set domain */ 2395 + if (gdev == last_gdev) 2396 + break; 2411 2397 /* 2412 2398 * A NULL domain can happen only for first probe, in which case 2413 2399 * we leave group->domain as NULL and let release clean ··· 2418 2398 */ 2419 2399 if (group->domain) 2420 2400 WARN_ON(__iommu_device_set_domain( 2421 - group, gdev->dev, group->domain, 2401 + group, gdev->dev, group->domain, new_domain, 2422 2402 IOMMU_SET_DOMAIN_MUST_SUCCEED)); 2423 - if (gdev == last_gdev) 2424 - break; 2425 2403 } 2426 2404 return ret; 2427 2405 }

+1

drivers/iommu/iommufd/Kconfig

··· 41 41 depends on DEBUG_KERNEL 42 42 depends on FAULT_INJECTION 43 43 depends on RUNTIME_TESTING_MENU 44 + depends on IOMMU_PT_AMDV1 44 45 select IOMMUFD_DRIVER 45 46 default n 46 47 help

+10 -1

drivers/iommu/iommufd/iommufd_test.h

··· 32 32 }; 33 33 34 34 enum { 35 + MOCK_IOMMUPT_DEFAULT = 0, 36 + MOCK_IOMMUPT_HUGE, 37 + MOCK_IOMMUPT_AMDV1, 38 + }; 39 + 40 + /* These values are true for MOCK_IOMMUPT_DEFAULT */ 41 + enum { 35 42 MOCK_APERTURE_START = 1UL << 24, 36 43 MOCK_APERTURE_LAST = (1UL << 31) - 1, 44 + MOCK_PAGE_SIZE = 2048, 45 + MOCK_HUGE_PAGE_SIZE = 512 * MOCK_PAGE_SIZE, 37 46 }; 38 47 39 48 enum { ··· 61 52 62 53 enum { 63 54 MOCK_FLAGS_DEVICE_NO_DIRTY = 1 << 0, 64 - MOCK_FLAGS_DEVICE_HUGE_IOVA = 1 << 1, 65 55 MOCK_FLAGS_DEVICE_PASID = 1 << 2, 66 56 }; 67 57 ··· 213 205 */ 214 206 struct iommu_hwpt_selftest { 215 207 __u32 iotlb; 208 + __u32 pagetable_type; 216 209 }; 217 210 218 211 /* Should not be equal to any defined value in enum iommu_hwpt_invalidate_data_type */

+178 -262

drivers/iommu/iommufd/selftest.c

··· 12 12 #include <linux/slab.h> 13 13 #include <linux/xarray.h> 14 14 #include <uapi/linux/iommufd.h> 15 + #include <linux/generic_pt/iommu.h> 16 + #include "../iommu-pages.h" 15 17 16 18 #include "../iommu-priv.h" 17 19 #include "io_pagetable.h" ··· 43 41 44 42 enum { 45 43 MOCK_DIRTY_TRACK = 1, 46 - MOCK_IO_PAGE_SIZE = PAGE_SIZE / 2, 47 - MOCK_HUGE_PAGE_SIZE = 512 * MOCK_IO_PAGE_SIZE, 48 - 49 - /* 50 - * Like a real page table alignment requires the low bits of the address 51 - * to be zero. xarray also requires the high bit to be zero, so we store 52 - * the pfns shifted. The upper bits are used for metadata. 53 - */ 54 - MOCK_PFN_MASK = ULONG_MAX / MOCK_IO_PAGE_SIZE, 55 - 56 - _MOCK_PFN_START = MOCK_PFN_MASK + 1, 57 - MOCK_PFN_START_IOVA = _MOCK_PFN_START, 58 - MOCK_PFN_LAST_IOVA = _MOCK_PFN_START, 59 - MOCK_PFN_DIRTY_IOVA = _MOCK_PFN_START << 1, 60 - MOCK_PFN_HUGE_IOVA = _MOCK_PFN_START << 2, 61 44 }; 62 45 63 46 static int mock_dev_enable_iopf(struct device *dev, struct iommu_domain *domain); ··· 111 124 } 112 125 113 126 struct mock_iommu_domain { 127 + union { 128 + struct iommu_domain domain; 129 + struct pt_iommu iommu; 130 + struct pt_iommu_amdv1 amdv1; 131 + }; 114 132 unsigned long flags; 115 - struct iommu_domain domain; 116 - struct xarray pfns; 117 133 }; 134 + PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, iommu, domain); 135 + PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, amdv1.iommu, domain); 118 136 119 137 static inline struct mock_iommu_domain * 120 138 to_mock_domain(struct iommu_domain *domain) ··· 208 216 } 209 217 210 218 static int mock_domain_nop_attach(struct iommu_domain *domain, 211 - struct device *dev) 219 + struct device *dev, struct iommu_domain *old) 212 220 { 213 221 struct mock_dev *mdev = to_mock_dev(dev); 214 222 struct mock_viommu *new_viommu = NULL; ··· 336 344 return 0; 337 345 } 338 346 339 - static bool mock_test_and_clear_dirty(struct mock_iommu_domain *mock, 340 - unsigned long iova, size_t page_size, 341 - unsigned long flags) 342 - { 343 - unsigned long cur, end = iova + page_size - 1; 344 - bool dirty = false; 345 - void *ent, *old; 346 - 347 - for (cur = iova; cur < end; cur += MOCK_IO_PAGE_SIZE) { 348 - ent = xa_load(&mock->pfns, cur / MOCK_IO_PAGE_SIZE); 349 - if (!ent || !(xa_to_value(ent) & MOCK_PFN_DIRTY_IOVA)) 350 - continue; 351 - 352 - dirty = true; 353 - /* Clear dirty */ 354 - if (!(flags & IOMMU_DIRTY_NO_CLEAR)) { 355 - unsigned long val; 356 - 357 - val = xa_to_value(ent) & ~MOCK_PFN_DIRTY_IOVA; 358 - old = xa_store(&mock->pfns, cur / MOCK_IO_PAGE_SIZE, 359 - xa_mk_value(val), GFP_KERNEL); 360 - WARN_ON_ONCE(ent != old); 361 - } 362 - } 363 - 364 - return dirty; 365 - } 366 - 367 - static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain, 368 - unsigned long iova, size_t size, 369 - unsigned long flags, 370 - struct iommu_dirty_bitmap *dirty) 371 - { 372 - struct mock_iommu_domain *mock = to_mock_domain(domain); 373 - unsigned long end = iova + size; 374 - void *ent; 375 - 376 - if (!(mock->flags & MOCK_DIRTY_TRACK) && dirty->bitmap) 377 - return -EINVAL; 378 - 379 - do { 380 - unsigned long pgsize = MOCK_IO_PAGE_SIZE; 381 - unsigned long head; 382 - 383 - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); 384 - if (!ent) { 385 - iova += pgsize; 386 - continue; 387 - } 388 - 389 - if (xa_to_value(ent) & MOCK_PFN_HUGE_IOVA) 390 - pgsize = MOCK_HUGE_PAGE_SIZE; 391 - head = iova & ~(pgsize - 1); 392 - 393 - /* Clear dirty */ 394 - if (mock_test_and_clear_dirty(mock, head, pgsize, flags)) 395 - iommu_dirty_bitmap_record(dirty, iova, pgsize); 396 - iova += pgsize; 397 - } while (iova < end); 398 - 399 - return 0; 400 - } 401 - 402 - static const struct iommu_dirty_ops dirty_ops = { 403 - .set_dirty_tracking = mock_domain_set_dirty_tracking, 404 - .read_and_clear_dirty = mock_domain_read_and_clear_dirty, 405 - }; 406 - 407 347 static struct mock_iommu_domain_nested * 408 348 __mock_domain_alloc_nested(const struct iommu_user_data *user_data) 409 349 { ··· 370 446 371 447 if (flags & ~IOMMU_HWPT_ALLOC_PASID) 372 448 return ERR_PTR(-EOPNOTSUPP); 373 - if (!parent || parent->ops != mock_ops.default_domain_ops) 449 + if (!parent || !(parent->type & __IOMMU_DOMAIN_PAGING)) 374 450 return ERR_PTR(-EINVAL); 375 451 376 452 mock_parent = to_mock_domain(parent); ··· 383 459 return &mock_nested->domain; 384 460 } 385 461 462 + static void mock_domain_free(struct iommu_domain *domain) 463 + { 464 + struct mock_iommu_domain *mock = to_mock_domain(domain); 465 + 466 + pt_iommu_deinit(&mock->iommu); 467 + kfree(mock); 468 + } 469 + 470 + static void mock_iotlb_sync(struct iommu_domain *domain, 471 + struct iommu_iotlb_gather *gather) 472 + { 473 + iommu_put_pages_list(&gather->freelist); 474 + } 475 + 476 + static const struct iommu_domain_ops amdv1_mock_ops = { 477 + IOMMU_PT_DOMAIN_OPS(amdv1_mock), 478 + .free = mock_domain_free, 479 + .attach_dev = mock_domain_nop_attach, 480 + .set_dev_pasid = mock_domain_set_dev_pasid_nop, 481 + .iotlb_sync = &mock_iotlb_sync, 482 + }; 483 + 484 + static const struct iommu_domain_ops amdv1_mock_huge_ops = { 485 + IOMMU_PT_DOMAIN_OPS(amdv1_mock), 486 + .free = mock_domain_free, 487 + .attach_dev = mock_domain_nop_attach, 488 + .set_dev_pasid = mock_domain_set_dev_pasid_nop, 489 + .iotlb_sync = &mock_iotlb_sync, 490 + }; 491 + #undef pt_iommu_amdv1_mock_map_pages 492 + 493 + static const struct iommu_dirty_ops amdv1_mock_dirty_ops = { 494 + IOMMU_PT_DIRTY_OPS(amdv1_mock), 495 + .set_dirty_tracking = mock_domain_set_dirty_tracking, 496 + }; 497 + 498 + static const struct iommu_domain_ops amdv1_ops = { 499 + IOMMU_PT_DOMAIN_OPS(amdv1), 500 + .free = mock_domain_free, 501 + .attach_dev = mock_domain_nop_attach, 502 + .set_dev_pasid = mock_domain_set_dev_pasid_nop, 503 + .iotlb_sync = &mock_iotlb_sync, 504 + }; 505 + 506 + static const struct iommu_dirty_ops amdv1_dirty_ops = { 507 + IOMMU_PT_DIRTY_OPS(amdv1), 508 + .set_dirty_tracking = mock_domain_set_dirty_tracking, 509 + }; 510 + 511 + static struct mock_iommu_domain * 512 + mock_domain_alloc_pgtable(struct device *dev, 513 + const struct iommu_hwpt_selftest *user_cfg, u32 flags) 514 + { 515 + struct mock_iommu_domain *mock; 516 + int rc; 517 + 518 + mock = kzalloc(sizeof(*mock), GFP_KERNEL); 519 + if (!mock) 520 + return ERR_PTR(-ENOMEM); 521 + mock->domain.type = IOMMU_DOMAIN_UNMANAGED; 522 + 523 + mock->amdv1.iommu.nid = NUMA_NO_NODE; 524 + 525 + switch (user_cfg->pagetable_type) { 526 + case MOCK_IOMMUPT_DEFAULT: 527 + case MOCK_IOMMUPT_HUGE: { 528 + struct pt_iommu_amdv1_cfg cfg = {}; 529 + 530 + /* The mock version has a 2k page size */ 531 + cfg.common.hw_max_vasz_lg2 = 56; 532 + cfg.common.hw_max_oasz_lg2 = 51; 533 + cfg.starting_level = 2; 534 + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) 535 + mock->domain.ops = &amdv1_mock_huge_ops; 536 + else 537 + mock->domain.ops = &amdv1_mock_ops; 538 + rc = pt_iommu_amdv1_mock_init(&mock->amdv1, &cfg, GFP_KERNEL); 539 + if (rc) 540 + goto err_free; 541 + 542 + /* 543 + * In huge mode userspace should only provide huge pages, we 544 + * have to include PAGE_SIZE for the domain to be accepted by 545 + * iommufd. 546 + */ 547 + if (user_cfg->pagetable_type == MOCK_IOMMUPT_HUGE) 548 + mock->domain.pgsize_bitmap = MOCK_HUGE_PAGE_SIZE | 549 + PAGE_SIZE; 550 + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) 551 + mock->domain.dirty_ops = &amdv1_mock_dirty_ops; 552 + break; 553 + } 554 + 555 + case MOCK_IOMMUPT_AMDV1: { 556 + struct pt_iommu_amdv1_cfg cfg = {}; 557 + 558 + cfg.common.hw_max_vasz_lg2 = 64; 559 + cfg.common.hw_max_oasz_lg2 = 52; 560 + cfg.common.features = BIT(PT_FEAT_DYNAMIC_TOP) | 561 + BIT(PT_FEAT_AMDV1_ENCRYPT_TABLES) | 562 + BIT(PT_FEAT_AMDV1_FORCE_COHERENCE); 563 + cfg.starting_level = 2; 564 + mock->domain.ops = &amdv1_ops; 565 + rc = pt_iommu_amdv1_init(&mock->amdv1, &cfg, GFP_KERNEL); 566 + if (rc) 567 + goto err_free; 568 + if (flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) 569 + mock->domain.dirty_ops = &amdv1_dirty_ops; 570 + break; 571 + } 572 + default: 573 + rc = -EOPNOTSUPP; 574 + goto err_free; 575 + } 576 + 577 + /* 578 + * Override the real aperture to the MOCK aperture for test purposes. 579 + */ 580 + if (user_cfg->pagetable_type == MOCK_IOMMUPT_DEFAULT) { 581 + WARN_ON(mock->domain.geometry.aperture_start != 0); 582 + WARN_ON(mock->domain.geometry.aperture_end < MOCK_APERTURE_LAST); 583 + 584 + mock->domain.geometry.aperture_start = MOCK_APERTURE_START; 585 + mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; 586 + } 587 + 588 + return mock; 589 + err_free: 590 + kfree(mock); 591 + return ERR_PTR(rc); 592 + } 593 + 386 594 static struct iommu_domain * 387 595 mock_domain_alloc_paging_flags(struct device *dev, u32 flags, 388 596 const struct iommu_user_data *user_data) ··· 525 469 IOMMU_HWPT_ALLOC_PASID; 526 470 struct mock_dev *mdev = to_mock_dev(dev); 527 471 bool no_dirty_ops = mdev->flags & MOCK_FLAGS_DEVICE_NO_DIRTY; 472 + struct iommu_hwpt_selftest user_cfg = {}; 528 473 struct mock_iommu_domain *mock; 474 + int rc; 529 475 530 - if (user_data) 531 - return ERR_PTR(-EOPNOTSUPP); 532 476 if ((flags & ~PAGING_FLAGS) || (has_dirty_flag && no_dirty_ops)) 533 477 return ERR_PTR(-EOPNOTSUPP); 534 478 535 - mock = kzalloc(sizeof(*mock), GFP_KERNEL); 536 - if (!mock) 537 - return ERR_PTR(-ENOMEM); 538 - mock->domain.geometry.aperture_start = MOCK_APERTURE_START; 539 - mock->domain.geometry.aperture_end = MOCK_APERTURE_LAST; 540 - mock->domain.pgsize_bitmap = MOCK_IO_PAGE_SIZE; 541 - if (dev && mdev->flags & MOCK_FLAGS_DEVICE_HUGE_IOVA) 542 - mock->domain.pgsize_bitmap |= MOCK_HUGE_PAGE_SIZE; 543 - mock->domain.ops = mock_ops.default_domain_ops; 544 - mock->domain.type = IOMMU_DOMAIN_UNMANAGED; 545 - xa_init(&mock->pfns); 479 + if (user_data && (user_data->type != IOMMU_HWPT_DATA_SELFTEST && 480 + user_data->type != IOMMU_HWPT_DATA_NONE)) 481 + return ERR_PTR(-EOPNOTSUPP); 546 482 547 - if (has_dirty_flag) 548 - mock->domain.dirty_ops = &dirty_ops; 483 + if (user_data) { 484 + rc = iommu_copy_struct_from_user( 485 + &user_cfg, user_data, IOMMU_HWPT_DATA_SELFTEST, iotlb); 486 + if (rc) 487 + return ERR_PTR(rc); 488 + } 489 + 490 + mock = mock_domain_alloc_pgtable(dev, &user_cfg, flags); 491 + if (IS_ERR(mock)) 492 + return ERR_CAST(mock); 549 493 return &mock->domain; 550 - } 551 - 552 - static void mock_domain_free(struct iommu_domain *domain) 553 - { 554 - struct mock_iommu_domain *mock = to_mock_domain(domain); 555 - 556 - WARN_ON(!xa_empty(&mock->pfns)); 557 - kfree(mock); 558 - } 559 - 560 - static int mock_domain_map_pages(struct iommu_domain *domain, 561 - unsigned long iova, phys_addr_t paddr, 562 - size_t pgsize, size_t pgcount, int prot, 563 - gfp_t gfp, size_t *mapped) 564 - { 565 - struct mock_iommu_domain *mock = to_mock_domain(domain); 566 - unsigned long flags = MOCK_PFN_START_IOVA; 567 - unsigned long start_iova = iova; 568 - 569 - /* 570 - * xarray does not reliably work with fault injection because it does a 571 - * retry allocation, so put our own failure point. 572 - */ 573 - if (iommufd_should_fail()) 574 - return -ENOENT; 575 - 576 - WARN_ON(iova % MOCK_IO_PAGE_SIZE); 577 - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); 578 - for (; pgcount; pgcount--) { 579 - size_t cur; 580 - 581 - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { 582 - void *old; 583 - 584 - if (pgcount == 1 && cur + MOCK_IO_PAGE_SIZE == pgsize) 585 - flags = MOCK_PFN_LAST_IOVA; 586 - if (pgsize != MOCK_IO_PAGE_SIZE) { 587 - flags |= MOCK_PFN_HUGE_IOVA; 588 - } 589 - old = xa_store(&mock->pfns, iova / MOCK_IO_PAGE_SIZE, 590 - xa_mk_value((paddr / MOCK_IO_PAGE_SIZE) | 591 - flags), 592 - gfp); 593 - if (xa_is_err(old)) { 594 - for (; start_iova != iova; 595 - start_iova += MOCK_IO_PAGE_SIZE) 596 - xa_erase(&mock->pfns, 597 - start_iova / 598 - MOCK_IO_PAGE_SIZE); 599 - return xa_err(old); 600 - } 601 - WARN_ON(old); 602 - iova += MOCK_IO_PAGE_SIZE; 603 - paddr += MOCK_IO_PAGE_SIZE; 604 - *mapped += MOCK_IO_PAGE_SIZE; 605 - flags = 0; 606 - } 607 - } 608 - return 0; 609 - } 610 - 611 - static size_t mock_domain_unmap_pages(struct iommu_domain *domain, 612 - unsigned long iova, size_t pgsize, 613 - size_t pgcount, 614 - struct iommu_iotlb_gather *iotlb_gather) 615 - { 616 - struct mock_iommu_domain *mock = to_mock_domain(domain); 617 - bool first = true; 618 - size_t ret = 0; 619 - void *ent; 620 - 621 - WARN_ON(iova % MOCK_IO_PAGE_SIZE); 622 - WARN_ON(pgsize % MOCK_IO_PAGE_SIZE); 623 - 624 - for (; pgcount; pgcount--) { 625 - size_t cur; 626 - 627 - for (cur = 0; cur != pgsize; cur += MOCK_IO_PAGE_SIZE) { 628 - ent = xa_erase(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); 629 - 630 - /* 631 - * iommufd generates unmaps that must be a strict 632 - * superset of the map's performend So every 633 - * starting/ending IOVA should have been an iova passed 634 - * to map. 635 - * 636 - * This simple logic doesn't work when the HUGE_PAGE is 637 - * turned on since the core code will automatically 638 - * switch between the two page sizes creating a break in 639 - * the unmap calls. The break can land in the middle of 640 - * contiguous IOVA. 641 - */ 642 - if (!(domain->pgsize_bitmap & MOCK_HUGE_PAGE_SIZE)) { 643 - if (first) { 644 - WARN_ON(ent && !(xa_to_value(ent) & 645 - MOCK_PFN_START_IOVA)); 646 - first = false; 647 - } 648 - if (pgcount == 1 && 649 - cur + MOCK_IO_PAGE_SIZE == pgsize) 650 - WARN_ON(ent && !(xa_to_value(ent) & 651 - MOCK_PFN_LAST_IOVA)); 652 - } 653 - 654 - iova += MOCK_IO_PAGE_SIZE; 655 - ret += MOCK_IO_PAGE_SIZE; 656 - } 657 - } 658 - return ret; 659 - } 660 - 661 - static phys_addr_t mock_domain_iova_to_phys(struct iommu_domain *domain, 662 - dma_addr_t iova) 663 - { 664 - struct mock_iommu_domain *mock = to_mock_domain(domain); 665 - void *ent; 666 - 667 - WARN_ON(iova % MOCK_IO_PAGE_SIZE); 668 - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); 669 - WARN_ON(!ent); 670 - return (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE; 671 494 } 672 495 673 496 static bool mock_domain_capable(struct device *dev, enum iommu_cap cap) ··· 890 955 .user_pasid_table = true, 891 956 .get_viommu_size = mock_get_viommu_size, 892 957 .viommu_init = mock_viommu_init, 893 - .default_domain_ops = 894 - &(struct iommu_domain_ops){ 895 - .free = mock_domain_free, 896 - .attach_dev = mock_domain_nop_attach, 897 - .map_pages = mock_domain_map_pages, 898 - .unmap_pages = mock_domain_unmap_pages, 899 - .iova_to_phys = mock_domain_iova_to_phys, 900 - .set_dev_pasid = mock_domain_set_dev_pasid_nop, 901 - }, 902 958 }; 903 959 904 960 static void mock_domain_free_nested(struct iommu_domain *domain) ··· 973 1047 if (IS_ERR(hwpt)) 974 1048 return hwpt; 975 1049 if (hwpt->domain->type != IOMMU_DOMAIN_UNMANAGED || 976 - hwpt->domain->ops != mock_ops.default_domain_ops) { 1050 + hwpt->domain->owner != &mock_ops) { 977 1051 iommufd_put_object(ucmd->ictx, &hwpt->obj); 978 1052 return ERR_PTR(-EINVAL); 979 1053 } ··· 1014 1088 {}, 1015 1089 }; 1016 1090 const u32 valid_flags = MOCK_FLAGS_DEVICE_NO_DIRTY | 1017 - MOCK_FLAGS_DEVICE_HUGE_IOVA | 1018 1091 MOCK_FLAGS_DEVICE_PASID; 1019 1092 struct mock_dev *mdev; 1020 1093 int rc, i; ··· 1202 1277 { 1203 1278 struct iommufd_hw_pagetable *hwpt; 1204 1279 struct mock_iommu_domain *mock; 1280 + unsigned int page_size; 1205 1281 uintptr_t end; 1206 1282 int rc; 1207 - 1208 - if (iova % MOCK_IO_PAGE_SIZE || length % MOCK_IO_PAGE_SIZE || 1209 - (uintptr_t)uptr % MOCK_IO_PAGE_SIZE || 1210 - check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) 1211 - return -EINVAL; 1212 1283 1213 1284 hwpt = get_md_pagetable(ucmd, mockpt_id, &mock); 1214 1285 if (IS_ERR(hwpt)) 1215 1286 return PTR_ERR(hwpt); 1216 1287 1217 - for (; length; length -= MOCK_IO_PAGE_SIZE) { 1288 + page_size = 1 << __ffs(mock->domain.pgsize_bitmap); 1289 + if (iova % page_size || length % page_size || 1290 + (uintptr_t)uptr % page_size || 1291 + check_add_overflow((uintptr_t)uptr, (uintptr_t)length, &end)) 1292 + return -EINVAL; 1293 + 1294 + for (; length; length -= page_size) { 1218 1295 struct page *pages[1]; 1296 + phys_addr_t io_phys; 1219 1297 unsigned long pfn; 1220 1298 long npages; 1221 - void *ent; 1222 1299 1223 1300 npages = get_user_pages_fast((uintptr_t)uptr & PAGE_MASK, 1, 0, 1224 1301 pages); ··· 1235 1308 pfn = page_to_pfn(pages[0]); 1236 1309 put_page(pages[0]); 1237 1310 1238 - ent = xa_load(&mock->pfns, iova / MOCK_IO_PAGE_SIZE); 1239 - if (!ent || 1240 - (xa_to_value(ent) & MOCK_PFN_MASK) * MOCK_IO_PAGE_SIZE != 1241 - pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { 1311 + io_phys = mock->domain.ops->iova_to_phys(&mock->domain, iova); 1312 + if (io_phys != 1313 + pfn * PAGE_SIZE + ((uintptr_t)uptr % PAGE_SIZE)) { 1242 1314 rc = -EINVAL; 1243 1315 goto out_put; 1244 1316 } 1245 - iova += MOCK_IO_PAGE_SIZE; 1246 - uptr += MOCK_IO_PAGE_SIZE; 1317 + iova += page_size; 1318 + uptr += page_size; 1247 1319 } 1248 1320 rc = 0; 1249 1321 ··· 1721 1795 if (IS_ERR(hwpt)) 1722 1796 return PTR_ERR(hwpt); 1723 1797 1724 - if (!(mock->flags & MOCK_DIRTY_TRACK)) { 1798 + if (!(mock->flags & MOCK_DIRTY_TRACK) || !mock->iommu.ops->set_dirty) { 1725 1799 rc = -EINVAL; 1726 1800 goto out_put; 1727 1801 } ··· 1740 1814 } 1741 1815 1742 1816 for (i = 0; i < max; i++) { 1743 - unsigned long cur = iova + i * page_size; 1744 - void *ent, *old; 1745 - 1746 1817 if (!test_bit(i, (unsigned long *)tmp)) 1747 1818 continue; 1748 - 1749 - ent = xa_load(&mock->pfns, cur / page_size); 1750 - if (ent) { 1751 - unsigned long val; 1752 - 1753 - val = xa_to_value(ent) | MOCK_PFN_DIRTY_IOVA; 1754 - old = xa_store(&mock->pfns, cur / page_size, 1755 - xa_mk_value(val), GFP_KERNEL); 1756 - WARN_ON_ONCE(ent != old); 1757 - count++; 1758 - } 1819 + mock->iommu.ops->set_dirty(&mock->iommu, iova + i * page_size); 1820 + count++; 1759 1821 } 1760 1822 1761 1823 cmd->dirty.out_nr_dirty = count; ··· 2116 2202 platform_device_unregister(selftest_iommu_dev); 2117 2203 debugfs_remove_recursive(dbgfs_root); 2118 2204 } 2205 + 2206 + MODULE_IMPORT_NS("GENERIC_PT_IOMMU");

+7 -5

drivers/iommu/ipmmu-vmsa.c

··· 590 590 } 591 591 592 592 static int ipmmu_attach_device(struct iommu_domain *io_domain, 593 - struct device *dev) 593 + struct device *dev, struct iommu_domain *old) 594 594 { 595 595 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 596 596 struct ipmmu_vmsa_device *mmu = to_ipmmu(dev); ··· 637 637 } 638 638 639 639 static int ipmmu_iommu_identity_attach(struct iommu_domain *identity_domain, 640 - struct device *dev) 640 + struct device *dev, 641 + struct iommu_domain *old) 641 642 { 642 - struct iommu_domain *io_domain = iommu_get_domain_for_dev(dev); 643 643 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 644 644 struct ipmmu_vmsa_domain *domain; 645 645 unsigned int i; 646 646 647 - if (io_domain == identity_domain || !io_domain) 647 + if (old == identity_domain || !old) 648 648 return 0; 649 649 650 - domain = to_vmsa_domain(io_domain); 650 + domain = to_vmsa_domain(old); 651 651 for (i = 0; i < fwspec->num_ids; ++i) 652 652 ipmmu_utlb_disable(domain, fwspec->ids[i]); 653 653 ··· 719 719 return -ENODEV; 720 720 721 721 dev_iommu_priv_set(dev, platform_get_drvdata(ipmmu_pdev)); 722 + 723 + put_device(&ipmmu_pdev->dev); 722 724 723 725 return 0; 724 726 }

+6 -5

drivers/iommu/msm_iommu.c

··· 391 391 return &iommu->iommu; 392 392 } 393 393 394 - static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) 394 + static int msm_iommu_attach_dev(struct iommu_domain *domain, struct device *dev, 395 + struct iommu_domain *old) 395 396 { 396 397 int ret = 0; 397 398 unsigned long flags; ··· 442 441 } 443 442 444 443 static int msm_iommu_identity_attach(struct iommu_domain *identity_domain, 445 - struct device *dev) 444 + struct device *dev, 445 + struct iommu_domain *old) 446 446 { 447 - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 448 447 struct msm_priv *priv; 449 448 unsigned long flags; 450 449 struct msm_iommu_dev *iommu; 451 450 struct msm_iommu_ctx_dev *master; 452 451 int ret = 0; 453 452 454 - if (domain == identity_domain || !domain) 453 + if (old == identity_domain || !old) 455 454 return 0; 456 455 457 - priv = to_msm_priv(domain); 456 + priv = to_msm_priv(old); 458 457 free_io_pgtable_ops(priv->iop); 459 458 460 459 spin_lock_irqsave(&msm_iommu_lock, flags);

+146 -28

drivers/iommu/mtk_iommu.c

··· 139 139 /* 2 bits: iommu type */ 140 140 #define MTK_IOMMU_TYPE_MM (0x0 << 13) 141 141 #define MTK_IOMMU_TYPE_INFRA (0x1 << 13) 142 + #define MTK_IOMMU_TYPE_APU (0x2 << 13) 142 143 #define MTK_IOMMU_TYPE_MASK (0x3 << 13) 143 144 /* PM and clock always on. e.g. infra iommu */ 144 145 #define PM_CLK_AO BIT(15) ··· 148 147 #define TF_PORT_TO_ADDR_MT8173 BIT(18) 149 148 #define INT_ID_PORT_WIDTH_6 BIT(19) 150 149 #define CFG_IFA_MASTER_IN_ATF BIT(20) 150 + #define DL_WITH_MULTI_LARB BIT(21) 151 151 152 152 #define MTK_IOMMU_HAS_FLAG_MASK(pdata, _x, mask) \ 153 153 ((((pdata)->flags) & (mask)) == (_x)) ··· 174 172 M4U_MT8183, 175 173 M4U_MT8186, 176 174 M4U_MT8188, 175 + M4U_MT8189, 177 176 M4U_MT8192, 178 177 M4U_MT8195, 179 178 M4U_MT8365, ··· 338 335 */ 339 336 #define MTK_IOMMU_4GB_MODE_REMAP_BASE 0x140000000UL 340 337 338 + static LIST_HEAD(apulist); /* List the apu iommu HWs */ 339 + static LIST_HEAD(infralist); /* List the iommu_infra HW */ 341 340 static LIST_HEAD(m4ulist); /* List all the M4U HWs */ 342 341 343 342 #define for_each_m4u(data, head) list_for_each_entry(data, head, list) ··· 354 349 355 350 #define MT8192_MULTI_REGION_NR (IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) ? \ 356 351 MT8192_MULTI_REGION_NR_MAX : 1) 352 + 353 + static const struct mtk_iommu_iova_region mt8189_multi_dom_apu[] = { 354 + { .iova_base = 0x200000ULL, .size = SZ_512M}, /* APU SECURE */ 355 + #if IS_ENABLED(CONFIG_ARCH_DMA_ADDR_T_64BIT) 356 + { .iova_base = SZ_1G, .size = 0xc0000000}, /* APU CODE */ 357 + { .iova_base = 0x70000000ULL, .size = 0x12600000}, /* APU VLM */ 358 + { .iova_base = SZ_4G, .size = SZ_4G * 3}, /* APU VPU */ 359 + #endif 360 + }; 357 361 358 362 static const struct mtk_iommu_iova_region mt8192_multi_dom[MT8192_MULTI_REGION_NR] = { 359 363 { .iova_base = 0x0, .size = MTK_IOMMU_IOVA_SZ_4G}, /* 0 ~ 4G, */ ··· 719 705 } 720 706 721 707 static int mtk_iommu_attach_device(struct iommu_domain *domain, 722 - struct device *dev) 708 + struct device *dev, struct iommu_domain *old) 723 709 { 724 710 struct mtk_iommu_data *data = dev_iommu_priv_get(dev), *frstdata; 725 711 struct mtk_iommu_domain *dom = to_mtk_domain(domain); ··· 787 773 } 788 774 789 775 static int mtk_iommu_identity_attach(struct iommu_domain *identity_domain, 790 - struct device *dev) 776 + struct device *dev, 777 + struct iommu_domain *old) 791 778 { 792 - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 793 779 struct mtk_iommu_data *data = dev_iommu_priv_get(dev); 794 780 795 - if (domain == identity_domain || !domain) 781 + if (old == identity_domain || !old) 796 782 return 0; 797 783 798 784 mtk_iommu_config(data, dev, false, 0); ··· 879 865 struct mtk_iommu_data *data = dev_iommu_priv_get(dev); 880 866 struct device_link *link; 881 867 struct device *larbdev; 868 + unsigned long larbid_msk = 0; 882 869 unsigned int larbid, larbidx, i; 883 870 884 871 if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) ··· 887 872 888 873 /* 889 874 * Link the consumer device with the smi-larb device(supplier). 890 - * The device that connects with each a larb is a independent HW. 891 - * All the ports in each a device should be in the same larbs. 875 + * w/DL_WITH_MULTI_LARB: the master may connect with multi larbs, 876 + * we should create device link with each larb. 877 + * w/o DL_WITH_MULTI_LARB: the master must connect with one larb, 878 + * otherwise fail. 892 879 */ 893 880 larbid = MTK_M4U_TO_LARB(fwspec->ids[0]); 894 881 if (larbid >= MTK_LARB_NR_MAX) 895 882 return ERR_PTR(-EINVAL); 896 883 884 + larbid_msk |= BIT(larbid); 885 + 897 886 for (i = 1; i < fwspec->num_ids; i++) { 898 887 larbidx = MTK_M4U_TO_LARB(fwspec->ids[i]); 899 - if (larbid != larbidx) { 888 + if (MTK_IOMMU_HAS_FLAG(data->plat_data, DL_WITH_MULTI_LARB)) { 889 + larbid_msk |= BIT(larbidx); 890 + } else if (larbid != larbidx) { 900 891 dev_err(dev, "Can only use one larb. Fail@larb%d-%d.\n", 901 892 larbid, larbidx); 902 893 return ERR_PTR(-EINVAL); 903 894 } 904 895 } 905 - larbdev = data->larb_imu[larbid].dev; 906 - if (!larbdev) 907 - return ERR_PTR(-EINVAL); 908 896 909 - link = device_link_add(dev, larbdev, 910 - DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS); 911 - if (!link) 912 - dev_err(dev, "Unable to link %s\n", dev_name(larbdev)); 897 + for_each_set_bit(larbid, &larbid_msk, 32) { 898 + larbdev = data->larb_imu[larbid].dev; 899 + if (!larbdev) 900 + return ERR_PTR(-EINVAL); 901 + 902 + link = device_link_add(dev, larbdev, 903 + DL_FLAG_PM_RUNTIME | DL_FLAG_STATELESS); 904 + if (!link) { 905 + dev_err(dev, "Unable to link %s\n", dev_name(larbdev)); 906 + goto link_remove; 907 + } 908 + } 909 + 913 910 return &data->iommu; 911 + 912 + link_remove: 913 + for_each_set_bit(i, &larbid_msk, larbid) { 914 + larbdev = data->larb_imu[i].dev; 915 + device_link_remove(dev, larbdev); 916 + } 917 + 918 + return ERR_PTR(-ENODEV); 914 919 } 915 920 916 921 static void mtk_iommu_release_device(struct device *dev) ··· 938 903 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 939 904 struct mtk_iommu_data *data; 940 905 struct device *larbdev; 941 - unsigned int larbid; 906 + unsigned int larbid, i; 907 + unsigned long larbid_msk = 0; 942 908 943 909 data = dev_iommu_priv_get(dev); 944 - if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { 945 - larbid = MTK_M4U_TO_LARB(fwspec->ids[0]); 910 + if (!MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) 911 + return; 912 + 913 + for (i = 0; i < fwspec->num_ids; i++) { 914 + larbid = MTK_M4U_TO_LARB(fwspec->ids[i]); 915 + larbid_msk |= BIT(larbid); 916 + } 917 + 918 + for_each_set_bit(larbid, &larbid_msk, 32) { 946 919 larbdev = data->larb_imu[larbid].dev; 947 920 device_link_remove(dev, larbdev); 948 921 } ··· 1017 974 return -EINVAL; 1018 975 1019 976 dev_iommu_priv_set(dev, platform_get_drvdata(m4updev)); 977 + 978 + put_device(&m4updev->dev); 1020 979 } 1021 980 1022 981 return iommu_fwspec_add_ids(dev, args->args, 1); ··· 1256 1211 } 1257 1212 1258 1213 component_match_add(dev, match, component_compare_dev, &plarbdev->dev); 1259 - platform_device_put(plarbdev); 1260 1214 } 1261 1215 1262 - if (!frst_avail_smicomm_node) 1263 - return -EINVAL; 1216 + if (!frst_avail_smicomm_node) { 1217 + ret = -EINVAL; 1218 + goto err_larbdev_put; 1219 + } 1264 1220 1265 1221 pcommdev = of_find_device_by_node(frst_avail_smicomm_node); 1266 1222 of_node_put(frst_avail_smicomm_node); 1267 - if (!pcommdev) 1268 - return -ENODEV; 1223 + if (!pcommdev) { 1224 + ret = -ENODEV; 1225 + goto err_larbdev_put; 1226 + } 1269 1227 data->smicomm_dev = &pcommdev->dev; 1270 1228 1271 1229 link = device_link_add(data->smicomm_dev, dev, ··· 1276 1228 platform_device_put(pcommdev); 1277 1229 if (!link) { 1278 1230 dev_err(dev, "Unable to link %s.\n", dev_name(data->smicomm_dev)); 1279 - return -EINVAL; 1231 + ret = -EINVAL; 1232 + goto err_larbdev_put; 1280 1233 } 1281 1234 return 0; 1282 1235 1283 1236 err_larbdev_put: 1284 - for (i = MTK_LARB_NR_MAX - 1; i >= 0; i--) { 1285 - if (!data->larb_imu[i].dev) 1286 - continue; 1237 + /* id mapping may not be linear, loop the whole array */ 1238 + for (i = 0; i < MTK_LARB_NR_MAX; i++) 1287 1239 put_device(data->larb_imu[i].dev); 1288 - } 1240 + 1289 1241 return ret; 1290 1242 } 1291 1243 ··· 1448 1400 iommu_device_sysfs_remove(&data->iommu); 1449 1401 out_list_del: 1450 1402 list_del(&data->list); 1451 - if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) 1403 + if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { 1452 1404 device_link_remove(data->smicomm_dev, dev); 1405 + 1406 + for (i = 0; i < MTK_LARB_NR_MAX; i++) 1407 + put_device(data->larb_imu[i].dev); 1408 + } 1453 1409 out_runtime_disable: 1454 1410 pm_runtime_disable(dev); 1455 1411 return ret; ··· 1473 1421 if (MTK_IOMMU_IS_TYPE(data->plat_data, MTK_IOMMU_TYPE_MM)) { 1474 1422 device_link_remove(data->smicomm_dev, &pdev->dev); 1475 1423 component_master_del(&pdev->dev, &mtk_iommu_com_ops); 1424 + 1425 + for (i = 0; i < MTK_LARB_NR_MAX; i++) 1426 + put_device(data->larb_imu[i].dev); 1476 1427 } 1477 1428 pm_runtime_disable(&pdev->dev); 1478 1429 for (i = 0; i < data->plat_data->banks_num; i++) { ··· 1750 1695 27, 28 /* ccu0 */, MTK_INVALID_LARBID}, {4, 6}}, 1751 1696 }; 1752 1697 1698 + static const unsigned int mt8189_apu_region_msk[][MTK_LARB_NR_MAX] = { 1699 + [0] = {[0] = BIT(2)}, /* Region0: fake larb 0 APU_SECURE */ 1700 + [1] = {[0] = BIT(1)}, /* Region1: fake larb 0 APU_CODE */ 1701 + [2] = {[0] = BIT(3)}, /* Region2: fake larb 0 APU_VLM */ 1702 + [3] = {[0] = BIT(0)}, /* Region3: fake larb 0 APU_DATA */ 1703 + }; 1704 + 1705 + static const struct mtk_iommu_plat_data mt8189_data_apu = { 1706 + .m4u_plat = M4U_MT8189, 1707 + .flags = IOVA_34_EN | DCM_DISABLE | 1708 + MTK_IOMMU_TYPE_APU | PGTABLE_PA_35_EN, 1709 + .hw_list = &apulist, 1710 + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, 1711 + .banks_num = 1, 1712 + .banks_enable = {true}, 1713 + .iova_region = mt8189_multi_dom_apu, 1714 + .iova_region_nr = ARRAY_SIZE(mt8189_multi_dom_apu), 1715 + .larbid_remap = {{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}}, 1716 + .iova_region_larb_msk = mt8189_apu_region_msk, 1717 + }; 1718 + 1719 + static const struct mtk_iommu_plat_data mt8189_data_infra = { 1720 + .m4u_plat = M4U_MT8189, 1721 + .flags = WR_THROT_EN | DCM_DISABLE | MTK_IOMMU_TYPE_INFRA | 1722 + CFG_IFA_MASTER_IN_ATF | SHARE_PGTABLE | PGTABLE_PA_35_EN, 1723 + .hw_list = &infralist, 1724 + .banks_num = 1, 1725 + .banks_enable = {true}, 1726 + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, 1727 + .iova_region = single_domain, 1728 + .iova_region_nr = ARRAY_SIZE(single_domain), 1729 + }; 1730 + 1731 + static const u32 mt8189_larb_region_msk[MT8192_MULTI_REGION_NR_MAX][MTK_LARB_NR_MAX] = { 1732 + [0] = {~0, ~0, ~0, [22] = BIT(0)}, /* Region0: all ports for larb0/1/2 */ 1733 + [1] = {[3] = ~0, [4] = ~0}, /* Region1: all ports for larb4(3)/7(4) */ 1734 + [2] = {[5] = ~0, [6] = ~0, /* Region2: all ports for larb9(5)/11(6) */ 1735 + [7] = ~0, [8] = ~0, /* Region2: all ports for larb13(7)/14(8) */ 1736 + [9] = ~0, [10] = ~0, /* Region2: all ports for larb16(9)/17(10) */ 1737 + [11] = ~0, [12] = ~0, /* Region2: all ports for larb19(11)/20(12) */ 1738 + [21] = ~0}, /* Region2: larb21 fake GCE larb */ 1739 + }; 1740 + 1741 + static const struct mtk_iommu_plat_data mt8189_data_mm = { 1742 + .m4u_plat = M4U_MT8189, 1743 + .flags = HAS_BCLK | HAS_SUB_COMM_3BITS | OUT_ORDER_WR_EN | 1744 + WR_THROT_EN | IOVA_34_EN | MTK_IOMMU_TYPE_MM | 1745 + PGTABLE_PA_35_EN | DL_WITH_MULTI_LARB, 1746 + .hw_list = &m4ulist, 1747 + .inv_sel_reg = REG_MMU_INV_SEL_GEN2, 1748 + .banks_num = 5, 1749 + .banks_enable = {true, false, false, false, false}, 1750 + .iova_region = mt8192_multi_dom, 1751 + .iova_region_nr = ARRAY_SIZE(mt8192_multi_dom), 1752 + .iova_region_larb_msk = mt8189_larb_region_msk, 1753 + .larbid_remap = {{0}, {1}, {21/* GCE_D */, 21/* GCE_M */, 2}, 1754 + {19, 20, 9, 11}, {7}, {4}, 1755 + {13, 17}, {14, 16}}, 1756 + }; 1757 + 1753 1758 static const struct mtk_iommu_plat_data mt8192_data = { 1754 1759 .m4u_plat = M4U_MT8192, 1755 1760 .flags = HAS_BCLK | HAS_SUB_COMM_2BITS | OUT_ORDER_WR_EN | ··· 1911 1796 { .compatible = "mediatek,mt8188-iommu-infra", .data = &mt8188_data_infra}, 1912 1797 { .compatible = "mediatek,mt8188-iommu-vdo", .data = &mt8188_data_vdo}, 1913 1798 { .compatible = "mediatek,mt8188-iommu-vpp", .data = &mt8188_data_vpp}, 1799 + { .compatible = "mediatek,mt8189-iommu-apu", .data = &mt8189_data_apu}, 1800 + { .compatible = "mediatek,mt8189-iommu-infra", .data = &mt8189_data_infra}, 1801 + { .compatible = "mediatek,mt8189-iommu-mm", .data = &mt8189_data_mm}, 1914 1802 { .compatible = "mediatek,mt8192-m4u", .data = &mt8192_data}, 1915 1803 { .compatible = "mediatek,mt8195-iommu-infra", .data = &mt8195_data_infra}, 1916 1804 { .compatible = "mediatek,mt8195-iommu-vdo", .data = &mt8195_data_vdo},

+28 -7

drivers/iommu/mtk_iommu_v1.c

··· 303 303 kfree(to_mtk_domain(domain)); 304 304 } 305 305 306 - static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, struct device *dev) 306 + static int mtk_iommu_v1_attach_device(struct iommu_domain *domain, 307 + struct device *dev, 308 + struct iommu_domain *old) 307 309 { 308 310 struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); 309 311 struct mtk_iommu_v1_domain *dom = to_mtk_domain(domain); ··· 331 329 } 332 330 333 331 static int mtk_iommu_v1_identity_attach(struct iommu_domain *identity_domain, 334 - struct device *dev) 332 + struct device *dev, 333 + struct iommu_domain *old) 335 334 { 336 335 struct mtk_iommu_v1_data *data = dev_iommu_priv_get(dev); 337 336 ··· 438 435 return -EINVAL; 439 436 440 437 dev_iommu_priv_set(dev, platform_get_drvdata(m4updev)); 438 + 439 + put_device(&m4updev->dev); 441 440 } 442 441 443 442 ret = iommu_fwspec_add_ids(dev, args->args, 1); ··· 646 641 if (larb_nr < 0) 647 642 return larb_nr; 648 643 644 + if (larb_nr > MTK_LARB_NR_MAX) 645 + return -EINVAL; 646 + 649 647 for (i = 0; i < larb_nr; i++) { 650 648 struct device_node *larbnode; 651 649 struct platform_device *plarbdev; 652 650 653 651 larbnode = of_parse_phandle(dev->of_node, "mediatek,larbs", i); 654 - if (!larbnode) 655 - return -EINVAL; 652 + if (!larbnode) { 653 + ret = -EINVAL; 654 + goto out_put_larbs; 655 + } 656 656 657 657 if (!of_device_is_available(larbnode)) { 658 658 of_node_put(larbnode); ··· 667 657 plarbdev = of_find_device_by_node(larbnode); 668 658 if (!plarbdev) { 669 659 of_node_put(larbnode); 670 - return -ENODEV; 660 + ret = -ENODEV; 661 + goto out_put_larbs; 671 662 } 672 663 if (!plarbdev->dev.driver) { 673 664 of_node_put(larbnode); 674 - return -EPROBE_DEFER; 665 + put_device(&plarbdev->dev); 666 + ret = -EPROBE_DEFER; 667 + goto out_put_larbs; 675 668 } 676 669 data->larb_imu[i].dev = &plarbdev->dev; 677 670 ··· 686 673 687 674 ret = mtk_iommu_v1_hw_init(data); 688 675 if (ret) 689 - return ret; 676 + goto out_put_larbs; 690 677 691 678 ret = iommu_device_sysfs_add(&data->iommu, &pdev->dev, NULL, 692 679 dev_name(&pdev->dev)); ··· 708 695 iommu_device_sysfs_remove(&data->iommu); 709 696 out_clk_unprepare: 710 697 clk_disable_unprepare(data->bclk); 698 + out_put_larbs: 699 + for (i = 0; i < MTK_LARB_NR_MAX; i++) 700 + put_device(data->larb_imu[i].dev); 701 + 711 702 return ret; 712 703 } 713 704 714 705 static void mtk_iommu_v1_remove(struct platform_device *pdev) 715 706 { 716 707 struct mtk_iommu_v1_data *data = platform_get_drvdata(pdev); 708 + int i; 717 709 718 710 iommu_device_sysfs_remove(&data->iommu); 719 711 iommu_device_unregister(&data->iommu); ··· 726 708 clk_disable_unprepare(data->bclk); 727 709 devm_free_irq(&pdev->dev, data->irq, data); 728 710 component_master_del(&pdev->dev, &mtk_iommu_v1_com_ops); 711 + 712 + for (i = 0; i < MTK_LARB_NR_MAX; i++) 713 + put_device(data->larb_imu[i].dev); 729 714 } 730 715 731 716 static int __maybe_unused mtk_iommu_v1_suspend(struct device *dev)

+8 -11

drivers/iommu/omap-iommu.c

··· 1431 1431 odomain->iommus = NULL; 1432 1432 } 1433 1433 1434 - static int 1435 - omap_iommu_attach_dev(struct iommu_domain *domain, struct device *dev) 1434 + static int omap_iommu_attach_dev(struct iommu_domain *domain, 1435 + struct device *dev, struct iommu_domain *old) 1436 1436 { 1437 1437 struct omap_iommu_arch_data *arch_data = dev_iommu_priv_get(dev); 1438 1438 struct omap_iommu_domain *omap_domain = to_omap_domain(domain); ··· 1536 1536 } 1537 1537 1538 1538 static int omap_iommu_identity_attach(struct iommu_domain *identity_domain, 1539 - struct device *dev) 1539 + struct device *dev, 1540 + struct iommu_domain *old) 1540 1541 { 1541 - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 1542 1542 struct omap_iommu_domain *omap_domain; 1543 1543 1544 - if (domain == identity_domain || !domain) 1544 + if (old == identity_domain || !old) 1545 1545 return 0; 1546 1546 1547 - omap_domain = to_omap_domain(domain); 1547 + omap_domain = to_omap_domain(old); 1548 1548 spin_lock(&omap_domain->lock); 1549 1549 _omap_iommu_detach_dev(omap_domain, dev); 1550 1550 spin_unlock(&omap_domain->lock); ··· 1668 1668 } 1669 1669 1670 1670 pdev = of_find_device_by_node(np); 1671 + of_node_put(np); 1671 1672 if (!pdev) { 1672 - of_node_put(np); 1673 1673 kfree(arch_data); 1674 1674 return ERR_PTR(-ENODEV); 1675 1675 } 1676 1676 1677 1677 oiommu = platform_get_drvdata(pdev); 1678 + put_device(&pdev->dev); 1678 1679 if (!oiommu) { 1679 - of_node_put(np); 1680 1680 kfree(arch_data); 1681 1681 return ERR_PTR(-EINVAL); 1682 1682 } 1683 1683 1684 1684 tmp->iommu_dev = oiommu; 1685 - tmp->dev = &pdev->dev; 1686 - 1687 - of_node_put(np); 1688 1685 } 1689 1686 1690 1687 dev_iommu_priv_set(dev, arch_data);

-2

drivers/iommu/omap-iommu.h

··· 88 88 /** 89 89 * struct omap_iommu_arch_data - omap iommu private data 90 90 * @iommu_dev: handle of the OMAP iommu device 91 - * @dev: handle of the iommu device 92 91 * 93 92 * This is an omap iommu private data object, which binds an iommu user 94 93 * to its iommu device. This object should be placed at the iommu user's ··· 96 97 */ 97 98 struct omap_iommu_arch_data { 98 99 struct omap_iommu *iommu_dev; 99 - struct device *dev; 100 100 }; 101 101 102 102 struct cr_regs {

+6 -3

drivers/iommu/riscv/iommu.c

··· 1321 1321 } 1322 1322 1323 1323 static int riscv_iommu_attach_paging_domain(struct iommu_domain *iommu_domain, 1324 - struct device *dev) 1324 + struct device *dev, 1325 + struct iommu_domain *old) 1325 1326 { 1326 1327 struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain); 1327 1328 struct riscv_iommu_device *iommu = dev_to_iommu(dev); ··· 1427 1426 } 1428 1427 1429 1428 static int riscv_iommu_attach_blocking_domain(struct iommu_domain *iommu_domain, 1430 - struct device *dev) 1429 + struct device *dev, 1430 + struct iommu_domain *old) 1431 1431 { 1432 1432 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1433 1433 struct riscv_iommu_info *info = dev_iommu_priv_get(dev); ··· 1449 1447 }; 1450 1448 1451 1449 static int riscv_iommu_attach_identity_domain(struct iommu_domain *iommu_domain, 1452 - struct device *dev) 1450 + struct device *dev, 1451 + struct iommu_domain *old) 1453 1452 { 1454 1453 struct riscv_iommu_device *iommu = dev_to_iommu(dev); 1455 1454 struct riscv_iommu_info *info = dev_iommu_priv_get(dev);

+15 -5

drivers/iommu/rockchip-iommu.c

··· 960 960 } 961 961 962 962 static int rk_iommu_identity_attach(struct iommu_domain *identity_domain, 963 - struct device *dev) 963 + struct device *dev, 964 + struct iommu_domain *old) 964 965 { 965 966 struct rk_iommu *iommu; 966 967 struct rk_iommu_domain *rk_domain; ··· 1006 1005 }; 1007 1006 1008 1007 static int rk_iommu_attach_device(struct iommu_domain *domain, 1009 - struct device *dev) 1008 + struct device *dev, struct iommu_domain *old) 1010 1009 { 1011 1010 struct rk_iommu *iommu; 1012 1011 struct rk_iommu_domain *rk_domain = to_rk_domain(domain); ··· 1027 1026 if (iommu->domain == domain) 1028 1027 return 0; 1029 1028 1030 - ret = rk_iommu_identity_attach(&rk_identity_domain, dev); 1029 + ret = rk_iommu_identity_attach(&rk_identity_domain, dev, old); 1031 1030 if (ret) 1032 1031 return ret; 1033 1032 ··· 1042 1041 return 0; 1043 1042 1044 1043 ret = rk_iommu_enable(iommu); 1045 - if (ret) 1046 - WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev)); 1044 + if (ret) { 1045 + /* 1046 + * Note rk_iommu_identity_attach() might fail before physically 1047 + * attaching the dev to iommu->domain, in which case the actual 1048 + * old domain for this revert should be rk_identity_domain v.s. 1049 + * iommu->domain. Since rk_iommu_identity_attach() does not care 1050 + * about the old domain argument for now, this is not a problem. 1051 + */ 1052 + WARN_ON(rk_iommu_identity_attach(&rk_identity_domain, dev, 1053 + iommu->domain)); 1054 + } 1047 1055 1048 1056 pm_runtime_put(iommu->dev); 1049 1057

+8 -5

drivers/iommu/s390-iommu.c

··· 670 670 } 671 671 672 672 static int blocking_domain_attach_device(struct iommu_domain *domain, 673 - struct device *dev) 673 + struct device *dev, 674 + struct iommu_domain *old) 674 675 { 675 676 struct zpci_dev *zdev = to_zpci_dev(dev); 676 677 struct s390_domain *s390_domain; ··· 695 694 } 696 695 697 696 static int s390_iommu_attach_device(struct iommu_domain *domain, 698 - struct device *dev) 697 + struct device *dev, 698 + struct iommu_domain *old) 699 699 { 700 700 struct s390_domain *s390_domain = to_s390_domain(domain); 701 701 struct zpci_dev *zdev = to_zpci_dev(dev); ··· 711 709 domain->geometry.aperture_end < zdev->start_dma)) 712 710 return -EINVAL; 713 711 714 - blocking_domain_attach_device(&blocking_domain, dev); 712 + blocking_domain_attach_device(&blocking_domain, dev, old); 715 713 716 714 /* If we fail now DMA remains blocked via blocking domain */ 717 715 cc = s390_iommu_domain_reg_ioat(zdev, domain, &status); ··· 1133 1131 subsys_initcall(s390_iommu_init); 1134 1132 1135 1133 static int s390_attach_dev_identity(struct iommu_domain *domain, 1136 - struct device *dev) 1134 + struct device *dev, 1135 + struct iommu_domain *old) 1137 1136 { 1138 1137 struct zpci_dev *zdev = to_zpci_dev(dev); 1139 1138 u8 status; 1140 1139 int cc; 1141 1140 1142 - blocking_domain_attach_device(&blocking_domain, dev); 1141 + blocking_domain_attach_device(&blocking_domain, dev, old); 1143 1142 1144 1143 /* If we fail now DMA remains blocked via blocking domain */ 1145 1144 cc = s390_iommu_domain_reg_ioat(zdev, domain, &status);

+2 -1

drivers/iommu/sprd-iommu.c

··· 247 247 } 248 248 249 249 static int sprd_iommu_attach_device(struct iommu_domain *domain, 250 - struct device *dev) 250 + struct device *dev, 251 + struct iommu_domain *old) 251 252 { 252 253 struct sprd_iommu_device *sdev = dev_iommu_priv_get(dev); 253 254 struct sprd_iommu_domain *dom = to_sprd_domain(domain);

+7 -3

drivers/iommu/sun50i-iommu.c

··· 771 771 } 772 772 773 773 static int sun50i_iommu_identity_attach(struct iommu_domain *identity_domain, 774 - struct device *dev) 774 + struct device *dev, 775 + struct iommu_domain *old) 775 776 { 776 777 struct sun50i_iommu *iommu = dev_iommu_priv_get(dev); 777 778 struct sun50i_iommu_domain *sun50i_domain; ··· 798 797 }; 799 798 800 799 static int sun50i_iommu_attach_device(struct iommu_domain *domain, 801 - struct device *dev) 800 + struct device *dev, 801 + struct iommu_domain *old) 802 802 { 803 803 struct sun50i_iommu_domain *sun50i_domain = to_sun50i_domain(domain); 804 804 struct sun50i_iommu *iommu; ··· 815 813 if (iommu->domain == domain) 816 814 return 0; 817 815 818 - sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev); 816 + sun50i_iommu_identity_attach(&sun50i_iommu_identity_domain, dev, old); 819 817 820 818 sun50i_iommu_attach_domain(iommu, sun50i_domain); 821 819 ··· 840 838 unsigned id = args->args[0]; 841 839 842 840 dev_iommu_priv_set(dev, platform_get_drvdata(iommu_pdev)); 841 + 842 + put_device(&iommu_pdev->dev); 843 843 844 844 return iommu_fwspec_add_ids(dev, &id, 1); 845 845 }

+7 -8

drivers/iommu/tegra-smmu.c

··· 490 490 } 491 491 492 492 static int tegra_smmu_attach_dev(struct iommu_domain *domain, 493 - struct device *dev) 493 + struct device *dev, struct iommu_domain *old) 494 494 { 495 495 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 496 496 struct tegra_smmu *smmu = dev_iommu_priv_get(dev); ··· 524 524 } 525 525 526 526 static int tegra_smmu_identity_attach(struct iommu_domain *identity_domain, 527 - struct device *dev) 527 + struct device *dev, 528 + struct iommu_domain *old) 528 529 { 529 - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); 530 530 struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev); 531 531 struct tegra_smmu_as *as; 532 532 struct tegra_smmu *smmu; ··· 535 535 if (!fwspec) 536 536 return -ENODEV; 537 537 538 - if (domain == identity_domain || !domain) 538 + if (old == identity_domain || !old) 539 539 return 0; 540 540 541 - as = to_smmu_as(domain); 541 + as = to_smmu_as(old); 542 542 smmu = as->smmu; 543 543 for (index = 0; index < fwspec->num_ids; index++) { 544 544 tegra_smmu_disable(smmu, fwspec->ids[index], as->id); ··· 830 830 return NULL; 831 831 832 832 mc = platform_get_drvdata(pdev); 833 - if (!mc) { 834 - put_device(&pdev->dev); 833 + put_device(&pdev->dev); 834 + if (!mc) 835 835 return NULL; 836 - } 837 836 838 837 return mc->smmu; 839 838 }

+4 -2

drivers/iommu/virtio-iommu.c

··· 730 730 return domain; 731 731 } 732 732 733 - static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev) 733 + static int viommu_attach_dev(struct iommu_domain *domain, struct device *dev, 734 + struct iommu_domain *old) 734 735 { 735 736 int ret = 0; 736 737 struct virtio_iommu_req_attach req; ··· 782 781 } 783 782 784 783 static int viommu_attach_identity_domain(struct iommu_domain *domain, 785 - struct device *dev) 784 + struct device *dev, 785 + struct iommu_domain *old) 786 786 { 787 787 int ret = 0; 788 788 struct virtio_iommu_req_attach req;

+283

include/dt-bindings/memory/mediatek,mt8189-memory-port.h

··· 1 + /* SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) */ 2 + /* 3 + * Copyright (c) 2025 MediaTek Inc. 4 + * Author: Zhengnan chen <zhengnan.chen@mediatek.com> 5 + */ 6 + #ifndef _DT_BINDINGS_MEMORY_MEDIATEK_MT8189_MEMORY_PORT_H_ 7 + #define _DT_BINDINGS_MEMORY_MEDIATEK_MT8189_MEMORY_PORT_H_ 8 + 9 + #include <dt-bindings/memory/mtk-memory-port.h> 10 + 11 + #define SMI_L0_ID (0) 12 + #define SMI_L1_ID (1) 13 + #define SMI_L2_ID (2) 14 + #define SMI_L4_ID (3) 15 + #define SMI_L7_ID (4) 16 + #define SMI_L9_ID (5) 17 + #define SMI_L11_ID (6) 18 + #define SMI_L13_ID (7) 19 + #define SMI_L14_ID (8) 20 + #define SMI_L16_ID (9) 21 + #define SMI_L17_ID (10) 22 + #define SMI_L19_ID (11) 23 + #define SMI_L20_ID (12) 24 + 25 + /* 26 + * MM IOMMU supports 16GB dma address. We separate it to four ranges: 27 + * 0 ~ 4G; 4G ~ 8G; 8G ~ 12G; 12G ~ 16G, we could adjust these masters 28 + * locate in anyone region. BUT: 29 + * a) Make sure all the ports inside a larb are in one range. 30 + * b) The iova of any master can NOT cross the 4G/8G/12G boundary. 31 + * 32 + * This is the suggested mapping in this SoC: 33 + * 34 + * modules dma-address-region larbs-ports 35 + * disp/mdp 0 ~ 4G larb0/1/2 36 + * vcodec 4G ~ 8G larb4/7 37 + * imgsys/cam/ipesys 8G ~ 12G the other larbs. 38 + * N/A 12G ~ 16G 39 + */ 40 + 41 + /* Larb0 -- disp */ 42 + #define M4U_L0_P0_DISP_OVL0_4L_HDR MTK_M4U_ID(SMI_L0_ID, 0) 43 + #define M4U_L0_P1_DISP_OVL0_4L_RDMA0 MTK_M4U_ID(SMI_L0_ID, 1) 44 + #define M4U_L0_P2_DISP_OVL1_4L_RDMA1 MTK_M4U_ID(SMI_L0_ID, 2) 45 + #define M4U_L0_P3_DISP_OVL0_4L_RDMA2 MTK_M4U_ID(SMI_L0_ID, 3) 46 + #define M4U_L0_P4_DISP_OVL1_4L_RDMA3 MTK_M4U_ID(SMI_L0_ID, 4) 47 + #define M4U_L0_P5_DISP_RDMA0 MTK_M4U_ID(SMI_L0_ID, 5) 48 + #define M4U_L0_P6_DISP_WDMA0 MTK_M4U_ID(SMI_L0_ID, 6) 49 + #define M4U_L0_P7_DISP_FAKE_ENG0 MTK_M4U_ID(SMI_L0_ID, 7) 50 + 51 + /* Larb1 -- disp */ 52 + #define M4U_L1_P0_DISP_OVL1_4L_HDR MTK_M4U_ID(SMI_L1_ID, 0) 53 + #define M4U_L1_P1_DISP_OVL1_4L_RDMA0 MTK_M4U_ID(SMI_L1_ID, 1) 54 + #define M4U_L1_P2_DISP_OVL0_4L_RDMA1 MTK_M4U_ID(SMI_L1_ID, 2) 55 + #define M4U_L1_P3_DISP_OVL1_4L_RDMA2 MTK_M4U_ID(SMI_L1_ID, 3) 56 + #define M4U_L1_P4_DISP_OVL0_4L_RDMA3 MTK_M4U_ID(SMI_L1_ID, 4) 57 + #define M4U_L1_P5_DISP_RDMA1 MTK_M4U_ID(SMI_L1_ID, 5) 58 + #define M4U_L1_P6_DISP_WDMA1 MTK_M4U_ID(SMI_L1_ID, 6) 59 + #define M4U_L1_P7_DISP_FAKE_ENG1 MTK_M4U_ID(SMI_L1_ID, 7) 60 + 61 + /* Larb2 -- mmlsys(mdp) */ 62 + #define M4U_L2_P0_MDP_RDMA0 MTK_M4U_ID(SMI_L2_ID, 0) 63 + #define M4U_L2_P1_MDP_RDMA1 MTK_M4U_ID(SMI_L2_ID, 1) 64 + #define M4U_L2_P2_MDP_WROT0 MTK_M4U_ID(SMI_L2_ID, 2) 65 + #define M4U_L2_P3_MDP_WROT1 MTK_M4U_ID(SMI_L2_ID, 3) 66 + #define M4U_L2_P4_MDP_DUMMY0 MTK_M4U_ID(SMI_L2_ID, 4) 67 + #define M4U_L2_P5_MDP_DUMMY1 MTK_M4U_ID(SMI_L2_ID, 5) 68 + #define M4U_L2_P6_MDP_RDMA2 MTK_M4U_ID(SMI_L2_ID, 6) 69 + #define M4U_L2_P7_MDP_RDMA3 MTK_M4U_ID(SMI_L2_ID, 7) 70 + #define M4U_L2_P8_MDP_WROT2 MTK_M4U_ID(SMI_L2_ID, 8) 71 + #define M4U_L2_P9_MDP_WROT3 MTK_M4U_ID(SMI_L2_ID, 9) 72 + #define M4U_L2_P10_DISP_FAKE0 MTK_M4U_ID(SMI_L2_ID, 10) 73 + 74 + /* Larb3: null */ 75 + 76 + /* Larb4 -- vdec */ 77 + #define M4U_L4_P0_HW_VDEC_MC_EXT MTK_M4U_ID(SMI_L4_ID, 0) 78 + #define M4U_L4_P1_HW_VDEC_UFO_EXT MTK_M4U_ID(SMI_L4_ID, 1) 79 + #define M4U_L4_P2_HW_VDEC_PP_EXT MTK_M4U_ID(SMI_L4_ID, 2) 80 + #define M4U_L4_P3_HW_VDEC_PRED_RD_EXT MTK_M4U_ID(SMI_L4_ID, 3) 81 + #define M4U_L4_P4_HW_VDEC_PRED_WR_EXT MTK_M4U_ID(SMI_L4_ID, 4) 82 + #define M4U_L4_P5_HW_VDEC_PPWRAP_EXT MTK_M4U_ID(SMI_L4_ID, 5) 83 + #define M4U_L4_P6_HW_VDEC_TILE_EXT MTK_M4U_ID(SMI_L4_ID, 6) 84 + #define M4U_L4_P7_HW_VDEC_VLD_EXT MTK_M4U_ID(SMI_L4_ID, 7) 85 + #define M4U_L4_P8_HW_VDEC_VLD2_EXT MTK_M4U_ID(SMI_L4_ID, 8) 86 + #define M4U_L4_P9_HW_VDEC_AVC_MV_EXT MTK_M4U_ID(SMI_L4_ID, 9) 87 + #define M4U_L4_P10_HW_VDEC_RG_CTRL_DMA_EXT MTK_M4U_ID(SMI_L4_ID, 10) 88 + #define M4U_L4_P11_HW_VDEC_UFO_ENC_EXT MTK_M4U_ID(SMI_L4_ID, 11) 89 + 90 + /* Larb5: null */ 91 + 92 + /* Larb6: null */ 93 + 94 + /* Larb7 -- venc */ 95 + #define M4U_L7_P0_VENC_RCPU MTK_M4U_ID(SMI_L7_ID, 0) 96 + #define M4U_L7_P1_VENC_REC MTK_M4U_ID(SMI_L7_ID, 1) 97 + #define M4U_L7_P2_VENC_BSDMA MTK_M4U_ID(SMI_L7_ID, 2) 98 + #define M4U_L7_P3_VENC_SV_COMV MTK_M4U_ID(SMI_L7_ID, 3) 99 + #define M4U_L7_P4_VENC_RD_COMV MTK_M4U_ID(SMI_L7_ID, 4) 100 + #define M4U_L7_P5_JPGENC_Y_RDMA MTK_M4U_ID(SMI_L7_ID, 5) 101 + #define M4U_L7_P6_JPGENC_C_RDMA MTK_M4U_ID(SMI_L7_ID, 6) 102 + #define M4U_L7_P7_JPGENC_Q_RDMA MTK_M4U_ID(SMI_L7_ID, 7) 103 + #define M4U_L7_P8_VENC_SUB_W_LUMA MTK_M4U_ID(SMI_L7_ID, 8) 104 + #define M4U_L7_P9_JPGENC_BSDMA MTK_M4U_ID(SMI_L7_ID, 9) 105 + #define M4U_L7_P10_VENC_CUR_LUMA MTK_M4U_ID(SMI_L7_ID, 10) 106 + #define M4U_L7_P11_VENC_CUR_CHROMA MTK_M4U_ID(SMI_L7_ID, 11) 107 + #define M4U_L7_P12_VENC_REF_LUMA MTK_M4U_ID(SMI_L7_ID, 12) 108 + #define M4U_L7_P13_VENC_REF_CHROMA MTK_M4U_ID(SMI_L7_ID, 13) 109 + #define M4U_L7_P14_VENC_SUB_R_LUMA MTK_M4U_ID(SMI_L7_ID, 14) 110 + #define M4U_L7_P15_JPGDEC_WDMA MTK_M4U_ID(SMI_L7_ID, 15) 111 + #define M4U_L7_P16_JPGDEC_BSDMA MTK_M4U_ID(SMI_L7_ID, 16) 112 + #define M4U_L7_P17_JPGDEC_HUFF_OFFSET MTK_M4U_ID(SMI_L7_ID, 17) 113 + 114 + /* Larb8: null */ 115 + 116 + /* Larb9 --imgsys */ 117 + #define M4U_L9_P0_IMGI_D1 MTK_M4U_ID(SMI_L9_ID, 0) 118 + #define M4U_L9_P1_IMGBI_D1 MTK_M4U_ID(SMI_L9_ID, 1) 119 + #define M4U_L9_P2_DMGI_D1 MTK_M4U_ID(SMI_L9_ID, 2) 120 + #define M4U_L9_P3_DEPI_D1 MTK_M4U_ID(SMI_L9_ID, 3) 121 + #define M4U_L9_P4_LCE_D1 MTK_M4U_ID(SMI_L9_ID, 4) 122 + #define M4U_L9_P5_SMTI_D1 MTK_M4U_ID(SMI_L9_ID, 5) 123 + #define M4U_L9_P6_SMTO_D2 MTK_M4U_ID(SMI_L9_ID, 6) 124 + #define M4U_L9_P7_SMTO_D1 MTK_M4U_ID(SMI_L9_ID, 7) 125 + #define M4U_L9_P8_CRZO_D1 MTK_M4U_ID(SMI_L9_ID, 8) 126 + #define M4U_L9_P9_IMG3O_D1 MTK_M4U_ID(SMI_L9_ID, 9) 127 + #define M4U_L9_P10_VIPI_D1 MTK_M4U_ID(SMI_L9_ID, 10) 128 + #define M4U_L9_P11_SMTI_D5 MTK_M4U_ID(SMI_L9_ID, 11) 129 + #define M4U_L9_P12_TIMGO_D1 MTK_M4U_ID(SMI_L9_ID, 12) 130 + #define M4U_L9_P13_UFBC_W0 MTK_M4U_ID(SMI_L9_ID, 13) 131 + #define M4U_L9_P14_UFBC_R0 MTK_M4U_ID(SMI_L9_ID, 14) 132 + #define M4U_L9_P15_WPE_RDMA1 MTK_M4U_ID(SMI_L9_ID, 15) 133 + #define M4U_L9_P16_WPE_RDMA0 MTK_M4U_ID(SMI_L9_ID, 16) 134 + #define M4U_L9_P17_WPE_WDMA MTK_M4U_ID(SMI_L9_ID, 17) 135 + #define M4U_L9_P18_MFB_RDMA0 MTK_M4U_ID(SMI_L9_ID, 18) 136 + #define M4U_L9_P19_MFB_RDMA1 MTK_M4U_ID(SMI_L9_ID, 19) 137 + #define M4U_L9_P20_MFB_RDMA2 MTK_M4U_ID(SMI_L9_ID, 20) 138 + #define M4U_L9_P21_MFB_RDMA3 MTK_M4U_ID(SMI_L9_ID, 21) 139 + #define M4U_L9_P22_MFB_RDMA4 MTK_M4U_ID(SMI_L9_ID, 22) 140 + #define M4U_L9_P23_MFB_RDMA5 MTK_M4U_ID(SMI_L9_ID, 23) 141 + #define M4U_L9_P24_MFB_WDMA0 MTK_M4U_ID(SMI_L9_ID, 24) 142 + #define M4U_L9_P25_MFB_WDMA1 MTK_M4U_ID(SMI_L9_ID, 25) 143 + #define M4U_L9_P26_RESERVE6 MTK_M4U_ID(SMI_L9_ID, 26) 144 + #define M4U_L9_P27_RESERVE7 MTK_M4U_ID(SMI_L9_ID, 27) 145 + #define M4U_L9_P28_RESERVE8 MTK_M4U_ID(SMI_L9_ID, 28) 146 + 147 + /* Larb10: null */ 148 + 149 + /* Larb11 -- imgsys */ 150 + #define M4U_L11_P0_IMGI_D1 MTK_M4U_ID(SMI_L11_ID, 0) 151 + #define M4U_L11_P1_IMGBI_D1 MTK_M4U_ID(SMI_L11_ID, 1) 152 + #define M4U_L11_P2_DMGI_D1 MTK_M4U_ID(SMI_L11_ID, 2) 153 + #define M4U_L11_P3_DEPI_D1 MTK_M4U_ID(SMI_L11_ID, 3) 154 + #define M4U_L11_P4_LCE_D1 MTK_M4U_ID(SMI_L11_ID, 4) 155 + #define M4U_L11_P5_SMTI_D1 MTK_M4U_ID(SMI_L11_ID, 5) 156 + #define M4U_L11_P6_SMTO_D2 MTK_M4U_ID(SMI_L11_ID, 6) 157 + #define M4U_L11_P7_SMTO_D1 MTK_M4U_ID(SMI_L11_ID, 7) 158 + #define M4U_L11_P8_CRZO_D1 MTK_M4U_ID(SMI_L11_ID, 8) 159 + #define M4U_L11_P9_IMG3O_D1 MTK_M4U_ID(SMI_L11_ID, 9) 160 + #define M4U_L11_P10_VIPI_D1 MTK_M4U_ID(SMI_L11_ID, 10) 161 + #define M4U_L11_P11_SMTI_D5 MTK_M4U_ID(SMI_L11_ID, 11) 162 + #define M4U_L11_P12_TIMGO_D1 MTK_M4U_ID(SMI_L11_ID, 12) 163 + #define M4U_L11_P13_UFBC_W0 MTK_M4U_ID(SMI_L11_ID, 13) 164 + #define M4U_L11_P14_UFBC_R0 MTK_M4U_ID(SMI_L11_ID, 14) 165 + #define M4U_L11_P15_WPE_RDMA1 MTK_M4U_ID(SMI_L11_ID, 15) 166 + #define M4U_L11_P16_WPE_RDMA0 MTK_M4U_ID(SMI_L11_ID, 16) 167 + #define M4U_L11_P17_WPE_WDMA MTK_M4U_ID(SMI_L11_ID, 17) 168 + #define M4U_L11_P18_MFB_RDMA0 MTK_M4U_ID(SMI_L11_ID, 18) 169 + #define M4U_L11_P19_MFB_RDMA1 MTK_M4U_ID(SMI_L11_ID, 19) 170 + #define M4U_L11_P20_MFB_RDMA2 MTK_M4U_ID(SMI_L11_ID, 20) 171 + #define M4U_L11_P21_MFB_RDMA3 MTK_M4U_ID(SMI_L11_ID, 21) 172 + #define M4U_L11_P22_MFB_RDMA4 MTK_M4U_ID(SMI_L11_ID, 22) 173 + #define M4U_L11_P23_MFB_RDMA5 MTK_M4U_ID(SMI_L11_ID, 23) 174 + #define M4U_L11_P24_MFB_WDMA0 MTK_M4U_ID(SMI_L11_ID, 24) 175 + #define M4U_L11_P25_MFB_WDMA1 MTK_M4U_ID(SMI_L11_ID, 25) 176 + #define M4U_L11_P26_RESERVE6 MTK_M4U_ID(SMI_L11_ID, 26) 177 + #define M4U_L11_P27_RESERVE7 MTK_M4U_ID(SMI_L11_ID, 27) 178 + #define M4U_L11_P28_RESERVE8 MTK_M4U_ID(SMI_L11_ID, 28) 179 + 180 + /* Larb12: null */ 181 + 182 + /* Larb13 -- cam */ 183 + #define M4U_L13_P0_MRAWI MTK_M4U_ID(SMI_L13_ID, 0) 184 + #define M4U_L13_P1_MRAWO_0 MTK_M4U_ID(SMI_L13_ID, 1) 185 + #define M4U_L13_P2_MRAWO_1 MTK_M4U_ID(SMI_L13_ID, 2) 186 + #define M4U_L13_P3_CAMSV_1 MTK_M4U_ID(SMI_L13_ID, 3) 187 + #define M4U_L13_P4_CAMSV_2 MTK_M4U_ID(SMI_L13_ID, 4) 188 + #define M4U_L13_P5_CAMSV_3 MTK_M4U_ID(SMI_L13_ID, 5) 189 + #define M4U_L13_P6_CAMSV_4 MTK_M4U_ID(SMI_L13_ID, 6) 190 + #define M4U_L13_P7_CAMSV_5 MTK_M4U_ID(SMI_L13_ID, 7) 191 + #define M4U_L13_P8_CAMSV_6 MTK_M4U_ID(SMI_L13_ID, 8) 192 + #define M4U_L13_P9_CCUI MTK_M4U_ID(SMI_L13_ID, 9) 193 + #define M4U_L13_P10_CCUO MTK_M4U_ID(SMI_L13_ID, 10) 194 + #define M4U_L13_P11_FAKE MTK_M4U_ID(SMI_L13_ID, 11) 195 + #define M4U_L13_P12_PDAI_0 MTK_M4U_ID(SMI_L13_ID, 12) 196 + #define M4U_L13_P13_PDAI_1 MTK_M4U_ID(SMI_L13_ID, 13) 197 + #define M4U_L13_P14_PDAO MTK_M4U_ID(SMI_L13_ID, 14) 198 + 199 + /* Larb14 -- cam */ 200 + #define M4U_L14_P0_RESERVE MTK_M4U_ID(SMI_L14_ID, 0) 201 + #define M4U_L14_P1_RESERVE MTK_M4U_ID(SMI_L14_ID, 1) 202 + #define M4U_L14_P2_RESERVE MTK_M4U_ID(SMI_L14_ID, 2) 203 + #define M4U_L14_P3_CAMSV_0 MTK_M4U_ID(SMI_L14_ID, 3) 204 + #define M4U_L14_P4_CCUI MTK_M4U_ID(SMI_L14_ID, 4) 205 + #define M4U_L14_P5_CCUO MTK_M4U_ID(SMI_L14_ID, 5) 206 + #define M4U_L14_P6_CAMSV_7 MTK_M4U_ID(SMI_L14_ID, 6) 207 + #define M4U_L14_P7_CAMSV_8 MTK_M4U_ID(SMI_L14_ID, 7) 208 + #define M4U_L14_P8_CAMSV_9 MTK_M4U_ID(SMI_L14_ID, 8) 209 + #define M4U_L14_P9_CAMSV_10 MTK_M4U_ID(SMI_L14_ID, 9) 210 + 211 + /* Larb15: null */ 212 + 213 + /* Larb16 -- cam */ 214 + #define M4U_L16_P0_IMGO_R1_A MTK_M4U_ID(SMI_L16_ID, 0) 215 + #define M4U_L16_P1_RRZO_R1_A MTK_M4U_ID(SMI_L16_ID, 1) 216 + #define M4U_L16_P2_CQI_R1_A MTK_M4U_ID(SMI_L16_ID, 2) 217 + #define M4U_L16_P3_BPCI_R1_A MTK_M4U_ID(SMI_L16_ID, 3) 218 + #define M4U_L16_P4_YUVO_R1_A MTK_M4U_ID(SMI_L16_ID, 4) 219 + #define M4U_L16_P5_UFDI_R2_A MTK_M4U_ID(SMI_L16_ID, 5) 220 + #define M4U_L16_P6_RAWI_R2_A MTK_M4U_ID(SMI_L16_ID, 6) 221 + #define M4U_L16_P7_RAWI_R3_A MTK_M4U_ID(SMI_L16_ID, 7) 222 + #define M4U_L16_P8_AAO_R1_A MTK_M4U_ID(SMI_L16_ID, 8) 223 + #define M4U_L16_P9_AFO_R1_A MTK_M4U_ID(SMI_L16_ID, 9) 224 + #define M4U_L16_P10_FLKO_R1_A MTK_M4U_ID(SMI_L16_ID, 10) 225 + #define M4U_L16_P11_LCESO_R1_A MTK_M4U_ID(SMI_L16_ID, 11) 226 + #define M4U_L16_P12_CRZO_R1_A MTK_M4U_ID(SMI_L16_ID, 12) 227 + #define M4U_L16_P13_LTMSO_R1_A MTK_M4U_ID(SMI_L16_ID, 13) 228 + #define M4U_L16_P14_RSSO_R1_A MTK_M4U_ID(SMI_L16_ID, 14) 229 + #define M4U_L16_P15_AAHO_R1_A MTK_M4U_ID(SMI_L16_ID, 15) 230 + #define M4U_L16_P16_LSCI_R1_A MTK_M4U_ID(SMI_L16_ID, 16) 231 + 232 + /* Larb17 -- cam */ 233 + #define M4U_L17_P0_IMGO_R1_B MTK_M4U_ID(SMI_L17_ID, 0) 234 + #define M4U_L17_P1_RRZO_R1_B MTK_M4U_ID(SMI_L17_ID, 1) 235 + #define M4U_L17_P2_CQI_R1_B MTK_M4U_ID(SMI_L17_ID, 2) 236 + #define M4U_L17_P3_BPCI_R1_B MTK_M4U_ID(SMI_L17_ID, 3) 237 + #define M4U_L17_P4_YUVO_R1_B MTK_M4U_ID(SMI_L17_ID, 4) 238 + #define M4U_L17_P5_UFDI_R2_B MTK_M4U_ID(SMI_L17_ID, 5) 239 + #define M4U_L17_P6_RAWI_R2_B MTK_M4U_ID(SMI_L17_ID, 6) 240 + #define M4U_L17_P7_RAWI_R3_B MTK_M4U_ID(SMI_L17_ID, 7) 241 + #define M4U_L17_P8_AAO_R1_B MTK_M4U_ID(SMI_L17_ID, 8) 242 + #define M4U_L17_P9_AFO_R1_B MTK_M4U_ID(SMI_L17_ID, 9) 243 + #define M4U_L17_P10_FLKO_R1_B MTK_M4U_ID(SMI_L17_ID, 10) 244 + #define M4U_L17_P11_LCESO_R1_B MTK_M4U_ID(SMI_L17_ID, 11) 245 + #define M4U_L17_P12_CRZO_R1_B MTK_M4U_ID(SMI_L17_ID, 12) 246 + #define M4U_L17_P13_LTMSO_R1_B MTK_M4U_ID(SMI_L17_ID, 13) 247 + #define M4U_L17_P14_RSSO_R1_B MTK_M4U_ID(SMI_L17_ID, 14) 248 + #define M4U_L17_P15_AAHO_R1_B MTK_M4U_ID(SMI_L17_ID, 15) 249 + #define M4U_L17_P16_LSCI_R1_B MTK_M4U_ID(SMI_L17_ID, 16) 250 + 251 + /* Larb19 -- ipesys */ 252 + #define M4U_L19_P0_DVS_RDMA MTK_M4U_ID(SMI_L19_ID, 0) 253 + #define M4U_L19_P1_DVS_WDMA MTK_M4U_ID(SMI_L19_ID, 1) 254 + #define M4U_L19_P2_DVP_RDMA MTK_M4U_ID(SMI_L19_ID, 2) 255 + #define M4U_L19_P3_DVP_WDMA MTK_M4U_ID(SMI_L19_ID, 3) 256 + 257 + /* Larb20 -- ipesys */ 258 + #define M4U_L20_P0_FDVT_RDA_0 MTK_M4U_ID(SMI_L20_ID, 0) 259 + #define M4U_L20_P1_FDVT_RDB_0 MTK_M4U_ID(SMI_L20_ID, 1) 260 + #define M4U_L20_P2_FDVT_WRA_0 MTK_M4U_ID(SMI_L20_ID, 2) 261 + #define M4U_L20_P3_FDVT_WRB_0 MTK_M4U_ID(SMI_L20_ID, 3) 262 + #define M4U_L20_P4_RSC_RDMA MTK_M4U_ID(SMI_L20_ID, 4) 263 + #define M4U_L20_P5_RSC_WDMA MTK_M4U_ID(SMI_L20_ID, 5) 264 + 265 + /* fake larb21 for gce */ 266 + #define M4U_L21_GCE_DM MTK_M4U_ID(21, 0) 267 + #define M4U_L21_GCE_MM MTK_M4U_ID(21, 1) 268 + 269 + /* fake larb & port for svp and dual svp and wfd */ 270 + #define M4U_PORT_SVP_HEAP MTK_M4U_ID(22, 0) 271 + #define M4U_PORT_DUAL_SVP_HEAP MTK_M4U_ID(22, 1) 272 + #define M4U_PORT_WFD_HEAP MTK_M4U_ID(22, 2) 273 + 274 + /* fake larb0 for apu */ 275 + #define M4U_L0_APU_DATA MTK_M4U_ID(0, 0) 276 + #define M4U_L0_APU_CODE MTK_M4U_ID(0, 1) 277 + #define M4U_L0_APU_SECURE MTK_M4U_ID(0, 2) 278 + #define M4U_L0_APU_VLM MTK_M4U_ID(0, 3) 279 + 280 + /* infra/peri */ 281 + #define IFR_IOMMU_PORT_PCIE_0 MTK_IFAIOMMU_PERI_ID(0, 26) 282 + 283 + #endif

+191

include/linux/generic_pt/common.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + */ 5 + #ifndef __GENERIC_PT_COMMON_H 6 + #define __GENERIC_PT_COMMON_H 7 + 8 + #include <linux/types.h> 9 + #include <linux/build_bug.h> 10 + #include <linux/bits.h> 11 + 12 + /** 13 + * DOC: Generic Radix Page Table 14 + * 15 + * Generic Radix Page Table is a set of functions and helpers to efficiently 16 + * parse radix style page tables typically seen in HW implementations. The 17 + * interface is built to deliver similar code generation as the mm's pte/pmd/etc 18 + * system by fully inlining the exact code required to handle each table level. 19 + * 20 + * Like the mm subsystem each format contributes its parsing implementation 21 + * under common names and the common code implements the required algorithms. 22 + * 23 + * The system is divided into three logical levels: 24 + * 25 + * - The page table format and its manipulation functions 26 + * - Generic helpers to give a consistent API regardless of underlying format 27 + * - An algorithm implementation (e.g. IOMMU/DRM/KVM/MM) 28 + * 29 + * Multiple implementations are supported. The intention is to have the generic 30 + * format code be re-usable for whatever specialized implementation is required. 31 + * The generic code is solely about the format of the radix tree; it does not 32 + * include memory allocation or higher level decisions that are left for the 33 + * implementation. 34 + * 35 + * The generic framework supports a superset of functions across many HW 36 + * implementations: 37 + * 38 + * - Entries comprised of contiguous blocks of IO PTEs for larger page sizes 39 + * - Multi-level tables, up to 6 levels. Runtime selected top level 40 + * - Runtime variable table level size (ARM's concatenated tables) 41 + * - Expandable top level allowing dynamic sizing of table levels 42 + * - Optional leaf entries at any level 43 + * - 32-bit/64-bit virtual and output addresses, using every address bit 44 + * - Dirty tracking 45 + * - Sign extended addressing 46 + */ 47 + 48 + /** 49 + * struct pt_common - struct for all page table implementations 50 + */ 51 + struct pt_common { 52 + /** 53 + * @top_of_table: Encodes the table top pointer and the top level in a 54 + * single value. Must use READ_ONCE/WRITE_ONCE to access it. The lower 55 + * bits of the aligned table pointer are used for the level. 56 + */ 57 + uintptr_t top_of_table; 58 + /** 59 + * @max_oasz_lg2: Maximum number of bits the OA can contain. Upper bits 60 + * must be zero. This may be less than what the page table format 61 + * supports, but must not be more. 62 + */ 63 + u8 max_oasz_lg2; 64 + /** 65 + * @max_vasz_lg2: Maximum number of bits the VA can contain. Upper bits 66 + * are 0 or 1 depending on pt_full_va_prefix(). This may be less than 67 + * what the page table format supports, but must not be more. When 68 + * PT_FEAT_DYNAMIC_TOP is set this reflects the maximum VA capability. 69 + */ 70 + u8 max_vasz_lg2; 71 + /** 72 + * @features: Bitmap of `enum pt_features` 73 + */ 74 + unsigned int features; 75 + }; 76 + 77 + /* Encoding parameters for top_of_table */ 78 + enum { 79 + PT_TOP_LEVEL_BITS = 3, 80 + PT_TOP_LEVEL_MASK = GENMASK(PT_TOP_LEVEL_BITS - 1, 0), 81 + }; 82 + 83 + /** 84 + * enum pt_features - Features turned on in the table. Each symbol is a bit 85 + * position. 86 + */ 87 + enum pt_features { 88 + /** 89 + * @PT_FEAT_DMA_INCOHERENT: Cache flush page table memory before 90 + * assuming the HW can read it. Otherwise a SMP release is sufficient 91 + * for HW to read it. 92 + */ 93 + PT_FEAT_DMA_INCOHERENT, 94 + /** 95 + * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to 96 + * PT_VADDR_MAX. 97 + */ 98 + PT_FEAT_FULL_VA, 99 + /** 100 + * @PT_FEAT_DYNAMIC_TOP: The table's top level can be increased 101 + * dynamically during map. This requires HW support for atomically 102 + * setting both the table top pointer and the starting table level. 103 + */ 104 + PT_FEAT_DYNAMIC_TOP, 105 + /** 106 + * @PT_FEAT_SIGN_EXTEND: The top most bit of the valid VA range sign 107 + * extends up to the full pt_vaddr_t. This divides the page table into 108 + * three VA ranges:: 109 + * 110 + * 0 -> 2^N - 1 Lower 111 + * 2^N -> (MAX - 2^N - 1) Non-Canonical 112 + * MAX - 2^N -> MAX Upper 113 + * 114 + * In this mode pt_common::max_vasz_lg2 includes the sign bit and the 115 + * upper bits that don't fall within the translation are just validated. 116 + * 117 + * If not set there is no sign extension and valid VA goes from 0 to 2^N 118 + * - 1. 119 + */ 120 + PT_FEAT_SIGN_EXTEND, 121 + /** 122 + * @PT_FEAT_FLUSH_RANGE: IOTLB maintenance is done by flushing IOVA 123 + * ranges which will clean out any walk cache or any IOPTE fully 124 + * contained by the range. The optimization objective is to minimize the 125 + * number of flushes even if ranges include IOVA gaps that do not need 126 + * to be flushed. 127 + */ 128 + PT_FEAT_FLUSH_RANGE, 129 + /** 130 + * @PT_FEAT_FLUSH_RANGE_NO_GAPS: Like PT_FEAT_FLUSH_RANGE except that 131 + * the optimization objective is to only flush IOVA that has been 132 + * changed. This mode is suitable for cases like hypervisor shadowing 133 + * where flushing unchanged ranges may cause the hypervisor to reparse 134 + * significant amount of page table. 135 + */ 136 + PT_FEAT_FLUSH_RANGE_NO_GAPS, 137 + /* private: */ 138 + PT_FEAT_FMT_START, 139 + }; 140 + 141 + struct pt_amdv1 { 142 + struct pt_common common; 143 + }; 144 + 145 + enum { 146 + /* 147 + * The memory backing the tables is encrypted. Use __sme_set() to adjust 148 + * the page table pointers in the tree. This only works with 149 + * CONFIG_AMD_MEM_ENCRYPT. 150 + */ 151 + PT_FEAT_AMDV1_ENCRYPT_TABLES = PT_FEAT_FMT_START, 152 + /* 153 + * The PTEs are set to prevent cache incoherent traffic, such as PCI no 154 + * snoop. This is set either at creation time or before the first map 155 + * operation. 156 + */ 157 + PT_FEAT_AMDV1_FORCE_COHERENCE, 158 + }; 159 + 160 + struct pt_vtdss { 161 + struct pt_common common; 162 + }; 163 + 164 + enum { 165 + /* 166 + * The PTEs are set to prevent cache incoherent traffic, such as PCI no 167 + * snoop. This is set either at creation time or before the first map 168 + * operation. 169 + */ 170 + PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START, 171 + /* 172 + * Prevent creating read-only PTEs. Used to work around HW errata 173 + * ERRATA_772415_SPR17. 174 + */ 175 + PT_FEAT_VTDSS_FORCE_WRITEABLE, 176 + }; 177 + 178 + struct pt_x86_64 { 179 + struct pt_common common; 180 + }; 181 + 182 + enum { 183 + /* 184 + * The memory backing the tables is encrypted. Use __sme_set() to adjust 185 + * the page table pointers in the tree. This only works with 186 + * CONFIG_AMD_MEM_ENCRYPT. 187 + */ 188 + PT_FEAT_X86_64_AMD_ENCRYPT_TABLES = PT_FEAT_FMT_START, 189 + }; 190 + 191 + #endif

+293

include/linux/generic_pt/iommu.h

··· 1 + /* SPDX-License-Identifier: GPL-2.0-only */ 2 + /* 3 + * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES 4 + */ 5 + #ifndef __GENERIC_PT_IOMMU_H 6 + #define __GENERIC_PT_IOMMU_H 7 + 8 + #include <linux/generic_pt/common.h> 9 + #include <linux/iommu.h> 10 + #include <linux/mm_types.h> 11 + 12 + struct iommu_iotlb_gather; 13 + struct pt_iommu_ops; 14 + struct pt_iommu_driver_ops; 15 + struct iommu_dirty_bitmap; 16 + 17 + /** 18 + * DOC: IOMMU Radix Page Table 19 + * 20 + * The IOMMU implementation of the Generic Page Table provides an ops struct 21 + * that is useful to go with an iommu_domain to serve the DMA API, IOMMUFD and 22 + * the generic map/unmap interface. 23 + * 24 + * This interface uses a caller provided locking approach. The caller must have 25 + * a VA range lock concept that prevents concurrent threads from calling ops on 26 + * the same VA. Generally the range lock must be at least as large as a single 27 + * map call. 28 + */ 29 + 30 + /** 31 + * struct pt_iommu - Base structure for IOMMU page tables 32 + * 33 + * The format-specific struct will include this as the first member. 34 + */ 35 + struct pt_iommu { 36 + /** 37 + * @domain: The core IOMMU domain. The driver should use a union to 38 + * overlay this memory with its previously existing domain struct to 39 + * create an alias. 40 + */ 41 + struct iommu_domain domain; 42 + 43 + /** 44 + * @ops: Function pointers to access the API 45 + */ 46 + const struct pt_iommu_ops *ops; 47 + 48 + /** 49 + * @driver_ops: Function pointers provided by the HW driver to help 50 + * manage HW details like caches. 51 + */ 52 + const struct pt_iommu_driver_ops *driver_ops; 53 + 54 + /** 55 + * @nid: Node ID to use for table memory allocations. The IOMMU driver 56 + * may want to set the NID to the device's NID, if there are multiple 57 + * table walkers. 58 + */ 59 + int nid; 60 + 61 + /** 62 + * @iommu_device: Device pointer used for any DMA cache flushing when 63 + * PT_FEAT_DMA_INCOHERENT. This is the iommu device that created the 64 + * page table which must have dma ops that perform cache flushing. 65 + */ 66 + struct device *iommu_device; 67 + }; 68 + 69 + /** 70 + * struct pt_iommu_info - Details about the IOMMU page table 71 + * 72 + * Returned from pt_iommu_ops->get_info() 73 + */ 74 + struct pt_iommu_info { 75 + /** 76 + * @pgsize_bitmap: A bitmask where each set bit indicates 77 + * a page size that can be natively stored in the page table. 78 + */ 79 + u64 pgsize_bitmap; 80 + }; 81 + 82 + struct pt_iommu_ops { 83 + /** 84 + * @set_dirty: Make the iova write dirty 85 + * @iommu_table: Table to manipulate 86 + * @iova: IO virtual address to start 87 + * 88 + * This is only used by iommufd testing. It makes the iova dirty so that 89 + * read_and_clear_dirty() will see it as dirty. Unlike all the other ops 90 + * this one is safe to call without holding any locking. It may return 91 + * -EAGAIN if there is a race. 92 + */ 93 + int (*set_dirty)(struct pt_iommu *iommu_table, dma_addr_t iova); 94 + 95 + /** 96 + * @get_info: Return the pt_iommu_info structure 97 + * @iommu_table: Table to query 98 + * 99 + * Return some basic static information about the page table. 100 + */ 101 + void (*get_info)(struct pt_iommu *iommu_table, 102 + struct pt_iommu_info *info); 103 + 104 + /** 105 + * @deinit: Undo a format specific init operation 106 + * @iommu_table: Table to destroy 107 + * 108 + * Release all of the memory. The caller must have already removed the 109 + * table from all HW access and all caches. 110 + */ 111 + void (*deinit)(struct pt_iommu *iommu_table); 112 + }; 113 + 114 + /** 115 + * struct pt_iommu_driver_ops - HW IOTLB cache flushing operations 116 + * 117 + * The IOMMU driver should implement these using container_of(iommu_table) to 118 + * get to it's iommu_domain derived structure. All ops can be called in atomic 119 + * contexts as they are buried under DMA API calls. 120 + */ 121 + struct pt_iommu_driver_ops { 122 + /** 123 + * @change_top: Update the top of table pointer 124 + * @iommu_table: Table to operate on 125 + * @top_paddr: New CPU physical address of the top pointer 126 + * @top_level: IOMMU PT level of the new top 127 + * 128 + * Called under the get_top_lock() spinlock. The driver must update all 129 + * HW references to this domain with a new top address and 130 + * configuration. On return mappings placed in the new top must be 131 + * reachable by the HW. 132 + * 133 + * top_level encodes the level in IOMMU PT format, level 0 is the 134 + * smallest page size increasing from there. This has to be translated 135 + * to any HW specific format. During this call the new top will not be 136 + * visible to any other API. 137 + * 138 + * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if 139 + * enabled. 140 + */ 141 + void (*change_top)(struct pt_iommu *iommu_table, phys_addr_t top_paddr, 142 + unsigned int top_level); 143 + 144 + /** 145 + * @get_top_lock: lock to hold when changing the table top 146 + * @iommu_table: Table to operate on 147 + * 148 + * Return a lock to hold when changing the table top page table from 149 + * being stored in HW. The lock will be held prior to calling 150 + * change_top() and released once the top is fully visible. 151 + * 152 + * Typically this would be a lock that protects the iommu_domain's 153 + * attachment list. 154 + * 155 + * This op is only used by PT_FEAT_DYNAMIC_TOP, and is required if 156 + * enabled. 157 + */ 158 + spinlock_t *(*get_top_lock)(struct pt_iommu *iommu_table); 159 + }; 160 + 161 + static inline void pt_iommu_deinit(struct pt_iommu *iommu_table) 162 + { 163 + /* 164 + * It is safe to call pt_iommu_deinit() before an init, or if init 165 + * fails. The ops pointer will only become non-NULL if deinit needs to be 166 + * run. 167 + */ 168 + if (iommu_table->ops) 169 + iommu_table->ops->deinit(iommu_table); 170 + } 171 + 172 + /** 173 + * struct pt_iommu_cfg - Common configuration values for all formats 174 + */ 175 + struct pt_iommu_cfg { 176 + /** 177 + * @features: Features required. Only these features will be turned on. 178 + * The feature list should reflect what the IOMMU HW is capable of. 179 + */ 180 + unsigned int features; 181 + /** 182 + * @hw_max_vasz_lg2: Maximum VA the IOMMU HW can support. This will 183 + * imply the top level of the table. 184 + */ 185 + u8 hw_max_vasz_lg2; 186 + /** 187 + * @hw_max_oasz_lg2: Maximum OA the IOMMU HW can support. The format 188 + * might select a lower maximum OA. 189 + */ 190 + u8 hw_max_oasz_lg2; 191 + }; 192 + 193 + /* Generate the exported function signatures from iommu_pt.h */ 194 + #define IOMMU_PROTOTYPES(fmt) \ 195 + phys_addr_t pt_iommu_##fmt##_iova_to_phys(struct iommu_domain *domain, \ 196 + dma_addr_t iova); \ 197 + int pt_iommu_##fmt##_map_pages(struct iommu_domain *domain, \ 198 + unsigned long iova, phys_addr_t paddr, \ 199 + size_t pgsize, size_t pgcount, \ 200 + int prot, gfp_t gfp, size_t *mapped); \ 201 + size_t pt_iommu_##fmt##_unmap_pages( \ 202 + struct iommu_domain *domain, unsigned long iova, \ 203 + size_t pgsize, size_t pgcount, \ 204 + struct iommu_iotlb_gather *iotlb_gather); \ 205 + int pt_iommu_##fmt##_read_and_clear_dirty( \ 206 + struct iommu_domain *domain, unsigned long iova, size_t size, \ 207 + unsigned long flags, struct iommu_dirty_bitmap *dirty); \ 208 + int pt_iommu_##fmt##_init(struct pt_iommu_##fmt *table, \ 209 + const struct pt_iommu_##fmt##_cfg *cfg, \ 210 + gfp_t gfp); \ 211 + void pt_iommu_##fmt##_hw_info(struct pt_iommu_##fmt *table, \ 212 + struct pt_iommu_##fmt##_hw_info *info) 213 + #define IOMMU_FORMAT(fmt, member) \ 214 + struct pt_iommu_##fmt { \ 215 + struct pt_iommu iommu; \ 216 + struct pt_##fmt member; \ 217 + }; \ 218 + IOMMU_PROTOTYPES(fmt) 219 + 220 + /* 221 + * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the 222 + * iommu_pt 223 + */ 224 + #define IOMMU_PT_DOMAIN_OPS(fmt) \ 225 + .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ 226 + .map_pages = &pt_iommu_##fmt##_map_pages, \ 227 + .unmap_pages = &pt_iommu_##fmt##_unmap_pages 228 + #define IOMMU_PT_DIRTY_OPS(fmt) \ 229 + .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty 230 + 231 + /* 232 + * The driver should setup its domain struct like 233 + * union { 234 + * struct iommu_domain domain; 235 + * struct pt_iommu_xxx xx; 236 + * }; 237 + * PT_IOMMU_CHECK_DOMAIN(struct mock_iommu_domain, xx.iommu, domain); 238 + * 239 + * Which creates an alias between driver_domain.domain and 240 + * driver_domain.xx.iommu.domain. This is to avoid a mass rename of existing 241 + * driver_domain.domain users. 242 + */ 243 + #define PT_IOMMU_CHECK_DOMAIN(s, pt_iommu_memb, domain_memb) \ 244 + static_assert(offsetof(s, pt_iommu_memb.domain) == \ 245 + offsetof(s, domain_memb)) 246 + 247 + struct pt_iommu_amdv1_cfg { 248 + struct pt_iommu_cfg common; 249 + unsigned int starting_level; 250 + }; 251 + 252 + struct pt_iommu_amdv1_hw_info { 253 + u64 host_pt_root; 254 + u8 mode; 255 + }; 256 + 257 + IOMMU_FORMAT(amdv1, amdpt); 258 + 259 + /* amdv1_mock is used by the iommufd selftest */ 260 + #define pt_iommu_amdv1_mock pt_iommu_amdv1 261 + #define pt_iommu_amdv1_mock_cfg pt_iommu_amdv1_cfg 262 + struct pt_iommu_amdv1_mock_hw_info; 263 + IOMMU_PROTOTYPES(amdv1_mock); 264 + 265 + struct pt_iommu_vtdss_cfg { 266 + struct pt_iommu_cfg common; 267 + /* 4 is a 57 bit 5 level table */ 268 + unsigned int top_level; 269 + }; 270 + 271 + struct pt_iommu_vtdss_hw_info { 272 + u64 ssptptr; 273 + u8 aw; 274 + }; 275 + 276 + IOMMU_FORMAT(vtdss, vtdss_pt); 277 + 278 + struct pt_iommu_x86_64_cfg { 279 + struct pt_iommu_cfg common; 280 + /* 4 is a 57 bit 5 level table */ 281 + unsigned int top_level; 282 + }; 283 + 284 + struct pt_iommu_x86_64_hw_info { 285 + u64 gcr3_pt; 286 + u8 levels; 287 + }; 288 + 289 + IOMMU_FORMAT(x86_64, x86_64_pt); 290 + 291 + #undef IOMMU_PROTOTYPES 292 + #undef IOMMU_FORMAT 293 + #endif

-2

include/linux/io-pgtable.h

··· 15 15 ARM_64_LPAE_S2, 16 16 ARM_V7S, 17 17 ARM_MALI_LPAE, 18 - AMD_IOMMU_V1, 19 - AMD_IOMMU_V2, 20 18 APPLE_DART, 21 19 APPLE_DART2, 22 20 IO_PGTABLE_NUM_FMTS,

+2 -1

include/linux/iommu.h

··· 751 751 * @free: Release the domain after use. 752 752 */ 753 753 struct iommu_domain_ops { 754 - int (*attach_dev)(struct iommu_domain *domain, struct device *dev); 754 + int (*attach_dev)(struct iommu_domain *domain, struct device *dev, 755 + struct iommu_domain *old); 755 756 int (*set_dev_pasid)(struct iommu_domain *domain, struct device *dev, 756 757 ioasid_t pasid, struct iommu_domain *old); 757 758

+1 -2

include/linux/irqchip/riscv-imsic.h

··· 10 10 #include <linux/bitops.h> 11 11 #include <linux/device.h> 12 12 #include <linux/fwnode.h> 13 - #include <asm/csr.h> 14 13 15 14 #define IMSIC_MMIO_PAGE_SHIFT 12 16 15 #define IMSIC_MMIO_PAGE_SZ BIT(IMSIC_MMIO_PAGE_SHIFT) ··· 85 86 86 87 #endif 87 88 88 - #ifdef CONFIG_ACPI 89 + #if IS_ENABLED(CONFIG_ACPI) && IS_ENABLED(CONFIG_RISCV_IMSIC) 89 90 int imsic_platform_acpi_probe(struct fwnode_handle *fwnode); 90 91 struct fwnode_handle *imsic_acpi_get_fwnode(struct device *dev); 91 92 #else

+38 -22

tools/testing/selftests/iommu/iommufd.c

··· 13 13 14 14 static unsigned long HUGEPAGE_SIZE; 15 15 16 - #define MOCK_PAGE_SIZE (PAGE_SIZE / 2) 17 - #define MOCK_HUGE_PAGE_SIZE (512 * MOCK_PAGE_SIZE) 18 - 19 16 static unsigned long get_huge_page_size(void) 20 17 { 21 18 char buf[80]; ··· 2055 2058 2056 2059 FIXTURE_SETUP(iommufd_dirty_tracking) 2057 2060 { 2061 + struct iommu_option cmd = { 2062 + .size = sizeof(cmd), 2063 + .option_id = IOMMU_OPTION_HUGE_PAGES, 2064 + .op = IOMMU_OPTION_OP_SET, 2065 + .val64 = 0, 2066 + }; 2058 2067 size_t mmap_buffer_size; 2059 2068 unsigned long size; 2060 2069 int mmap_flags; ··· 2069 2066 2070 2067 if (variant->buffer_size < MOCK_PAGE_SIZE) { 2071 2068 SKIP(return, 2072 - "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%lu", 2069 + "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%u", 2073 2070 variant->buffer_size, MOCK_PAGE_SIZE); 2074 2071 } 2075 2072 ··· 2117 2114 assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); 2118 2115 2119 2116 test_ioctl_ioas_alloc(&self->ioas_id); 2120 - /* Enable 1M mock IOMMU hugepages */ 2121 - if (variant->hugepages) { 2122 - test_cmd_mock_domain_flags(self->ioas_id, 2123 - MOCK_FLAGS_DEVICE_HUGE_IOVA, 2124 - &self->stdev_id, &self->hwpt_id, 2125 - &self->idev_id); 2126 - } else { 2127 - test_cmd_mock_domain(self->ioas_id, &self->stdev_id, 2128 - &self->hwpt_id, &self->idev_id); 2129 - } 2117 + 2118 + /* 2119 + * For dirty testing it is important that the page size fed into 2120 + * the iommu page tables matches the size the dirty logic 2121 + * expects, or set_dirty can touch too much stuff. 2122 + */ 2123 + cmd.object_id = self->ioas_id; 2124 + if (!variant->hugepages) 2125 + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); 2126 + 2127 + test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id, 2128 + &self->idev_id); 2130 2129 } 2131 2130 2132 2131 FIXTURE_TEARDOWN(iommufd_dirty_tracking) ··· 2253 2248 TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) 2254 2249 { 2255 2250 uint32_t page_size = MOCK_PAGE_SIZE; 2251 + uint32_t ioas_id = self->ioas_id; 2256 2252 uint32_t hwpt_id; 2257 - uint32_t ioas_id; 2258 2253 2259 2254 if (variant->hugepages) 2260 2255 page_size = MOCK_HUGE_PAGE_SIZE; 2261 2256 2262 - test_ioctl_ioas_alloc(&ioas_id); 2263 2257 test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, 2264 2258 variant->buffer_size, MOCK_APERTURE_START); 2265 2259 2266 - test_cmd_hwpt_alloc(self->idev_id, ioas_id, 2267 - IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); 2260 + if (variant->hugepages) 2261 + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, 2262 + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, 2263 + MOCK_IOMMUPT_HUGE, &hwpt_id); 2264 + else 2265 + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, 2266 + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, 2267 + MOCK_IOMMUPT_DEFAULT, &hwpt_id); 2268 2268 2269 2269 test_cmd_set_dirty_tracking(hwpt_id, true); 2270 2270 ··· 2295 2285 TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) 2296 2286 { 2297 2287 uint32_t page_size = MOCK_PAGE_SIZE; 2288 + uint32_t ioas_id = self->ioas_id; 2298 2289 uint32_t hwpt_id; 2299 - uint32_t ioas_id; 2300 2290 2301 2291 if (variant->hugepages) 2302 2292 page_size = MOCK_HUGE_PAGE_SIZE; 2303 2293 2304 - test_ioctl_ioas_alloc(&ioas_id); 2305 2294 test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, 2306 2295 variant->buffer_size, MOCK_APERTURE_START); 2307 2296 2308 - test_cmd_hwpt_alloc(self->idev_id, ioas_id, 2309 - IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); 2297 + 2298 + if (variant->hugepages) 2299 + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, 2300 + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, 2301 + MOCK_IOMMUPT_HUGE, &hwpt_id); 2302 + else 2303 + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, 2304 + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, 2305 + MOCK_IOMMUPT_DEFAULT, &hwpt_id); 2310 2306 2311 2307 test_cmd_set_dirty_tracking(hwpt_id, true); 2312 2308

+12

tools/testing/selftests/iommu/iommufd_utils.h

··· 215 215 ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags, \ 216 216 hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \ 217 217 0)) 218 + #define test_cmd_hwpt_alloc_iommupt(device_id, pt_id, flags, iommupt_type, \ 219 + hwpt_id) \ 220 + ({ \ 221 + struct iommu_hwpt_selftest user_cfg = { \ 222 + .pagetable_type = iommupt_type \ 223 + }; \ 224 + \ 225 + ASSERT_EQ(0, _test_cmd_hwpt_alloc( \ 226 + self->fd, device_id, pt_id, 0, flags, \ 227 + hwpt_id, IOMMU_HWPT_DATA_SELFTEST, \ 228 + &user_cfg, sizeof(user_cfg))); \ 229 + }) 218 230 #define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ 219 231 EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc( \ 220 232 self->fd, device_id, pt_id, 0, flags, \