Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES
4 */
5#ifndef __GENERIC_PT_COMMON_H
6#define __GENERIC_PT_COMMON_H
7
8#include <linux/types.h>
9#include <linux/build_bug.h>
10#include <linux/bits.h>
11
12/**
13 * DOC: Generic Radix Page Table
14 *
15 * Generic Radix Page Table is a set of functions and helpers to efficiently
16 * parse radix style page tables typically seen in HW implementations. The
17 * interface is built to deliver similar code generation as the mm's pte/pmd/etc
18 * system by fully inlining the exact code required to handle each table level.
19 *
20 * Like the mm subsystem each format contributes its parsing implementation
21 * under common names and the common code implements the required algorithms.
22 *
23 * The system is divided into three logical levels:
24 *
25 * - The page table format and its manipulation functions
26 * - Generic helpers to give a consistent API regardless of underlying format
27 * - An algorithm implementation (e.g. IOMMU/DRM/KVM/MM)
28 *
29 * Multiple implementations are supported. The intention is to have the generic
30 * format code be re-usable for whatever specialized implementation is required.
31 * The generic code is solely about the format of the radix tree; it does not
32 * include memory allocation or higher level decisions that are left for the
33 * implementation.
34 *
35 * The generic framework supports a superset of functions across many HW
36 * implementations:
37 *
38 * - Entries comprised of contiguous blocks of IO PTEs for larger page sizes
39 * - Multi-level tables, up to 6 levels. Runtime selected top level
40 * - Runtime variable table level size (ARM's concatenated tables)
41 * - Expandable top level allowing dynamic sizing of table levels
42 * - Optional leaf entries at any level
43 * - 32-bit/64-bit virtual and output addresses, using every address bit
44 * - Dirty tracking
45 * - Sign extended addressing
46 */
47
48/**
49 * struct pt_common - struct for all page table implementations
50 */
51struct pt_common {
52 /**
53 * @top_of_table: Encodes the table top pointer and the top level in a
54 * single value. Must use READ_ONCE/WRITE_ONCE to access it. The lower
55 * bits of the aligned table pointer are used for the level.
56 */
57 uintptr_t top_of_table;
58 /**
59 * @max_oasz_lg2: Maximum number of bits the OA can contain. Upper bits
60 * must be zero. This may be less than what the page table format
61 * supports, but must not be more.
62 */
63 u8 max_oasz_lg2;
64 /**
65 * @max_vasz_lg2: Maximum number of bits the VA can contain. Upper bits
66 * are 0 or 1 depending on pt_full_va_prefix(). This may be less than
67 * what the page table format supports, but must not be more. When
68 * PT_FEAT_DYNAMIC_TOP is set this reflects the maximum VA capability.
69 */
70 u8 max_vasz_lg2;
71 /**
72 * @features: Bitmap of `enum pt_features`
73 */
74 unsigned int features;
75};
76
77/* Encoding parameters for top_of_table */
78enum {
79 PT_TOP_LEVEL_BITS = 3,
80 PT_TOP_LEVEL_MASK = GENMASK(PT_TOP_LEVEL_BITS - 1, 0),
81};
82
83/**
84 * enum pt_features - Features turned on in the table. Each symbol is a bit
85 * position.
86 */
87enum pt_features {
88 /**
89 * @PT_FEAT_DMA_INCOHERENT: Cache flush page table memory before
90 * assuming the HW can read it. Otherwise a SMP release is sufficient
91 * for HW to read it.
92 */
93 PT_FEAT_DMA_INCOHERENT,
94 /**
95 * @PT_FEAT_FULL_VA: The table can span the full VA range from 0 to
96 * PT_VADDR_MAX.
97 */
98 PT_FEAT_FULL_VA,
99 /**
100 * @PT_FEAT_DYNAMIC_TOP: The table's top level can be increased
101 * dynamically during map. This requires HW support for atomically
102 * setting both the table top pointer and the starting table level.
103 */
104 PT_FEAT_DYNAMIC_TOP,
105 /**
106 * @PT_FEAT_SIGN_EXTEND: The top most bit of the valid VA range sign
107 * extends up to the full pt_vaddr_t. This divides the page table into
108 * three VA ranges::
109 *
110 * 0 -> 2^N - 1 Lower
111 * 2^N -> (MAX - 2^N - 1) Non-Canonical
112 * MAX - 2^N -> MAX Upper
113 *
114 * In this mode pt_common::max_vasz_lg2 includes the sign bit and the
115 * upper bits that don't fall within the translation are just validated.
116 *
117 * If not set there is no sign extension and valid VA goes from 0 to 2^N
118 * - 1.
119 */
120 PT_FEAT_SIGN_EXTEND,
121 /**
122 * @PT_FEAT_FLUSH_RANGE: IOTLB maintenance is done by flushing IOVA
123 * ranges which will clean out any walk cache or any IOPTE fully
124 * contained by the range. The optimization objective is to minimize the
125 * number of flushes even if ranges include IOVA gaps that do not need
126 * to be flushed.
127 */
128 PT_FEAT_FLUSH_RANGE,
129 /**
130 * @PT_FEAT_FLUSH_RANGE_NO_GAPS: Like PT_FEAT_FLUSH_RANGE except that
131 * the optimization objective is to only flush IOVA that has been
132 * changed. This mode is suitable for cases like hypervisor shadowing
133 * where flushing unchanged ranges may cause the hypervisor to reparse
134 * significant amount of page table.
135 */
136 PT_FEAT_FLUSH_RANGE_NO_GAPS,
137 /* private: */
138 PT_FEAT_FMT_START,
139};
140
141struct pt_amdv1 {
142 struct pt_common common;
143};
144
145enum {
146 /*
147 * The memory backing the tables is encrypted. Use __sme_set() to adjust
148 * the page table pointers in the tree. This only works with
149 * CONFIG_AMD_MEM_ENCRYPT.
150 */
151 PT_FEAT_AMDV1_ENCRYPT_TABLES = PT_FEAT_FMT_START,
152 /*
153 * The PTEs are set to prevent cache incoherent traffic, such as PCI no
154 * snoop. This is set either at creation time or before the first map
155 * operation.
156 */
157 PT_FEAT_AMDV1_FORCE_COHERENCE,
158};
159
160struct pt_vtdss {
161 struct pt_common common;
162};
163
164enum {
165 /*
166 * The PTEs are set to prevent cache incoherent traffic, such as PCI no
167 * snoop. This is set either at creation time or before the first map
168 * operation.
169 */
170 PT_FEAT_VTDSS_FORCE_COHERENCE = PT_FEAT_FMT_START,
171 /*
172 * Prevent creating read-only PTEs. Used to work around HW errata
173 * ERRATA_772415_SPR17.
174 */
175 PT_FEAT_VTDSS_FORCE_WRITEABLE,
176};
177
178struct pt_x86_64 {
179 struct pt_common common;
180};
181
182enum {
183 /*
184 * The memory backing the tables is encrypted. Use __sme_set() to adjust
185 * the page table pointers in the tree. This only works with
186 * CONFIG_AMD_MEM_ENCRYPT.
187 */
188 PT_FEAT_X86_64_AMD_ENCRYPT_TABLES = PT_FEAT_FMT_START,
189};
190
191#endif