Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Implement mseal() syscall.
4 *
5 * Copyright (c) 2023,2024 Google, Inc.
6 *
7 * Author: Jeff Xu <jeffxu@chromium.org>
8 */
9
10#include <linux/mempolicy.h>
11#include <linux/mman.h>
12#include <linux/mm.h>
13#include <linux/mm_inline.h>
14#include <linux/mmu_context.h>
15#include <linux/syscalls.h>
16#include <linux/sched.h>
17#include "internal.h"
18
19static inline bool vma_is_sealed(struct vm_area_struct *vma)
20{
21 return (vma->vm_flags & VM_SEALED);
22}
23
24static inline void set_vma_sealed(struct vm_area_struct *vma)
25{
26 vm_flags_set(vma, VM_SEALED);
27}
28
29/*
30 * check if a vma is sealed for modification.
31 * return true, if modification is allowed.
32 */
33static bool can_modify_vma(struct vm_area_struct *vma)
34{
35 if (unlikely(vma_is_sealed(vma)))
36 return false;
37
38 return true;
39}
40
41static bool is_madv_discard(int behavior)
42{
43 switch (behavior) {
44 case MADV_FREE:
45 case MADV_DONTNEED:
46 case MADV_DONTNEED_LOCKED:
47 case MADV_REMOVE:
48 case MADV_DONTFORK:
49 case MADV_WIPEONFORK:
50 return true;
51 }
52
53 return false;
54}
55
56static bool is_ro_anon(struct vm_area_struct *vma)
57{
58 /* check anonymous mapping. */
59 if (vma->vm_file || vma->vm_flags & VM_SHARED)
60 return false;
61
62 /*
63 * check for non-writable:
64 * PROT=RO or PKRU is not writeable.
65 */
66 if (!(vma->vm_flags & VM_WRITE) ||
67 !arch_vma_access_permitted(vma, true, false, false))
68 return true;
69
70 return false;
71}
72
73/*
74 * Check if the vmas of a memory range are allowed to be modified.
75 * the memory ranger can have a gap (unallocated memory).
76 * return true, if it is allowed.
77 */
78bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
79{
80 struct vm_area_struct *vma;
81
82 VMA_ITERATOR(vmi, mm, start);
83
84 /* going through each vma to check. */
85 for_each_vma_range(vmi, vma, end) {
86 if (unlikely(!can_modify_vma(vma)))
87 return false;
88 }
89
90 /* Allow by default. */
91 return true;
92}
93
94/*
95 * Check if the vmas of a memory range are allowed to be modified by madvise.
96 * the memory ranger can have a gap (unallocated memory).
97 * return true, if it is allowed.
98 */
99bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
100 int behavior)
101{
102 struct vm_area_struct *vma;
103
104 VMA_ITERATOR(vmi, mm, start);
105
106 if (!is_madv_discard(behavior))
107 return true;
108
109 /* going through each vma to check. */
110 for_each_vma_range(vmi, vma, end)
111 if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
112 return false;
113
114 /* Allow by default. */
115 return true;
116}
117
118static int mseal_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma,
119 struct vm_area_struct **prev, unsigned long start,
120 unsigned long end, vm_flags_t newflags)
121{
122 int ret = 0;
123 vm_flags_t oldflags = vma->vm_flags;
124
125 if (newflags == oldflags)
126 goto out;
127
128 vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags);
129 if (IS_ERR(vma)) {
130 ret = PTR_ERR(vma);
131 goto out;
132 }
133
134 set_vma_sealed(vma);
135out:
136 *prev = vma;
137 return ret;
138}
139
140/*
141 * Check for do_mseal:
142 * 1> start is part of a valid vma.
143 * 2> end is part of a valid vma.
144 * 3> No gap (unallocated address) between start and end.
145 * 4> map is sealable.
146 */
147static int check_mm_seal(unsigned long start, unsigned long end)
148{
149 struct vm_area_struct *vma;
150 unsigned long nstart = start;
151
152 VMA_ITERATOR(vmi, current->mm, start);
153
154 /* going through each vma to check. */
155 for_each_vma_range(vmi, vma, end) {
156 if (vma->vm_start > nstart)
157 /* unallocated memory found. */
158 return -ENOMEM;
159
160 if (vma->vm_end >= end)
161 return 0;
162
163 nstart = vma->vm_end;
164 }
165
166 return -ENOMEM;
167}
168
169/*
170 * Apply sealing.
171 */
172static int apply_mm_seal(unsigned long start, unsigned long end)
173{
174 unsigned long nstart;
175 struct vm_area_struct *vma, *prev;
176
177 VMA_ITERATOR(vmi, current->mm, start);
178
179 vma = vma_iter_load(&vmi);
180 /*
181 * Note: check_mm_seal should already checked ENOMEM case.
182 * so vma should not be null, same for the other ENOMEM cases.
183 */
184 prev = vma_prev(&vmi);
185 if (start > vma->vm_start)
186 prev = vma;
187
188 nstart = start;
189 for_each_vma_range(vmi, vma, end) {
190 int error;
191 unsigned long tmp;
192 vm_flags_t newflags;
193
194 newflags = vma->vm_flags | VM_SEALED;
195 tmp = vma->vm_end;
196 if (tmp > end)
197 tmp = end;
198 error = mseal_fixup(&vmi, vma, &prev, nstart, tmp, newflags);
199 if (error)
200 return error;
201 nstart = vma_iter_end(&vmi);
202 }
203
204 return 0;
205}
206
207/*
208 * mseal(2) seals the VM's meta data from
209 * selected syscalls.
210 *
211 * addr/len: VM address range.
212 *
213 * The address range by addr/len must meet:
214 * start (addr) must be in a valid VMA.
215 * end (addr + len) must be in a valid VMA.
216 * no gap (unallocated memory) between start and end.
217 * start (addr) must be page aligned.
218 *
219 * len: len will be page aligned implicitly.
220 *
221 * Below VMA operations are blocked after sealing.
222 * 1> Unmapping, moving to another location, and shrinking
223 * the size, via munmap() and mremap(), can leave an empty
224 * space, therefore can be replaced with a VMA with a new
225 * set of attributes.
226 * 2> Moving or expanding a different vma into the current location,
227 * via mremap().
228 * 3> Modifying a VMA via mmap(MAP_FIXED).
229 * 4> Size expansion, via mremap(), does not appear to pose any
230 * specific risks to sealed VMAs. It is included anyway because
231 * the use case is unclear. In any case, users can rely on
232 * merging to expand a sealed VMA.
233 * 5> mprotect and pkey_mprotect.
234 * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
235 * for anonymous memory, when users don't have write permission to the
236 * memory. Those behaviors can alter region contents by discarding pages,
237 * effectively a memset(0) for anonymous memory.
238 *
239 * flags: reserved.
240 *
241 * return values:
242 * zero: success.
243 * -EINVAL:
244 * invalid input flags.
245 * start address is not page aligned.
246 * Address arange (start + len) overflow.
247 * -ENOMEM:
248 * addr is not a valid address (not allocated).
249 * end (start + len) is not a valid address.
250 * a gap (unallocated memory) between start and end.
251 * -EPERM:
252 * - In 32 bit architecture, sealing is not supported.
253 * Note:
254 * user can call mseal(2) multiple times, adding a seal on an
255 * already sealed memory is a no-action (no error).
256 *
257 * unseal() is not supported.
258 */
259static int do_mseal(unsigned long start, size_t len_in, unsigned long flags)
260{
261 size_t len;
262 int ret = 0;
263 unsigned long end;
264 struct mm_struct *mm = current->mm;
265
266 ret = can_do_mseal(flags);
267 if (ret)
268 return ret;
269
270 start = untagged_addr(start);
271 if (!PAGE_ALIGNED(start))
272 return -EINVAL;
273
274 len = PAGE_ALIGN(len_in);
275 /* Check to see whether len was rounded up from small -ve to zero. */
276 if (len_in && !len)
277 return -EINVAL;
278
279 end = start + len;
280 if (end < start)
281 return -EINVAL;
282
283 if (end == start)
284 return 0;
285
286 if (mmap_write_lock_killable(mm))
287 return -EINTR;
288
289 /*
290 * First pass, this helps to avoid
291 * partial sealing in case of error in input address range,
292 * e.g. ENOMEM error.
293 */
294 ret = check_mm_seal(start, end);
295 if (ret)
296 goto out;
297
298 /*
299 * Second pass, this should success, unless there are errors
300 * from vma_modify_flags, e.g. merge/split error, or process
301 * reaching the max supported VMAs, however, those cases shall
302 * be rare.
303 */
304 ret = apply_mm_seal(start, end);
305
306out:
307 mmap_write_unlock(current->mm);
308 return ret;
309}
310
311SYSCALL_DEFINE3(mseal, unsigned long, start, size_t, len, unsigned long,
312 flags)
313{
314 return do_mseal(start, len, flags);
315}