Linux kernel mirror (for testing)
git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel
os
linux
1// SPDX-License-Identifier: GPL-2.0
2
3// Copyright (C) 2025 Google LLC.
4
5use kernel::{
6 page::PAGE_SIZE,
7 prelude::*,
8 rbtree::{RBTree, RBTreeNode, RBTreeNodeReservation},
9 seq_file::SeqFile,
10 seq_print,
11 task::Pid,
12};
13
14use crate::range_alloc::{DescriptorState, FreedRange, Range};
15
16/// Keeps track of allocations in a process' mmap.
17///
18/// Each process has an mmap where the data for incoming transactions will be placed. This struct
19/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that
20/// has metadata related to the allocation. We also keep track of available free space.
21pub(super) struct TreeRangeAllocator<T> {
22 /// This collection contains descriptors for *both* ranges containing an allocation, *and* free
23 /// ranges between allocations. The free ranges get merged, so there are never two free ranges
24 /// next to each other.
25 tree: RBTree<usize, Descriptor<T>>,
26 /// Contains an entry for every free range in `self.tree`. This tree sorts the ranges by size,
27 /// letting us look up the smallest range whose size is at least some lower bound.
28 free_tree: RBTree<FreeKey, ()>,
29 size: usize,
30 free_oneway_space: usize,
31}
32
33impl<T> TreeRangeAllocator<T> {
34 pub(crate) fn from_array(
35 size: usize,
36 ranges: &mut KVec<Range<T>>,
37 alloc: &mut FromArrayAllocs<T>,
38 ) -> Self {
39 let mut tree = TreeRangeAllocator {
40 tree: RBTree::new(),
41 free_tree: RBTree::new(),
42 size,
43 free_oneway_space: size / 2,
44 };
45
46 let mut free_offset = 0;
47 for range in ranges.drain_all() {
48 let free_size = range.offset - free_offset;
49 if free_size > 0 {
50 let free_node = alloc.free_tree.pop().unwrap();
51 tree.free_tree
52 .insert(free_node.into_node((free_size, free_offset), ()));
53 let tree_node = alloc.tree.pop().unwrap();
54 tree.tree.insert(
55 tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size)),
56 );
57 }
58 free_offset = range.endpoint();
59
60 if range.state.is_oneway() {
61 tree.free_oneway_space = tree.free_oneway_space.saturating_sub(range.size);
62 }
63
64 let free_res = alloc.free_tree.pop().unwrap();
65 let tree_node = alloc.tree.pop().unwrap();
66 let mut desc = Descriptor::new(range.offset, range.size);
67 desc.state = Some((range.state, free_res));
68 tree.tree.insert(tree_node.into_node(range.offset, desc));
69 }
70
71 // After the last range, we may need a free range.
72 if free_offset < size {
73 let free_size = size - free_offset;
74 let free_node = alloc.free_tree.pop().unwrap();
75 tree.free_tree
76 .insert(free_node.into_node((free_size, free_offset), ()));
77 let tree_node = alloc.tree.pop().unwrap();
78 tree.tree
79 .insert(tree_node.into_node(free_offset, Descriptor::new(free_offset, free_size)));
80 }
81
82 tree
83 }
84
85 pub(crate) fn is_empty(&self) -> bool {
86 let mut tree_iter = self.tree.values();
87 // There's always at least one range, because index zero is either the start of a free or
88 // allocated range.
89 let first_value = tree_iter.next().unwrap();
90 if tree_iter.next().is_some() {
91 // There are never two free ranges next to each other, so if there is more than one
92 // descriptor, then at least one of them must hold an allocated range.
93 return false;
94 }
95 // There is only one descriptor. Return true if it is for a free range.
96 first_value.state.is_none()
97 }
98
99 pub(crate) fn total_size(&self) -> usize {
100 self.size
101 }
102
103 pub(crate) fn free_oneway_space(&self) -> usize {
104 self.free_oneway_space
105 }
106
107 pub(crate) fn count_buffers(&self) -> usize {
108 self.tree
109 .values()
110 .filter(|desc| desc.state.is_some())
111 .count()
112 }
113
114 pub(crate) fn debug_print(&self, m: &SeqFile) -> Result<()> {
115 for desc in self.tree.values() {
116 let state = match &desc.state {
117 Some(state) => &state.0,
118 None => continue,
119 };
120 seq_print!(
121 m,
122 " buffer: {} size {} pid {}",
123 desc.offset,
124 desc.size,
125 state.pid(),
126 );
127 if state.is_oneway() {
128 seq_print!(m, " oneway");
129 }
130 match state {
131 DescriptorState::Reserved(_res) => {
132 seq_print!(m, " reserved\n");
133 }
134 DescriptorState::Allocated(_alloc) => {
135 seq_print!(m, " allocated\n");
136 }
137 }
138 }
139 Ok(())
140 }
141
142 fn find_best_match(&mut self, size: usize) -> Option<&mut Descriptor<T>> {
143 let free_cursor = self.free_tree.cursor_lower_bound(&(size, 0))?;
144 let ((_, offset), ()) = free_cursor.current();
145 self.tree.get_mut(offset)
146 }
147
148 /// Try to reserve a new buffer, using the provided allocation if necessary.
149 pub(crate) fn reserve_new(
150 &mut self,
151 debug_id: usize,
152 size: usize,
153 is_oneway: bool,
154 pid: Pid,
155 alloc: ReserveNewTreeAlloc<T>,
156 ) -> Result<(usize, bool)> {
157 // Compute new value of free_oneway_space, which is set only on success.
158 let new_oneway_space = if is_oneway {
159 match self.free_oneway_space.checked_sub(size) {
160 Some(new_oneway_space) => new_oneway_space,
161 None => return Err(ENOSPC),
162 }
163 } else {
164 self.free_oneway_space
165 };
166
167 // Start detecting spammers once we have less than 20%
168 // of async space left (which is less than 10% of total
169 // buffer size).
170 //
171 // (This will short-circut, so `low_oneway_space` is
172 // only called when necessary.)
173 let oneway_spam_detected =
174 is_oneway && new_oneway_space < self.size / 10 && self.low_oneway_space(pid);
175
176 let (found_size, found_off, tree_node, free_tree_node) = match self.find_best_match(size) {
177 None => {
178 pr_warn!("ENOSPC from range_alloc.reserve_new - size: {}", size);
179 return Err(ENOSPC);
180 }
181 Some(desc) => {
182 let found_size = desc.size;
183 let found_offset = desc.offset;
184
185 // In case we need to break up the descriptor
186 let new_desc = Descriptor::new(found_offset + size, found_size - size);
187 let (tree_node, free_tree_node, desc_node_res) = alloc.initialize(new_desc);
188
189 desc.state = Some((
190 DescriptorState::new(is_oneway, debug_id, pid),
191 desc_node_res,
192 ));
193 desc.size = size;
194
195 (found_size, found_offset, tree_node, free_tree_node)
196 }
197 };
198 self.free_oneway_space = new_oneway_space;
199 self.free_tree.remove(&(found_size, found_off));
200
201 if found_size != size {
202 self.tree.insert(tree_node);
203 self.free_tree.insert(free_tree_node);
204 }
205
206 Ok((found_off, oneway_spam_detected))
207 }
208
209 pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result<FreedRange> {
210 let mut cursor = self.tree.cursor_lower_bound_mut(&offset).ok_or_else(|| {
211 pr_warn!(
212 "EINVAL from range_alloc.reservation_abort - offset: {}",
213 offset
214 );
215 EINVAL
216 })?;
217
218 let (_, desc) = cursor.current_mut();
219
220 if desc.offset != offset {
221 pr_warn!(
222 "EINVAL from range_alloc.reservation_abort - offset: {}",
223 offset
224 );
225 return Err(EINVAL);
226 }
227
228 let (reservation, free_node_res) = desc.try_change_state(|state| match state {
229 Some((DescriptorState::Reserved(reservation), free_node_res)) => {
230 (None, Ok((reservation, free_node_res)))
231 }
232 None => {
233 pr_warn!(
234 "EINVAL from range_alloc.reservation_abort - offset: {}",
235 offset
236 );
237 (None, Err(EINVAL))
238 }
239 allocated => {
240 pr_warn!(
241 "EPERM from range_alloc.reservation_abort - offset: {}",
242 offset
243 );
244 (allocated, Err(EPERM))
245 }
246 })?;
247
248 let mut size = desc.size;
249 let mut offset = desc.offset;
250 let free_oneway_space_add = if reservation.is_oneway { size } else { 0 };
251
252 self.free_oneway_space += free_oneway_space_add;
253
254 let mut freed_range = FreedRange::interior_pages(offset, size);
255 // Compute how large the next free region needs to be to include one more page in
256 // the newly freed range.
257 let add_next_page_needed = match (offset + size) % PAGE_SIZE {
258 0 => usize::MAX,
259 unalign => PAGE_SIZE - unalign,
260 };
261 // Compute how large the previous free region needs to be to include one more page
262 // in the newly freed range.
263 let add_prev_page_needed = match offset % PAGE_SIZE {
264 0 => usize::MAX,
265 unalign => unalign,
266 };
267
268 // Merge next into current if next is free
269 let remove_next = match cursor.peek_next() {
270 Some((_, next)) if next.state.is_none() => {
271 if next.size >= add_next_page_needed {
272 freed_range.end_page_idx += 1;
273 }
274 self.free_tree.remove(&(next.size, next.offset));
275 size += next.size;
276 true
277 }
278 _ => false,
279 };
280
281 if remove_next {
282 let (_, desc) = cursor.current_mut();
283 desc.size = size;
284 cursor.remove_next();
285 }
286
287 // Merge current into prev if prev is free
288 match cursor.peek_prev_mut() {
289 Some((_, prev)) if prev.state.is_none() => {
290 if prev.size >= add_prev_page_needed {
291 freed_range.start_page_idx -= 1;
292 }
293 // merge previous with current, remove current
294 self.free_tree.remove(&(prev.size, prev.offset));
295 offset = prev.offset;
296 size += prev.size;
297 prev.size = size;
298 cursor.remove_current();
299 }
300 _ => {}
301 };
302
303 self.free_tree
304 .insert(free_node_res.into_node((size, offset), ()));
305
306 Ok(freed_range)
307 }
308
309 pub(crate) fn reservation_commit(&mut self, offset: usize, data: &mut Option<T>) -> Result {
310 let desc = self.tree.get_mut(&offset).ok_or(ENOENT)?;
311
312 desc.try_change_state(|state| match state {
313 Some((DescriptorState::Reserved(reservation), free_node_res)) => (
314 Some((
315 DescriptorState::Allocated(reservation.allocate(data.take())),
316 free_node_res,
317 )),
318 Ok(()),
319 ),
320 other => (other, Err(ENOENT)),
321 })
322 }
323
324 /// Takes an entry at the given offset from [`DescriptorState::Allocated`] to
325 /// [`DescriptorState::Reserved`].
326 ///
327 /// Returns the size of the existing entry and the data associated with it.
328 pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, usize, Option<T>)> {
329 let desc = self.tree.get_mut(&offset).ok_or_else(|| {
330 pr_warn!(
331 "ENOENT from range_alloc.reserve_existing - offset: {}",
332 offset
333 );
334 ENOENT
335 })?;
336
337 let (debug_id, data) = desc.try_change_state(|state| match state {
338 Some((DescriptorState::Allocated(allocation), free_node_res)) => {
339 let (reservation, data) = allocation.deallocate();
340 let debug_id = reservation.debug_id;
341 (
342 Some((DescriptorState::Reserved(reservation), free_node_res)),
343 Ok((debug_id, data)),
344 )
345 }
346 other => {
347 pr_warn!(
348 "ENOENT from range_alloc.reserve_existing - offset: {}",
349 offset
350 );
351 (other, Err(ENOENT))
352 }
353 })?;
354
355 Ok((desc.size, debug_id, data))
356 }
357
358 /// Call the provided callback at every allocated region.
359 ///
360 /// This destroys the range allocator. Used only during shutdown.
361 pub(crate) fn take_for_each<F: Fn(usize, usize, usize, Option<T>)>(&mut self, callback: F) {
362 for (_, desc) in self.tree.iter_mut() {
363 if let Some((DescriptorState::Allocated(allocation), _)) = &mut desc.state {
364 callback(
365 desc.offset,
366 desc.size,
367 allocation.debug_id(),
368 allocation.take(),
369 );
370 }
371 }
372 }
373
374 /// Find the amount and size of buffers allocated by the current caller.
375 ///
376 /// The idea is that once we cross the threshold, whoever is responsible
377 /// for the low async space is likely to try to send another async transaction,
378 /// and at some point we'll catch them in the act. This is more efficient
379 /// than keeping a map per pid.
380 fn low_oneway_space(&self, calling_pid: Pid) -> bool {
381 let mut total_alloc_size = 0;
382 let mut num_buffers = 0;
383 for (_, desc) in self.tree.iter() {
384 if let Some((state, _)) = &desc.state {
385 if state.is_oneway() && state.pid() == calling_pid {
386 total_alloc_size += desc.size;
387 num_buffers += 1;
388 }
389 }
390 }
391
392 // Warn if this pid has more than 50 transactions, or more than 50% of
393 // async space (which is 25% of total buffer size). Oneway spam is only
394 // detected when the threshold is exceeded.
395 num_buffers > 50 || total_alloc_size > self.size / 4
396 }
397}
398
399type TreeDescriptorState<T> = (DescriptorState<T>, FreeNodeRes);
400struct Descriptor<T> {
401 size: usize,
402 offset: usize,
403 state: Option<TreeDescriptorState<T>>,
404}
405
406impl<T> Descriptor<T> {
407 fn new(offset: usize, size: usize) -> Self {
408 Self {
409 size,
410 offset,
411 state: None,
412 }
413 }
414
415 fn try_change_state<F, Data>(&mut self, f: F) -> Result<Data>
416 where
417 F: FnOnce(Option<TreeDescriptorState<T>>) -> (Option<TreeDescriptorState<T>>, Result<Data>),
418 {
419 let (new_state, result) = f(self.state.take());
420 self.state = new_state;
421 result
422 }
423}
424
425// (Descriptor.size, Descriptor.offset)
426type FreeKey = (usize, usize);
427type FreeNodeRes = RBTreeNodeReservation<FreeKey, ()>;
428
429/// An allocation for use by `reserve_new`.
430pub(crate) struct ReserveNewTreeAlloc<T> {
431 tree_node_res: RBTreeNodeReservation<usize, Descriptor<T>>,
432 free_tree_node_res: FreeNodeRes,
433 desc_node_res: FreeNodeRes,
434}
435
436impl<T> ReserveNewTreeAlloc<T> {
437 pub(crate) fn try_new() -> Result<Self> {
438 let tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?;
439 let free_tree_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?;
440 let desc_node_res = RBTreeNodeReservation::new(GFP_KERNEL)?;
441 Ok(Self {
442 tree_node_res,
443 free_tree_node_res,
444 desc_node_res,
445 })
446 }
447
448 fn initialize(
449 self,
450 desc: Descriptor<T>,
451 ) -> (
452 RBTreeNode<usize, Descriptor<T>>,
453 RBTreeNode<FreeKey, ()>,
454 FreeNodeRes,
455 ) {
456 let size = desc.size;
457 let offset = desc.offset;
458 (
459 self.tree_node_res.into_node(offset, desc),
460 self.free_tree_node_res.into_node((size, offset), ()),
461 self.desc_node_res,
462 )
463 }
464}
465
466/// An allocation for creating a tree from an `ArrayRangeAllocator`.
467pub(crate) struct FromArrayAllocs<T> {
468 tree: KVec<RBTreeNodeReservation<usize, Descriptor<T>>>,
469 free_tree: KVec<RBTreeNodeReservation<FreeKey, ()>>,
470}
471
472impl<T> FromArrayAllocs<T> {
473 pub(crate) fn try_new(len: usize) -> Result<Self> {
474 let num_descriptors = 2 * len + 1;
475
476 let mut tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?;
477 for _ in 0..num_descriptors {
478 tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?;
479 }
480
481 let mut free_tree = KVec::with_capacity(num_descriptors, GFP_KERNEL)?;
482 for _ in 0..num_descriptors {
483 free_tree.push(RBTreeNodeReservation::new(GFP_KERNEL)?, GFP_KERNEL)?;
484 }
485
486 Ok(Self { tree, free_tree })
487 }
488}