Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

pidfs: lookup pid through rbtree

The new pid inode number allocation scheme is neat but I overlooked a
possible, even though unlikely, attack that can be used to trigger an
overflow on both 32bit and 64bit.

An unique 64 bit identifier was constructed for each struct pid by two
combining a 32 bit idr with a 32 bit generation number. A 32bit number
was allocated using the idr_alloc_cyclic() infrastructure. When the idr
wrapped around a 32 bit wraparound counter was incremented. The 32 bit
wraparound counter served as the upper 32 bits and the allocated idr
number as the lower 32 bits.

Since the idr can only allocate up to INT_MAX entries everytime a
wraparound happens INT_MAX - 1 entries are lost (Ignoring that numbering
always starts at 2 to avoid theoretical collisions with the root inode
number.).

If userspace fully populates the idr such that and puts itself into
control of two entries such that one entry is somewhere in the middle
and the other entry is the INT_MAX entry then it is possible to overflow
the wraparound counter. That is probably difficult to pull off but the
mere possibility is annoying.

The problem could be contained to 32 bit by switching to a data
structure such as the maple tree that allows allocating 64 bit numbers
on 64 bit machines. That would leave 32 bit in a lurch but that probably
doesn't matter that much. The other problem is that removing entries
form the maple tree is somewhat non-trivial because the removal code can
be called under the irq write lock of tasklist_lock and
irq{save,restore} code.

Instead, allocate unique identifiers for struct pid by simply
incrementing a 64 bit counter and insert each struct pid into the rbtree
so it can be looked up to decode file handles avoiding to leak actual
pids across pid namespaces in file handles.

On both 64 bit and 32 bit the same 64 bit identifier is used to lookup
struct pid in the rbtree. On 64 bit the unique identifier for struct pid
simply becomes the inode number. Comparing two pidfds continues to be as
simple as comparing inode numbers.

On 32 bit the 64 bit number assigned to struct pid is split into two 32
bit numbers. The lower 32 bits are used as the inode number and the
upper 32 bits are used as the inode generation number. Whenever a
wraparound happens on 32 bit the 64 bit number will be incremented by 2
so inode numbering starts at 2 again.

When a wraparound happens on 32 bit multiple pidfds with the same inode
number are likely to exist. This isn't a problem since before pidfs
pidfds used the anonymous inode meaning all pidfds had the same inode
number. On 32 bit sserspace can thus reconstruct the 64 bit identifier
by retrieving both the inode number and the inode generation number to
compare, or use file handles. This gives the same guarantees on both 32
bit and 64 bit.

Link: https://lore.kernel.org/r/20241214-gekoppelt-erdarbeiten-a1f9a982a5a6@brauner
Signed-off-by: Christian Brauner <brauner@kernel.org>

+84 -51
+78 -47
fs/pidfs.c
··· 24 24 #include "internal.h" 25 25 #include "mount.h" 26 26 27 - static DEFINE_IDR(pidfs_ino_idr); 28 - 29 - static u32 pidfs_ino_upper_32_bits = 0; 27 + static struct rb_root pidfs_ino_tree = RB_ROOT; 30 28 31 29 #if BITS_PER_LONG == 32 32 - /* 33 - * On 32 bit systems the lower 32 bits are the inode number and 34 - * the higher 32 bits are the generation number. The starting 35 - * value for the inode number and the generation number is one. 36 - */ 37 - static u32 pidfs_ino_lower_32_bits = 1; 38 - 39 30 static inline unsigned long pidfs_ino(u64 ino) 40 31 { 41 32 return lower_32_bits(ino); ··· 40 49 41 50 #else 42 51 43 - static u32 pidfs_ino_lower_32_bits = 0; 44 - 45 52 /* On 64 bit simply return ino. */ 46 53 static inline unsigned long pidfs_ino(u64 ino) 47 54 { 48 55 return ino; 49 56 } 50 57 51 - /* On 64 bit the generation number is 1. */ 58 + /* On 64 bit the generation number is 0. */ 52 59 static inline u32 pidfs_gen(u64 ino) 53 60 { 54 - return 1; 61 + return 0; 55 62 } 56 63 #endif 57 64 58 - /* 59 - * Construct an inode number for struct pid in a way that we can use the 60 - * lower 32bit to lookup struct pid independent of any pid numbers that 61 - * could be leaked into userspace (e.g., via file handle encoding). 62 - */ 63 - int pidfs_add_pid(struct pid *pid) 65 + static int pidfs_ino_cmp(struct rb_node *a, const struct rb_node *b) 64 66 { 65 - u32 upper; 66 - int lower; 67 + struct pid *pid_a = rb_entry(a, struct pid, pidfs_node); 68 + struct pid *pid_b = rb_entry(b, struct pid, pidfs_node); 69 + u64 pid_ino_a = pid_a->ino; 70 + u64 pid_ino_b = pid_b->ino; 67 71 68 - /* 69 - * Inode numbering for pidfs start at 2. This avoids collisions 70 - * with the root inode which is 1 for pseudo filesystems. 71 - */ 72 - lower = idr_alloc_cyclic(&pidfs_ino_idr, pid, 2, 0, GFP_ATOMIC); 73 - if (lower >= 0 && lower < pidfs_ino_lower_32_bits) 74 - pidfs_ino_upper_32_bits++; 75 - upper = pidfs_ino_upper_32_bits; 76 - pidfs_ino_lower_32_bits = lower; 77 - if (lower < 0) 78 - return lower; 79 - 80 - pid->ino = ((u64)upper << 32) | lower; 81 - pid->stashed = NULL; 72 + if (pid_ino_a < pid_ino_b) 73 + return -1; 74 + if (pid_ino_a > pid_ino_b) 75 + return 1; 82 76 return 0; 83 77 } 84 78 85 - /* The idr number to remove is the lower 32 bits of the inode. */ 79 + void pidfs_add_pid(struct pid *pid) 80 + { 81 + static u64 pidfs_ino_nr = 2; 82 + 83 + /* 84 + * On 64 bit nothing special happens. The 64bit number assigned 85 + * to struct pid is the inode number. 86 + * 87 + * On 32 bit the 64 bit number assigned to struct pid is split 88 + * into two 32 bit numbers. The lower 32 bits are used as the 89 + * inode number and the upper 32 bits are used as the inode 90 + * generation number. 91 + * 92 + * On 32 bit pidfs_ino() will return the lower 32 bit. When 93 + * pidfs_ino() returns zero a wrap around happened. When a 94 + * wraparound happens the 64 bit number will be incremented by 2 95 + * so inode numbering starts at 2 again. 96 + * 97 + * On 64 bit comparing two pidfds is as simple as comparing 98 + * inode numbers. 99 + * 100 + * When a wraparound happens on 32 bit multiple pidfds with the 101 + * same inode number are likely to exist (This isn't a problem 102 + * since before pidfs pidfds used the anonymous inode meaning 103 + * all pidfds had the same inode number.). Userspace can 104 + * reconstruct the 64 bit identifier by retrieving both the 105 + * inode number and the inode generation number to compare or 106 + * use file handles. 107 + */ 108 + if (pidfs_ino(pidfs_ino_nr) == 0) 109 + pidfs_ino_nr += 2; 110 + 111 + pid->ino = pidfs_ino_nr; 112 + pid->stashed = NULL; 113 + pidfs_ino_nr++; 114 + 115 + write_seqcount_begin(&pidmap_lock_seq); 116 + rb_find_add_rcu(&pid->pidfs_node, &pidfs_ino_tree, pidfs_ino_cmp); 117 + write_seqcount_end(&pidmap_lock_seq); 118 + } 119 + 86 120 void pidfs_remove_pid(struct pid *pid) 87 121 { 88 - idr_remove(&pidfs_ino_idr, lower_32_bits(pid->ino)); 122 + write_seqcount_begin(&pidmap_lock_seq); 123 + rb_erase(&pid->pidfs_node, &pidfs_ino_tree); 124 + write_seqcount_end(&pidmap_lock_seq); 89 125 } 90 126 91 127 #ifdef CONFIG_PROC_FS ··· 531 513 return FILEID_KERNFS; 532 514 } 533 515 516 + static int pidfs_ino_find(const void *key, const struct rb_node *node) 517 + { 518 + const u64 pid_ino = *(u64 *)key; 519 + const struct pid *pid = rb_entry(node, struct pid, pidfs_node); 520 + 521 + if (pid_ino < pid->ino) 522 + return -1; 523 + if (pid_ino > pid->ino) 524 + return 1; 525 + return 0; 526 + } 527 + 534 528 /* Find a struct pid based on the inode number. */ 535 529 static struct pid *pidfs_ino_get_pid(u64 ino) 536 530 { 537 - unsigned long pid_ino = pidfs_ino(ino); 538 - u32 gen = pidfs_gen(ino); 539 531 struct pid *pid; 532 + struct rb_node *node; 533 + unsigned int seq; 540 534 541 535 guard(rcu)(); 536 + do { 537 + seq = read_seqcount_begin(&pidmap_lock_seq); 538 + node = rb_find_rcu(&ino, &pidfs_ino_tree, pidfs_ino_find); 539 + if (node) 540 + break; 541 + } while (read_seqcount_retry(&pidmap_lock_seq, seq)); 542 542 543 - pid = idr_find(&pidfs_ino_idr, lower_32_bits(pid_ino)); 544 - if (!pid) 543 + if (!node) 545 544 return NULL; 546 545 547 - if (pidfs_ino(pid->ino) != pid_ino) 548 - return NULL; 549 - 550 - if (pidfs_gen(pid->ino) != gen) 551 - return NULL; 546 + pid = rb_entry(node, struct pid, pidfs_node); 552 547 553 548 /* Within our pid namespace hierarchy? */ 554 549 if (pid_vnr(pid) == 0)
+2
include/linux/pid.h
··· 59 59 spinlock_t lock; 60 60 struct dentry *stashed; 61 61 u64 ino; 62 + struct rb_node pidfs_node; 62 63 /* lists of tasks that use this pid */ 63 64 struct hlist_head tasks[PIDTYPE_MAX]; 64 65 struct hlist_head inodes; ··· 69 68 struct upid numbers[]; 70 69 }; 71 70 71 + extern seqcount_spinlock_t pidmap_lock_seq; 72 72 extern struct pid init_struct_pid; 73 73 74 74 struct file;
+1 -1
include/linux/pidfs.h
··· 4 4 5 5 struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags); 6 6 void __init pidfs_init(void); 7 - int pidfs_add_pid(struct pid *pid); 7 + void pidfs_add_pid(struct pid *pid); 8 8 void pidfs_remove_pid(struct pid *pid); 9 9 10 10 #endif /* _LINUX_PID_FS_H */
+3 -3
kernel/pid.c
··· 43 43 #include <linux/sched/task.h> 44 44 #include <linux/idr.h> 45 45 #include <linux/pidfs.h> 46 + #include <linux/seqlock.h> 46 47 #include <net/sock.h> 47 48 #include <uapi/linux/pidfd.h> 48 49 ··· 104 103 */ 105 104 106 105 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); 106 + seqcount_spinlock_t pidmap_lock_seq = SEQCNT_SPINLOCK_ZERO(pidmap_lock_seq, &pidmap_lock); 107 107 108 108 void put_pid(struct pid *pid) 109 109 { ··· 275 273 spin_lock_irq(&pidmap_lock); 276 274 if (!(ns->pid_allocated & PIDNS_ADDING)) 277 275 goto out_unlock; 278 - retval = pidfs_add_pid(pid); 279 - if (retval) 280 - goto out_unlock; 276 + pidfs_add_pid(pid); 281 277 for ( ; upid >= pid->numbers; --upid) { 282 278 /* Make the PID visible to find_pid_ns. */ 283 279 idr_replace(&upid->ns->idr, pid, upid->nr);