Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rbtree: Make lockless searches non-fatal

Change the insert and erase code such that lockless searches are
non-fatal.

In and of itself an rbtree cannot be correctly searched while
in-modification, we can however provide weaker guarantees that will
allow the rbtree to be used in conjunction with other techniques, such
as latches; see 9b0fd802e8c0 ("seqcount: Add raw_write_seqcount_latch()").

For this to work we need the following guarantees from the rbtree
code:

1) a lockless reader must not see partial stores, this would allow it
to observe nodes that are invalid memory.

2) there must not be (temporary) loops in the tree structure in the
modifier's program order, this would cause a lookup which
interrupts the modifier to get stuck indefinitely.

For 1) we must use WRITE_ONCE() for all updates to the tree structure;
in particular this patch only does rb_{left,right} as those are the
only element required for simple searches.

It generates slightly worse code, probably because volatile. But in
pointer chasing heavy code a few instructions more should not matter.

For 2) I have carefully audited the code and drawn every intermediate
link state and not found a loop.

Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Reviewed-by: Michel Lespinasse <walken@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

authored by

Peter Zijlstra and committed by
Rusty Russell
d72da4a4 0be964be

+81 -32
+13 -3
include/linux/rbtree.h
··· 31 31 32 32 #include <linux/kernel.h> 33 33 #include <linux/stddef.h> 34 + #include <linux/rcupdate.h> 34 35 35 36 struct rb_node { 36 37 unsigned long __rb_parent_color; ··· 74 73 extern struct rb_node *rb_next_postorder(const struct rb_node *); 75 74 76 75 /* Fast replacement of a single node without remove/rebalance/add/rebalance */ 77 - extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 76 + extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 78 77 struct rb_root *root); 79 78 80 - static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, 81 - struct rb_node ** rb_link) 79 + static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, 80 + struct rb_node **rb_link) 82 81 { 83 82 node->__rb_parent_color = (unsigned long)parent; 84 83 node->rb_left = node->rb_right = NULL; 85 84 86 85 *rb_link = node; 86 + } 87 + 88 + static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, 89 + struct rb_node **rb_link) 90 + { 91 + node->__rb_parent_color = (unsigned long)parent; 92 + node->rb_left = node->rb_right = NULL; 93 + 94 + rcu_assign_pointer(*rb_link, node); 87 95 } 88 96 89 97 #define rb_entry_safe(ptr, type, member) \
+14 -7
include/linux/rbtree_augmented.h
··· 123 123 { 124 124 if (parent) { 125 125 if (parent->rb_left == old) 126 - parent->rb_left = new; 126 + WRITE_ONCE(parent->rb_left, new); 127 127 else 128 - parent->rb_right = new; 128 + WRITE_ONCE(parent->rb_right, new); 129 129 } else 130 - root->rb_node = new; 130 + WRITE_ONCE(root->rb_node, new); 131 131 } 132 132 133 133 extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, ··· 137 137 __rb_erase_augmented(struct rb_node *node, struct rb_root *root, 138 138 const struct rb_augment_callbacks *augment) 139 139 { 140 - struct rb_node *child = node->rb_right, *tmp = node->rb_left; 140 + struct rb_node *child = node->rb_right; 141 + struct rb_node *tmp = node->rb_left; 141 142 struct rb_node *parent, *rebalance; 142 143 unsigned long pc; 143 144 ··· 168 167 tmp = parent; 169 168 } else { 170 169 struct rb_node *successor = child, *child2; 170 + 171 171 tmp = child->rb_left; 172 172 if (!tmp) { 173 173 /* ··· 182 180 */ 183 181 parent = successor; 184 182 child2 = successor->rb_right; 183 + 185 184 augment->copy(node, successor); 186 185 } else { 187 186 /* ··· 204 201 successor = tmp; 205 202 tmp = tmp->rb_left; 206 203 } while (tmp); 207 - parent->rb_left = child2 = successor->rb_right; 208 - successor->rb_right = child; 204 + child2 = successor->rb_right; 205 + WRITE_ONCE(parent->rb_left, child2); 206 + WRITE_ONCE(successor->rb_right, child); 209 207 rb_set_parent(child, successor); 208 + 210 209 augment->copy(node, successor); 211 210 augment->propagate(parent, successor); 212 211 } 213 212 214 - successor->rb_left = tmp = node->rb_left; 213 + tmp = node->rb_left; 214 + WRITE_ONCE(successor->rb_left, tmp); 215 215 rb_set_parent(tmp, successor); 216 216 217 217 pc = node->__rb_parent_color; 218 218 tmp = __rb_parent(pc); 219 219 __rb_change_child(node, successor, tmp, root); 220 + 220 221 if (child2) { 221 222 successor->__rb_parent_color = pc; 222 223 rb_set_parent_color(child2, parent, RB_BLACK);
+54 -22
lib/rbtree.c
··· 44 44 * parentheses and have some accompanying text comment. 45 45 */ 46 46 47 + /* 48 + * Notes on lockless lookups: 49 + * 50 + * All stores to the tree structure (rb_left and rb_right) must be done using 51 + * WRITE_ONCE(). And we must not inadvertently cause (temporary) loops in the 52 + * tree structure as seen in program order. 53 + * 54 + * These two requirements will allow lockless iteration of the tree -- not 55 + * correct iteration mind you, tree rotations are not atomic so a lookup might 56 + * miss entire subtrees. 57 + * 58 + * But they do guarantee that any such traversal will only see valid elements 59 + * and that it will indeed complete -- does not get stuck in a loop. 60 + * 61 + * It also guarantees that if the lookup returns an element it is the 'correct' 62 + * one. But not returning an element does _NOT_ mean it's not present. 63 + * 64 + * NOTE: 65 + * 66 + * Stores to __rb_parent_color are not important for simple lookups so those 67 + * are left undone as of now. Nor did I check for loops involving parent 68 + * pointers. 69 + */ 70 + 47 71 static inline void rb_set_black(struct rb_node *rb) 48 72 { 49 73 rb->__rb_parent_color |= RB_BLACK; ··· 153 129 * This still leaves us in violation of 4), the 154 130 * continuation into Case 3 will fix that. 155 131 */ 156 - parent->rb_right = tmp = node->rb_left; 157 - node->rb_left = parent; 132 + tmp = node->rb_left; 133 + WRITE_ONCE(parent->rb_right, tmp); 134 + WRITE_ONCE(node->rb_left, parent); 158 135 if (tmp) 159 136 rb_set_parent_color(tmp, parent, 160 137 RB_BLACK); ··· 174 149 * / \ 175 150 * n U 176 151 */ 177 - gparent->rb_left = tmp; /* == parent->rb_right */ 178 - parent->rb_right = gparent; 152 + WRITE_ONCE(gparent->rb_left, tmp); /* == parent->rb_right */ 153 + WRITE_ONCE(parent->rb_right, gparent); 179 154 if (tmp) 180 155 rb_set_parent_color(tmp, gparent, RB_BLACK); 181 156 __rb_rotate_set_parents(gparent, parent, root, RB_RED); ··· 196 171 tmp = parent->rb_left; 197 172 if (node == tmp) { 198 173 /* Case 2 - right rotate at parent */ 199 - parent->rb_left = tmp = node->rb_right; 200 - node->rb_right = parent; 174 + tmp = node->rb_right; 175 + WRITE_ONCE(parent->rb_left, tmp); 176 + WRITE_ONCE(node->rb_right, parent); 201 177 if (tmp) 202 178 rb_set_parent_color(tmp, parent, 203 179 RB_BLACK); ··· 209 183 } 210 184 211 185 /* Case 3 - left rotate at gparent */ 212 - gparent->rb_right = tmp; /* == parent->rb_left */ 213 - parent->rb_left = gparent; 186 + WRITE_ONCE(gparent->rb_right, tmp); /* == parent->rb_left */ 187 + WRITE_ONCE(parent->rb_left, gparent); 214 188 if (tmp) 215 189 rb_set_parent_color(tmp, gparent, RB_BLACK); 216 190 __rb_rotate_set_parents(gparent, parent, root, RB_RED); ··· 250 224 * / \ / \ 251 225 * Sl Sr N Sl 252 226 */ 253 - parent->rb_right = tmp1 = sibling->rb_left; 254 - sibling->rb_left = parent; 227 + tmp1 = sibling->rb_left; 228 + WRITE_ONCE(parent->rb_right, tmp1); 229 + WRITE_ONCE(sibling->rb_left, parent); 255 230 rb_set_parent_color(tmp1, parent, RB_BLACK); 256 231 __rb_rotate_set_parents(parent, sibling, root, 257 232 RB_RED); ··· 302 275 * \ 303 276 * Sr 304 277 */ 305 - sibling->rb_left = tmp1 = tmp2->rb_right; 306 - tmp2->rb_right = sibling; 307 - parent->rb_right = tmp2; 278 + tmp1 = tmp2->rb_right; 279 + WRITE_ONCE(sibling->rb_left, tmp1); 280 + WRITE_ONCE(tmp2->rb_right, sibling); 281 + WRITE_ONCE(parent->rb_right, tmp2); 308 282 if (tmp1) 309 283 rb_set_parent_color(tmp1, sibling, 310 284 RB_BLACK); ··· 325 297 * / \ / \ 326 298 * (sl) sr N (sl) 327 299 */ 328 - parent->rb_right = tmp2 = sibling->rb_left; 329 - sibling->rb_left = parent; 300 + tmp2 = sibling->rb_left; 301 + WRITE_ONCE(parent->rb_right, tmp2); 302 + WRITE_ONCE(sibling->rb_left, parent); 330 303 rb_set_parent_color(tmp1, sibling, RB_BLACK); 331 304 if (tmp2) 332 305 rb_set_parent(tmp2, parent); ··· 339 310 sibling = parent->rb_left; 340 311 if (rb_is_red(sibling)) { 341 312 /* Case 1 - right rotate at parent */ 342 - parent->rb_left = tmp1 = sibling->rb_right; 343 - sibling->rb_right = parent; 313 + tmp1 = sibling->rb_right; 314 + WRITE_ONCE(parent->rb_left, tmp1); 315 + WRITE_ONCE(sibling->rb_right, parent); 344 316 rb_set_parent_color(tmp1, parent, RB_BLACK); 345 317 __rb_rotate_set_parents(parent, sibling, root, 346 318 RB_RED); ··· 366 336 break; 367 337 } 368 338 /* Case 3 - right rotate at sibling */ 369 - sibling->rb_right = tmp1 = tmp2->rb_left; 370 - tmp2->rb_left = sibling; 371 - parent->rb_left = tmp2; 339 + tmp1 = tmp2->rb_left; 340 + WRITE_ONCE(sibling->rb_right, tmp1); 341 + WRITE_ONCE(tmp2->rb_left, sibling); 342 + WRITE_ONCE(parent->rb_left, tmp2); 372 343 if (tmp1) 373 344 rb_set_parent_color(tmp1, sibling, 374 345 RB_BLACK); ··· 378 347 sibling = tmp2; 379 348 } 380 349 /* Case 4 - left rotate at parent + color flips */ 381 - parent->rb_left = tmp2 = sibling->rb_right; 382 - sibling->rb_right = parent; 350 + tmp2 = sibling->rb_right; 351 + WRITE_ONCE(parent->rb_left, tmp2); 352 + WRITE_ONCE(sibling->rb_right, parent); 383 353 rb_set_parent_color(tmp1, sibling, RB_BLACK); 384 354 if (tmp2) 385 355 rb_set_parent(tmp2, parent);