Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

rbtree: Implement generic latch_tree

Implement a latched RB-tree in order to get unconditional RCU/lockless
lookups.

Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Michel Lespinasse <walken@google.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Woodhouse <David.Woodhouse@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>

authored by

Peter Zijlstra and committed by
Rusty Russell
ade3f510 7fc26327

+212
+212
include/linux/rbtree_latch.h
··· 1 + /* 2 + * Latched RB-trees 3 + * 4 + * Copyright (C) 2015 Intel Corp., Peter Zijlstra <peterz@infradead.org> 5 + * 6 + * Since RB-trees have non-atomic modifications they're not immediately suited 7 + * for RCU/lockless queries. Even though we made RB-tree lookups non-fatal for 8 + * lockless lookups; we cannot guarantee they return a correct result. 9 + * 10 + * The simplest solution is a seqlock + RB-tree, this will allow lockless 11 + * lookups; but has the constraint (inherent to the seqlock) that read sides 12 + * cannot nest in write sides. 13 + * 14 + * If we need to allow unconditional lookups (say as required for NMI context 15 + * usage) we need a more complex setup; this data structure provides this by 16 + * employing the latch technique -- see @raw_write_seqcount_latch -- to 17 + * implement a latched RB-tree which does allow for unconditional lookups by 18 + * virtue of always having (at least) one stable copy of the tree. 19 + * 20 + * However, while we have the guarantee that there is at all times one stable 21 + * copy, this does not guarantee an iteration will not observe modifications. 22 + * What might have been a stable copy at the start of the iteration, need not 23 + * remain so for the duration of the iteration. 24 + * 25 + * Therefore, this does require a lockless RB-tree iteration to be non-fatal; 26 + * see the comment in lib/rbtree.c. Note however that we only require the first 27 + * condition -- not seeing partial stores -- because the latch thing isolates 28 + * us from loops. If we were to interrupt a modification the lookup would be 29 + * pointed at the stable tree and complete while the modification was halted. 30 + */ 31 + 32 + #ifndef RB_TREE_LATCH_H 33 + #define RB_TREE_LATCH_H 34 + 35 + #include <linux/rbtree.h> 36 + #include <linux/seqlock.h> 37 + 38 + struct latch_tree_node { 39 + struct rb_node node[2]; 40 + }; 41 + 42 + struct latch_tree_root { 43 + seqcount_t seq; 44 + struct rb_root tree[2]; 45 + }; 46 + 47 + /** 48 + * latch_tree_ops - operators to define the tree order 49 + * @less: used for insertion; provides the (partial) order between two elements. 50 + * @comp: used for lookups; provides the order between the search key and an element. 51 + * 52 + * The operators are related like: 53 + * 54 + * comp(a->key,b) < 0 := less(a,b) 55 + * comp(a->key,b) > 0 := less(b,a) 56 + * comp(a->key,b) == 0 := !less(a,b) && !less(b,a) 57 + * 58 + * If these operators define a partial order on the elements we make no 59 + * guarantee on which of the elements matching the key is found. See 60 + * latch_tree_find(). 61 + */ 62 + struct latch_tree_ops { 63 + bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b); 64 + int (*comp)(void *key, struct latch_tree_node *b); 65 + }; 66 + 67 + static __always_inline struct latch_tree_node * 68 + __lt_from_rb(struct rb_node *node, int idx) 69 + { 70 + return container_of(node, struct latch_tree_node, node[idx]); 71 + } 72 + 73 + static __always_inline void 74 + __lt_insert(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx, 75 + bool (*less)(struct latch_tree_node *a, struct latch_tree_node *b)) 76 + { 77 + struct rb_root *root = &ltr->tree[idx]; 78 + struct rb_node **link = &root->rb_node; 79 + struct rb_node *node = &ltn->node[idx]; 80 + struct rb_node *parent = NULL; 81 + struct latch_tree_node *ltp; 82 + 83 + while (*link) { 84 + parent = *link; 85 + ltp = __lt_from_rb(parent, idx); 86 + 87 + if (less(ltn, ltp)) 88 + link = &parent->rb_left; 89 + else 90 + link = &parent->rb_right; 91 + } 92 + 93 + rb_link_node_rcu(node, parent, link); 94 + rb_insert_color(node, root); 95 + } 96 + 97 + static __always_inline void 98 + __lt_erase(struct latch_tree_node *ltn, struct latch_tree_root *ltr, int idx) 99 + { 100 + rb_erase(&ltn->node[idx], &ltr->tree[idx]); 101 + } 102 + 103 + static __always_inline struct latch_tree_node * 104 + __lt_find(void *key, struct latch_tree_root *ltr, int idx, 105 + int (*comp)(void *key, struct latch_tree_node *node)) 106 + { 107 + struct rb_node *node = rcu_dereference_raw(ltr->tree[idx].rb_node); 108 + struct latch_tree_node *ltn; 109 + int c; 110 + 111 + while (node) { 112 + ltn = __lt_from_rb(node, idx); 113 + c = comp(key, ltn); 114 + 115 + if (c < 0) 116 + node = rcu_dereference_raw(node->rb_left); 117 + else if (c > 0) 118 + node = rcu_dereference_raw(node->rb_right); 119 + else 120 + return ltn; 121 + } 122 + 123 + return NULL; 124 + } 125 + 126 + /** 127 + * latch_tree_insert() - insert @node into the trees @root 128 + * @node: nodes to insert 129 + * @root: trees to insert @node into 130 + * @ops: operators defining the node order 131 + * 132 + * It inserts @node into @root in an ordered fashion such that we can always 133 + * observe one complete tree. See the comment for raw_write_seqcount_latch(). 134 + * 135 + * The inserts use rcu_assign_pointer() to publish the element such that the 136 + * tree structure is stored before we can observe the new @node. 137 + * 138 + * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be 139 + * serialized. 140 + */ 141 + static __always_inline void 142 + latch_tree_insert(struct latch_tree_node *node, 143 + struct latch_tree_root *root, 144 + const struct latch_tree_ops *ops) 145 + { 146 + raw_write_seqcount_latch(&root->seq); 147 + __lt_insert(node, root, 0, ops->less); 148 + raw_write_seqcount_latch(&root->seq); 149 + __lt_insert(node, root, 1, ops->less); 150 + } 151 + 152 + /** 153 + * latch_tree_erase() - removes @node from the trees @root 154 + * @node: nodes to remote 155 + * @root: trees to remove @node from 156 + * @ops: operators defining the node order 157 + * 158 + * Removes @node from the trees @root in an ordered fashion such that we can 159 + * always observe one complete tree. See the comment for 160 + * raw_write_seqcount_latch(). 161 + * 162 + * It is assumed that @node will observe one RCU quiescent state before being 163 + * reused of freed. 164 + * 165 + * All modifications (latch_tree_insert, latch_tree_remove) are assumed to be 166 + * serialized. 167 + */ 168 + static __always_inline void 169 + latch_tree_erase(struct latch_tree_node *node, 170 + struct latch_tree_root *root, 171 + const struct latch_tree_ops *ops) 172 + { 173 + raw_write_seqcount_latch(&root->seq); 174 + __lt_erase(node, root, 0); 175 + raw_write_seqcount_latch(&root->seq); 176 + __lt_erase(node, root, 1); 177 + } 178 + 179 + /** 180 + * latch_tree_find() - find the node matching @key in the trees @root 181 + * @key: search key 182 + * @root: trees to search for @key 183 + * @ops: operators defining the node order 184 + * 185 + * Does a lockless lookup in the trees @root for the node matching @key. 186 + * 187 + * It is assumed that this is called while holding the appropriate RCU read 188 + * side lock. 189 + * 190 + * If the operators define a partial order on the elements (there are multiple 191 + * elements which have the same key value) it is undefined which of these 192 + * elements will be found. Nor is it possible to iterate the tree to find 193 + * further elements with the same key value. 194 + * 195 + * Returns: a pointer to the node matching @key or NULL. 196 + */ 197 + static __always_inline struct latch_tree_node * 198 + latch_tree_find(void *key, struct latch_tree_root *root, 199 + const struct latch_tree_ops *ops) 200 + { 201 + struct latch_tree_node *node; 202 + unsigned int seq; 203 + 204 + do { 205 + seq = raw_read_seqcount_latch(&root->seq); 206 + node = __lt_find(key, root, seq & 1, ops->comp); 207 + } while (read_seqcount_retry(&root->seq, seq)); 208 + 209 + return node; 210 + } 211 + 212 + #endif /* RB_TREE_LATCH_H */