Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: replace list_head with single linked list

The netfilter hook list never uses the prev pointer, and so can be trimmed to
be a simple singly-linked list.

In addition to having a more light weight structure for hook traversal,
struct net becomes 5568 bytes (down from 6400) and struct net_device becomes
2176 bytes (down from 2240).

Signed-off-by: Aaron Conole <aconole@bytheb.org>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Aaron Conole and committed by
Pablo Neira Ayuso
e3b37f11 54f17bbc

+169 -118
+1 -1
include/linux/netdevice.h
··· 1783 1783 #endif 1784 1784 struct netdev_queue __rcu *ingress_queue; 1785 1785 #ifdef CONFIG_NETFILTER_INGRESS 1786 - struct list_head nf_hooks_ingress; 1786 + struct nf_hook_entry __rcu *nf_hooks_ingress; 1787 1787 #endif 1788 1788 1789 1789 unsigned char broadcast[MAX_ADDR_LEN];
+34 -29
include/linux/netfilter.h
··· 55 55 struct net_device *out; 56 56 struct sock *sk; 57 57 struct net *net; 58 - struct list_head *hook_list; 58 + struct nf_hook_entry __rcu *hook_entries; 59 59 int (*okfn)(struct net *, struct sock *, struct sk_buff *); 60 60 }; 61 61 62 + typedef unsigned int nf_hookfn(void *priv, 63 + struct sk_buff *skb, 64 + const struct nf_hook_state *state); 65 + struct nf_hook_ops { 66 + struct list_head list; 67 + 68 + /* User fills in from here down. */ 69 + nf_hookfn *hook; 70 + struct net_device *dev; 71 + void *priv; 72 + u_int8_t pf; 73 + unsigned int hooknum; 74 + /* Hooks are ordered in ascending priority. */ 75 + int priority; 76 + }; 77 + 78 + struct nf_hook_entry { 79 + struct nf_hook_entry __rcu *next; 80 + struct nf_hook_ops ops; 81 + const struct nf_hook_ops *orig_ops; 82 + }; 83 + 62 84 static inline void nf_hook_state_init(struct nf_hook_state *p, 63 - struct list_head *hook_list, 85 + struct nf_hook_entry *hook_entry, 64 86 unsigned int hook, 65 87 int thresh, u_int8_t pf, 66 88 struct net_device *indev, ··· 98 76 p->out = outdev; 99 77 p->sk = sk; 100 78 p->net = net; 101 - p->hook_list = hook_list; 79 + RCU_INIT_POINTER(p->hook_entries, hook_entry); 102 80 p->okfn = okfn; 103 81 } 104 82 105 - typedef unsigned int nf_hookfn(void *priv, 106 - struct sk_buff *skb, 107 - const struct nf_hook_state *state); 108 83 109 - struct nf_hook_ops { 110 - struct list_head list; 111 - 112 - /* User fills in from here down. */ 113 - nf_hookfn *hook; 114 - struct net_device *dev; 115 - void *priv; 116 - u_int8_t pf; 117 - unsigned int hooknum; 118 - /* Hooks are ordered in ascending priority. */ 119 - int priority; 120 - }; 121 84 122 85 struct nf_sockopt_ops { 123 86 struct list_head list; ··· 168 161 int (*okfn)(struct net *, struct sock *, struct sk_buff *), 169 162 int thresh) 170 163 { 171 - struct list_head *hook_list; 164 + struct nf_hook_entry *hook_head; 165 + int ret = 1; 172 166 173 167 #ifdef HAVE_JUMP_LABEL 174 168 if (__builtin_constant_p(pf) && ··· 178 170 return 1; 179 171 #endif 180 172 181 - hook_list = &net->nf.hooks[pf][hook]; 182 - 183 - if (!list_empty(hook_list)) { 173 + rcu_read_lock(); 174 + hook_head = rcu_dereference(net->nf.hooks[pf][hook]); 175 + if (hook_head) { 184 176 struct nf_hook_state state; 185 - int ret; 186 177 187 - /* We may already have this, but read-locks nest anyway */ 188 - rcu_read_lock(); 189 - nf_hook_state_init(&state, hook_list, hook, thresh, 178 + nf_hook_state_init(&state, hook_head, hook, thresh, 190 179 pf, indev, outdev, sk, net, okfn); 191 180 192 181 ret = nf_hook_slow(skb, &state); 193 - rcu_read_unlock(); 194 - return ret; 195 182 } 196 - return 1; 183 + rcu_read_unlock(); 184 + 185 + return ret; 197 186 } 198 187 199 188 static inline int nf_hook(u_int8_t pf, unsigned int hook, struct net *net,
+12 -5
include/linux/netfilter_ingress.h
··· 11 11 if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) 12 12 return false; 13 13 #endif 14 - return !list_empty(&skb->dev->nf_hooks_ingress); 14 + return rcu_access_pointer(skb->dev->nf_hooks_ingress); 15 15 } 16 16 17 17 /* caller must hold rcu_read_lock */ 18 18 static inline int nf_hook_ingress(struct sk_buff *skb) 19 19 { 20 + struct nf_hook_entry *e = rcu_dereference(skb->dev->nf_hooks_ingress); 20 21 struct nf_hook_state state; 21 22 22 - nf_hook_state_init(&state, &skb->dev->nf_hooks_ingress, 23 - NF_NETDEV_INGRESS, INT_MIN, NFPROTO_NETDEV, 24 - skb->dev, NULL, NULL, dev_net(skb->dev), NULL); 23 + /* Must recheck the ingress hook head, in the event it became NULL 24 + * after the check in nf_hook_ingress_active evaluated to true. 25 + */ 26 + if (unlikely(!e)) 27 + return 0; 28 + 29 + nf_hook_state_init(&state, e, NF_NETDEV_INGRESS, INT_MIN, 30 + NFPROTO_NETDEV, skb->dev, NULL, NULL, 31 + dev_net(skb->dev), NULL); 25 32 return nf_hook_slow(skb, &state); 26 33 } 27 34 28 35 static inline void nf_hook_ingress_init(struct net_device *dev) 29 36 { 30 - INIT_LIST_HEAD(&dev->nf_hooks_ingress); 37 + RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); 31 38 } 32 39 #else /* CONFIG_NETFILTER_INGRESS */ 33 40 static inline int nf_hook_ingress_active(struct sk_buff *skb)
+1 -2
include/net/netfilter/nf_queue.h
··· 11 11 struct sk_buff *skb; 12 12 unsigned int id; 13 13 14 - struct nf_hook_ops *elem; 15 14 struct nf_hook_state state; 16 15 u16 size; /* sizeof(entry) + saved route keys */ 17 16 ··· 24 25 int (*outfn)(struct nf_queue_entry *entry, 25 26 unsigned int queuenum); 26 27 void (*nf_hook_drop)(struct net *net, 27 - struct nf_hook_ops *ops); 28 + const struct nf_hook_entry *hooks); 28 29 }; 29 30 30 31 void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh);
+1 -1
include/net/netns/netfilter.h
··· 16 16 #ifdef CONFIG_SYSCTL 17 17 struct ctl_table_header *nf_log_dir_header; 18 18 #endif 19 - struct list_head hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; 19 + struct nf_hook_entry __rcu *hooks[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; 20 20 }; 21 21 #endif
+6 -13
net/bridge/br_netfilter_hooks.c
··· 1002 1002 int (*okfn)(struct net *, struct sock *, 1003 1003 struct sk_buff *)) 1004 1004 { 1005 - struct nf_hook_ops *elem; 1005 + struct nf_hook_entry *elem; 1006 1006 struct nf_hook_state state; 1007 - struct list_head *head; 1008 1007 int ret; 1009 1008 1010 - head = &net->nf.hooks[NFPROTO_BRIDGE][hook]; 1009 + elem = rcu_dereference(net->nf.hooks[NFPROTO_BRIDGE][hook]); 1011 1010 1012 - list_for_each_entry_rcu(elem, head, list) { 1013 - struct nf_hook_ops *next; 1011 + while (elem && (elem->ops.priority <= NF_BR_PRI_BRNF)) 1012 + elem = rcu_dereference(elem->next); 1014 1013 1015 - next = list_entry_rcu(list_next_rcu(&elem->list), 1016 - struct nf_hook_ops, list); 1017 - if (next->priority <= NF_BR_PRI_BRNF) 1018 - continue; 1019 - } 1020 - 1021 - if (&elem->list == head) 1014 + if (!elem) 1022 1015 return okfn(net, sk, skb); 1023 1016 1024 1017 /* We may already have this, but read-locks nest anyway */ 1025 1018 rcu_read_lock(); 1026 - nf_hook_state_init(&state, head, hook, NF_BR_PRI_BRNF + 1, 1019 + nf_hook_state_init(&state, elem, hook, NF_BR_PRI_BRNF + 1, 1027 1020 NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); 1028 1021 1029 1022 ret = nf_hook_slow(skb, &state);
+94 -51
net/netfilter/core.c
··· 22 22 #include <linux/proc_fs.h> 23 23 #include <linux/mutex.h> 24 24 #include <linux/slab.h> 25 + #include <linux/rcupdate.h> 25 26 #include <net/net_namespace.h> 26 27 #include <net/sock.h> 27 28 ··· 62 61 #endif 63 62 64 63 static DEFINE_MUTEX(nf_hook_mutex); 64 + #define nf_entry_dereference(e) \ 65 + rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex)) 65 66 66 - static struct list_head *nf_find_hook_list(struct net *net, 67 - const struct nf_hook_ops *reg) 67 + static struct nf_hook_entry *nf_hook_entry_head(struct net *net, 68 + const struct nf_hook_ops *reg) 68 69 { 69 - struct list_head *hook_list = NULL; 70 + struct nf_hook_entry *hook_head = NULL; 70 71 71 72 if (reg->pf != NFPROTO_NETDEV) 72 - hook_list = &net->nf.hooks[reg->pf][reg->hooknum]; 73 + hook_head = nf_entry_dereference(net->nf.hooks[reg->pf] 74 + [reg->hooknum]); 73 75 else if (reg->hooknum == NF_NETDEV_INGRESS) { 74 76 #ifdef CONFIG_NETFILTER_INGRESS 75 77 if (reg->dev && dev_net(reg->dev) == net) 76 - hook_list = &reg->dev->nf_hooks_ingress; 78 + hook_head = 79 + nf_entry_dereference( 80 + reg->dev->nf_hooks_ingress); 77 81 #endif 78 82 } 79 - return hook_list; 83 + return hook_head; 80 84 } 81 85 82 - struct nf_hook_entry { 83 - const struct nf_hook_ops *orig_ops; 84 - struct nf_hook_ops ops; 85 - }; 86 + /* must hold nf_hook_mutex */ 87 + static void nf_set_hooks_head(struct net *net, const struct nf_hook_ops *reg, 88 + struct nf_hook_entry *entry) 89 + { 90 + switch (reg->pf) { 91 + case NFPROTO_NETDEV: 92 + /* We already checked in nf_register_net_hook() that this is 93 + * used from ingress. 94 + */ 95 + rcu_assign_pointer(reg->dev->nf_hooks_ingress, entry); 96 + break; 97 + default: 98 + rcu_assign_pointer(net->nf.hooks[reg->pf][reg->hooknum], 99 + entry); 100 + break; 101 + } 102 + } 86 103 87 104 int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) 88 105 { 89 - struct list_head *hook_list; 106 + struct nf_hook_entry *hooks_entry; 90 107 struct nf_hook_entry *entry; 91 - struct nf_hook_ops *elem; 92 108 93 109 if (reg->pf == NFPROTO_NETDEV && 94 110 (reg->hooknum != NF_NETDEV_INGRESS || ··· 118 100 119 101 entry->orig_ops = reg; 120 102 entry->ops = *reg; 121 - 122 - hook_list = nf_find_hook_list(net, reg); 123 - if (!hook_list) { 124 - kfree(entry); 125 - return -ENOENT; 126 - } 103 + entry->next = NULL; 127 104 128 105 mutex_lock(&nf_hook_mutex); 129 - list_for_each_entry(elem, hook_list, list) { 130 - if (reg->priority < elem->priority) 131 - break; 106 + hooks_entry = nf_hook_entry_head(net, reg); 107 + 108 + if (hooks_entry && hooks_entry->orig_ops->priority > reg->priority) { 109 + /* This is the case where we need to insert at the head */ 110 + entry->next = hooks_entry; 111 + hooks_entry = NULL; 132 112 } 133 - list_add_rcu(&entry->ops.list, elem->list.prev); 113 + 114 + while (hooks_entry && 115 + reg->priority >= hooks_entry->orig_ops->priority && 116 + nf_entry_dereference(hooks_entry->next)) { 117 + hooks_entry = nf_entry_dereference(hooks_entry->next); 118 + } 119 + 120 + if (hooks_entry) { 121 + entry->next = nf_entry_dereference(hooks_entry->next); 122 + rcu_assign_pointer(hooks_entry->next, entry); 123 + } else { 124 + nf_set_hooks_head(net, reg, entry); 125 + } 126 + 134 127 mutex_unlock(&nf_hook_mutex); 135 128 #ifdef CONFIG_NETFILTER_INGRESS 136 129 if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) ··· 156 127 157 128 void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) 158 129 { 159 - struct list_head *hook_list; 160 - struct nf_hook_entry *entry; 161 - struct nf_hook_ops *elem; 162 - 163 - hook_list = nf_find_hook_list(net, reg); 164 - if (!hook_list) 165 - return; 130 + struct nf_hook_entry *hooks_entry; 166 131 167 132 mutex_lock(&nf_hook_mutex); 168 - list_for_each_entry(elem, hook_list, list) { 169 - entry = container_of(elem, struct nf_hook_entry, ops); 170 - if (entry->orig_ops == reg) { 171 - list_del_rcu(&entry->ops.list); 172 - break; 173 - } 133 + hooks_entry = nf_hook_entry_head(net, reg); 134 + if (hooks_entry->orig_ops == reg) { 135 + nf_set_hooks_head(net, reg, 136 + nf_entry_dereference(hooks_entry->next)); 137 + goto unlock; 174 138 } 139 + while (hooks_entry && nf_entry_dereference(hooks_entry->next)) { 140 + struct nf_hook_entry *next = 141 + nf_entry_dereference(hooks_entry->next); 142 + struct nf_hook_entry *nnext; 143 + 144 + if (next->orig_ops != reg) { 145 + hooks_entry = next; 146 + continue; 147 + } 148 + nnext = nf_entry_dereference(next->next); 149 + rcu_assign_pointer(hooks_entry->next, nnext); 150 + hooks_entry = next; 151 + break; 152 + } 153 + 154 + unlock: 175 155 mutex_unlock(&nf_hook_mutex); 176 - if (&elem->list == hook_list) { 156 + if (!hooks_entry) { 177 157 WARN(1, "nf_unregister_net_hook: hook not found!\n"); 178 158 return; 179 159 } ··· 194 156 static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); 195 157 #endif 196 158 synchronize_net(); 197 - nf_queue_nf_hook_drop(net, &entry->ops); 159 + nf_queue_nf_hook_drop(net, hooks_entry); 198 160 /* other cpu might still process nfqueue verdict that used reg */ 199 161 synchronize_net(); 200 - kfree(entry); 162 + kfree(hooks_entry); 201 163 } 202 164 EXPORT_SYMBOL(nf_unregister_net_hook); 203 165 ··· 296 258 } 297 259 EXPORT_SYMBOL(nf_unregister_hooks); 298 260 299 - unsigned int nf_iterate(struct list_head *head, 300 - struct sk_buff *skb, 261 + unsigned int nf_iterate(struct sk_buff *skb, 301 262 struct nf_hook_state *state, 302 - struct nf_hook_ops **elemp) 263 + struct nf_hook_entry **entryp) 303 264 { 304 265 unsigned int verdict; 305 266 ··· 306 269 * The caller must not block between calls to this 307 270 * function because of risk of continuing from deleted element. 308 271 */ 309 - list_for_each_entry_continue_rcu((*elemp), head, list) { 310 - if (state->thresh > (*elemp)->priority) 272 + while (*entryp) { 273 + if (state->thresh > (*entryp)->ops.priority) { 274 + *entryp = rcu_dereference((*entryp)->next); 311 275 continue; 276 + } 312 277 313 278 /* Optimization: we don't need to hold module 314 279 reference here, since function can't sleep. --RR */ 315 280 repeat: 316 - verdict = (*elemp)->hook((*elemp)->priv, skb, state); 281 + verdict = (*entryp)->ops.hook((*entryp)->ops.priv, skb, state); 317 282 if (verdict != NF_ACCEPT) { 318 283 #ifdef CONFIG_NETFILTER_DEBUG 319 284 if (unlikely((verdict & NF_VERDICT_MASK) 320 285 > NF_MAX_VERDICT)) { 321 286 NFDEBUG("Evil return from %p(%u).\n", 322 - (*elemp)->hook, state->hook); 287 + (*entryp)->ops.hook, state->hook); 288 + *entryp = rcu_dereference((*entryp)->next); 323 289 continue; 324 290 } 325 291 #endif ··· 330 290 return verdict; 331 291 goto repeat; 332 292 } 293 + *entryp = rcu_dereference((*entryp)->next); 333 294 } 334 295 return NF_ACCEPT; 335 296 } ··· 340 299 * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */ 341 300 int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state) 342 301 { 343 - struct nf_hook_ops *elem; 302 + struct nf_hook_entry *entry; 344 303 unsigned int verdict; 345 304 int ret = 0; 346 305 347 - elem = list_entry_rcu(state->hook_list, struct nf_hook_ops, list); 306 + entry = rcu_dereference(state->hook_entries); 348 307 next_hook: 349 - verdict = nf_iterate(state->hook_list, skb, state, &elem); 308 + verdict = nf_iterate(skb, state, &entry); 350 309 if (verdict == NF_ACCEPT || verdict == NF_STOP) { 351 310 ret = 1; 352 311 } else if ((verdict & NF_VERDICT_MASK) == NF_DROP) { ··· 355 314 if (ret == 0) 356 315 ret = -EPERM; 357 316 } else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) { 358 - int err = nf_queue(skb, elem, state, 359 - verdict >> NF_VERDICT_QBITS); 317 + int err; 318 + 319 + RCU_INIT_POINTER(state->hook_entries, entry); 320 + err = nf_queue(skb, state, verdict >> NF_VERDICT_QBITS); 360 321 if (err < 0) { 361 322 if (err == -ESRCH && 362 323 (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) ··· 485 442 486 443 for (i = 0; i < ARRAY_SIZE(net->nf.hooks); i++) { 487 444 for (h = 0; h < NF_MAX_HOOKS; h++) 488 - INIT_LIST_HEAD(&net->nf.hooks[i][h]); 445 + RCU_INIT_POINTER(net->nf.hooks[i][h], NULL); 489 446 } 490 447 491 448 #ifdef CONFIG_PROC_FS
+5 -5
net/netfilter/nf_internals.h
··· 13 13 14 14 15 15 /* core.c */ 16 - unsigned int nf_iterate(struct list_head *head, struct sk_buff *skb, 17 - struct nf_hook_state *state, struct nf_hook_ops **elemp); 16 + unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, 17 + struct nf_hook_entry **entryp); 18 18 19 19 /* nf_queue.c */ 20 - int nf_queue(struct sk_buff *skb, struct nf_hook_ops *elem, 21 - struct nf_hook_state *state, unsigned int queuenum); 22 - void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops); 20 + int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, 21 + unsigned int queuenum); 22 + void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry); 23 23 int __init netfilter_queue_init(void); 24 24 25 25 /* nf_log.c */
+10 -8
net/netfilter/nf_queue.c
··· 96 96 } 97 97 EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); 98 98 99 - void nf_queue_nf_hook_drop(struct net *net, struct nf_hook_ops *ops) 99 + void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry) 100 100 { 101 101 const struct nf_queue_handler *qh; 102 102 103 103 rcu_read_lock(); 104 104 qh = rcu_dereference(net->nf.queue_handler); 105 105 if (qh) 106 - qh->nf_hook_drop(net, ops); 106 + qh->nf_hook_drop(net, entry); 107 107 rcu_read_unlock(); 108 108 } 109 109 ··· 112 112 * through nf_reinject(). 113 113 */ 114 114 int nf_queue(struct sk_buff *skb, 115 - struct nf_hook_ops *elem, 116 115 struct nf_hook_state *state, 117 116 unsigned int queuenum) 118 117 { ··· 140 141 141 142 *entry = (struct nf_queue_entry) { 142 143 .skb = skb, 143 - .elem = elem, 144 144 .state = *state, 145 145 .size = sizeof(*entry) + afinfo->route_key_size, 146 146 }; ··· 163 165 164 166 void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) 165 167 { 168 + struct nf_hook_entry *hook_entry; 166 169 struct sk_buff *skb = entry->skb; 167 - struct nf_hook_ops *elem = entry->elem; 168 170 const struct nf_afinfo *afinfo; 171 + struct nf_hook_ops *elem; 169 172 int err; 173 + 174 + hook_entry = rcu_dereference(entry->state.hook_entries); 175 + elem = &hook_entry->ops; 170 176 171 177 nf_queue_entry_release_refs(entry); 172 178 ··· 188 186 189 187 if (verdict == NF_ACCEPT) { 190 188 next_hook: 191 - verdict = nf_iterate(entry->state.hook_list, 192 - skb, &entry->state, &elem); 189 + verdict = nf_iterate(skb, &entry->state, &hook_entry); 193 190 } 194 191 195 192 switch (verdict & NF_VERDICT_MASK) { ··· 199 198 local_bh_enable(); 200 199 break; 201 200 case NF_QUEUE: 202 - err = nf_queue(skb, elem, &entry->state, 201 + RCU_INIT_POINTER(entry->state.hook_entries, hook_entry); 202 + err = nf_queue(skb, &entry->state, 203 203 verdict >> NF_VERDICT_QBITS); 204 204 if (err < 0) { 205 205 if (err == -ESRCH &&