Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: ipset: Prepare the ipset core to use RCU at set level

Replace rwlock_t with spinlock_t in "struct ip_set" and change the locking
accordingly. Convert the comment extension into an rcu-avare object. Also,
simplify the timeout routines.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>

+66 -50
+7 -2
include/linux/netfilter/ipset/ip_set.h
··· 108 108 atomic64_t packets; 109 109 }; 110 110 111 + struct ip_set_comment_rcu { 112 + struct rcu_head rcu; 113 + char str[0]; 114 + }; 115 + 111 116 struct ip_set_comment { 112 - char *str; 117 + struct ip_set_comment_rcu __rcu *c; 113 118 }; 114 119 115 120 struct ip_set_skbinfo { ··· 231 226 /* The name of the set */ 232 227 char name[IPSET_MAXNAMELEN]; 233 228 /* Lock protecting the set data */ 234 - rwlock_t lock; 229 + spinlock_t lock; 235 230 /* References to the set */ 236 231 u32 ref; 237 232 /* The core set type */
+27 -11
include/linux/netfilter/ipset/ip_set_comment.h
··· 16 16 return nla_data(tb); 17 17 } 18 18 19 + /* Called from uadd only, protected by the set spinlock. 20 + * The kadt functions don't use the comment extensions in any way. 21 + */ 19 22 static inline void 20 23 ip_set_init_comment(struct ip_set_comment *comment, 21 24 const struct ip_set_ext *ext) 22 25 { 26 + struct ip_set_comment_rcu *c = rcu_dereference_protected(comment->c, 1); 23 27 size_t len = ext->comment ? strlen(ext->comment) : 0; 24 28 25 - if (unlikely(comment->str)) { 26 - kfree(comment->str); 27 - comment->str = NULL; 29 + if (unlikely(c)) { 30 + kfree_rcu(c, rcu); 31 + rcu_assign_pointer(comment->c, NULL); 28 32 } 29 33 if (!len) 30 34 return; 31 35 if (unlikely(len > IPSET_MAX_COMMENT_SIZE)) 32 36 len = IPSET_MAX_COMMENT_SIZE; 33 - comment->str = kzalloc(len + 1, GFP_ATOMIC); 34 - if (unlikely(!comment->str)) 37 + c = kzalloc(sizeof(*c) + len + 1, GFP_ATOMIC); 38 + if (unlikely(!c)) 35 39 return; 36 - strlcpy(comment->str, ext->comment, len + 1); 40 + strlcpy(c->str, ext->comment, len + 1); 41 + rcu_assign_pointer(comment->c, c); 37 42 } 38 43 44 + /* Used only when dumping a set, protected by rcu_read_lock_bh() */ 39 45 static inline int 40 46 ip_set_put_comment(struct sk_buff *skb, struct ip_set_comment *comment) 41 47 { 42 - if (!comment->str) 48 + struct ip_set_comment_rcu *c = rcu_dereference_bh(comment->c); 49 + 50 + if (!c) 43 51 return 0; 44 - return nla_put_string(skb, IPSET_ATTR_COMMENT, comment->str); 52 + return nla_put_string(skb, IPSET_ATTR_COMMENT, c->str); 45 53 } 46 54 55 + /* Called from uadd/udel, flush or the garbage collectors protected 56 + * by the set spinlock. 57 + * Called when the set is destroyed and when there can't be any user 58 + * of the set data anymore. 59 + */ 47 60 static inline void 48 61 ip_set_comment_free(struct ip_set_comment *comment) 49 62 { 50 - if (unlikely(!comment->str)) 63 + struct ip_set_comment_rcu *c; 64 + 65 + c = rcu_dereference_protected(comment->c, 1); 66 + if (unlikely(!c)) 51 67 return; 52 - kfree(comment->str); 53 - comment->str = NULL; 68 + kfree_rcu(c, rcu); 69 + rcu_assign_pointer(comment->c, NULL); 54 70 } 55 71 56 72 #endif
+10 -15
include/linux/netfilter/ipset/ip_set_timeout.h
··· 40 40 } 41 41 42 42 static inline bool 43 - ip_set_timeout_test(unsigned long timeout) 43 + ip_set_timeout_expired(unsigned long *t) 44 44 { 45 - return timeout == IPSET_ELEM_PERMANENT || 46 - time_is_after_jiffies(timeout); 47 - } 48 - 49 - static inline bool 50 - ip_set_timeout_expired(unsigned long *timeout) 51 - { 52 - return *timeout != IPSET_ELEM_PERMANENT && 53 - time_is_before_jiffies(*timeout); 45 + return *t != IPSET_ELEM_PERMANENT && time_is_before_jiffies(*t); 54 46 } 55 47 56 48 static inline void 57 - ip_set_timeout_set(unsigned long *timeout, u32 t) 49 + ip_set_timeout_set(unsigned long *timeout, u32 value) 58 50 { 59 - if (!t) { 51 + unsigned long t; 52 + 53 + if (!value) { 60 54 *timeout = IPSET_ELEM_PERMANENT; 61 55 return; 62 56 } 63 57 64 - *timeout = msecs_to_jiffies(t * MSEC_PER_SEC) + jiffies; 65 - if (*timeout == IPSET_ELEM_PERMANENT) 58 + t = msecs_to_jiffies(value * MSEC_PER_SEC) + jiffies; 59 + if (t == IPSET_ELEM_PERMANENT) 66 60 /* Bingo! :-) */ 67 - (*timeout)--; 61 + t--; 62 + *timeout = t; 68 63 } 69 64 70 65 static inline u32
+22 -22
net/netfilter/ipset/ip_set_core.c
··· 209 209 pr_warn("ip_set type %s, family %s with revision min %u already registered!\n", 210 210 type->name, family_name(type->family), 211 211 type->revision_min); 212 - ret = -EINVAL; 213 - goto unlock; 212 + ip_set_type_unlock(); 213 + return -EINVAL; 214 214 } 215 215 list_add_rcu(&type->list, &ip_set_type_list); 216 216 pr_debug("type %s, family %s, revision %u:%u registered.\n", 217 217 type->name, family_name(type->family), 218 218 type->revision_min, type->revision_max); 219 - unlock: 220 219 ip_set_type_unlock(); 220 + 221 221 return ret; 222 222 } 223 223 EXPORT_SYMBOL_GPL(ip_set_type_register); ··· 231 231 pr_warn("ip_set type %s, family %s with revision min %u not registered\n", 232 232 type->name, family_name(type->family), 233 233 type->revision_min); 234 - goto unlock; 234 + ip_set_type_unlock(); 235 + return; 235 236 } 236 237 list_del_rcu(&type->list); 237 238 pr_debug("type %s, family %s with revision min %u unregistered.\n", 238 239 type->name, family_name(type->family), type->revision_min); 239 - unlock: 240 240 ip_set_type_unlock(); 241 241 242 242 synchronize_rcu(); ··· 531 531 !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) 532 532 return 0; 533 533 534 - read_lock_bh(&set->lock); 534 + rcu_read_lock_bh(); 535 535 ret = set->variant->kadt(set, skb, par, IPSET_TEST, opt); 536 - read_unlock_bh(&set->lock); 536 + rcu_read_unlock_bh(); 537 537 538 538 if (ret == -EAGAIN) { 539 539 /* Type requests element to be completed */ 540 540 pr_debug("element must be completed, ADD is triggered\n"); 541 - write_lock_bh(&set->lock); 541 + spin_lock_bh(&set->lock); 542 542 set->variant->kadt(set, skb, par, IPSET_ADD, opt); 543 - write_unlock_bh(&set->lock); 543 + spin_unlock_bh(&set->lock); 544 544 ret = 1; 545 545 } else { 546 546 /* --return-nomatch: invert matched element */ ··· 570 570 !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) 571 571 return -IPSET_ERR_TYPE_MISMATCH; 572 572 573 - write_lock_bh(&set->lock); 573 + spin_lock_bh(&set->lock); 574 574 ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); 575 - write_unlock_bh(&set->lock); 575 + spin_unlock_bh(&set->lock); 576 576 577 577 return ret; 578 578 } ··· 593 593 !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) 594 594 return -IPSET_ERR_TYPE_MISMATCH; 595 595 596 - write_lock_bh(&set->lock); 596 + spin_lock_bh(&set->lock); 597 597 ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); 598 - write_unlock_bh(&set->lock); 598 + spin_unlock_bh(&set->lock); 599 599 600 600 return ret; 601 601 } ··· 880 880 set = kzalloc(sizeof(struct ip_set), GFP_KERNEL); 881 881 if (!set) 882 882 return -ENOMEM; 883 - rwlock_init(&set->lock); 883 + spin_lock_init(&set->lock); 884 884 strlcpy(set->name, name, IPSET_MAXNAMELEN); 885 885 set->family = family; 886 886 set->revision = revision; ··· 1062 1062 { 1063 1063 pr_debug("set: %s\n", set->name); 1064 1064 1065 - write_lock_bh(&set->lock); 1065 + spin_lock_bh(&set->lock); 1066 1066 set->variant->flush(set); 1067 - write_unlock_bh(&set->lock); 1067 + spin_unlock_bh(&set->lock); 1068 1068 } 1069 1069 1070 1070 static int ··· 1377 1377 set->variant->uref(set, cb, true); 1378 1378 /* Fall through and add elements */ 1379 1379 default: 1380 - read_lock_bh(&set->lock); 1380 + rcu_read_lock_bh(); 1381 1381 ret = set->variant->list(set, skb, cb); 1382 - read_unlock_bh(&set->lock); 1382 + rcu_read_unlock_bh(); 1383 1383 if (!cb->args[IPSET_CB_ARG0]) 1384 1384 /* Set is done, proceed with next one */ 1385 1385 goto next_set; ··· 1462 1462 bool eexist = flags & IPSET_FLAG_EXIST, retried = false; 1463 1463 1464 1464 do { 1465 - write_lock_bh(&set->lock); 1465 + spin_lock_bh(&set->lock); 1466 1466 ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); 1467 - write_unlock_bh(&set->lock); 1467 + spin_unlock_bh(&set->lock); 1468 1468 retried = true; 1469 1469 } while (ret == -EAGAIN && 1470 1470 set->variant->resize && ··· 1644 1644 set->type->adt_policy)) 1645 1645 return -IPSET_ERR_PROTOCOL; 1646 1646 1647 - read_lock_bh(&set->lock); 1647 + rcu_read_lock_bh(); 1648 1648 ret = set->variant->uadt(set, tb, IPSET_TEST, NULL, 0, 0); 1649 - read_unlock_bh(&set->lock); 1649 + rcu_read_unlock_bh(); 1650 1650 /* Userspace can't trigger element to be re-added */ 1651 1651 if (ret == -EAGAIN) 1652 1652 ret = 1;