Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: ipset: fix race condition in ipset save, swap and delete

This fix adds a new reference counter (ref_netlink) for the struct ip_set.
The other reference counter (ref) can be swapped out by ip_set_swap and we
need a separate counter to keep track of references for netlink events
like dump. Using the same ref counter for dump causes a race condition
which can be demonstrated by the following script:

ipset create hash_ip1 hash:ip family inet hashsize 1024 maxelem 500000 \
counters
ipset create hash_ip2 hash:ip family inet hashsize 300000 maxelem 500000 \
counters
ipset create hash_ip3 hash:ip family inet hashsize 1024 maxelem 500000 \
counters

ipset save &

ipset swap hash_ip3 hash_ip2
ipset destroy hash_ip3 /* will crash the machine */

Swap will exchange the values of ref so destroy will see ref = 0 instead of
ref = 1. With this fix in place swap will not succeed because ipset save
still has ref_netlink on the set (ip_set_swap doesn't swap ref_netlink).

Both delete and swap will error out if ref_netlink != 0 on the set.

Note: The changes to *_head functions is because previously we would
increment ref whenever we called these functions, we don't do that
anymore.

Reviewed-by: Joshua Hunt <johunt@akamai.com>
Signed-off-by: Vishwanath Pai <vpai@akamai.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Vishwanath Pai and committed by
Pablo Neira Ayuso
596cf3fe d7be81a5

+35 -8
+4
include/linux/netfilter/ipset/ip_set.h
··· 234 234 spinlock_t lock; 235 235 /* References to the set */ 236 236 u32 ref; 237 + /* References to the set for netlink events like dump, 238 + * ref can be swapped out by ip_set_swap 239 + */ 240 + u32 ref_netlink; 237 241 /* The core set type */ 238 242 struct ip_set_type *type; 239 243 /* The type variant doing the real job */
+1 -1
net/netfilter/ipset/ip_set_bitmap_gen.h
··· 95 95 if (!nested) 96 96 goto nla_put_failure; 97 97 if (mtype_do_head(skb, map) || 98 - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || 98 + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || 99 99 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) 100 100 goto nla_put_failure; 101 101 if (unlikely(ip_set_put_flags(skb, set)))
+28 -5
net/netfilter/ipset/ip_set_core.c
··· 497 497 write_unlock_bh(&ip_set_ref_lock); 498 498 } 499 499 500 + /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need 501 + * a separate reference counter 502 + */ 503 + static inline void 504 + __ip_set_get_netlink(struct ip_set *set) 505 + { 506 + write_lock_bh(&ip_set_ref_lock); 507 + set->ref_netlink++; 508 + write_unlock_bh(&ip_set_ref_lock); 509 + } 510 + 511 + static inline void 512 + __ip_set_put_netlink(struct ip_set *set) 513 + { 514 + write_lock_bh(&ip_set_ref_lock); 515 + BUG_ON(set->ref_netlink == 0); 516 + set->ref_netlink--; 517 + write_unlock_bh(&ip_set_ref_lock); 518 + } 519 + 500 520 /* Add, del and test set entries from kernel. 501 521 * 502 522 * The set behind the index must exist and must be referenced ··· 1022 1002 if (!attr[IPSET_ATTR_SETNAME]) { 1023 1003 for (i = 0; i < inst->ip_set_max; i++) { 1024 1004 s = ip_set(inst, i); 1025 - if (s && s->ref) { 1005 + if (s && (s->ref || s->ref_netlink)) { 1026 1006 ret = -IPSET_ERR_BUSY; 1027 1007 goto out; 1028 1008 } ··· 1044 1024 if (!s) { 1045 1025 ret = -ENOENT; 1046 1026 goto out; 1047 - } else if (s->ref) { 1027 + } else if (s->ref || s->ref_netlink) { 1048 1028 ret = -IPSET_ERR_BUSY; 1049 1029 goto out; 1050 1030 } ··· 1191 1171 from->family == to->family)) 1192 1172 return -IPSET_ERR_TYPE_MISMATCH; 1193 1173 1174 + if (from->ref_netlink || to->ref_netlink) 1175 + return -EBUSY; 1176 + 1194 1177 strncpy(from_name, from->name, IPSET_MAXNAMELEN); 1195 1178 strncpy(from->name, to->name, IPSET_MAXNAMELEN); 1196 1179 strncpy(to->name, from_name, IPSET_MAXNAMELEN); ··· 1229 1206 if (set->variant->uref) 1230 1207 set->variant->uref(set, cb, false); 1231 1208 pr_debug("release set %s\n", set->name); 1232 - __ip_set_put_byindex(inst, index); 1209 + __ip_set_put_netlink(set); 1233 1210 } 1234 1211 return 0; 1235 1212 } ··· 1351 1328 if (!cb->args[IPSET_CB_ARG0]) { 1352 1329 /* Start listing: make sure set won't be destroyed */ 1353 1330 pr_debug("reference set\n"); 1354 - set->ref++; 1331 + set->ref_netlink++; 1355 1332 } 1356 1333 write_unlock_bh(&ip_set_ref_lock); 1357 1334 nlh = start_msg(skb, NETLINK_CB(cb->skb).portid, ··· 1419 1396 if (set->variant->uref) 1420 1397 set->variant->uref(set, cb, false); 1421 1398 pr_debug("release set %s\n", set->name); 1422 - __ip_set_put_byindex(inst, index); 1399 + __ip_set_put_netlink(set); 1423 1400 cb->args[IPSET_CB_ARG0] = 0; 1424 1401 } 1425 1402 out:
+1 -1
net/netfilter/ipset/ip_set_hash_gen.h
··· 1082 1082 if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask)) 1083 1083 goto nla_put_failure; 1084 1084 #endif 1085 - if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || 1085 + if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || 1086 1086 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize))) 1087 1087 goto nla_put_failure; 1088 1088 if (unlikely(ip_set_put_flags(skb, set)))
+1 -1
net/netfilter/ipset/ip_set_list_set.c
··· 458 458 if (!nested) 459 459 goto nla_put_failure; 460 460 if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) || 461 - nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) || 461 + nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || 462 462 nla_put_net32(skb, IPSET_ATTR_MEMSIZE, 463 463 htonl(sizeof(*map) + n * set->dsize))) 464 464 goto nla_put_failure;