netfilter: ipset: References are protected by rwlock instead of mutex

The timeout variant of the list:set type must reference the member sets.
However, its garbage collector runs at timer interrupt so the mutex protection
of the references is a no go. Therefore the reference protection
is converted to rwlock.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>

authored by Jozsef Kadlecsik and committed by Patrick McHardy 6604271c 49a341f2

+73 -56
+1 -1
include/linux/netfilter/ipset/ip_set.h
··· 293 293 /* Lock protecting the set data */ 294 294 rwlock_t lock; 295 295 /* References to the set */ 296 - atomic_t ref; 296 + u32 ref; 297 297 /* The core set type */ 298 298 struct ip_set_type *type; 299 299 /* The type variant doing the real job */
+1 -2
include/linux/netfilter/ipset/ip_set_ahash.h
··· 515 515 if (h->netmask != HOST_MASK) 516 516 NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, h->netmask); 517 517 #endif 518 - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, 519 - htonl(atomic_read(&set->ref) - 1)); 518 + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); 520 519 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)); 521 520 if (with_timeout(h->timeout)) 522 521 NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout));
+1 -2
net/netfilter/ipset/ip_set_bitmap_ip.c
··· 338 338 NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); 339 339 if (map->netmask != 32) 340 340 NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, map->netmask); 341 - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, 342 - htonl(atomic_read(&set->ref) - 1)); 341 + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); 343 342 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 344 343 htonl(sizeof(*map) + map->memsize)); 345 344 if (with_timeout(map->timeout))
+1 -2
net/netfilter/ipset/ip_set_bitmap_ipmac.c
··· 434 434 goto nla_put_failure; 435 435 NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip)); 436 436 NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); 437 - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, 438 - htonl(atomic_read(&set->ref) - 1)); 437 + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); 439 438 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 440 439 htonl(sizeof(*map) 441 440 + (map->last_ip - map->first_ip + 1) * map->dsize));
+1 -2
net/netfilter/ipset/ip_set_bitmap_port.c
··· 320 320 goto nla_put_failure; 321 321 NLA_PUT_NET16(skb, IPSET_ATTR_PORT, htons(map->first_port)); 322 322 NLA_PUT_NET16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); 323 - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, 324 - htonl(atomic_read(&set->ref) - 1)); 323 + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); 325 324 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 326 325 htonl(sizeof(*map) + map->memsize)); 327 326 if (with_timeout(map->timeout))
+66 -43
net/netfilter/ipset/ip_set_core.c
··· 26 26 27 27 static LIST_HEAD(ip_set_type_list); /* all registered set types */ 28 28 static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ 29 + static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ 29 30 30 31 static struct ip_set **ip_set_list; /* all individual sets */ 31 32 static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */ ··· 302 301 static inline void 303 302 __ip_set_get(ip_set_id_t index) 304 303 { 305 - atomic_inc(&ip_set_list[index]->ref); 304 + write_lock_bh(&ip_set_ref_lock); 305 + ip_set_list[index]->ref++; 306 + write_unlock_bh(&ip_set_ref_lock); 306 307 } 307 308 308 309 static inline void 309 310 __ip_set_put(ip_set_id_t index) 310 311 { 311 - atomic_dec(&ip_set_list[index]->ref); 312 + write_lock_bh(&ip_set_ref_lock); 313 + BUG_ON(ip_set_list[index]->ref == 0); 314 + ip_set_list[index]->ref--; 315 + write_unlock_bh(&ip_set_ref_lock); 312 316 } 313 317 314 318 /* ··· 330 324 struct ip_set *set = ip_set_list[index]; 331 325 int ret = 0; 332 326 333 - BUG_ON(set == NULL || atomic_read(&set->ref) == 0); 327 + BUG_ON(set == NULL); 334 328 pr_debug("set %s, index %u\n", set->name, index); 335 329 336 330 if (dim < set->type->dimension || ··· 362 356 struct ip_set *set = ip_set_list[index]; 363 357 int ret; 364 358 365 - BUG_ON(set == NULL || atomic_read(&set->ref) == 0); 359 + BUG_ON(set == NULL); 366 360 pr_debug("set %s, index %u\n", set->name, index); 367 361 368 362 if (dim < set->type->dimension || ··· 384 378 struct ip_set *set = ip_set_list[index]; 385 379 int ret = 0; 386 380 387 - BUG_ON(set == NULL || atomic_read(&set->ref) == 0); 381 + BUG_ON(set == NULL); 388 382 pr_debug("set %s, index %u\n", set->name, index); 389 383 390 384 if (dim < set->type->dimension || ··· 403 397 * Find set by name, reference it once. The reference makes sure the 404 398 * thing pointed to, does not go away under our feet. 405 399 * 406 - * The nfnl mutex must already be activated. 407 400 */ 408 401 ip_set_id_t 409 402 ip_set_get_byname(const char *name, struct ip_set **set) ··· 428 423 * reference count by 1. The caller shall not assume the index 429 424 * to be valid, after calling this function. 430 425 * 431 - * The nfnl mutex must already be activated. 432 426 */ 433 427 void 434 428 ip_set_put_byindex(ip_set_id_t index) 435 429 { 436 - if (ip_set_list[index] != NULL) { 437 - BUG_ON(atomic_read(&ip_set_list[index]->ref) == 0); 430 + if (ip_set_list[index] != NULL) 438 431 __ip_set_put(index); 439 - } 440 432 } 441 433 EXPORT_SYMBOL_GPL(ip_set_put_byindex); 442 434 ··· 443 441 * can't be destroyed. The set cannot be renamed due to 444 442 * the referencing either. 445 443 * 446 - * The nfnl mutex must already be activated. 447 444 */ 448 445 const char * 449 446 ip_set_name_byindex(ip_set_id_t index) ··· 450 449 const struct ip_set *set = ip_set_list[index]; 451 450 452 451 BUG_ON(set == NULL); 453 - BUG_ON(atomic_read(&set->ref) == 0); 452 + BUG_ON(set->ref == 0); 454 453 455 454 /* Referenced, so it's safe */ 456 455 return set->name; ··· 516 515 ip_set_nfnl_put(ip_set_id_t index) 517 516 { 518 517 nfnl_lock(); 519 - if (ip_set_list[index] != NULL) { 520 - BUG_ON(atomic_read(&ip_set_list[index]->ref) == 0); 521 - __ip_set_put(index); 522 - } 518 + ip_set_put_byindex(index); 523 519 nfnl_unlock(); 524 520 } 525 521 EXPORT_SYMBOL_GPL(ip_set_nfnl_put); ··· 524 526 /* 525 527 * Communication protocol with userspace over netlink. 526 528 * 527 - * We already locked by nfnl_lock. 529 + * The commands are serialized by the nfnl mutex. 528 530 */ 529 531 530 532 static inline bool ··· 655 657 return -ENOMEM; 656 658 rwlock_init(&set->lock); 657 659 strlcpy(set->name, name, IPSET_MAXNAMELEN); 658 - atomic_set(&set->ref, 0); 659 660 set->family = family; 660 661 661 662 /* ··· 687 690 688 691 /* 689 692 * Here, we have a valid, constructed set and we are protected 690 - * by nfnl_lock. Find the first free index in ip_set_list and 691 - * check clashing. 693 + * by the nfnl mutex. Find the first free index in ip_set_list 694 + * and check clashing. 692 695 */ 693 696 if ((ret = find_free_id(set->name, &index, &clash)) != 0) { 694 697 /* If this is the same set and requested, ignore error */ ··· 748 751 const struct nlattr * const attr[]) 749 752 { 750 753 ip_set_id_t i; 754 + int ret = 0; 751 755 752 756 if (unlikely(protocol_failed(attr))) 753 757 return -IPSET_ERR_PROTOCOL; 754 758 755 - /* References are protected by the nfnl mutex */ 759 + /* Commands are serialized and references are 760 + * protected by the ip_set_ref_lock. 761 + * External systems (i.e. xt_set) must call 762 + * ip_set_put|get_nfnl_* functions, that way we 763 + * can safely check references here. 764 + * 765 + * list:set timer can only decrement the reference 766 + * counter, so if it's already zero, we can proceed 767 + * without holding the lock. 768 + */ 769 + read_lock_bh(&ip_set_ref_lock); 756 770 if (!attr[IPSET_ATTR_SETNAME]) { 757 771 for (i = 0; i < ip_set_max; i++) { 758 - if (ip_set_list[i] != NULL && 759 - (atomic_read(&ip_set_list[i]->ref))) 760 - return -IPSET_ERR_BUSY; 772 + if (ip_set_list[i] != NULL && ip_set_list[i]->ref) { 773 + ret = IPSET_ERR_BUSY; 774 + goto out; 775 + } 761 776 } 777 + read_unlock_bh(&ip_set_ref_lock); 762 778 for (i = 0; i < ip_set_max; i++) { 763 779 if (ip_set_list[i] != NULL) 764 780 ip_set_destroy_set(i); 765 781 } 766 782 } else { 767 783 i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); 768 - if (i == IPSET_INVALID_ID) 769 - return -ENOENT; 770 - else if (atomic_read(&ip_set_list[i]->ref)) 771 - return -IPSET_ERR_BUSY; 784 + if (i == IPSET_INVALID_ID) { 785 + ret = -ENOENT; 786 + goto out; 787 + } else if (ip_set_list[i]->ref) { 788 + ret = -IPSET_ERR_BUSY; 789 + goto out; 790 + } 791 + read_unlock_bh(&ip_set_ref_lock); 772 792 773 793 ip_set_destroy_set(i); 774 794 } 775 795 return 0; 796 + out: 797 + read_unlock_bh(&ip_set_ref_lock); 798 + return ret; 776 799 } 777 800 778 801 /* Flush sets */ ··· 851 834 struct ip_set *set; 852 835 const char *name2; 853 836 ip_set_id_t i; 837 + int ret = 0; 854 838 855 839 if (unlikely(protocol_failed(attr) || 856 840 attr[IPSET_ATTR_SETNAME] == NULL || ··· 861 843 set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); 862 844 if (set == NULL) 863 845 return -ENOENT; 864 - if (atomic_read(&set->ref) != 0) 865 - return -IPSET_ERR_REFERENCED; 846 + 847 + read_lock_bh(&ip_set_ref_lock); 848 + if (set->ref != 0) { 849 + ret = -IPSET_ERR_REFERENCED; 850 + goto out; 851 + } 866 852 867 853 name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); 868 854 for (i = 0; i < ip_set_max; i++) { 869 855 if (ip_set_list[i] != NULL && 870 - STREQ(ip_set_list[i]->name, name2)) 871 - return -IPSET_ERR_EXIST_SETNAME2; 856 + STREQ(ip_set_list[i]->name, name2)) { 857 + ret = -IPSET_ERR_EXIST_SETNAME2; 858 + goto out; 859 + } 872 860 } 873 861 strncpy(set->name, name2, IPSET_MAXNAMELEN); 874 862 875 - return 0; 863 + out: 864 + read_unlock_bh(&ip_set_ref_lock); 865 + return ret; 876 866 } 877 867 878 868 /* Swap two sets so that name/index points to the other. 879 869 * References and set names are also swapped. 880 870 * 881 - * We are protected by the nfnl mutex and references are 882 - * manipulated only by holding the mutex. The kernel interfaces 871 + * The commands are serialized by the nfnl mutex and references are 872 + * protected by the ip_set_ref_lock. The kernel interfaces 883 873 * do not hold the mutex but the pointer settings are atomic 884 874 * so the ip_set_list always contains valid pointers to the sets. 885 875 */ ··· 900 874 struct ip_set *from, *to; 901 875 ip_set_id_t from_id, to_id; 902 876 char from_name[IPSET_MAXNAMELEN]; 903 - u32 from_ref; 904 877 905 878 if (unlikely(protocol_failed(attr) || 906 879 attr[IPSET_ATTR_SETNAME] == NULL || ··· 924 899 from->type->family == to->type->family)) 925 900 return -IPSET_ERR_TYPE_MISMATCH; 926 901 927 - /* No magic here: ref munging protected by the nfnl_lock */ 928 902 strncpy(from_name, from->name, IPSET_MAXNAMELEN); 929 - from_ref = atomic_read(&from->ref); 930 - 931 903 strncpy(from->name, to->name, IPSET_MAXNAMELEN); 932 - atomic_set(&from->ref, atomic_read(&to->ref)); 933 904 strncpy(to->name, from_name, IPSET_MAXNAMELEN); 934 - atomic_set(&to->ref, from_ref); 935 905 906 + write_lock_bh(&ip_set_ref_lock); 907 + swap(from->ref, to->ref); 936 908 ip_set_list[from_id] = to; 937 909 ip_set_list[to_id] = from; 910 + write_unlock_bh(&ip_set_ref_lock); 938 911 939 912 return 0; 940 913 } ··· 949 926 { 950 927 if (cb->args[2]) { 951 928 pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name); 952 - __ip_set_put((ip_set_id_t) cb->args[1]); 929 + ip_set_put_byindex((ip_set_id_t) cb->args[1]); 953 930 } 954 931 return 0; 955 932 } ··· 1091 1068 /* If there was an error or set is done, release set */ 1092 1069 if (ret || !cb->args[2]) { 1093 1070 pr_debug("release set %s\n", ip_set_list[index]->name); 1094 - __ip_set_put(index); 1071 + ip_set_put_byindex(index); 1095 1072 } 1096 1073 1097 1074 /* If we dump all sets, continue with dumping last ones */
+2 -4
net/netfilter/ipset/ip_set_list_set.c
··· 366 366 NLA_PUT_NET32(skb, IPSET_ATTR_SIZE, htonl(map->size)); 367 367 if (with_timeout(map->timeout)) 368 368 NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); 369 - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, 370 - htonl(atomic_read(&set->ref) - 1)); 369 + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); 371 370 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 372 371 htonl(sizeof(*map) + map->size * map->dsize)); 373 372 ipset_nest_end(skb, nested); ··· 456 457 struct list_set *map = set->data; 457 458 struct set_telem *e; 458 459 u32 i; 459 - 460 - /* nfnl_lock should be called */ 460 + 461 461 write_lock_bh(&set->lock); 462 462 for (i = 0; i < map->size; i++) { 463 463 e = list_set_telem(map, i);