netfilter: ipset: references are protected by rwlock instead of mutex

The timeout variant of the list:set type must reference the member sets.
However, its garbage collector runs at timer interrupt so the mutex
protection of the references is a no go. Therefore the reference protection
is converted to rwlock.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Patrick McHardy <kaber@trash.net>

authored by Jozsef Kadlecsik and committed by Patrick McHardy 2f9f28b2 512d06b5

+73 -56
+1 -1
include/linux/netfilter/ipset/ip_set.h
··· 293 /* Lock protecting the set data */ 294 rwlock_t lock; 295 /* References to the set */ 296 - atomic_t ref; 297 /* The core set type */ 298 struct ip_set_type *type; 299 /* The type variant doing the real job */
··· 293 /* Lock protecting the set data */ 294 rwlock_t lock; 295 /* References to the set */ 296 + u32 ref; 297 /* The core set type */ 298 struct ip_set_type *type; 299 /* The type variant doing the real job */
+1 -2
include/linux/netfilter/ipset/ip_set_ahash.h
··· 515 if (h->netmask != HOST_MASK) 516 NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, h->netmask); 517 #endif 518 - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, 519 - htonl(atomic_read(&set->ref) - 1)); 520 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)); 521 if (with_timeout(h->timeout)) 522 NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout));
··· 515 if (h->netmask != HOST_MASK) 516 NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, h->netmask); 517 #endif 518 + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); 519 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)); 520 if (with_timeout(h->timeout)) 521 NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout));
+1 -2
net/netfilter/ipset/ip_set_bitmap_ip.c
··· 338 NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); 339 if (map->netmask != 32) 340 NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, map->netmask); 341 - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, 342 - htonl(atomic_read(&set->ref) - 1)); 343 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 344 htonl(sizeof(*map) + map->memsize)); 345 if (with_timeout(map->timeout))
··· 338 NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); 339 if (map->netmask != 32) 340 NLA_PUT_U8(skb, IPSET_ATTR_NETMASK, map->netmask); 341 + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); 342 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 343 htonl(sizeof(*map) + map->memsize)); 344 if (with_timeout(map->timeout))
+1 -2
net/netfilter/ipset/ip_set_bitmap_ipmac.c
··· 434 goto nla_put_failure; 435 NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip)); 436 NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); 437 - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, 438 - htonl(atomic_read(&set->ref) - 1)); 439 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 440 htonl(sizeof(*map) 441 + (map->last_ip - map->first_ip + 1) * map->dsize));
··· 434 goto nla_put_failure; 435 NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP, htonl(map->first_ip)); 436 NLA_PUT_IPADDR4(skb, IPSET_ATTR_IP_TO, htonl(map->last_ip)); 437 + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); 438 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 439 htonl(sizeof(*map) 440 + (map->last_ip - map->first_ip + 1) * map->dsize));
+1 -2
net/netfilter/ipset/ip_set_bitmap_port.c
··· 320 goto nla_put_failure; 321 NLA_PUT_NET16(skb, IPSET_ATTR_PORT, htons(map->first_port)); 322 NLA_PUT_NET16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); 323 - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, 324 - htonl(atomic_read(&set->ref) - 1)); 325 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 326 htonl(sizeof(*map) + map->memsize)); 327 if (with_timeout(map->timeout))
··· 320 goto nla_put_failure; 321 NLA_PUT_NET16(skb, IPSET_ATTR_PORT, htons(map->first_port)); 322 NLA_PUT_NET16(skb, IPSET_ATTR_PORT_TO, htons(map->last_port)); 323 + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); 324 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 325 htonl(sizeof(*map) + map->memsize)); 326 if (with_timeout(map->timeout))
+66 -43
net/netfilter/ipset/ip_set_core.c
··· 26 27 static LIST_HEAD(ip_set_type_list); /* all registered set types */ 28 static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ 29 30 static struct ip_set **ip_set_list; /* all individual sets */ 31 static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */ ··· 302 static inline void 303 __ip_set_get(ip_set_id_t index) 304 { 305 - atomic_inc(&ip_set_list[index]->ref); 306 } 307 308 static inline void 309 __ip_set_put(ip_set_id_t index) 310 { 311 - atomic_dec(&ip_set_list[index]->ref); 312 } 313 314 /* ··· 330 struct ip_set *set = ip_set_list[index]; 331 int ret = 0; 332 333 - BUG_ON(set == NULL || atomic_read(&set->ref) == 0); 334 pr_debug("set %s, index %u\n", set->name, index); 335 336 if (dim < set->type->dimension || ··· 362 struct ip_set *set = ip_set_list[index]; 363 int ret; 364 365 - BUG_ON(set == NULL || atomic_read(&set->ref) == 0); 366 pr_debug("set %s, index %u\n", set->name, index); 367 368 if (dim < set->type->dimension || ··· 384 struct ip_set *set = ip_set_list[index]; 385 int ret = 0; 386 387 - BUG_ON(set == NULL || atomic_read(&set->ref) == 0); 388 pr_debug("set %s, index %u\n", set->name, index); 389 390 if (dim < set->type->dimension || ··· 403 * Find set by name, reference it once. The reference makes sure the 404 * thing pointed to, does not go away under our feet. 405 * 406 - * The nfnl mutex must already be activated. 407 */ 408 ip_set_id_t 409 ip_set_get_byname(const char *name, struct ip_set **set) ··· 428 * reference count by 1. The caller shall not assume the index 429 * to be valid, after calling this function. 430 * 431 - * The nfnl mutex must already be activated. 432 */ 433 void 434 ip_set_put_byindex(ip_set_id_t index) 435 { 436 - if (ip_set_list[index] != NULL) { 437 - BUG_ON(atomic_read(&ip_set_list[index]->ref) == 0); 438 __ip_set_put(index); 439 - } 440 } 441 EXPORT_SYMBOL_GPL(ip_set_put_byindex); 442 ··· 443 * can't be destroyed. The set cannot be renamed due to 444 * the referencing either. 445 * 446 - * The nfnl mutex must already be activated. 447 */ 448 const char * 449 ip_set_name_byindex(ip_set_id_t index) ··· 450 const struct ip_set *set = ip_set_list[index]; 451 452 BUG_ON(set == NULL); 453 - BUG_ON(atomic_read(&set->ref) == 0); 454 455 /* Referenced, so it's safe */ 456 return set->name; ··· 516 ip_set_nfnl_put(ip_set_id_t index) 517 { 518 nfnl_lock(); 519 - if (ip_set_list[index] != NULL) { 520 - BUG_ON(atomic_read(&ip_set_list[index]->ref) == 0); 521 - __ip_set_put(index); 522 - } 523 nfnl_unlock(); 524 } 525 EXPORT_SYMBOL_GPL(ip_set_nfnl_put); ··· 524 /* 525 * Communication protocol with userspace over netlink. 526 * 527 - * We already locked by nfnl_lock. 528 */ 529 530 static inline bool ··· 655 return -ENOMEM; 656 rwlock_init(&set->lock); 657 strlcpy(set->name, name, IPSET_MAXNAMELEN); 658 - atomic_set(&set->ref, 0); 659 set->family = family; 660 661 /* ··· 687 688 /* 689 * Here, we have a valid, constructed set and we are protected 690 - * by nfnl_lock. Find the first free index in ip_set_list and 691 - * check clashing. 692 */ 693 if ((ret = find_free_id(set->name, &index, &clash)) != 0) { 694 /* If this is the same set and requested, ignore error */ ··· 748 const struct nlattr * const attr[]) 749 { 750 ip_set_id_t i; 751 752 if (unlikely(protocol_failed(attr))) 753 return -IPSET_ERR_PROTOCOL; 754 755 - /* References are protected by the nfnl mutex */ 756 if (!attr[IPSET_ATTR_SETNAME]) { 757 for (i = 0; i < ip_set_max; i++) { 758 - if (ip_set_list[i] != NULL && 759 - (atomic_read(&ip_set_list[i]->ref))) 760 - return -IPSET_ERR_BUSY; 761 } 762 for (i = 0; i < ip_set_max; i++) { 763 if (ip_set_list[i] != NULL) 764 ip_set_destroy_set(i); 765 } 766 } else { 767 i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); 768 - if (i == IPSET_INVALID_ID) 769 - return -ENOENT; 770 - else if (atomic_read(&ip_set_list[i]->ref)) 771 - return -IPSET_ERR_BUSY; 772 773 ip_set_destroy_set(i); 774 } 775 return 0; 776 } 777 778 /* Flush sets */ ··· 851 struct ip_set *set; 852 const char *name2; 853 ip_set_id_t i; 854 855 if (unlikely(protocol_failed(attr) || 856 attr[IPSET_ATTR_SETNAME] == NULL || ··· 861 set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); 862 if (set == NULL) 863 return -ENOENT; 864 - if (atomic_read(&set->ref) != 0) 865 - return -IPSET_ERR_REFERENCED; 866 867 name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); 868 for (i = 0; i < ip_set_max; i++) { 869 if (ip_set_list[i] != NULL && 870 - STREQ(ip_set_list[i]->name, name2)) 871 - return -IPSET_ERR_EXIST_SETNAME2; 872 } 873 strncpy(set->name, name2, IPSET_MAXNAMELEN); 874 875 - return 0; 876 } 877 878 /* Swap two sets so that name/index points to the other. 879 * References and set names are also swapped. 880 * 881 - * We are protected by the nfnl mutex and references are 882 - * manipulated only by holding the mutex. The kernel interfaces 883 * do not hold the mutex but the pointer settings are atomic 884 * so the ip_set_list always contains valid pointers to the sets. 885 */ ··· 900 struct ip_set *from, *to; 901 ip_set_id_t from_id, to_id; 902 char from_name[IPSET_MAXNAMELEN]; 903 - u32 from_ref; 904 905 if (unlikely(protocol_failed(attr) || 906 attr[IPSET_ATTR_SETNAME] == NULL || ··· 924 from->type->family == to->type->family)) 925 return -IPSET_ERR_TYPE_MISMATCH; 926 927 - /* No magic here: ref munging protected by the nfnl_lock */ 928 strncpy(from_name, from->name, IPSET_MAXNAMELEN); 929 - from_ref = atomic_read(&from->ref); 930 - 931 strncpy(from->name, to->name, IPSET_MAXNAMELEN); 932 - atomic_set(&from->ref, atomic_read(&to->ref)); 933 strncpy(to->name, from_name, IPSET_MAXNAMELEN); 934 - atomic_set(&to->ref, from_ref); 935 936 ip_set_list[from_id] = to; 937 ip_set_list[to_id] = from; 938 939 return 0; 940 } ··· 949 { 950 if (cb->args[2]) { 951 pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name); 952 - __ip_set_put((ip_set_id_t) cb->args[1]); 953 } 954 return 0; 955 } ··· 1091 /* If there was an error or set is done, release set */ 1092 if (ret || !cb->args[2]) { 1093 pr_debug("release set %s\n", ip_set_list[index]->name); 1094 - __ip_set_put(index); 1095 } 1096 1097 /* If we dump all sets, continue with dumping last ones */
··· 26 27 static LIST_HEAD(ip_set_type_list); /* all registered set types */ 28 static DEFINE_MUTEX(ip_set_type_mutex); /* protects ip_set_type_list */ 29 + static DEFINE_RWLOCK(ip_set_ref_lock); /* protects the set refs */ 30 31 static struct ip_set **ip_set_list; /* all individual sets */ 32 static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */ ··· 301 static inline void 302 __ip_set_get(ip_set_id_t index) 303 { 304 + write_lock_bh(&ip_set_ref_lock); 305 + ip_set_list[index]->ref++; 306 + write_unlock_bh(&ip_set_ref_lock); 307 } 308 309 static inline void 310 __ip_set_put(ip_set_id_t index) 311 { 312 + write_lock_bh(&ip_set_ref_lock); 313 + BUG_ON(ip_set_list[index]->ref == 0); 314 + ip_set_list[index]->ref--; 315 + write_unlock_bh(&ip_set_ref_lock); 316 } 317 318 /* ··· 324 struct ip_set *set = ip_set_list[index]; 325 int ret = 0; 326 327 + BUG_ON(set == NULL); 328 pr_debug("set %s, index %u\n", set->name, index); 329 330 if (dim < set->type->dimension || ··· 356 struct ip_set *set = ip_set_list[index]; 357 int ret; 358 359 + BUG_ON(set == NULL); 360 pr_debug("set %s, index %u\n", set->name, index); 361 362 if (dim < set->type->dimension || ··· 378 struct ip_set *set = ip_set_list[index]; 379 int ret = 0; 380 381 + BUG_ON(set == NULL); 382 pr_debug("set %s, index %u\n", set->name, index); 383 384 if (dim < set->type->dimension || ··· 397 * Find set by name, reference it once. The reference makes sure the 398 * thing pointed to, does not go away under our feet. 399 * 400 */ 401 ip_set_id_t 402 ip_set_get_byname(const char *name, struct ip_set **set) ··· 423 * reference count by 1. The caller shall not assume the index 424 * to be valid, after calling this function. 425 * 426 */ 427 void 428 ip_set_put_byindex(ip_set_id_t index) 429 { 430 + if (ip_set_list[index] != NULL) 431 __ip_set_put(index); 432 } 433 EXPORT_SYMBOL_GPL(ip_set_put_byindex); 434 ··· 441 * can't be destroyed. The set cannot be renamed due to 442 * the referencing either. 443 * 444 */ 445 const char * 446 ip_set_name_byindex(ip_set_id_t index) ··· 449 const struct ip_set *set = ip_set_list[index]; 450 451 BUG_ON(set == NULL); 452 + BUG_ON(set->ref == 0); 453 454 /* Referenced, so it's safe */ 455 return set->name; ··· 515 ip_set_nfnl_put(ip_set_id_t index) 516 { 517 nfnl_lock(); 518 + ip_set_put_byindex(index); 519 nfnl_unlock(); 520 } 521 EXPORT_SYMBOL_GPL(ip_set_nfnl_put); ··· 526 /* 527 * Communication protocol with userspace over netlink. 528 * 529 + * The commands are serialized by the nfnl mutex. 530 */ 531 532 static inline bool ··· 657 return -ENOMEM; 658 rwlock_init(&set->lock); 659 strlcpy(set->name, name, IPSET_MAXNAMELEN); 660 set->family = family; 661 662 /* ··· 690 691 /* 692 * Here, we have a valid, constructed set and we are protected 693 + * by the nfnl mutex. Find the first free index in ip_set_list 694 + * and check clashing. 695 */ 696 if ((ret = find_free_id(set->name, &index, &clash)) != 0) { 697 /* If this is the same set and requested, ignore error */ ··· 751 const struct nlattr * const attr[]) 752 { 753 ip_set_id_t i; 754 + int ret = 0; 755 756 if (unlikely(protocol_failed(attr))) 757 return -IPSET_ERR_PROTOCOL; 758 759 + /* Commands are serialized and references are 760 + * protected by the ip_set_ref_lock. 761 + * External systems (i.e. xt_set) must call 762 + * ip_set_put|get_nfnl_* functions, that way we 763 + * can safely check references here. 764 + * 765 + * list:set timer can only decrement the reference 766 + * counter, so if it's already zero, we can proceed 767 + * without holding the lock. 768 + */ 769 + read_lock_bh(&ip_set_ref_lock); 770 if (!attr[IPSET_ATTR_SETNAME]) { 771 for (i = 0; i < ip_set_max; i++) { 772 + if (ip_set_list[i] != NULL && ip_set_list[i]->ref) { 773 + ret = IPSET_ERR_BUSY; 774 + goto out; 775 + } 776 } 777 + read_unlock_bh(&ip_set_ref_lock); 778 for (i = 0; i < ip_set_max; i++) { 779 if (ip_set_list[i] != NULL) 780 ip_set_destroy_set(i); 781 } 782 } else { 783 i = find_set_id(nla_data(attr[IPSET_ATTR_SETNAME])); 784 + if (i == IPSET_INVALID_ID) { 785 + ret = -ENOENT; 786 + goto out; 787 + } else if (ip_set_list[i]->ref) { 788 + ret = -IPSET_ERR_BUSY; 789 + goto out; 790 + } 791 + read_unlock_bh(&ip_set_ref_lock); 792 793 ip_set_destroy_set(i); 794 } 795 return 0; 796 + out: 797 + read_unlock_bh(&ip_set_ref_lock); 798 + return ret; 799 } 800 801 /* Flush sets */ ··· 834 struct ip_set *set; 835 const char *name2; 836 ip_set_id_t i; 837 + int ret = 0; 838 839 if (unlikely(protocol_failed(attr) || 840 attr[IPSET_ATTR_SETNAME] == NULL || ··· 843 set = find_set(nla_data(attr[IPSET_ATTR_SETNAME])); 844 if (set == NULL) 845 return -ENOENT; 846 + 847 + read_lock_bh(&ip_set_ref_lock); 848 + if (set->ref != 0) { 849 + ret = -IPSET_ERR_REFERENCED; 850 + goto out; 851 + } 852 853 name2 = nla_data(attr[IPSET_ATTR_SETNAME2]); 854 for (i = 0; i < ip_set_max; i++) { 855 if (ip_set_list[i] != NULL && 856 + STREQ(ip_set_list[i]->name, name2)) { 857 + ret = -IPSET_ERR_EXIST_SETNAME2; 858 + goto out; 859 + } 860 } 861 strncpy(set->name, name2, IPSET_MAXNAMELEN); 862 863 + out: 864 + read_unlock_bh(&ip_set_ref_lock); 865 + return ret; 866 } 867 868 /* Swap two sets so that name/index points to the other. 869 * References and set names are also swapped. 870 * 871 + * The commands are serialized by the nfnl mutex and references are 872 + * protected by the ip_set_ref_lock. The kernel interfaces 873 * do not hold the mutex but the pointer settings are atomic 874 * so the ip_set_list always contains valid pointers to the sets. 875 */ ··· 874 struct ip_set *from, *to; 875 ip_set_id_t from_id, to_id; 876 char from_name[IPSET_MAXNAMELEN]; 877 878 if (unlikely(protocol_failed(attr) || 879 attr[IPSET_ATTR_SETNAME] == NULL || ··· 899 from->type->family == to->type->family)) 900 return -IPSET_ERR_TYPE_MISMATCH; 901 902 strncpy(from_name, from->name, IPSET_MAXNAMELEN); 903 strncpy(from->name, to->name, IPSET_MAXNAMELEN); 904 strncpy(to->name, from_name, IPSET_MAXNAMELEN); 905 906 + write_lock_bh(&ip_set_ref_lock); 907 + swap(from->ref, to->ref); 908 ip_set_list[from_id] = to; 909 ip_set_list[to_id] = from; 910 + write_unlock_bh(&ip_set_ref_lock); 911 912 return 0; 913 } ··· 926 { 927 if (cb->args[2]) { 928 pr_debug("release set %s\n", ip_set_list[cb->args[1]]->name); 929 + ip_set_put_byindex((ip_set_id_t) cb->args[1]); 930 } 931 return 0; 932 } ··· 1068 /* If there was an error or set is done, release set */ 1069 if (ret || !cb->args[2]) { 1070 pr_debug("release set %s\n", ip_set_list[index]->name); 1071 + ip_set_put_byindex(index); 1072 } 1073 1074 /* If we dump all sets, continue with dumping last ones */
+2 -4
net/netfilter/ipset/ip_set_list_set.c
··· 366 NLA_PUT_NET32(skb, IPSET_ATTR_SIZE, htonl(map->size)); 367 if (with_timeout(map->timeout)) 368 NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); 369 - NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, 370 - htonl(atomic_read(&set->ref) - 1)); 371 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 372 htonl(sizeof(*map) + map->size * map->dsize)); 373 ipset_nest_end(skb, nested); ··· 456 struct list_set *map = set->data; 457 struct set_telem *e; 458 u32 i; 459 - 460 - /* nfnl_lock should be called */ 461 write_lock_bh(&set->lock); 462 for (i = 0; i < map->size; i++) { 463 e = list_set_telem(map, i);
··· 366 NLA_PUT_NET32(skb, IPSET_ATTR_SIZE, htonl(map->size)); 367 if (with_timeout(map->timeout)) 368 NLA_PUT_NET32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout)); 369 + NLA_PUT_NET32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)); 370 NLA_PUT_NET32(skb, IPSET_ATTR_MEMSIZE, 371 htonl(sizeof(*map) + map->size * map->dsize)); 372 ipset_nest_end(skb, nested); ··· 457 struct list_set *map = set->data; 458 struct set_telem *e; 459 u32 i; 460 + 461 write_lock_bh(&set->lock); 462 for (i = 0; i < map->size; i++) { 463 e = list_set_telem(map, i);