netfilter: revised locking for x_tables

The x_tables are organized with a table structure and a per-cpu copies
of the counters and rules. On older kernels there was a reader/writer
lock per table which was a performance bottleneck. In 2.6.30-rc, this
was converted to use RCU and the counters/rules which solved the performance
problems for do_table but made replacing rules much slower because of
the necessary RCU grace period.

This version uses a per-cpu set of spinlocks and counters to allow to
table processing to proceed without the cache thrashing of a global
reader lock and keeps the same performance for table updates.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>

authored by Stephen Hemminger and committed by David S. Miller 942e4a2b bf368e4e

+204 -296
+68 -5
include/linux/netfilter/x_tables.h
··· 354 354 /* What hooks you will enter on */ 355 355 unsigned int valid_hooks; 356 356 357 - /* Lock for the curtain */ 358 - struct mutex lock; 359 - 360 357 /* Man behind the curtain... */ 361 358 struct xt_table_info *private; 362 359 ··· 431 434 432 435 extern struct xt_table_info *xt_alloc_table_info(unsigned int size); 433 436 extern void xt_free_table_info(struct xt_table_info *info); 434 - extern void xt_table_entry_swap_rcu(struct xt_table_info *old, 435 - struct xt_table_info *new); 437 + 438 + /* 439 + * Per-CPU spinlock associated with per-cpu table entries, and 440 + * with a counter for the "reading" side that allows a recursive 441 + * reader to avoid taking the lock and deadlocking. 442 + * 443 + * "reading" is used by ip/arp/ip6 tables rule processing which runs per-cpu. 444 + * It needs to ensure that the rules are not being changed while the packet 445 + * is being processed. In some cases, the read lock will be acquired 446 + * twice on the same CPU; this is okay because of the count. 447 + * 448 + * "writing" is used when reading counters. 449 + * During replace any readers that are using the old tables have to complete 450 + * before freeing the old table. This is handled by the write locking 451 + * necessary for reading the counters. 452 + */ 453 + struct xt_info_lock { 454 + spinlock_t lock; 455 + unsigned char readers; 456 + }; 457 + DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks); 458 + 459 + /* 460 + * Note: we need to ensure that preemption is disabled before acquiring 461 + * the per-cpu-variable, so we do it as a two step process rather than 462 + * using "spin_lock_bh()". 463 + * 464 + * We _also_ need to disable bottom half processing before updating our 465 + * nesting count, to make sure that the only kind of re-entrancy is this 466 + * code being called by itself: since the count+lock is not an atomic 467 + * operation, we can allow no races. 468 + * 469 + * _Only_ that special combination of being per-cpu and never getting 470 + * re-entered asynchronously means that the count is safe. 471 + */ 472 + static inline void xt_info_rdlock_bh(void) 473 + { 474 + struct xt_info_lock *lock; 475 + 476 + local_bh_disable(); 477 + lock = &__get_cpu_var(xt_info_locks); 478 + if (!lock->readers++) 479 + spin_lock(&lock->lock); 480 + } 481 + 482 + static inline void xt_info_rdunlock_bh(void) 483 + { 484 + struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks); 485 + 486 + if (!--lock->readers) 487 + spin_unlock(&lock->lock); 488 + local_bh_enable(); 489 + } 490 + 491 + /* 492 + * The "writer" side needs to get exclusive access to the lock, 493 + * regardless of readers. This must be called with bottom half 494 + * processing (and thus also preemption) disabled. 495 + */ 496 + static inline void xt_info_wrlock(unsigned int cpu) 497 + { 498 + spin_lock(&per_cpu(xt_info_locks, cpu).lock); 499 + } 500 + 501 + static inline void xt_info_wrunlock(unsigned int cpu) 502 + { 503 + spin_unlock(&per_cpu(xt_info_locks, cpu).lock); 504 + } 436 505 437 506 /* 438 507 * This helper is performance critical and must be inlined
+36 -89
net/ipv4/netfilter/arp_tables.c
··· 253 253 indev = in ? in->name : nulldevname; 254 254 outdev = out ? out->name : nulldevname; 255 255 256 - rcu_read_lock_bh(); 257 - private = rcu_dereference(table->private); 258 - table_base = rcu_dereference(private->entries[smp_processor_id()]); 256 + xt_info_rdlock_bh(); 257 + private = table->private; 258 + table_base = private->entries[smp_processor_id()]; 259 259 260 260 e = get_entry(table_base, private->hook_entry[hook]); 261 261 back = get_entry(table_base, private->underflow[hook]); ··· 273 273 274 274 hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + 275 275 (2 * skb->dev->addr_len); 276 + 276 277 ADD_COUNTER(e->counters, hdr_len, 1); 277 278 278 279 t = arpt_get_target(e); ··· 329 328 e = (void *)e + e->next_offset; 330 329 } 331 330 } while (!hotdrop); 332 - 333 - rcu_read_unlock_bh(); 331 + xt_info_rdunlock_bh(); 334 332 335 333 if (hotdrop) 336 334 return NF_DROP; ··· 711 711 /* Instead of clearing (by a previous call to memset()) 712 712 * the counters and using adds, we set the counters 713 713 * with data used by 'current' CPU 714 - * We dont care about preemption here. 714 + * 715 + * Bottom half has to be disabled to prevent deadlock 716 + * if new softirq were to run and call ipt_do_table 715 717 */ 716 - curcpu = raw_smp_processor_id(); 718 + local_bh_disable(); 719 + curcpu = smp_processor_id(); 717 720 718 721 i = 0; 719 722 ARPT_ENTRY_ITERATE(t->entries[curcpu], ··· 729 726 if (cpu == curcpu) 730 727 continue; 731 728 i = 0; 729 + xt_info_wrlock(cpu); 732 730 ARPT_ENTRY_ITERATE(t->entries[cpu], 733 731 t->size, 734 732 add_entry_to_counter, 735 733 counters, 736 734 &i); 735 + xt_info_wrunlock(cpu); 737 736 } 738 - } 739 - 740 - 741 - /* We're lazy, and add to the first CPU; overflow works its fey magic 742 - * and everything is OK. */ 743 - static int 744 - add_counter_to_entry(struct arpt_entry *e, 745 - const struct xt_counters addme[], 746 - unsigned int *i) 747 - { 748 - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); 749 - 750 - (*i)++; 751 - return 0; 752 - } 753 - 754 - /* Take values from counters and add them back onto the current cpu */ 755 - static void put_counters(struct xt_table_info *t, 756 - const struct xt_counters counters[]) 757 - { 758 - unsigned int i, cpu; 759 - 760 - local_bh_disable(); 761 - cpu = smp_processor_id(); 762 - i = 0; 763 - ARPT_ENTRY_ITERATE(t->entries[cpu], 764 - t->size, 765 - add_counter_to_entry, 766 - counters, 767 - &i); 768 737 local_bh_enable(); 769 - } 770 - 771 - static inline int 772 - zero_entry_counter(struct arpt_entry *e, void *arg) 773 - { 774 - e->counters.bcnt = 0; 775 - e->counters.pcnt = 0; 776 - return 0; 777 - } 778 - 779 - static void 780 - clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) 781 - { 782 - unsigned int cpu; 783 - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; 784 - 785 - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 786 - for_each_possible_cpu(cpu) { 787 - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); 788 - ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, 789 - zero_entry_counter, NULL); 790 - } 791 738 } 792 739 793 740 static struct xt_counters *alloc_counters(struct xt_table *table) ··· 745 792 unsigned int countersize; 746 793 struct xt_counters *counters; 747 794 struct xt_table_info *private = table->private; 748 - struct xt_table_info *info; 749 795 750 796 /* We need atomic snapshot of counters: rest doesn't change 751 797 * (other than comefrom, which userspace doesn't care ··· 754 802 counters = vmalloc_node(countersize, numa_node_id()); 755 803 756 804 if (counters == NULL) 757 - goto nomem; 805 + return ERR_PTR(-ENOMEM); 758 806 759 - info = xt_alloc_table_info(private->size); 760 - if (!info) 761 - goto free_counters; 762 - 763 - clone_counters(info, private); 764 - 765 - mutex_lock(&table->lock); 766 - xt_table_entry_swap_rcu(private, info); 767 - synchronize_net(); /* Wait until smoke has cleared */ 768 - 769 - get_counters(info, counters); 770 - put_counters(private, counters); 771 - mutex_unlock(&table->lock); 772 - 773 - xt_free_table_info(info); 807 + get_counters(private, counters); 774 808 775 809 return counters; 776 - 777 - free_counters: 778 - vfree(counters); 779 - nomem: 780 - return ERR_PTR(-ENOMEM); 781 810 } 782 811 783 812 static int copy_entries_to_user(unsigned int total_size, ··· 1027 1094 (newinfo->number <= oldinfo->initial_entries)) 1028 1095 module_put(t->me); 1029 1096 1030 - /* Get the old counters. */ 1097 + /* Get the old counters, and synchronize with replace */ 1031 1098 get_counters(oldinfo, counters); 1099 + 1032 1100 /* Decrease module usage counts and free resource */ 1033 1101 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1034 1102 ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, ··· 1099 1165 return ret; 1100 1166 } 1101 1167 1168 + /* We're lazy, and add to the first CPU; overflow works its fey magic 1169 + * and everything is OK. */ 1170 + static int 1171 + add_counter_to_entry(struct arpt_entry *e, 1172 + const struct xt_counters addme[], 1173 + unsigned int *i) 1174 + { 1175 + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); 1176 + 1177 + (*i)++; 1178 + return 0; 1179 + } 1180 + 1102 1181 static int do_add_counters(struct net *net, void __user *user, unsigned int len, 1103 1182 int compat) 1104 1183 { 1105 - unsigned int i; 1184 + unsigned int i, curcpu; 1106 1185 struct xt_counters_info tmp; 1107 1186 struct xt_counters *paddc; 1108 1187 unsigned int num_counters; ··· 1171 1224 goto free; 1172 1225 } 1173 1226 1174 - mutex_lock(&t->lock); 1227 + local_bh_disable(); 1175 1228 private = t->private; 1176 1229 if (private->number != num_counters) { 1177 1230 ret = -EINVAL; 1178 1231 goto unlock_up_free; 1179 1232 } 1180 1233 1181 - preempt_disable(); 1182 1234 i = 0; 1183 1235 /* Choose the copy that is on our node */ 1184 - loc_cpu_entry = private->entries[smp_processor_id()]; 1236 + curcpu = smp_processor_id(); 1237 + loc_cpu_entry = private->entries[curcpu]; 1238 + xt_info_wrlock(curcpu); 1185 1239 ARPT_ENTRY_ITERATE(loc_cpu_entry, 1186 1240 private->size, 1187 1241 add_counter_to_entry, 1188 1242 paddc, 1189 1243 &i); 1190 - preempt_enable(); 1244 + xt_info_wrunlock(curcpu); 1191 1245 unlock_up_free: 1192 - mutex_unlock(&t->lock); 1193 - 1246 + local_bh_enable(); 1194 1247 xt_table_unlock(t); 1195 1248 module_put(t->me); 1196 1249 free:
+35 -91
net/ipv4/netfilter/ip_tables.c
··· 338 338 tgpar.hooknum = hook; 339 339 340 340 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 341 - 342 - rcu_read_lock_bh(); 343 - private = rcu_dereference(table->private); 344 - table_base = rcu_dereference(private->entries[smp_processor_id()]); 341 + xt_info_rdlock_bh(); 342 + private = table->private; 343 + table_base = private->entries[smp_processor_id()]; 345 344 346 345 e = get_entry(table_base, private->hook_entry[hook]); 347 346 ··· 435 436 e = (void *)e + e->next_offset; 436 437 } 437 438 } while (!hotdrop); 438 - 439 - rcu_read_unlock_bh(); 439 + xt_info_rdunlock_bh(); 440 440 441 441 #ifdef DEBUG_ALLOW_ALL 442 442 return NF_ACCEPT; ··· 894 896 895 897 /* Instead of clearing (by a previous call to memset()) 896 898 * the counters and using adds, we set the counters 897 - * with data used by 'current' CPU 898 - * We dont care about preemption here. 899 + * with data used by 'current' CPU. 900 + * 901 + * Bottom half has to be disabled to prevent deadlock 902 + * if new softirq were to run and call ipt_do_table 899 903 */ 900 - curcpu = raw_smp_processor_id(); 904 + local_bh_disable(); 905 + curcpu = smp_processor_id(); 901 906 902 907 i = 0; 903 908 IPT_ENTRY_ITERATE(t->entries[curcpu], ··· 913 912 if (cpu == curcpu) 914 913 continue; 915 914 i = 0; 915 + xt_info_wrlock(cpu); 916 916 IPT_ENTRY_ITERATE(t->entries[cpu], 917 917 t->size, 918 918 add_entry_to_counter, 919 919 counters, 920 920 &i); 921 + xt_info_wrunlock(cpu); 921 922 } 922 - 923 - } 924 - 925 - /* We're lazy, and add to the first CPU; overflow works its fey magic 926 - * and everything is OK. */ 927 - static int 928 - add_counter_to_entry(struct ipt_entry *e, 929 - const struct xt_counters addme[], 930 - unsigned int *i) 931 - { 932 - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); 933 - 934 - (*i)++; 935 - return 0; 936 - } 937 - 938 - /* Take values from counters and add them back onto the current cpu */ 939 - static void put_counters(struct xt_table_info *t, 940 - const struct xt_counters counters[]) 941 - { 942 - unsigned int i, cpu; 943 - 944 - local_bh_disable(); 945 - cpu = smp_processor_id(); 946 - i = 0; 947 - IPT_ENTRY_ITERATE(t->entries[cpu], 948 - t->size, 949 - add_counter_to_entry, 950 - counters, 951 - &i); 952 923 local_bh_enable(); 953 - } 954 - 955 - 956 - static inline int 957 - zero_entry_counter(struct ipt_entry *e, void *arg) 958 - { 959 - e->counters.bcnt = 0; 960 - e->counters.pcnt = 0; 961 - return 0; 962 - } 963 - 964 - static void 965 - clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) 966 - { 967 - unsigned int cpu; 968 - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; 969 - 970 - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 971 - for_each_possible_cpu(cpu) { 972 - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); 973 - IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, 974 - zero_entry_counter, NULL); 975 - } 976 924 } 977 925 978 926 static struct xt_counters * alloc_counters(struct xt_table *table) ··· 929 979 unsigned int countersize; 930 980 struct xt_counters *counters; 931 981 struct xt_table_info *private = table->private; 932 - struct xt_table_info *info; 933 982 934 983 /* We need atomic snapshot of counters: rest doesn't change 935 984 (other than comefrom, which userspace doesn't care ··· 937 988 counters = vmalloc_node(countersize, numa_node_id()); 938 989 939 990 if (counters == NULL) 940 - goto nomem; 991 + return ERR_PTR(-ENOMEM); 941 992 942 - info = xt_alloc_table_info(private->size); 943 - if (!info) 944 - goto free_counters; 945 - 946 - clone_counters(info, private); 947 - 948 - mutex_lock(&table->lock); 949 - xt_table_entry_swap_rcu(private, info); 950 - synchronize_net(); /* Wait until smoke has cleared */ 951 - 952 - get_counters(info, counters); 953 - put_counters(private, counters); 954 - mutex_unlock(&table->lock); 955 - 956 - xt_free_table_info(info); 993 + get_counters(private, counters); 957 994 958 995 return counters; 959 - 960 - free_counters: 961 - vfree(counters); 962 - nomem: 963 - return ERR_PTR(-ENOMEM); 964 996 } 965 997 966 998 static int ··· 1236 1306 (newinfo->number <= oldinfo->initial_entries)) 1237 1307 module_put(t->me); 1238 1308 1239 - /* Get the old counters. */ 1309 + /* Get the old counters, and synchronize with replace */ 1240 1310 get_counters(oldinfo, counters); 1311 + 1241 1312 /* Decrease module usage counts and free resource */ 1242 1313 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1243 1314 IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, ··· 1308 1377 return ret; 1309 1378 } 1310 1379 1380 + /* We're lazy, and add to the first CPU; overflow works its fey magic 1381 + * and everything is OK. */ 1382 + static int 1383 + add_counter_to_entry(struct ipt_entry *e, 1384 + const struct xt_counters addme[], 1385 + unsigned int *i) 1386 + { 1387 + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); 1388 + 1389 + (*i)++; 1390 + return 0; 1391 + } 1311 1392 1312 1393 static int 1313 1394 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat) 1314 1395 { 1315 - unsigned int i; 1396 + unsigned int i, curcpu; 1316 1397 struct xt_counters_info tmp; 1317 1398 struct xt_counters *paddc; 1318 1399 unsigned int num_counters; ··· 1380 1437 goto free; 1381 1438 } 1382 1439 1383 - mutex_lock(&t->lock); 1440 + local_bh_disable(); 1384 1441 private = t->private; 1385 1442 if (private->number != num_counters) { 1386 1443 ret = -EINVAL; 1387 1444 goto unlock_up_free; 1388 1445 } 1389 1446 1390 - preempt_disable(); 1391 1447 i = 0; 1392 1448 /* Choose the copy that is on our node */ 1393 - loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1449 + curcpu = smp_processor_id(); 1450 + loc_cpu_entry = private->entries[curcpu]; 1451 + xt_info_wrlock(curcpu); 1394 1452 IPT_ENTRY_ITERATE(loc_cpu_entry, 1395 1453 private->size, 1396 1454 add_counter_to_entry, 1397 1455 paddc, 1398 1456 &i); 1399 - preempt_enable(); 1457 + xt_info_wrunlock(curcpu); 1400 1458 unlock_up_free: 1401 - mutex_unlock(&t->lock); 1459 + local_bh_enable(); 1402 1460 xt_table_unlock(t); 1403 1461 module_put(t->me); 1404 1462 free:
+37 -86
net/ipv6/netfilter/ip6_tables.c
··· 365 365 366 366 IP_NF_ASSERT(table->valid_hooks & (1 << hook)); 367 367 368 - rcu_read_lock_bh(); 369 - private = rcu_dereference(table->private); 370 - table_base = rcu_dereference(private->entries[smp_processor_id()]); 368 + xt_info_rdlock_bh(); 369 + private = table->private; 370 + table_base = private->entries[smp_processor_id()]; 371 371 372 372 e = get_entry(table_base, private->hook_entry[hook]); 373 373 ··· 466 466 #ifdef CONFIG_NETFILTER_DEBUG 467 467 ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON; 468 468 #endif 469 - rcu_read_unlock_bh(); 469 + xt_info_rdunlock_bh(); 470 470 471 471 #ifdef DEBUG_ALLOW_ALL 472 472 return NF_ACCEPT; ··· 926 926 /* Instead of clearing (by a previous call to memset()) 927 927 * the counters and using adds, we set the counters 928 928 * with data used by 'current' CPU 929 - * We dont care about preemption here. 929 + * 930 + * Bottom half has to be disabled to prevent deadlock 931 + * if new softirq were to run and call ipt_do_table 930 932 */ 931 - curcpu = raw_smp_processor_id(); 933 + local_bh_disable(); 934 + curcpu = smp_processor_id(); 932 935 933 936 i = 0; 934 937 IP6T_ENTRY_ITERATE(t->entries[curcpu], ··· 944 941 if (cpu == curcpu) 945 942 continue; 946 943 i = 0; 944 + xt_info_wrlock(cpu); 947 945 IP6T_ENTRY_ITERATE(t->entries[cpu], 948 946 t->size, 949 947 add_entry_to_counter, 950 948 counters, 951 949 &i); 950 + xt_info_wrunlock(cpu); 952 951 } 953 - } 954 - 955 - /* We're lazy, and add to the first CPU; overflow works its fey magic 956 - * and everything is OK. */ 957 - static int 958 - add_counter_to_entry(struct ip6t_entry *e, 959 - const struct xt_counters addme[], 960 - unsigned int *i) 961 - { 962 - ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); 963 - 964 - (*i)++; 965 - return 0; 966 - } 967 - 968 - /* Take values from counters and add them back onto the current cpu */ 969 - static void put_counters(struct xt_table_info *t, 970 - const struct xt_counters counters[]) 971 - { 972 - unsigned int i, cpu; 973 - 974 - local_bh_disable(); 975 - cpu = smp_processor_id(); 976 - i = 0; 977 - IP6T_ENTRY_ITERATE(t->entries[cpu], 978 - t->size, 979 - add_counter_to_entry, 980 - counters, 981 - &i); 982 952 local_bh_enable(); 983 - } 984 - 985 - static inline int 986 - zero_entry_counter(struct ip6t_entry *e, void *arg) 987 - { 988 - e->counters.bcnt = 0; 989 - e->counters.pcnt = 0; 990 - return 0; 991 - } 992 - 993 - static void 994 - clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info) 995 - { 996 - unsigned int cpu; 997 - const void *loc_cpu_entry = info->entries[raw_smp_processor_id()]; 998 - 999 - memcpy(newinfo, info, offsetof(struct xt_table_info, entries)); 1000 - for_each_possible_cpu(cpu) { 1001 - memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size); 1002 - IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size, 1003 - zero_entry_counter, NULL); 1004 - } 1005 953 } 1006 954 1007 955 static struct xt_counters *alloc_counters(struct xt_table *table) ··· 960 1006 unsigned int countersize; 961 1007 struct xt_counters *counters; 962 1008 struct xt_table_info *private = table->private; 963 - struct xt_table_info *info; 964 1009 965 1010 /* We need atomic snapshot of counters: rest doesn't change 966 1011 (other than comefrom, which userspace doesn't care ··· 968 1015 counters = vmalloc_node(countersize, numa_node_id()); 969 1016 970 1017 if (counters == NULL) 971 - goto nomem; 1018 + return ERR_PTR(-ENOMEM); 972 1019 973 - info = xt_alloc_table_info(private->size); 974 - if (!info) 975 - goto free_counters; 976 - 977 - clone_counters(info, private); 978 - 979 - mutex_lock(&table->lock); 980 - xt_table_entry_swap_rcu(private, info); 981 - synchronize_net(); /* Wait until smoke has cleared */ 982 - 983 - get_counters(info, counters); 984 - put_counters(private, counters); 985 - mutex_unlock(&table->lock); 986 - 987 - xt_free_table_info(info); 1020 + get_counters(private, counters); 988 1021 989 1022 return counters; 990 - 991 - free_counters: 992 - vfree(counters); 993 - nomem: 994 - return ERR_PTR(-ENOMEM); 995 1023 } 996 1024 997 1025 static int ··· 1268 1334 (newinfo->number <= oldinfo->initial_entries)) 1269 1335 module_put(t->me); 1270 1336 1271 - /* Get the old counters. */ 1337 + /* Get the old counters, and synchronize with replace */ 1272 1338 get_counters(oldinfo, counters); 1339 + 1273 1340 /* Decrease module usage counts and free resource */ 1274 1341 loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; 1275 1342 IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry, ··· 1340 1405 return ret; 1341 1406 } 1342 1407 1408 + /* We're lazy, and add to the first CPU; overflow works its fey magic 1409 + * and everything is OK. */ 1410 + static int 1411 + add_counter_to_entry(struct ip6t_entry *e, 1412 + const struct xt_counters addme[], 1413 + unsigned int *i) 1414 + { 1415 + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); 1416 + 1417 + (*i)++; 1418 + return 0; 1419 + } 1420 + 1343 1421 static int 1344 1422 do_add_counters(struct net *net, void __user *user, unsigned int len, 1345 1423 int compat) 1346 1424 { 1347 - unsigned int i; 1425 + unsigned int i, curcpu; 1348 1426 struct xt_counters_info tmp; 1349 1427 struct xt_counters *paddc; 1350 1428 unsigned int num_counters; ··· 1413 1465 goto free; 1414 1466 } 1415 1467 1416 - mutex_lock(&t->lock); 1468 + 1469 + local_bh_disable(); 1417 1470 private = t->private; 1418 1471 if (private->number != num_counters) { 1419 1472 ret = -EINVAL; 1420 1473 goto unlock_up_free; 1421 1474 } 1422 1475 1423 - preempt_disable(); 1424 1476 i = 0; 1425 1477 /* Choose the copy that is on our node */ 1426 - loc_cpu_entry = private->entries[raw_smp_processor_id()]; 1478 + curcpu = smp_processor_id(); 1479 + xt_info_wrlock(curcpu); 1480 + loc_cpu_entry = private->entries[curcpu]; 1427 1481 IP6T_ENTRY_ITERATE(loc_cpu_entry, 1428 1482 private->size, 1429 1483 add_counter_to_entry, 1430 1484 paddc, 1431 1485 &i); 1432 - preempt_enable(); 1486 + xt_info_wrunlock(curcpu); 1487 + 1433 1488 unlock_up_free: 1434 - mutex_unlock(&t->lock); 1489 + local_bh_enable(); 1435 1490 xt_table_unlock(t); 1436 1491 module_put(t->me); 1437 1492 free:
+28 -25
net/netfilter/x_tables.c
··· 625 625 } 626 626 EXPORT_SYMBOL(xt_free_table_info); 627 627 628 - void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo, 629 - struct xt_table_info *newinfo) 630 - { 631 - unsigned int cpu; 632 - 633 - for_each_possible_cpu(cpu) { 634 - void *p = oldinfo->entries[cpu]; 635 - rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]); 636 - newinfo->entries[cpu] = p; 637 - } 638 - 639 - } 640 - EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu); 641 - 642 628 /* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ 643 629 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, 644 630 const char *name) ··· 662 676 EXPORT_SYMBOL_GPL(xt_compat_unlock); 663 677 #endif 664 678 679 + DEFINE_PER_CPU(struct xt_info_lock, xt_info_locks); 680 + EXPORT_PER_CPU_SYMBOL_GPL(xt_info_locks); 681 + 682 + 665 683 struct xt_table_info * 666 684 xt_replace_table(struct xt_table *table, 667 685 unsigned int num_counters, 668 686 struct xt_table_info *newinfo, 669 687 int *error) 670 688 { 671 - struct xt_table_info *oldinfo, *private; 689 + struct xt_table_info *private; 672 690 673 691 /* Do the substitution. */ 674 - mutex_lock(&table->lock); 692 + local_bh_disable(); 675 693 private = table->private; 694 + 676 695 /* Check inside lock: is the old number correct? */ 677 696 if (num_counters != private->number) { 678 697 duprintf("num_counters != table->private->number (%u/%u)\n", 679 698 num_counters, private->number); 680 - mutex_unlock(&table->lock); 699 + local_bh_enable(); 681 700 *error = -EAGAIN; 682 701 return NULL; 683 702 } 684 - oldinfo = private; 685 - rcu_assign_pointer(table->private, newinfo); 686 - newinfo->initial_entries = oldinfo->initial_entries; 687 - mutex_unlock(&table->lock); 688 703 689 - synchronize_net(); 690 - return oldinfo; 704 + table->private = newinfo; 705 + newinfo->initial_entries = private->initial_entries; 706 + 707 + /* 708 + * Even though table entries have now been swapped, other CPU's 709 + * may still be using the old entries. This is okay, because 710 + * resynchronization happens because of the locking done 711 + * during the get_counters() routine. 712 + */ 713 + local_bh_enable(); 714 + 715 + return private; 691 716 } 692 717 EXPORT_SYMBOL_GPL(xt_replace_table); 693 718 ··· 731 734 732 735 /* Simplifies replace_table code. */ 733 736 table->private = bootstrap; 734 - mutex_init(&table->lock); 735 737 736 738 if (!xt_replace_table(table, 0, newinfo, &ret)) 737 739 goto unlock; ··· 1143 1147 1144 1148 static int __init xt_init(void) 1145 1149 { 1146 - int i, rv; 1150 + unsigned int i; 1151 + int rv; 1152 + 1153 + for_each_possible_cpu(i) { 1154 + struct xt_info_lock *lock = &per_cpu(xt_info_locks, i); 1155 + spin_lock_init(&lock->lock); 1156 + lock->readers = 0; 1157 + } 1147 1158 1148 1159 xt = kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); 1149 1160 if (!xt)