Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: ipset: fix performance regression in swap operation

The patch "netfilter: ipset: fix race condition between swap/destroy
and kernel side add/del/test", commit 28628fa9 fixes a race condition.
But the synchronize_rcu() added to the swap function unnecessarily slows
it down: it can safely be moved to destroy and use call_rcu() instead.

Eric Dumazet pointed out that simply calling the destroy functions as
rcu callback does not work: sets with timeout use garbage collectors
which need cancelling at destroy which can wait. Therefore the destroy
functions are split into two: cancelling garbage collectors safely at
executing the command received by netlink and moving the remaining
part only into the rcu callback.

Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
Fixes: 28628fa952fe ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
Reported-by: Ale Crismani <ale.crismani@automattic.com>
Reported-by: David Wang <00107082@163.com>
Tested-by: David Wang <00107082@163.com>
Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

authored by

Jozsef Kadlecsik and committed by
Pablo Neira Ayuso
97f7cf1c 6e348067

+65 -18
+4
include/linux/netfilter/ipset/ip_set.h
··· 186 186 /* Return true if "b" set is the same as "a" 187 187 * according to the create set parameters */ 188 188 bool (*same_set)(const struct ip_set *a, const struct ip_set *b); 189 + /* Cancel ongoing garbage collectors before destroying the set*/ 190 + void (*cancel_gc)(struct ip_set *set); 189 191 /* Region-locking is used */ 190 192 bool region_lock; 191 193 }; ··· 244 242 245 243 /* A generic IP set */ 246 244 struct ip_set { 245 + /* For call_cru in destroy */ 246 + struct rcu_head rcu; 247 247 /* The name of the set */ 248 248 char name[IPSET_MAXNAMELEN]; 249 249 /* Lock protecting the set data */
+11 -3
net/netfilter/ipset/ip_set_bitmap_gen.h
··· 30 30 #define mtype_del IPSET_TOKEN(MTYPE, _del) 31 31 #define mtype_list IPSET_TOKEN(MTYPE, _list) 32 32 #define mtype_gc IPSET_TOKEN(MTYPE, _gc) 33 + #define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) 33 34 #define mtype MTYPE 34 35 35 36 #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) ··· 59 58 mtype_destroy(struct ip_set *set) 60 59 { 61 60 struct mtype *map = set->data; 62 - 63 - if (SET_WITH_TIMEOUT(set)) 64 - del_timer_sync(&map->gc); 65 61 66 62 if (set->dsize && set->extensions & IPSET_EXT_DESTROY) 67 63 mtype_ext_cleanup(set); ··· 288 290 add_timer(&map->gc); 289 291 } 290 292 293 + static void 294 + mtype_cancel_gc(struct ip_set *set) 295 + { 296 + struct mtype *map = set->data; 297 + 298 + if (SET_WITH_TIMEOUT(set)) 299 + del_timer_sync(&map->gc); 300 + } 301 + 291 302 static const struct ip_set_type_variant mtype = { 292 303 .kadt = mtype_kadt, 293 304 .uadt = mtype_uadt, ··· 310 303 .head = mtype_head, 311 304 .list = mtype_list, 312 305 .same_set = mtype_same_set, 306 + .cancel_gc = mtype_cancel_gc, 313 307 }; 314 308 315 309 #endif /* __IP_SET_BITMAP_IP_GEN_H */
+28 -9
net/netfilter/ipset/ip_set_core.c
··· 1182 1182 kfree(set); 1183 1183 } 1184 1184 1185 + static void 1186 + ip_set_destroy_set_rcu(struct rcu_head *head) 1187 + { 1188 + struct ip_set *set = container_of(head, struct ip_set, rcu); 1189 + 1190 + ip_set_destroy_set(set); 1191 + } 1192 + 1185 1193 static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, 1186 1194 const struct nlattr * const attr[]) 1187 1195 { ··· 1201 1193 if (unlikely(protocol_min_failed(attr))) 1202 1194 return -IPSET_ERR_PROTOCOL; 1203 1195 1204 - /* Must wait for flush to be really finished in list:set */ 1205 - rcu_barrier(); 1206 1196 1207 1197 /* Commands are serialized and references are 1208 1198 * protected by the ip_set_ref_lock. ··· 1212 1206 * counter, so if it's already zero, we can proceed 1213 1207 * without holding the lock. 1214 1208 */ 1215 - read_lock_bh(&ip_set_ref_lock); 1216 1209 if (!attr[IPSET_ATTR_SETNAME]) { 1210 + /* Must wait for flush to be really finished in list:set */ 1211 + rcu_barrier(); 1212 + read_lock_bh(&ip_set_ref_lock); 1217 1213 for (i = 0; i < inst->ip_set_max; i++) { 1218 1214 s = ip_set(inst, i); 1219 1215 if (s && (s->ref || s->ref_netlink)) { ··· 1229 1221 s = ip_set(inst, i); 1230 1222 if (s) { 1231 1223 ip_set(inst, i) = NULL; 1224 + /* Must cancel garbage collectors */ 1225 + s->variant->cancel_gc(s); 1232 1226 ip_set_destroy_set(s); 1233 1227 } 1234 1228 } ··· 1238 1228 inst->is_destroyed = false; 1239 1229 } else { 1240 1230 u32 flags = flag_exist(info->nlh); 1231 + u16 features = 0; 1232 + 1233 + read_lock_bh(&ip_set_ref_lock); 1241 1234 s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), 1242 1235 &i); 1243 1236 if (!s) { ··· 1251 1238 ret = -IPSET_ERR_BUSY; 1252 1239 goto out; 1253 1240 } 1241 + features = s->type->features; 1254 1242 ip_set(inst, i) = NULL; 1255 1243 read_unlock_bh(&ip_set_ref_lock); 1256 - 1257 - ip_set_destroy_set(s); 1244 + if (features & IPSET_TYPE_NAME) { 1245 + /* Must wait for flush to be really finished */ 1246 + rcu_barrier(); 1247 + } 1248 + /* Must cancel garbage collectors */ 1249 + s->variant->cancel_gc(s); 1250 + call_rcu(&s->rcu, ip_set_destroy_set_rcu); 1258 1251 } 1259 1252 return 0; 1260 1253 out: ··· 1412 1393 ip_set(inst, from_id) = to; 1413 1394 ip_set(inst, to_id) = from; 1414 1395 write_unlock_bh(&ip_set_ref_lock); 1415 - 1416 - /* Make sure all readers of the old set pointers are completed. */ 1417 - synchronize_rcu(); 1418 1396 1419 1397 return 0; 1420 1398 } ··· 2425 2409 { 2426 2410 nf_unregister_sockopt(&so_set); 2427 2411 nfnetlink_subsys_unregister(&ip_set_netlink_subsys); 2428 - 2429 2412 unregister_pernet_subsys(&ip_set_net_ops); 2413 + 2414 + /* Wait for call_rcu() in destroy */ 2415 + rcu_barrier(); 2416 + 2430 2417 pr_debug("these are the famous last words\n"); 2431 2418 } 2432 2419
+12 -3
net/netfilter/ipset/ip_set_hash_gen.h
··· 222 222 #undef mtype_gc_do 223 223 #undef mtype_gc 224 224 #undef mtype_gc_init 225 + #undef mtype_cancel_gc 225 226 #undef mtype_variant 226 227 #undef mtype_data_match 227 228 ··· 267 266 #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) 268 267 #define mtype_gc IPSET_TOKEN(MTYPE, _gc) 269 268 #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) 269 + #define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc) 270 270 #define mtype_variant IPSET_TOKEN(MTYPE, _variant) 271 271 #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) 272 272 ··· 452 450 struct htype *h = set->data; 453 451 struct list_head *l, *lt; 454 452 455 - if (SET_WITH_TIMEOUT(set)) 456 - cancel_delayed_work_sync(&h->gc.dwork); 457 - 458 453 mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); 459 454 list_for_each_safe(l, lt, &h->ad) { 460 455 list_del(l); ··· 596 597 { 597 598 INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc); 598 599 queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); 600 + } 601 + 602 + static void 603 + mtype_cancel_gc(struct ip_set *set) 604 + { 605 + struct htype *h = set->data; 606 + 607 + if (SET_WITH_TIMEOUT(set)) 608 + cancel_delayed_work_sync(&h->gc.dwork); 599 609 } 600 610 601 611 static int ··· 1449 1441 .uref = mtype_uref, 1450 1442 .resize = mtype_resize, 1451 1443 .same_set = mtype_same_set, 1444 + .cancel_gc = mtype_cancel_gc, 1452 1445 .region_lock = true, 1453 1446 }; 1454 1447
+10 -3
net/netfilter/ipset/ip_set_list_set.c
··· 426 426 struct list_set *map = set->data; 427 427 struct set_elem *e, *n; 428 428 429 - if (SET_WITH_TIMEOUT(set)) 430 - timer_shutdown_sync(&map->gc); 431 - 432 429 list_for_each_entry_safe(e, n, &map->members, list) { 433 430 list_del(&e->list); 434 431 ip_set_put_byindex(map->net, e->id); ··· 542 545 a->extensions == b->extensions; 543 546 } 544 547 548 + static void 549 + list_set_cancel_gc(struct ip_set *set) 550 + { 551 + struct list_set *map = set->data; 552 + 553 + if (SET_WITH_TIMEOUT(set)) 554 + timer_shutdown_sync(&map->gc); 555 + } 556 + 545 557 static const struct ip_set_type_variant set_variant = { 546 558 .kadt = list_set_kadt, 547 559 .uadt = list_set_uadt, ··· 564 558 .head = list_set_head, 565 559 .list = list_set_list, 566 560 .same_set = list_set_same_set, 561 + .cancel_gc = list_set_cancel_gc, 567 562 }; 568 563 569 564 static void