Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

netfilter: nf_tables: coalesce multiple notifications into one skbuff

On x86_64, each notification results in one skbuff allocation which
consumes at least 768 bytes due to the skbuff overhead.

This patch coalesces several notifications into one single skbuff, so
each notification consumes at least ~211 bytes, that ~3.5 times less
memory consumption. As a result, this is reducing the chances to exhaust
the netlink socket receive buffer.

Rule of thumb is that each notification batch only contains netlink
messages whose report flag is the same, nfnetlink_send() requires this
to do appropriate delivery to userspace, either via unicast (echo
mode) or multicast (monitor mode).

The skbuff control buffer is used to annotate the report flag for later
handling at the new coalescing routine.

The batch skbuff notification size is NLMSG_GOODSIZE, using a larger
skbuff would allow for more socket receiver buffer savings (to amortize
the cost of the skbuff even more), however, going over that size might
break userspace applications, so let's be conservative and stick to
NLMSG_GOODSIZE.

Reported-by: Phil Sutter <phil@nwl.cc>
Acked-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>

+58 -13
+1
include/net/netns/nftables.h
··· 8 8 struct list_head tables; 9 9 struct list_head commit_list; 10 10 struct list_head module_list; 11 + struct list_head notify_list; 11 12 struct mutex commit_mutex; 12 13 unsigned int base_seq; 13 14 u8 gencursor;
+57 -13
net/netfilter/nf_tables_api.c
··· 684 684 return -1; 685 685 } 686 686 687 + struct nftnl_skb_parms { 688 + bool report; 689 + }; 690 + #define NFT_CB(skb) (*(struct nftnl_skb_parms*)&((skb)->cb)) 691 + 692 + static void nft_notify_enqueue(struct sk_buff *skb, bool report, 693 + struct list_head *notify_list) 694 + { 695 + NFT_CB(skb).report = report; 696 + list_add_tail(&skb->list, notify_list); 697 + } 698 + 687 699 static void nf_tables_table_notify(const struct nft_ctx *ctx, int event) 688 700 { 689 701 struct sk_buff *skb; ··· 727 715 goto err; 728 716 } 729 717 730 - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, 731 - ctx->report, GFP_KERNEL); 718 + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); 732 719 return; 733 720 err: 734 721 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); ··· 1479 1468 goto err; 1480 1469 } 1481 1470 1482 - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, 1483 - ctx->report, GFP_KERNEL); 1471 + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); 1484 1472 return; 1485 1473 err: 1486 1474 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); ··· 2817 2807 goto err; 2818 2808 } 2819 2809 2820 - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, 2821 - ctx->report, GFP_KERNEL); 2810 + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); 2822 2811 return; 2823 2812 err: 2824 2813 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); ··· 3846 3837 goto err; 3847 3838 } 3848 3839 3849 - nfnetlink_send(skb, ctx->net, portid, NFNLGRP_NFTABLES, ctx->report, 3850 - gfp_flags); 3840 + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); 3851 3841 return; 3852 3842 err: 3853 3843 nfnetlink_set_err(ctx->net, portid, NFNLGRP_NFTABLES, -ENOBUFS); ··· 4967 4959 goto err; 4968 4960 } 4969 4961 4970 - nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, ctx->report, 4971 - GFP_KERNEL); 4962 + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); 4972 4963 return; 4973 4964 err: 4974 4965 nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); ··· 6282 6275 goto err; 6283 6276 } 6284 6277 6285 - nfnetlink_send(skb, net, portid, NFNLGRP_NFTABLES, report, gfp); 6278 + nft_notify_enqueue(skb, report, &net->nft.notify_list); 6286 6279 return; 6287 6280 err: 6288 6281 nfnetlink_set_err(net, portid, NFNLGRP_NFTABLES, -ENOBUFS); ··· 7092 7085 goto err; 7093 7086 } 7094 7087 7095 - nfnetlink_send(skb, ctx->net, ctx->portid, NFNLGRP_NFTABLES, 7096 - ctx->report, GFP_KERNEL); 7088 + nft_notify_enqueue(skb, ctx->report, &ctx->net->nft.notify_list); 7097 7089 return; 7098 7090 err: 7099 7091 nfnetlink_set_err(ctx->net, ctx->portid, NFNLGRP_NFTABLES, -ENOBUFS); ··· 7701 7695 mutex_unlock(&net->nft.commit_mutex); 7702 7696 } 7703 7697 7698 + static void nft_commit_notify(struct net *net, u32 portid) 7699 + { 7700 + struct sk_buff *batch_skb = NULL, *nskb, *skb; 7701 + unsigned char *data; 7702 + int len; 7703 + 7704 + list_for_each_entry_safe(skb, nskb, &net->nft.notify_list, list) { 7705 + if (!batch_skb) { 7706 + new_batch: 7707 + batch_skb = skb; 7708 + len = NLMSG_GOODSIZE - skb->len; 7709 + list_del(&skb->list); 7710 + continue; 7711 + } 7712 + len -= skb->len; 7713 + if (len > 0 && NFT_CB(skb).report == NFT_CB(batch_skb).report) { 7714 + data = skb_put(batch_skb, skb->len); 7715 + memcpy(data, skb->data, skb->len); 7716 + list_del(&skb->list); 7717 + kfree_skb(skb); 7718 + continue; 7719 + } 7720 + nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES, 7721 + NFT_CB(batch_skb).report, GFP_KERNEL); 7722 + goto new_batch; 7723 + } 7724 + 7725 + if (batch_skb) { 7726 + nfnetlink_send(batch_skb, net, portid, NFNLGRP_NFTABLES, 7727 + NFT_CB(batch_skb).report, GFP_KERNEL); 7728 + } 7729 + 7730 + WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); 7731 + } 7732 + 7704 7733 static int nf_tables_commit(struct net *net, struct sk_buff *skb) 7705 7734 { 7706 7735 struct nft_trans *trans, *next; ··· 7938 7897 } 7939 7898 } 7940 7899 7900 + nft_commit_notify(net, NETLINK_CB(skb).portid); 7941 7901 nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); 7942 7902 nf_tables_commit_release(net); 7943 7903 ··· 8763 8721 INIT_LIST_HEAD(&net->nft.tables); 8764 8722 INIT_LIST_HEAD(&net->nft.commit_list); 8765 8723 INIT_LIST_HEAD(&net->nft.module_list); 8724 + INIT_LIST_HEAD(&net->nft.notify_list); 8766 8725 mutex_init(&net->nft.commit_mutex); 8767 8726 net->nft.base_seq = 1; 8768 8727 net->nft.validate_state = NFT_VALIDATE_SKIP; ··· 8780 8737 mutex_unlock(&net->nft.commit_mutex); 8781 8738 WARN_ON_ONCE(!list_empty(&net->nft.tables)); 8782 8739 WARN_ON_ONCE(!list_empty(&net->nft.module_list)); 8740 + WARN_ON_ONCE(!list_empty(&net->nft.notify_list)); 8783 8741 } 8784 8742 8785 8743 static struct pernet_operations nf_tables_net_ops = {