Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xfrm: policy: store inexact policies in an rhashtable

Switch packet-path lookups for inexact policies to rhashtable.

In this initial version, we now no longer need to search policies with
non-matching address family and type.

Next patch will add the if_id as well so lookups from the xfrm interface
driver only need to search inexact policies for that device.

Future patches will augment the hlist in each rhash bucket with a tree
and pre-sort policies according to daddr/prefix.

A single rhashtable is used. In order to avoid a full rhashtable walk on
netns exit, the bins get placed on a pernet list, i.e. we add almost no
cost for network namespaces that had no xfrm policies.

The inexact lists are kept in place, and policies are added to both the
per-rhash-inexact list and a pernet one.

The latter is needed for the control plane to handle migrate -- these
requests do not consider the if_id, so if we'd remove the inexact_list
now we would have to search all hash buckets and then figure
out which matching policy candidate is the most recent one -- this appears
a bit harder than just keeping the 'old' inexact list for this purpose.

Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>

authored by

Florian Westphal and committed by
Steffen Klassert
24969fac cc1bb845

+335 -18
+2
include/net/netns/xfrm.h
··· 5 5 #include <linux/list.h> 6 6 #include <linux/wait.h> 7 7 #include <linux/workqueue.h> 8 + #include <linux/rhashtable-types.h> 8 9 #include <linux/xfrm.h> 9 10 #include <net/dst_ops.h> 10 11 ··· 54 53 unsigned int policy_count[XFRM_POLICY_MAX * 2]; 55 54 struct work_struct policy_hash_work; 56 55 struct xfrm_policy_hthresh policy_hthresh; 56 + struct list_head inexact_bins; 57 57 58 58 59 59 struct sock *nlsk;
+1
include/net/xfrm.h
··· 596 596 u16 family; 597 597 struct xfrm_sec_ctx *security; 598 598 struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; 599 + struct hlist_node bydst_inexact_list; 599 600 struct rcu_head rcu; 600 601 }; 601 602
+332 -18
net/xfrm/xfrm_policy.c
··· 26 26 #include <linux/cache.h> 27 27 #include <linux/cpu.h> 28 28 #include <linux/audit.h> 29 + #include <linux/rhashtable.h> 29 30 #include <net/dst.h> 30 31 #include <net/flow.h> 31 32 #include <net/xfrm.h> ··· 46 45 u8 flags; 47 46 }; 48 47 48 + struct xfrm_pol_inexact_key { 49 + possible_net_t net; 50 + u16 family; 51 + u8 dir, type; 52 + }; 53 + 54 + struct xfrm_pol_inexact_bin { 55 + struct xfrm_pol_inexact_key k; 56 + struct rhash_head head; 57 + struct hlist_head hhead; 58 + 59 + /* slow path below */ 60 + struct list_head inexact_bins; 61 + struct rcu_head rcu; 62 + }; 63 + 49 64 static DEFINE_SPINLOCK(xfrm_if_cb_lock); 50 65 static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; 51 66 ··· 72 55 static struct kmem_cache *xfrm_dst_cache __ro_after_init; 73 56 static __read_mostly seqcount_t xfrm_policy_hash_generation; 74 57 58 + static struct rhashtable xfrm_policy_inexact_table; 59 + static const struct rhashtable_params xfrm_pol_inexact_params; 60 + 75 61 static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); 76 62 static int stale_bundle(struct dst_entry *dst); 77 63 static int xfrm_bundle_ok(struct xfrm_dst *xdst); ··· 83 63 static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); 84 64 static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, 85 65 int dir); 66 + 67 + static struct xfrm_pol_inexact_bin * 68 + xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir); 69 + 70 + static struct xfrm_pol_inexact_bin * 71 + xfrm_policy_inexact_lookup_rcu(struct net *net, 72 + u8 type, u16 family, u8 dir); 73 + static struct xfrm_policy * 74 + xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, 75 + bool excl); 76 + static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, 77 + struct xfrm_policy *policy); 86 78 87 79 static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) 88 80 { ··· 301 269 if (policy) { 302 270 write_pnet(&policy->xp_net, net); 303 271 INIT_LIST_HEAD(&policy->walk.all); 272 + INIT_HLIST_NODE(&policy->bydst_inexact_list); 304 273 INIT_HLIST_NODE(&policy->bydst); 305 274 INIT_HLIST_NODE(&policy->byidx); 306 275 rwlock_init(&policy->lock); ··· 596 563 mutex_unlock(&hash_resize_mutex); 597 564 } 598 565 566 + static void xfrm_hash_reset_inexact_table(struct net *net) 567 + { 568 + struct xfrm_pol_inexact_bin *b; 569 + 570 + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); 571 + 572 + list_for_each_entry(b, &net->xfrm.inexact_bins, inexact_bins) 573 + INIT_HLIST_HEAD(&b->hhead); 574 + } 575 + 576 + /* Make sure *pol can be inserted into fastbin. 577 + * Useful to check that later insert requests will be sucessful 578 + * (provided xfrm_policy_lock is held throughout). 579 + */ 580 + static struct xfrm_pol_inexact_bin * 581 + xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) 582 + { 583 + struct xfrm_pol_inexact_bin *bin, *prev; 584 + struct xfrm_pol_inexact_key k = { 585 + .family = pol->family, 586 + .type = pol->type, 587 + .dir = dir, 588 + }; 589 + struct net *net = xp_net(pol); 590 + 591 + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); 592 + 593 + write_pnet(&k.net, net); 594 + bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k, 595 + xfrm_pol_inexact_params); 596 + if (bin) 597 + return bin; 598 + 599 + bin = kzalloc(sizeof(*bin), GFP_ATOMIC); 600 + if (!bin) 601 + return NULL; 602 + 603 + bin->k = k; 604 + INIT_HLIST_HEAD(&bin->hhead); 605 + 606 + prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, 607 + &bin->k, &bin->head, 608 + xfrm_pol_inexact_params); 609 + if (!prev) { 610 + list_add(&bin->inexact_bins, &net->xfrm.inexact_bins); 611 + return bin; 612 + } 613 + 614 + kfree(bin); 615 + 616 + return IS_ERR(prev) ? NULL : prev; 617 + } 618 + 619 + static void xfrm_policy_inexact_delete_bin(struct net *net, 620 + struct xfrm_pol_inexact_bin *b) 621 + { 622 + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); 623 + 624 + if (!hlist_empty(&b->hhead)) 625 + return; 626 + 627 + if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head, 628 + xfrm_pol_inexact_params) == 0) { 629 + list_del(&b->inexact_bins); 630 + kfree_rcu(b, rcu); 631 + } 632 + } 633 + 634 + static void __xfrm_policy_inexact_flush(struct net *net) 635 + { 636 + struct xfrm_pol_inexact_bin *bin; 637 + 638 + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); 639 + 640 + list_for_each_entry(bin, &net->xfrm.inexact_bins, inexact_bins) 641 + xfrm_policy_inexact_delete_bin(net, bin); 642 + } 643 + 644 + static struct xfrm_policy * 645 + xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) 646 + { 647 + struct xfrm_pol_inexact_bin *bin; 648 + struct xfrm_policy *delpol; 649 + struct hlist_head *chain; 650 + struct net *net; 651 + 652 + bin = xfrm_policy_inexact_alloc_bin(policy, dir); 653 + if (!bin) 654 + return ERR_PTR(-ENOMEM); 655 + 656 + delpol = xfrm_policy_insert_list(&bin->hhead, policy, excl); 657 + if (delpol && excl) 658 + return ERR_PTR(-EEXIST); 659 + 660 + net = xp_net(policy); 661 + chain = &net->xfrm.policy_inexact[dir]; 662 + xfrm_policy_insert_inexact_list(chain, policy); 663 + 664 + return delpol; 665 + } 666 + 599 667 static void xfrm_hash_rebuild(struct work_struct *work) 600 668 { 601 669 struct net *net = container_of(work, struct net, ··· 726 592 727 593 spin_lock_bh(&net->xfrm.xfrm_policy_lock); 728 594 595 + /* make sure that we can insert the indirect policies again before 596 + * we start with destructive action. 597 + */ 598 + list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { 599 + u8 dbits, sbits; 600 + 601 + dir = xfrm_policy_id2dir(policy->index); 602 + if (policy->walk.dead || dir >= XFRM_POLICY_MAX) 603 + continue; 604 + 605 + if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { 606 + if (policy->family == AF_INET) { 607 + dbits = rbits4; 608 + sbits = lbits4; 609 + } else { 610 + dbits = rbits6; 611 + sbits = lbits6; 612 + } 613 + } else { 614 + if (policy->family == AF_INET) { 615 + dbits = lbits4; 616 + sbits = rbits4; 617 + } else { 618 + dbits = lbits6; 619 + sbits = rbits6; 620 + } 621 + } 622 + 623 + if (policy->selector.prefixlen_d < dbits || 624 + policy->selector.prefixlen_s < sbits) 625 + continue; 626 + 627 + if (!xfrm_policy_inexact_alloc_bin(policy, dir)) 628 + goto out_unlock; 629 + } 630 + 729 631 /* reset the bydst and inexact table in all directions */ 632 + xfrm_hash_reset_inexact_table(net); 633 + 730 634 for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { 731 635 INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); 732 636 hmask = net->xfrm.policy_bydst[dir].hmask; ··· 797 625 chain = policy_hash_bysel(net, &policy->selector, 798 626 policy->family, 799 627 xfrm_policy_id2dir(policy->index)); 800 - if (!chain) 801 - chain = &net->xfrm.policy_inexact[dir]; 628 + if (!chain) { 629 + void *p = xfrm_policy_inexact_insert(policy, dir, 0); 630 + 631 + WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p)); 632 + continue; 633 + } 634 + 802 635 hlist_for_each_entry(pol, chain, bydst) { 803 636 if (policy->priority >= pol->priority) 804 637 newpos = &pol->bydst; ··· 816 639 hlist_add_head_rcu(&policy->bydst, chain); 817 640 } 818 641 642 + out_unlock: 819 643 spin_unlock_bh(&net->xfrm.xfrm_policy_lock); 820 644 821 645 mutex_unlock(&hash_resize_mutex); ··· 920 742 return false; 921 743 } 922 744 745 + static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed) 746 + { 747 + const struct xfrm_pol_inexact_key *k = data; 748 + u32 a = k->type << 24 | k->dir << 16 | k->family; 749 + 750 + return jhash_2words(a, net_hash_mix(read_pnet(&k->net)), seed); 751 + } 752 + 753 + static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed) 754 + { 755 + const struct xfrm_pol_inexact_bin *b = data; 756 + 757 + return xfrm_pol_bin_key(&b->k, 0, seed); 758 + } 759 + 760 + static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg, 761 + const void *ptr) 762 + { 763 + const struct xfrm_pol_inexact_key *key = arg->key; 764 + const struct xfrm_pol_inexact_bin *b = ptr; 765 + int ret; 766 + 767 + if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net))) 768 + return -1; 769 + 770 + ret = b->k.dir ^ key->dir; 771 + if (ret) 772 + return ret; 773 + 774 + ret = b->k.type ^ key->type; 775 + if (ret) 776 + return ret; 777 + 778 + ret = b->k.family ^ key->family; 779 + if (ret) 780 + return ret; 781 + 782 + return 0; 783 + } 784 + 785 + static const struct rhashtable_params xfrm_pol_inexact_params = { 786 + .head_offset = offsetof(struct xfrm_pol_inexact_bin, head), 787 + .hashfn = xfrm_pol_bin_key, 788 + .obj_hashfn = xfrm_pol_bin_obj, 789 + .obj_cmpfn = xfrm_pol_bin_cmp, 790 + .automatic_shrinking = true, 791 + }; 792 + 793 + static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, 794 + struct xfrm_policy *policy) 795 + { 796 + struct xfrm_policy *pol, *delpol = NULL; 797 + struct hlist_node *newpos = NULL; 798 + 799 + hlist_for_each_entry(pol, chain, bydst_inexact_list) { 800 + if (pol->type == policy->type && 801 + pol->if_id == policy->if_id && 802 + !selector_cmp(&pol->selector, &policy->selector) && 803 + xfrm_policy_mark_match(policy, pol) && 804 + xfrm_sec_ctx_match(pol->security, policy->security) && 805 + !WARN_ON(delpol)) { 806 + delpol = pol; 807 + if (policy->priority > pol->priority) 808 + continue; 809 + } else if (policy->priority >= pol->priority) { 810 + newpos = &pol->bydst_inexact_list; 811 + continue; 812 + } 813 + if (delpol) 814 + break; 815 + } 816 + 817 + if (newpos) 818 + hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); 819 + else 820 + hlist_add_head_rcu(&policy->bydst_inexact_list, chain); 821 + } 822 + 923 823 static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, 924 824 struct xfrm_policy *policy, 925 825 bool excl) ··· 1023 767 if (delpol) 1024 768 break; 1025 769 } 770 + 1026 771 if (newpos) 1027 772 hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); 1028 773 else ··· 1040 783 1041 784 spin_lock_bh(&net->xfrm.xfrm_policy_lock); 1042 785 chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); 1043 - if (chain) { 786 + if (chain) 1044 787 delpol = xfrm_policy_insert_list(chain, policy, excl); 1045 - } else { 1046 - chain = &net->xfrm.policy_inexact[dir]; 1047 - delpol = xfrm_policy_insert_list(chain, policy, excl); 1048 - } 788 + else 789 + delpol = xfrm_policy_inexact_insert(policy, dir, excl); 1049 790 1050 791 if (IS_ERR(delpol)) { 1051 792 spin_unlock_bh(&net->xfrm.xfrm_policy_lock); ··· 1085 830 struct xfrm_sec_ctx *ctx, int delete, 1086 831 int *err) 1087 832 { 1088 - struct xfrm_policy *pol, *ret; 833 + struct xfrm_pol_inexact_bin *bin = NULL; 834 + struct xfrm_policy *pol, *ret = NULL; 1089 835 struct hlist_head *chain; 1090 836 1091 837 *err = 0; 1092 838 spin_lock_bh(&net->xfrm.xfrm_policy_lock); 1093 839 chain = policy_hash_bysel(net, sel, sel->family, dir); 1094 - if (!chain) 1095 - chain = &net->xfrm.policy_inexact[dir]; 840 + if (!chain) { 841 + bin = xfrm_policy_inexact_lookup(net, type, 842 + sel->family, dir); 843 + if (!bin) { 844 + spin_unlock_bh(&net->xfrm.xfrm_policy_lock); 845 + return NULL; 846 + } 847 + 848 + chain = &bin->hhead; 849 + } 850 + 1096 851 ret = NULL; 1097 852 hlist_for_each_entry(pol, chain, bydst) { 1098 853 if (pol->type == type && ··· 1119 854 return pol; 1120 855 } 1121 856 __xfrm_policy_unlink(pol, dir); 857 + xfrm_policy_inexact_delete_bin(net, bin); 1122 858 } 1123 859 ret = pol; 1124 860 break; ··· 1230 964 spin_lock_bh(&net->xfrm.xfrm_policy_lock); 1231 965 goto again; 1232 966 } 1233 - if (!cnt) 967 + if (cnt) 968 + __xfrm_policy_inexact_flush(net); 969 + else 1234 970 err = -ESRCH; 1235 971 out: 1236 972 spin_unlock_bh(&net->xfrm.xfrm_policy_lock); ··· 1331 1063 if (match) 1332 1064 ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, 1333 1065 dir); 1334 - 1335 1066 return ret; 1067 + } 1068 + 1069 + static struct xfrm_pol_inexact_bin * 1070 + xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir) 1071 + { 1072 + struct xfrm_pol_inexact_key k = { 1073 + .family = family, 1074 + .type = type, 1075 + .dir = dir, 1076 + }; 1077 + 1078 + write_pnet(&k.net, net); 1079 + 1080 + return rhashtable_lookup(&xfrm_policy_inexact_table, &k, 1081 + xfrm_pol_inexact_params); 1082 + } 1083 + 1084 + static struct xfrm_pol_inexact_bin * 1085 + xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir) 1086 + { 1087 + struct xfrm_pol_inexact_bin *bin; 1088 + 1089 + lockdep_assert_held(&net->xfrm.xfrm_policy_lock); 1090 + 1091 + rcu_read_lock(); 1092 + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir); 1093 + rcu_read_unlock(); 1094 + 1095 + return bin; 1336 1096 } 1337 1097 1338 1098 static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, ··· 1368 1072 u16 family, u8 dir, 1369 1073 u32 if_id) 1370 1074 { 1371 - int err; 1372 - struct xfrm_policy *pol, *ret; 1373 1075 const xfrm_address_t *daddr, *saddr; 1076 + struct xfrm_pol_inexact_bin *bin; 1077 + struct xfrm_policy *pol, *ret; 1374 1078 struct hlist_head *chain; 1375 1079 unsigned int sequence; 1376 1080 u32 priority; 1081 + int err; 1377 1082 1378 1083 daddr = xfrm_flowi_daddr(fl, family); 1379 1084 saddr = xfrm_flowi_saddr(fl, family); ··· 1405 1108 break; 1406 1109 } 1407 1110 } 1408 - chain = &net->xfrm.policy_inexact[dir]; 1111 + bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir); 1112 + if (!bin) 1113 + goto skip_inexact; 1114 + chain = &bin->hhead; 1409 1115 hlist_for_each_entry_rcu(pol, chain, bydst) { 1410 1116 if ((pol->priority >= priority) && ret) 1411 1117 break; ··· 1427 1127 } 1428 1128 } 1429 1129 1130 + skip_inexact: 1430 1131 if (read_seqcount_retry(&xfrm_policy_hash_generation, sequence)) 1431 1132 goto retry; 1432 1133 ··· 1519 1218 /* Socket policies are not hashed. */ 1520 1219 if (!hlist_unhashed(&pol->bydst)) { 1521 1220 hlist_del_rcu(&pol->bydst); 1221 + hlist_del_init(&pol->bydst_inexact_list); 1522 1222 hlist_del(&pol->byidx); 1523 1223 } 1524 1224 ··· 3097 2795 static int __net_init xfrm_policy_init(struct net *net) 3098 2796 { 3099 2797 unsigned int hmask, sz; 3100 - int dir; 2798 + int dir, err; 3101 2799 3102 - if (net_eq(net, &init_net)) 2800 + if (net_eq(net, &init_net)) { 3103 2801 xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", 3104 2802 sizeof(struct xfrm_dst), 3105 2803 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 3106 2804 NULL); 2805 + err = rhashtable_init(&xfrm_policy_inexact_table, 2806 + &xfrm_pol_inexact_params); 2807 + BUG_ON(err); 2808 + } 3107 2809 3108 2810 hmask = 8 - 1; 3109 2811 sz = (hmask+1) * sizeof(struct hlist_head); ··· 3142 2836 seqlock_init(&net->xfrm.policy_hthresh.lock); 3143 2837 3144 2838 INIT_LIST_HEAD(&net->xfrm.policy_all); 2839 + INIT_LIST_HEAD(&net->xfrm.inexact_bins); 3145 2840 INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); 3146 2841 INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); 3147 2842 return 0; ··· 3161 2854 3162 2855 static void xfrm_policy_fini(struct net *net) 3163 2856 { 2857 + struct xfrm_pol_inexact_bin *bin, *tmp; 3164 2858 unsigned int sz; 3165 2859 int dir; 3166 2860 ··· 3187 2879 sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); 3188 2880 WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); 3189 2881 xfrm_hash_free(net->xfrm.policy_byidx, sz); 2882 + 2883 + list_for_each_entry_safe(bin, tmp, &net->xfrm.inexact_bins, 2884 + inexact_bins) { 2885 + WARN_ON(!hlist_empty(&bin->hhead)); 2886 + xfrm_policy_inexact_delete_bin(net, bin); 2887 + } 3190 2888 } 3191 2889 3192 2890 static int __net_init xfrm_net_init(struct net *net) ··· 3358 3044 } 3359 3045 } 3360 3046 chain = &net->xfrm.policy_inexact[dir]; 3361 - hlist_for_each_entry(pol, chain, bydst) { 3047 + hlist_for_each_entry(pol, chain, bydst_inexact_list) { 3362 3048 if ((pol->priority >= priority) && ret) 3363 3049 break; 3364 3050