Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

xfrm: Add support for per cpu xfrm state handling.

Currently all flows for a certain SA must be processed by the same
cpu to avoid packet reordering and lock contention of the xfrm
state lock.

To get rid of this limitation, the IETF standardized per cpu SAs
in RFC 9611. This patch implements the xfrm part of it.

We add the cpu as a lookup key for xfrm states and a config option
to generate acquire messages for each cpu.

With that, we can have on each cpu a SA with identical traffic selector
so that flows can be processed in parallel on all cpus.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
Tested-by: Antony Antony <antony.antony@secunet.com>
Tested-by: Tobias Brunner <tobias@strongswan.org>

+112 -22
+3 -2
include/net/xfrm.h
··· 188 188 refcount_t refcnt; 189 189 spinlock_t lock; 190 190 191 + u32 pcpu_num; 191 192 struct xfrm_id id; 192 193 struct xfrm_selector sel; 193 194 struct xfrm_mark mark; ··· 1685 1684 u32 spdhmcnt; 1686 1685 }; 1687 1686 1688 - struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); 1687 + struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); 1689 1688 int xfrm_state_delete(struct xfrm_state *x); 1690 1689 int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync); 1691 1690 int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); ··· 1797 1796 int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi, 1798 1797 struct netlink_ext_ack *extack); 1799 1798 struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, 1800 - u8 mode, u32 reqid, u32 if_id, u8 proto, 1799 + u8 mode, u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, 1801 1800 const xfrm_address_t *daddr, 1802 1801 const xfrm_address_t *saddr, int create, 1803 1802 unsigned short family);
+2
include/uapi/linux/xfrm.h
··· 322 322 XFRMA_MTIMER_THRESH, /* __u32 in seconds for input SA */ 323 323 XFRMA_SA_DIR, /* __u8 */ 324 324 XFRMA_NAT_KEEPALIVE_INTERVAL, /* __u32 in seconds for NAT keepalive */ 325 + XFRMA_SA_PCPU, /* __u32 */ 325 326 __XFRMA_MAX 326 327 327 328 #define XFRMA_OUTPUT_MARK XFRMA_SET_MARK /* Compatibility */ ··· 438 437 #define XFRM_POLICY_LOCALOK 1 /* Allow user to override global policy */ 439 438 /* Automatically expand selector to include matching ICMP payloads. */ 440 439 #define XFRM_POLICY_ICMP 2 440 + #define XFRM_POLICY_CPU_ACQUIRE 4 441 441 __u8 share; 442 442 }; 443 443
+4 -3
net/key/af_key.c
··· 1354 1354 } 1355 1355 1356 1356 if (hdr->sadb_msg_seq) { 1357 - x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); 1357 + x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); 1358 1358 if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) { 1359 1359 xfrm_state_put(x); 1360 1360 x = NULL; ··· 1362 1362 } 1363 1363 1364 1364 if (!x) 1365 - x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family); 1365 + x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, UINT_MAX, 1366 + proto, xdaddr, xsaddr, 1, family); 1366 1367 1367 1368 if (x == NULL) 1368 1369 return -ENOENT; ··· 1418 1417 if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) 1419 1418 return 0; 1420 1419 1421 - x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); 1420 + x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq, UINT_MAX); 1422 1421 if (x == NULL) 1423 1422 return 0; 1424 1423
+4 -2
net/xfrm/xfrm_compat.c
··· 132 132 [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, 133 133 [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), 134 134 [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, 135 + [XFRMA_SA_PCPU] = { .type = NLA_U32 }, 135 136 }; 136 137 137 138 static struct nlmsghdr *xfrm_nlmsg_put_compat(struct sk_buff *skb, ··· 283 282 case XFRMA_MTIMER_THRESH: 284 283 case XFRMA_SA_DIR: 285 284 case XFRMA_NAT_KEEPALIVE_INTERVAL: 285 + case XFRMA_SA_PCPU: 286 286 return xfrm_nla_cpy(dst, src, nla_len(src)); 287 287 default: 288 - BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); 288 + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); 289 289 pr_warn_once("unsupported nla_type %d\n", src->nla_type); 290 290 return -EOPNOTSUPP; 291 291 } ··· 441 439 int err; 442 440 443 441 if (type > XFRMA_MAX) { 444 - BUILD_BUG_ON(XFRMA_MAX != XFRMA_NAT_KEEPALIVE_INTERVAL); 442 + BUILD_BUG_ON(XFRMA_MAX != XFRMA_SA_PCPU); 445 443 NL_SET_ERR_MSG(extack, "Bad attribute"); 446 444 return -EOPNOTSUPP; 447 445 }
+47 -11
net/xfrm/xfrm_state.c
··· 679 679 x->lft.hard_packet_limit = XFRM_INF; 680 680 x->replay_maxage = 0; 681 681 x->replay_maxdiff = 0; 682 + x->pcpu_num = UINT_MAX; 682 683 spin_lock_init(&x->lock); 683 684 } 684 685 return x; ··· 1156 1155 struct xfrm_state **best, int *acq_in_progress, 1157 1156 int *error) 1158 1157 { 1158 + /* We need the cpu id just as a lookup key, 1159 + * we don't require it to be stable. 1160 + */ 1161 + unsigned int pcpu_id = get_cpu(); 1162 + put_cpu(); 1163 + 1159 1164 /* Resolution logic: 1160 1165 * 1. There is a valid state with matching selector. Done. 1161 1166 * 2. Valid state with inappropriate selector. Skip. ··· 1181 1174 &fl->u.__fl_common)) 1182 1175 return; 1183 1176 1177 + if (x->pcpu_num != UINT_MAX && x->pcpu_num != pcpu_id) 1178 + return; 1179 + 1184 1180 if (!*best || 1181 + ((*best)->pcpu_num == UINT_MAX && x->pcpu_num == pcpu_id) || 1185 1182 (*best)->km.dying > x->km.dying || 1186 1183 ((*best)->km.dying == x->km.dying && 1187 1184 (*best)->curlft.add_time < x->curlft.add_time)) 1188 1185 *best = x; 1189 1186 } else if (x->km.state == XFRM_STATE_ACQ) { 1190 - *acq_in_progress = 1; 1187 + if (!*best || x->pcpu_num == pcpu_id) 1188 + *acq_in_progress = 1; 1191 1189 } else if (x->km.state == XFRM_STATE_ERROR || 1192 1190 x->km.state == XFRM_STATE_EXPIRED) { 1193 1191 if ((!x->sel.family || ··· 1221 1209 unsigned short encap_family = tmpl->encap_family; 1222 1210 unsigned int sequence; 1223 1211 struct km_event c; 1212 + unsigned int pcpu_id; 1213 + 1214 + /* We need the cpu id just as a lookup key, 1215 + * we don't require it to be stable. 1216 + */ 1217 + pcpu_id = get_cpu(); 1218 + put_cpu(); 1224 1219 1225 1220 to_put = NULL; 1226 1221 ··· 1301 1282 } 1302 1283 1303 1284 found: 1304 - x = best; 1285 + if (!(pol->flags & XFRM_POLICY_CPU_ACQUIRE) || 1286 + (best && (best->pcpu_num == pcpu_id))) 1287 + x = best; 1288 + 1305 1289 if (!x && !error && !acquire_in_progress) { 1306 1290 if (tmpl->id.spi && 1307 1291 (x0 = __xfrm_state_lookup_all(net, mark, daddr, ··· 1336 1314 xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); 1337 1315 memcpy(&x->mark, &pol->mark, sizeof(x->mark)); 1338 1316 x->if_id = if_id; 1317 + if ((pol->flags & XFRM_POLICY_CPU_ACQUIRE) && best) 1318 + x->pcpu_num = pcpu_id; 1339 1319 1340 1320 error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid); 1341 1321 if (error) { ··· 1416 1392 x = NULL; 1417 1393 error = -ESRCH; 1418 1394 } 1395 + 1396 + /* Use the already installed 'fallback' while the CPU-specific 1397 + * SA acquire is handled*/ 1398 + if (best) 1399 + x = best; 1419 1400 } 1420 1401 out: 1421 1402 if (x) { ··· 1553 1524 unsigned int h; 1554 1525 u32 mark = xnew->mark.v & xnew->mark.m; 1555 1526 u32 if_id = xnew->if_id; 1527 + u32 cpu_id = xnew->pcpu_num; 1556 1528 1557 1529 h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); 1558 1530 hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { 1559 1531 if (x->props.family == family && 1560 1532 x->props.reqid == reqid && 1561 1533 x->if_id == if_id && 1534 + x->pcpu_num == cpu_id && 1562 1535 (mark & x->mark.m) == x->mark.v && 1563 1536 xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) && 1564 1537 xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family)) ··· 1583 1552 static struct xfrm_state *__find_acq_core(struct net *net, 1584 1553 const struct xfrm_mark *m, 1585 1554 unsigned short family, u8 mode, 1586 - u32 reqid, u32 if_id, u8 proto, 1555 + u32 reqid, u32 if_id, u32 pcpu_num, u8 proto, 1587 1556 const xfrm_address_t *daddr, 1588 1557 const xfrm_address_t *saddr, 1589 1558 int create) ··· 1600 1569 x->id.spi != 0 || 1601 1570 x->id.proto != proto || 1602 1571 (mark & x->mark.m) != x->mark.v || 1572 + x->pcpu_num != pcpu_num || 1603 1573 !xfrm_addr_equal(&x->id.daddr, daddr, family) || 1604 1574 !xfrm_addr_equal(&x->props.saddr, saddr, family)) 1605 1575 continue; ··· 1634 1602 break; 1635 1603 } 1636 1604 1605 + x->pcpu_num = pcpu_num; 1637 1606 x->km.state = XFRM_STATE_ACQ; 1638 1607 x->id.proto = proto; 1639 1608 x->props.family = family; ··· 1663 1630 return x; 1664 1631 } 1665 1632 1666 - static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); 1633 + static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num); 1667 1634 1668 1635 int xfrm_state_add(struct xfrm_state *x) 1669 1636 { ··· 1689 1656 } 1690 1657 1691 1658 if (use_spi && x->km.seq) { 1692 - x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq); 1659 + x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq, x->pcpu_num); 1693 1660 if (x1 && ((x1->id.proto != x->id.proto) || 1694 1661 !xfrm_addr_equal(&x1->id.daddr, &x->id.daddr, family))) { 1695 1662 to_put = x1; ··· 1699 1666 1700 1667 if (use_spi && !x1) 1701 1668 x1 = __find_acq_core(net, &x->mark, family, x->props.mode, 1702 - x->props.reqid, x->if_id, x->id.proto, 1669 + x->props.reqid, x->if_id, x->pcpu_num, x->id.proto, 1703 1670 &x->id.daddr, &x->props.saddr, 0); 1704 1671 1705 1672 __xfrm_state_bump_genids(x); ··· 1824 1791 x->props.flags = orig->props.flags; 1825 1792 x->props.extra_flags = orig->props.extra_flags; 1826 1793 1794 + x->pcpu_num = orig->pcpu_num; 1827 1795 x->if_id = orig->if_id; 1828 1796 x->tfcpad = orig->tfcpad; 1829 1797 x->replay_maxdiff = orig->replay_maxdiff; ··· 2100 2066 2101 2067 struct xfrm_state * 2102 2068 xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, 2103 - u32 if_id, u8 proto, const xfrm_address_t *daddr, 2069 + u32 if_id, u32 pcpu_num, u8 proto, const xfrm_address_t *daddr, 2104 2070 const xfrm_address_t *saddr, int create, unsigned short family) 2105 2071 { 2106 2072 struct xfrm_state *x; 2107 2073 2108 2074 spin_lock_bh(&net->xfrm.xfrm_state_lock); 2109 - x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create); 2075 + x = __find_acq_core(net, mark, family, mode, reqid, if_id, pcpu_num, 2076 + proto, daddr, saddr, create); 2110 2077 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 2111 2078 2112 2079 return x; ··· 2242 2207 2243 2208 /* Silly enough, but I'm lazy to build resolution list */ 2244 2209 2245 - static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) 2210 + static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num) 2246 2211 { 2247 2212 unsigned int h = xfrm_seq_hash(net, seq); 2248 2213 struct xfrm_state *x; ··· 2250 2215 hlist_for_each_entry_rcu(x, net->xfrm.state_byseq + h, byseq) { 2251 2216 if (x->km.seq == seq && 2252 2217 (mark & x->mark.m) == x->mark.v && 2218 + x->pcpu_num == pcpu_num && 2253 2219 x->km.state == XFRM_STATE_ACQ) { 2254 2220 xfrm_state_hold(x); 2255 2221 return x; ··· 2260 2224 return NULL; 2261 2225 } 2262 2226 2263 - struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) 2227 + struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq, u32 pcpu_num) 2264 2228 { 2265 2229 struct xfrm_state *x; 2266 2230 2267 2231 spin_lock_bh(&net->xfrm.xfrm_state_lock); 2268 - x = __xfrm_find_acq_byseq(net, mark, seq); 2232 + x = __xfrm_find_acq_byseq(net, mark, seq, pcpu_num); 2269 2233 spin_unlock_bh(&net->xfrm.xfrm_state_lock); 2270 2234 return x; 2271 2235 }
+52 -4
net/xfrm/xfrm_user.c
··· 460 460 } 461 461 } 462 462 463 + if (!sa_dir && attrs[XFRMA_SA_PCPU]) { 464 + NL_SET_ERR_MSG(extack, "SA_PCPU only supported with SA_DIR"); 465 + err = -EINVAL; 466 + goto out; 467 + } 468 + 463 469 out: 464 470 return err; 465 471 } ··· 846 840 if (attrs[XFRMA_NAT_KEEPALIVE_INTERVAL]) 847 841 x->nat_keepalive_interval = 848 842 nla_get_u32(attrs[XFRMA_NAT_KEEPALIVE_INTERVAL]); 843 + 844 + if (attrs[XFRMA_SA_PCPU]) { 845 + x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); 846 + if (x->pcpu_num >= num_possible_cpus()) 847 + goto error; 848 + } 849 849 850 850 err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack); 851 851 if (err) ··· 1308 1296 if (ret) 1309 1297 goto out; 1310 1298 } 1299 + if (x->pcpu_num != UINT_MAX) { 1300 + ret = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); 1301 + if (ret) 1302 + goto out; 1303 + } 1311 1304 if (x->dir) 1312 1305 ret = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); 1313 1306 ··· 1717 1700 u32 mark; 1718 1701 struct xfrm_mark m; 1719 1702 u32 if_id = 0; 1703 + u32 pcpu_num = UINT_MAX; 1720 1704 1721 1705 p = nlmsg_data(nlh); 1722 1706 err = verify_spi_info(p->info.id.proto, p->min, p->max, extack); ··· 1734 1716 if (attrs[XFRMA_IF_ID]) 1735 1717 if_id = nla_get_u32(attrs[XFRMA_IF_ID]); 1736 1718 1719 + if (attrs[XFRMA_SA_PCPU]) { 1720 + pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); 1721 + if (pcpu_num >= num_possible_cpus()) { 1722 + err = -EINVAL; 1723 + goto out_noput; 1724 + } 1725 + } 1726 + 1737 1727 if (p->info.seq) { 1738 - x = xfrm_find_acq_byseq(net, mark, p->info.seq); 1728 + x = xfrm_find_acq_byseq(net, mark, p->info.seq, pcpu_num); 1739 1729 if (x && !xfrm_addr_equal(&x->id.daddr, daddr, family)) { 1740 1730 xfrm_state_put(x); 1741 1731 x = NULL; ··· 1752 1726 1753 1727 if (!x) 1754 1728 x = xfrm_find_acq(net, &m, p->info.mode, p->info.reqid, 1755 - if_id, p->info.id.proto, daddr, 1729 + if_id, pcpu_num, p->info.id.proto, daddr, 1756 1730 &p->info.saddr, 1, 1757 1731 family); 1758 1732 err = -ENOENT; ··· 2552 2526 + nla_total_size(sizeof(struct xfrm_mark)) 2553 2527 + nla_total_size(4) /* XFRM_AE_RTHR */ 2554 2528 + nla_total_size(4) /* XFRM_AE_ETHR */ 2555 - + nla_total_size(sizeof(x->dir)); /* XFRMA_SA_DIR */ 2529 + + nla_total_size(sizeof(x->dir)) /* XFRMA_SA_DIR */ 2530 + + nla_total_size(4); /* XFRMA_SA_PCPU */ 2556 2531 } 2557 2532 2558 2533 static int build_aevent(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) ··· 2609 2582 err = xfrm_if_id_put(skb, x->if_id); 2610 2583 if (err) 2611 2584 goto out_cancel; 2585 + if (x->pcpu_num != UINT_MAX) 2586 + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); 2612 2587 2613 2588 if (x->dir) { 2614 2589 err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); ··· 2880 2851 goto nomem; 2881 2852 2882 2853 xfrm_mark_get(attrs, &mark); 2854 + 2855 + if (attrs[XFRMA_SA_PCPU]) { 2856 + x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); 2857 + err = -EINVAL; 2858 + if (x->pcpu_num >= num_possible_cpus()) 2859 + goto free_state; 2860 + } 2883 2861 2884 2862 err = verify_newpolicy_info(&ua->policy, extack); 2885 2863 if (err) ··· 3218 3182 [XFRMA_MTIMER_THRESH] = { .type = NLA_U32 }, 3219 3183 [XFRMA_SA_DIR] = NLA_POLICY_RANGE(NLA_U8, XFRM_SA_DIR_IN, XFRM_SA_DIR_OUT), 3220 3184 [XFRMA_NAT_KEEPALIVE_INTERVAL] = { .type = NLA_U32 }, 3185 + [XFRMA_SA_PCPU] = { .type = NLA_U32 }, 3221 3186 }; 3222 3187 EXPORT_SYMBOL_GPL(xfrma_policy); 3223 3188 ··· 3385 3348 { 3386 3349 return NLMSG_ALIGN(sizeof(struct xfrm_user_expire)) + 3387 3350 nla_total_size(sizeof(struct xfrm_mark)) + 3388 - nla_total_size(sizeof_field(struct xfrm_state, dir)); 3351 + nla_total_size(sizeof_field(struct xfrm_state, dir)) + 3352 + nla_total_size(4); /* XFRMA_SA_PCPU */ 3389 3353 } 3390 3354 3391 3355 static int build_expire(struct sk_buff *skb, struct xfrm_state *x, const struct km_event *c) ··· 3412 3374 err = xfrm_if_id_put(skb, x->if_id); 3413 3375 if (err) 3414 3376 return err; 3377 + if (x->pcpu_num != UINT_MAX) { 3378 + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); 3379 + if (err) 3380 + return err; 3381 + } 3415 3382 3416 3383 if (x->dir) { 3417 3384 err = nla_put_u8(skb, XFRMA_SA_DIR, x->dir); ··· 3524 3481 } 3525 3482 if (x->if_id) 3526 3483 l += nla_total_size(sizeof(x->if_id)); 3484 + if (x->pcpu_num) 3485 + l += nla_total_size(sizeof(x->pcpu_num)); 3527 3486 3528 3487 /* Must count x->lastused as it may become non-zero behind our back. */ 3529 3488 l += nla_total_size_64bit(sizeof(u64)); ··· 3632 3587 + nla_total_size(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr) 3633 3588 + nla_total_size(sizeof(struct xfrm_mark)) 3634 3589 + nla_total_size(xfrm_user_sec_ctx_size(x->security)) 3590 + + nla_total_size(4) /* XFRMA_SA_PCPU */ 3635 3591 + userpolicy_type_attrsize(); 3636 3592 } 3637 3593 ··· 3669 3623 err = xfrm_if_id_put(skb, xp->if_id); 3670 3624 if (!err && xp->xdo.dev) 3671 3625 err = copy_user_offload(&xp->xdo, skb); 3626 + if (!err && x->pcpu_num != UINT_MAX) 3627 + err = nla_put_u32(skb, XFRMA_SA_PCPU, x->pcpu_num); 3672 3628 if (err) { 3673 3629 nlmsg_cancel(skb, nlh); 3674 3630 return err;