Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'tls-rx-nopad-and-backlog-flushing'

Jakub Kicinski says:

====================
tls: rx: nopad and backlog flushing

This small series contains the two changes I've been working
towards in the previous ~50 patches a couple of months ago.

The first major change is the optional "nopad" optimization.
Currently TLS 1.3 Rx performs quite poorly because it does
not support the "zero-copy" or rather direct decrypt to a user
space buffer. Because of TLS 1.3 record padding we don't
know if a record contains data or a control message until
we decrypt it. Most records will contain data, tho, so the
optimization is to try the decryption hoping its data and
retry if it wasn't.

The performance gain from doing that is significant (~40%)
but if I'm completely honest the major reason is that we
call skb_cow_data() on the non-"zc" path. The next series
will remove the CoW, dropping the gain to only ~10%.

The second change is to flush the backlog every 128kB.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>

+191 -17
+18
Documentation/networking/tls.rst
··· 239 239 this will look like TLS records had been tampered with and will result 240 240 in record authentication failures. 241 241 242 + TLS_RX_EXPECT_NO_PAD 243 + ~~~~~~~~~~~~~~~~~~~~ 244 + 245 + TLS 1.3 only. Expect the sender to not pad records. This allows the data 246 + to be decrypted directly into user space buffers with TLS 1.3. 247 + 248 + This optimization is safe to enable only if the remote end is trusted, 249 + otherwise it is an attack vector to doubling the TLS processing cost. 250 + 251 + If the record decrypted turns out to had been padded or is not a data 252 + record it will be decrypted again into a kernel buffer without zero copy. 253 + Such events are counted in the ``TlsDecryptRetry`` statistic. 254 + 242 255 Statistics 243 256 ========== 244 257 ··· 277 264 278 265 - ``TlsDeviceRxResync`` - 279 266 number of RX resyncs sent to NICs handling cryptography 267 + 268 + - ``TlsDecryptRetry`` - 269 + number of RX records which had to be re-decrypted due to 270 + ``TLS_RX_EXPECT_NO_PAD`` mis-prediction. Note that this counter will 271 + also increment for non-data records.
+8
include/linux/sockptr.h
··· 102 102 return strncpy_from_user(dst, src.user, count); 103 103 } 104 104 105 + static inline int check_zeroed_sockptr(sockptr_t src, size_t offset, 106 + size_t size) 107 + { 108 + if (!sockptr_is_kernel(src)) 109 + return check_zeroed_user(src.user + offset, size); 110 + return memchr_inv(src.kernel + offset, 0, size) == NULL; 111 + } 112 + 105 113 #endif /* _LINUX_SOCKPTR_H */
+3
include/net/tls.h
··· 149 149 150 150 struct sk_buff *recv_pkt; 151 151 u8 async_capable:1; 152 + u8 zc_capable:1; 152 153 atomic_t decrypt_pending; 153 154 /* protect crypto_wait with decrypt_pending*/ 154 155 spinlock_t decrypt_compl_lock; ··· 240 239 u8 tx_conf:3; 241 240 u8 rx_conf:3; 242 241 u8 zerocopy_sendfile:1; 242 + u8 rx_no_pad:1; 243 243 244 244 int (*push_pending_record)(struct sock *sk, int flags); 245 245 void (*sk_write_space)(struct sock *sk); ··· 360 358 void tls_err_abort(struct sock *sk, int err); 361 359 362 360 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx); 361 + void tls_update_rx_zc_capable(struct tls_context *tls_ctx); 363 362 void tls_sw_strparser_arm(struct sock *sk, struct tls_context *ctx); 364 363 void tls_sw_strparser_done(struct tls_context *tls_ctx); 365 364 int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
+1
include/uapi/linux/snmp.h
··· 344 344 LINUX_MIB_TLSRXDEVICE, /* TlsRxDevice */ 345 345 LINUX_MIB_TLSDECRYPTERROR, /* TlsDecryptError */ 346 346 LINUX_MIB_TLSRXDEVICERESYNC, /* TlsRxDeviceResync */ 347 + LINUX_MIN_TLSDECRYPTRETRY, /* TlsDecryptRetry */ 347 348 __LINUX_MIB_TLSMAX 348 349 }; 349 350
+2
include/uapi/linux/tls.h
··· 40 40 #define TLS_TX 1 /* Set transmit parameters */ 41 41 #define TLS_RX 2 /* Set receive parameters */ 42 42 #define TLS_TX_ZEROCOPY_RO 3 /* TX zerocopy (only sendfile now) */ 43 + #define TLS_RX_EXPECT_NO_PAD 4 /* Attempt opportunistic zero-copy */ 43 44 44 45 /* Supported versions */ 45 46 #define TLS_VERSION_MINOR(ver) ((ver) & 0xFF) ··· 163 162 TLS_INFO_TXCONF, 164 163 TLS_INFO_RXCONF, 165 164 TLS_INFO_ZC_RO_TX, 165 + TLS_INFO_RX_NO_PAD, 166 166 __TLS_INFO_MAX, 167 167 }; 168 168 #define TLS_INFO_MAX (__TLS_INFO_MAX - 1)
+1
net/core/sock.c
··· 2870 2870 __release_sock(sk); 2871 2871 spin_unlock_bh(&sk->sk_lock.slock); 2872 2872 } 2873 + EXPORT_SYMBOL_GPL(__sk_flush_backlog); 2873 2874 2874 2875 /** 2875 2876 * sk_wait_data - wait for data to arrive at sk_receive_queue
+75
net/tls/tls_main.c
··· 533 533 return 0; 534 534 } 535 535 536 + static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval, 537 + int __user *optlen) 538 + { 539 + struct tls_context *ctx = tls_get_ctx(sk); 540 + unsigned int value; 541 + int err, len; 542 + 543 + if (ctx->prot_info.version != TLS_1_3_VERSION) 544 + return -EINVAL; 545 + 546 + if (get_user(len, optlen)) 547 + return -EFAULT; 548 + if (len < sizeof(value)) 549 + return -EINVAL; 550 + 551 + lock_sock(sk); 552 + err = -EINVAL; 553 + if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) 554 + value = ctx->rx_no_pad; 555 + release_sock(sk); 556 + if (err) 557 + return err; 558 + 559 + if (put_user(sizeof(value), optlen)) 560 + return -EFAULT; 561 + if (copy_to_user(optval, &value, sizeof(value))) 562 + return -EFAULT; 563 + 564 + return 0; 565 + } 566 + 536 567 static int do_tls_getsockopt(struct sock *sk, int optname, 537 568 char __user *optval, int __user *optlen) 538 569 { ··· 577 546 break; 578 547 case TLS_TX_ZEROCOPY_RO: 579 548 rc = do_tls_getsockopt_tx_zc(sk, optval, optlen); 549 + break; 550 + case TLS_RX_EXPECT_NO_PAD: 551 + rc = do_tls_getsockopt_no_pad(sk, optval, optlen); 580 552 break; 581 553 default: 582 554 rc = -ENOPROTOOPT; ··· 752 718 return 0; 753 719 } 754 720 721 + static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval, 722 + unsigned int optlen) 723 + { 724 + struct tls_context *ctx = tls_get_ctx(sk); 725 + u32 val; 726 + int rc; 727 + 728 + if (ctx->prot_info.version != TLS_1_3_VERSION || 729 + sockptr_is_null(optval) || optlen < sizeof(val)) 730 + return -EINVAL; 731 + 732 + rc = copy_from_sockptr(&val, optval, sizeof(val)); 733 + if (rc) 734 + return -EFAULT; 735 + if (val > 1) 736 + return -EINVAL; 737 + rc = check_zeroed_sockptr(optval, sizeof(val), optlen - sizeof(val)); 738 + if (rc < 1) 739 + return rc == 0 ? -EINVAL : rc; 740 + 741 + lock_sock(sk); 742 + rc = -EINVAL; 743 + if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) { 744 + ctx->rx_no_pad = val; 745 + tls_update_rx_zc_capable(ctx); 746 + rc = 0; 747 + } 748 + release_sock(sk); 749 + 750 + return rc; 751 + } 752 + 755 753 static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, 756 754 unsigned int optlen) 757 755 { ··· 801 735 lock_sock(sk); 802 736 rc = do_tls_setsockopt_tx_zc(sk, optval, optlen); 803 737 release_sock(sk); 738 + break; 739 + case TLS_RX_EXPECT_NO_PAD: 740 + rc = do_tls_setsockopt_no_pad(sk, optval, optlen); 804 741 break; 805 742 default: 806 743 rc = -ENOPROTOOPT; ··· 1045 976 if (err) 1046 977 goto nla_failure; 1047 978 } 979 + if (ctx->rx_no_pad) { 980 + err = nla_put_flag(skb, TLS_INFO_RX_NO_PAD); 981 + if (err) 982 + goto nla_failure; 983 + } 1048 984 1049 985 rcu_read_unlock(); 1050 986 nla_nest_end(skb, start); ··· 1071 997 nla_total_size(sizeof(u16)) + /* TLS_INFO_RXCONF */ 1072 998 nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */ 1073 999 nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */ 1000 + nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */ 1074 1001 0; 1075 1002 1076 1003 return size;
+1
net/tls/tls_proc.c
··· 18 18 SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE), 19 19 SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR), 20 20 SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), 21 + SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIN_TLSDECRYPTRETRY), 21 22 SNMP_MIB_SENTINEL 22 23 }; 23 24
+67 -17
net/tls/tls_sw.c
··· 47 47 struct tls_decrypt_arg { 48 48 bool zc; 49 49 bool async; 50 + u8 tail; 50 51 }; 51 52 52 53 noinline void tls_err_abort(struct sock *sk, int err) ··· 134 133 return __skb_nsg(skb, offset, len, 0); 135 134 } 136 135 137 - static int padding_length(struct tls_prot_info *prot, struct sk_buff *skb) 136 + static int tls_padding_length(struct tls_prot_info *prot, struct sk_buff *skb, 137 + struct tls_decrypt_arg *darg) 138 138 { 139 139 struct strp_msg *rxm = strp_msg(skb); 140 140 struct tls_msg *tlm = tls_msg(skb); ··· 144 142 /* Determine zero-padding length */ 145 143 if (prot->version == TLS_1_3_VERSION) { 146 144 int offset = rxm->full_len - TLS_TAG_SIZE - 1; 147 - char content_type = 0; 145 + char content_type = darg->zc ? darg->tail : 0; 148 146 int err; 149 147 150 148 while (content_type == 0) { ··· 1420 1418 struct strp_msg *rxm = strp_msg(skb); 1421 1419 struct tls_msg *tlm = tls_msg(skb); 1422 1420 int n_sgin, n_sgout, nsg, mem_size, aead_size, err, pages = 0; 1421 + u8 *aad, *iv, *tail, *mem = NULL; 1423 1422 struct aead_request *aead_req; 1424 1423 struct sk_buff *unused; 1425 - u8 *aad, *iv, *mem = NULL; 1426 1424 struct scatterlist *sgin = NULL; 1427 1425 struct scatterlist *sgout = NULL; 1428 - const int data_len = rxm->full_len - prot->overhead_size + 1429 - prot->tail_size; 1426 + const int data_len = rxm->full_len - prot->overhead_size; 1427 + int tail_pages = !!prot->tail_size; 1430 1428 int iv_offset = 0; 1431 1429 1432 1430 if (darg->zc && (out_iov || out_sg)) { 1433 1431 if (out_iov) 1434 - n_sgout = 1 + 1432 + n_sgout = 1 + tail_pages + 1435 1433 iov_iter_npages_cap(out_iov, INT_MAX, data_len); 1436 1434 else 1437 1435 n_sgout = sg_nents(out_sg); ··· 1455 1453 mem_size = aead_size + (nsg * sizeof(struct scatterlist)); 1456 1454 mem_size = mem_size + prot->aad_size; 1457 1455 mem_size = mem_size + MAX_IV_SIZE; 1456 + mem_size = mem_size + prot->tail_size; 1458 1457 1459 1458 /* Allocate a single block of memory which contains 1460 - * aead_req || sgin[] || sgout[] || aad || iv. 1459 + * aead_req || sgin[] || sgout[] || aad || iv || tail. 1461 1460 * This order achieves correct alignment for aead_req, sgin, sgout. 1462 1461 */ 1463 1462 mem = kmalloc(mem_size, sk->sk_allocation); ··· 1471 1468 sgout = sgin + n_sgin; 1472 1469 aad = (u8 *)(sgout + n_sgout); 1473 1470 iv = aad + prot->aad_size; 1471 + tail = iv + MAX_IV_SIZE; 1474 1472 1475 1473 /* For CCM based ciphers, first byte of nonce+iv is a constant */ 1476 1474 switch (prot->cipher_type) { ··· 1525 1521 1526 1522 err = tls_setup_from_iter(out_iov, data_len, 1527 1523 &pages, &sgout[1], 1528 - (n_sgout - 1)); 1524 + (n_sgout - 1 - tail_pages)); 1529 1525 if (err < 0) 1530 1526 goto fallback_to_reg_recv; 1527 + 1528 + if (prot->tail_size) { 1529 + sg_unmark_end(&sgout[pages]); 1530 + sg_set_buf(&sgout[pages + 1], tail, 1531 + prot->tail_size); 1532 + sg_mark_end(&sgout[pages + 1]); 1533 + } 1531 1534 } else if (out_sg) { 1532 1535 memcpy(sgout, out_sg, n_sgout * sizeof(*sgout)); 1533 1536 } else { ··· 1549 1538 1550 1539 /* Prepare and submit AEAD request */ 1551 1540 err = tls_do_decryption(sk, skb, sgin, sgout, iv, 1552 - data_len, aead_req, darg); 1541 + data_len + prot->tail_size, aead_req, darg); 1553 1542 if (darg->async) 1554 1543 return 0; 1544 + 1545 + if (prot->tail_size) 1546 + darg->tail = *tail; 1555 1547 1556 1548 /* Release the pages in case iov was mapped to pages */ 1557 1549 for (; pages > 0; pages--) ··· 1597 1583 return err; 1598 1584 if (darg->async) 1599 1585 goto decrypt_next; 1586 + /* If opportunistic TLS 1.3 ZC failed retry without ZC */ 1587 + if (unlikely(darg->zc && prot->version == TLS_1_3_VERSION && 1588 + darg->tail != TLS_RECORD_TYPE_DATA)) { 1589 + darg->zc = false; 1590 + TLS_INC_STATS(sock_net(sk), LINUX_MIN_TLSDECRYPTRETRY); 1591 + return decrypt_skb_update(sk, skb, dest, darg); 1592 + } 1600 1593 1601 1594 decrypt_done: 1602 - pad = padding_length(prot, skb); 1595 + pad = tls_padding_length(prot, skb, darg); 1603 1596 if (pad < 0) 1604 1597 return pad; 1605 1598 ··· 1738 1717 return copied ? : err; 1739 1718 } 1740 1719 1720 + static void 1721 + tls_read_flush_backlog(struct sock *sk, struct tls_prot_info *prot, 1722 + size_t len_left, size_t decrypted, ssize_t done, 1723 + size_t *flushed_at) 1724 + { 1725 + size_t max_rec; 1726 + 1727 + if (len_left <= decrypted) 1728 + return; 1729 + 1730 + max_rec = prot->overhead_size - prot->tail_size + TLS_MAX_PAYLOAD_SIZE; 1731 + if (done - *flushed_at < SZ_128K && tcp_inq(sk) > max_rec) 1732 + return; 1733 + 1734 + *flushed_at = done; 1735 + sk_flush_backlog(sk); 1736 + } 1737 + 1741 1738 int tls_sw_recvmsg(struct sock *sk, 1742 1739 struct msghdr *msg, 1743 1740 size_t len, ··· 1768 1729 struct sk_psock *psock; 1769 1730 unsigned char control = 0; 1770 1731 ssize_t decrypted = 0; 1732 + size_t flushed_at = 0; 1771 1733 struct strp_msg *rxm; 1772 1734 struct tls_msg *tlm; 1773 1735 struct sk_buff *skb; ··· 1807 1767 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); 1808 1768 1809 1769 zc_capable = !bpf_strp_enabled && !is_kvec && !is_peek && 1810 - prot->version != TLS_1_3_VERSION; 1770 + ctx->zc_capable; 1811 1771 decrypted = 0; 1812 1772 while (len && (decrypted + copied < target || ctx->recv_pkt)) { 1813 1773 struct tls_decrypt_arg darg = {}; ··· 1857 1817 err = tls_record_content_type(msg, tlm, &control); 1858 1818 if (err <= 0) 1859 1819 goto recv_end; 1820 + 1821 + /* periodically flush backlog, and feed strparser */ 1822 + tls_read_flush_backlog(sk, prot, len, to_decrypt, 1823 + decrypted + copied, &flushed_at); 1860 1824 1861 1825 ctx->recv_pkt = NULL; 1862 1826 __strp_unpause(&ctx->strp); ··· 2293 2249 strp_check_rcv(&rx_ctx->strp); 2294 2250 } 2295 2251 2252 + void tls_update_rx_zc_capable(struct tls_context *tls_ctx) 2253 + { 2254 + struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(tls_ctx); 2255 + 2256 + rx_ctx->zc_capable = tls_ctx->rx_no_pad || 2257 + tls_ctx->prot_info.version != TLS_1_3_VERSION; 2258 + } 2259 + 2296 2260 int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) 2297 2261 { 2298 2262 struct tls_context *tls_ctx = tls_get_ctx(sk); ··· 2536 2484 if (sw_ctx_rx) { 2537 2485 tfm = crypto_aead_tfm(sw_ctx_rx->aead_recv); 2538 2486 2539 - if (crypto_info->version == TLS_1_3_VERSION) 2540 - sw_ctx_rx->async_capable = 0; 2541 - else 2542 - sw_ctx_rx->async_capable = 2543 - !!(tfm->__crt_alg->cra_flags & 2544 - CRYPTO_ALG_ASYNC); 2487 + tls_update_rx_zc_capable(ctx); 2488 + sw_ctx_rx->async_capable = 2489 + crypto_info->version != TLS_1_3_VERSION && 2490 + !!(tfm->__crt_alg->cra_flags & CRYPTO_ALG_ASYNC); 2545 2491 2546 2492 /* Set up strparser */ 2547 2493 memset(&cb, 0, sizeof(cb));
+15
tools/testing/selftests/net/tls.c
··· 235 235 { 236 236 uint16_t tls_version; 237 237 uint16_t cipher_type; 238 + bool nopad; 238 239 }; 239 240 240 241 FIXTURE_VARIANT_ADD(tls, 12_aes_gcm) ··· 298 297 .cipher_type = TLS_CIPHER_AES_GCM_256, 299 298 }; 300 299 300 + FIXTURE_VARIANT_ADD(tls, 13_nopad) 301 + { 302 + .tls_version = TLS_1_3_VERSION, 303 + .cipher_type = TLS_CIPHER_AES_GCM_128, 304 + .nopad = true, 305 + }; 306 + 301 307 FIXTURE_SETUP(tls) 302 308 { 303 309 struct tls_crypto_info_keys tls12; 310 + int one = 1; 304 311 int ret; 305 312 306 313 tls_crypto_info_init(variant->tls_version, variant->cipher_type, ··· 324 315 325 316 ret = setsockopt(self->cfd, SOL_TLS, TLS_RX, &tls12, tls12.len); 326 317 ASSERT_EQ(ret, 0); 318 + 319 + if (variant->nopad) { 320 + ret = setsockopt(self->cfd, SOL_TLS, TLS_RX_EXPECT_NO_PAD, 321 + (void *)&one, sizeof(one)); 322 + ASSERT_EQ(ret, 0); 323 + } 327 324 } 328 325 329 326 FIXTURE_TEARDOWN(tls)