Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-uring-ubufops' into HEAD

Pavel Begunkov says:

====================
implement io_uring notification (ubuf_info) stacking (net part)

To have per request buffer notifications each zerocopy io_uring send
request allocates a new ubuf_info. However, as an skb can carry only
one uarg, it may force the stack to create many small skbs hurting
performance in many ways.

The patchset implements notification, i.e. an io_uring's ubuf_info
extension, stacking. It attempts to link ubuf_info's into a list,
allowing to have multiple of them per skb.

liburing/examples/send-zerocopy shows up 6 times performance improvement
for TCP with 4KB bytes per send, and levels it with MSG_ZEROCOPY. Without
the patchset it requires much larger sends to utilise all potential.

bytes | before | after (Kqps)
1200 | 195 | 1023
4000 | 193 | 1386
8000 | 154 | 1058
====================

Link: https://lore.kernel.org/all/cover.1713369317.git.asml.silence@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

+69 -36
+1 -1
drivers/net/tap.c
··· 754 754 skb_zcopy_init(skb, msg_control); 755 755 } else if (msg_control) { 756 756 struct ubuf_info *uarg = msg_control; 757 - uarg->callback(NULL, uarg, false); 757 + uarg->ops->complete(NULL, uarg, false); 758 758 } 759 759 760 760 dev_queue_xmit(skb);
+1 -1
drivers/net/tun.c
··· 1906 1906 skb_zcopy_init(skb, msg_control); 1907 1907 } else if (msg_control) { 1908 1908 struct ubuf_info *uarg = msg_control; 1909 - uarg->callback(NULL, uarg, false); 1909 + uarg->ops->complete(NULL, uarg, false); 1910 1910 } 1911 1911 1912 1912 skb_reset_network_header(skb);
+2 -3
drivers/net/xen-netback/common.h
··· 390 390 391 391 void xenvif_carrier_on(struct xenvif *vif); 392 392 393 - /* Callback from stack when TX packet can be released */ 394 - void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf, 395 - bool zerocopy_success); 393 + /* Callbacks from stack when TX packet can be released */ 394 + extern const struct ubuf_info_ops xenvif_ubuf_ops; 396 395 397 396 static inline pending_ring_idx_t nr_pending_reqs(struct xenvif_queue *queue) 398 397 {
+1 -1
drivers/net/xen-netback/interface.c
··· 593 593 594 594 for (i = 0; i < MAX_PENDING_REQS; i++) { 595 595 queue->pending_tx_info[i].callback_struct = (struct ubuf_info_msgzc) 596 - { { .callback = xenvif_zerocopy_callback }, 596 + { { .ops = &xenvif_ubuf_ops }, 597 597 { { .ctx = NULL, 598 598 .desc = i } } }; 599 599 queue->grant_tx_handle[i] = NETBACK_INVALID_HANDLE;
+8 -3
drivers/net/xen-netback/netback.c
··· 1157 1157 uarg = skb_shinfo(skb)->destructor_arg; 1158 1158 /* increase inflight counter to offset decrement in callback */ 1159 1159 atomic_inc(&queue->inflight_packets); 1160 - uarg->callback(NULL, uarg, true); 1160 + uarg->ops->complete(NULL, uarg, true); 1161 1161 skb_shinfo(skb)->destructor_arg = NULL; 1162 1162 1163 1163 /* Fill the skb with the new (local) frags. */ ··· 1279 1279 return work_done; 1280 1280 } 1281 1281 1282 - void xenvif_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *ubuf_base, 1283 - bool zerocopy_success) 1282 + static void xenvif_zerocopy_callback(struct sk_buff *skb, 1283 + struct ubuf_info *ubuf_base, 1284 + bool zerocopy_success) 1284 1285 { 1285 1286 unsigned long flags; 1286 1287 pending_ring_idx_t index; ··· 1313 1312 queue->stats.tx_zerocopy_fail++; 1314 1313 xenvif_skb_zerocopy_complete(queue); 1315 1314 } 1315 + 1316 + const struct ubuf_info_ops xenvif_ubuf_ops = { 1317 + .complete = xenvif_zerocopy_callback, 1318 + }; 1316 1319 1317 1320 static inline void xenvif_tx_dealloc_action(struct xenvif_queue *queue) 1318 1321 {
+6 -2
drivers/vhost/net.c
··· 380 380 } 381 381 } 382 382 383 - static void vhost_zerocopy_callback(struct sk_buff *skb, 383 + static void vhost_zerocopy_complete(struct sk_buff *skb, 384 384 struct ubuf_info *ubuf_base, bool success) 385 385 { 386 386 struct ubuf_info_msgzc *ubuf = uarg_to_msgzc(ubuf_base); ··· 407 407 408 408 rcu_read_unlock_bh(); 409 409 } 410 + 411 + static const struct ubuf_info_ops vhost_ubuf_ops = { 412 + .complete = vhost_zerocopy_complete, 413 + }; 410 414 411 415 static inline unsigned long busy_clock(void) 412 416 { ··· 883 879 vq->heads[nvq->upend_idx].len = VHOST_DMA_IN_PROGRESS; 884 880 ubuf->ctx = nvq->ubufs; 885 881 ubuf->desc = nvq->upend_idx; 886 - ubuf->ubuf.callback = vhost_zerocopy_callback; 882 + ubuf->ubuf.ops = &vhost_ubuf_ops; 887 883 ubuf->ubuf.flags = SKBFL_ZEROCOPY_FRAG; 888 884 refcount_set(&ubuf->ubuf.refcnt, 1); 889 885 msg.msg_control = &ctl;
+13 -8
include/linux/skbuff.h
··· 527 527 #define SKBFL_ALL_ZEROCOPY (SKBFL_ZEROCOPY_FRAG | SKBFL_PURE_ZEROCOPY | \ 528 528 SKBFL_DONT_ORPHAN | SKBFL_MANAGED_FRAG_REFS) 529 529 530 + struct ubuf_info_ops { 531 + void (*complete)(struct sk_buff *, struct ubuf_info *, 532 + bool zerocopy_success); 533 + /* has to be compatible with skb_zcopy_set() */ 534 + int (*link_skb)(struct sk_buff *skb, struct ubuf_info *uarg); 535 + }; 536 + 530 537 /* 531 538 * The callback notifies userspace to release buffers when skb DMA is done in 532 539 * lower device, the skb last reference should be 0 when calling this. ··· 543 536 * The desc field is used to track userspace buffer index. 544 537 */ 545 538 struct ubuf_info { 546 - void (*callback)(struct sk_buff *, struct ubuf_info *, 547 - bool zerocopy_success); 539 + const struct ubuf_info_ops *ops; 548 540 refcount_t refcnt; 549 541 u8 flags; 550 542 }; ··· 1677 1671 } 1678 1672 #endif 1679 1673 1674 + extern const struct ubuf_info_ops msg_zerocopy_ubuf_ops; 1675 + 1680 1676 struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, 1681 1677 struct ubuf_info *uarg); 1682 1678 1683 1679 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref); 1684 - 1685 - void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, 1686 - bool success); 1687 1680 1688 1681 int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, 1689 1682 struct sk_buff *skb, struct iov_iter *from, ··· 1771 1766 static inline void net_zcopy_put(struct ubuf_info *uarg) 1772 1767 { 1773 1768 if (uarg) 1774 - uarg->callback(NULL, uarg, true); 1769 + uarg->ops->complete(NULL, uarg, true); 1775 1770 } 1776 1771 1777 1772 static inline void net_zcopy_put_abort(struct ubuf_info *uarg, bool have_uref) 1778 1773 { 1779 1774 if (uarg) { 1780 - if (uarg->callback == msg_zerocopy_callback) 1775 + if (uarg->ops == &msg_zerocopy_ubuf_ops) 1781 1776 msg_zerocopy_put_abort(uarg, have_uref); 1782 1777 else if (have_uref) 1783 1778 net_zcopy_put(uarg); ··· 1791 1786 1792 1787 if (uarg) { 1793 1788 if (!skb_zcopy_is_nouarg(skb)) 1794 - uarg->callback(skb, uarg, zerocopy_success); 1789 + uarg->ops->complete(skb, uarg, zerocopy_success); 1795 1790 1796 1791 skb_shinfo(skb)->flags &= ~SKBFL_ALL_ZEROCOPY; 1797 1792 }
+13 -5
io_uring/notif.c
··· 24 24 io_req_task_complete(notif, ts); 25 25 } 26 26 27 - static void io_tx_ubuf_callback(struct sk_buff *skb, struct ubuf_info *uarg, 27 + static void io_tx_ubuf_complete(struct sk_buff *skb, struct ubuf_info *uarg, 28 28 bool success) 29 29 { 30 30 struct io_notif_data *nd = container_of(uarg, struct io_notif_data, uarg); ··· 45 45 else if (!success && !nd->zc_copied) 46 46 WRITE_ONCE(nd->zc_copied, true); 47 47 } 48 - io_tx_ubuf_callback(skb, uarg, success); 48 + io_tx_ubuf_complete(skb, uarg, success); 49 49 } 50 + 51 + static const struct ubuf_info_ops io_ubuf_ops = { 52 + .complete = io_tx_ubuf_complete, 53 + }; 54 + 55 + static const struct ubuf_info_ops io_ubuf_ops_ext = { 56 + .complete = io_tx_ubuf_callback_ext, 57 + }; 50 58 51 59 void io_notif_set_extended(struct io_kiocb *notif) 52 60 { 53 61 struct io_notif_data *nd = io_notif_to_data(notif); 54 62 55 - if (nd->uarg.callback != io_tx_ubuf_callback_ext) { 63 + if (nd->uarg.ops != &io_ubuf_ops_ext) { 56 64 nd->account_pages = 0; 57 65 nd->zc_report = false; 58 66 nd->zc_used = false; 59 67 nd->zc_copied = false; 60 - nd->uarg.callback = io_tx_ubuf_callback_ext; 68 + nd->uarg.ops = &io_ubuf_ops_ext; 61 69 notif->io_task_work.func = io_notif_complete_tw_ext; 62 70 } 63 71 } ··· 88 80 89 81 nd = io_notif_to_data(notif); 90 82 nd->uarg.flags = IO_NOTIF_UBUF_FLAGS; 91 - nd->uarg.callback = io_tx_ubuf_callback; 83 + nd->uarg.ops = &io_ubuf_ops; 92 84 refcount_set(&nd->uarg.refcnt, 1); 93 85 return notif; 94 86 }
+24 -12
net/core/skbuff.c
··· 1652 1652 return NULL; 1653 1653 } 1654 1654 1655 - uarg->ubuf.callback = msg_zerocopy_callback; 1655 + uarg->ubuf.ops = &msg_zerocopy_ubuf_ops; 1656 1656 uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; 1657 1657 uarg->len = 1; 1658 1658 uarg->bytelen = size; ··· 1678 1678 u32 bytelen, next; 1679 1679 1680 1680 /* there might be non MSG_ZEROCOPY users */ 1681 - if (uarg->callback != msg_zerocopy_callback) 1681 + if (uarg->ops != &msg_zerocopy_ubuf_ops) 1682 1682 return NULL; 1683 1683 1684 1684 /* realloc only when socket is locked (TCP, UDP cork), ··· 1789 1789 sock_put(sk); 1790 1790 } 1791 1791 1792 - void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, 1793 - bool success) 1792 + static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg, 1793 + bool success) 1794 1794 { 1795 1795 struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); 1796 1796 ··· 1799 1799 if (refcount_dec_and_test(&uarg->refcnt)) 1800 1800 __msg_zerocopy_callback(uarg_zc); 1801 1801 } 1802 - EXPORT_SYMBOL_GPL(msg_zerocopy_callback); 1803 1802 1804 1803 void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) 1805 1804 { ··· 1808 1809 uarg_to_msgzc(uarg)->len--; 1809 1810 1810 1811 if (have_uref) 1811 - msg_zerocopy_callback(NULL, uarg, true); 1812 + msg_zerocopy_complete(NULL, uarg, true); 1812 1813 } 1813 1814 EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort); 1815 + 1816 + const struct ubuf_info_ops msg_zerocopy_ubuf_ops = { 1817 + .complete = msg_zerocopy_complete, 1818 + }; 1819 + EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops); 1814 1820 1815 1821 int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, 1816 1822 struct msghdr *msg, int len, ··· 1824 1820 struct ubuf_info *orig_uarg = skb_zcopy(skb); 1825 1821 int err, orig_len = skb->len; 1826 1822 1827 - /* An skb can only point to one uarg. This edge case happens when 1828 - * TCP appends to an skb, but zerocopy_realloc triggered a new alloc. 1829 - */ 1830 - if (orig_uarg && uarg != orig_uarg) 1831 - return -EEXIST; 1823 + if (uarg->ops->link_skb) { 1824 + err = uarg->ops->link_skb(skb, uarg); 1825 + if (err) 1826 + return err; 1827 + } else { 1828 + /* An skb can only point to one uarg. This edge case happens 1829 + * when TCP appends to an skb, but zerocopy_realloc triggered 1830 + * a new alloc. 1831 + */ 1832 + if (orig_uarg && uarg != orig_uarg) 1833 + return -EEXIST; 1834 + } 1832 1835 1833 1836 err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len); 1834 1837 if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { ··· 1849 1838 return err; 1850 1839 } 1851 1840 1852 - skb_zcopy_set(skb, uarg, NULL); 1841 + if (!uarg->ops->link_skb) 1842 + skb_zcopy_set(skb, uarg, NULL); 1853 1843 return skb->len - orig_len; 1854 1844 } 1855 1845 EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream);