Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ceph: Use sendmsg(MSG_SPLICE_PAGES) rather than sendpage()

Use sendmsg() and MSG_SPLICE_PAGES rather than sendpage in ceph when
transmitting data. For the moment, this can only transmit one page at a
time because of the architecture of net/ceph/, but if
write_partial_message_data() can be given a bvec[] at a time by the
iteration code, this would allow pages to be sent in a batch.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Ilya Dryomov <idryomov@gmail.com>
cc: Xiubo Li <xiubli@redhat.com>
cc: Jeff Layton <jlayton@kernel.org>
cc: Jens Axboe <axboe@kernel.dk>
cc: Matthew Wilcox <willy@infradead.org>
Link: https://lore.kernel.org/r/20230623225513.2732256-5-dhowells@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>

authored by

David Howells and committed by
Jakub Kicinski
fa094cca 40a8c17a

+19 -72
+19 -72
net/ceph/messenger_v2.c
··· 117 117 return ret; 118 118 } 119 119 120 - static int do_sendmsg(struct socket *sock, struct iov_iter *it) 121 - { 122 - struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; 123 - int ret; 124 - 125 - msg.msg_iter = *it; 126 - while (iov_iter_count(it)) { 127 - ret = sock_sendmsg(sock, &msg); 128 - if (ret <= 0) { 129 - if (ret == -EAGAIN) 130 - ret = 0; 131 - return ret; 132 - } 133 - 134 - iov_iter_advance(it, ret); 135 - } 136 - 137 - WARN_ON(msg_data_left(&msg)); 138 - return 1; 139 - } 140 - 141 - static int do_try_sendpage(struct socket *sock, struct iov_iter *it) 142 - { 143 - struct msghdr msg = { .msg_flags = CEPH_MSG_FLAGS }; 144 - struct bio_vec bv; 145 - int ret; 146 - 147 - if (WARN_ON(!iov_iter_is_bvec(it))) 148 - return -EINVAL; 149 - 150 - while (iov_iter_count(it)) { 151 - /* iov_iter_iovec() for ITER_BVEC */ 152 - bvec_set_page(&bv, it->bvec->bv_page, 153 - min(iov_iter_count(it), 154 - it->bvec->bv_len - it->iov_offset), 155 - it->bvec->bv_offset + it->iov_offset); 156 - 157 - /* 158 - * sendpage cannot properly handle pages with 159 - * page_count == 0, we need to fall back to sendmsg if 160 - * that's the case. 161 - * 162 - * Same goes for slab pages: skb_can_coalesce() allows 163 - * coalescing neighboring slab objects into a single frag 164 - * which triggers one of hardened usercopy checks. 165 - */ 166 - if (sendpage_ok(bv.bv_page)) { 167 - ret = sock->ops->sendpage(sock, bv.bv_page, 168 - bv.bv_offset, bv.bv_len, 169 - CEPH_MSG_FLAGS); 170 - } else { 171 - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bv, 1, bv.bv_len); 172 - ret = sock_sendmsg(sock, &msg); 173 - } 174 - if (ret <= 0) { 175 - if (ret == -EAGAIN) 176 - ret = 0; 177 - return ret; 178 - } 179 - 180 - iov_iter_advance(it, ret); 181 - } 182 - 183 - return 1; 184 - } 185 - 186 120 /* 187 121 * Write as much as possible. The socket is expected to be corked, 188 - * so we don't bother with MSG_MORE/MSG_SENDPAGE_NOTLAST here. 122 + * so we don't bother with MSG_MORE here. 189 123 * 190 124 * Return: 191 - * 1 - done, nothing (else) to write 125 + * >0 - done, nothing (else) to write 192 126 * 0 - socket is full, need to wait 193 127 * <0 - error 194 128 */ 195 129 static int ceph_tcp_send(struct ceph_connection *con) 196 130 { 131 + struct msghdr msg = { 132 + .msg_iter = con->v2.out_iter, 133 + .msg_flags = CEPH_MSG_FLAGS, 134 + }; 197 135 int ret; 136 + 137 + if (WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter))) 138 + return -EINVAL; 139 + 140 + if (con->v2.out_iter_sendpage) 141 + msg.msg_flags |= MSG_SPLICE_PAGES; 198 142 199 143 dout("%s con %p have %zu try_sendpage %d\n", __func__, con, 200 144 iov_iter_count(&con->v2.out_iter), con->v2.out_iter_sendpage); 201 - if (con->v2.out_iter_sendpage) 202 - ret = do_try_sendpage(con->sock, &con->v2.out_iter); 203 - else 204 - ret = do_sendmsg(con->sock, &con->v2.out_iter); 145 + 146 + ret = sock_sendmsg(con->sock, &msg); 147 + if (ret > 0) 148 + iov_iter_advance(&con->v2.out_iter, ret); 149 + else if (ret == -EAGAIN) 150 + ret = 0; 151 + 205 152 dout("%s con %p ret %d left %zu\n", __func__, con, ret, 206 153 iov_iter_count(&con->v2.out_iter)); 207 154 return ret;