Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

libceph: just wait for more data to be available on the socket

A short read may occur while reading the message footer from the
socket. Later, when the socket is ready for another read, the
messenger invokes all read_partial_*() handlers, including
read_partial_sparse_msg_data(). The expectation is that
read_partial_sparse_msg_data() would bail, allowing the messenger to
invoke read_partial() for the footer and pick up where it left off.

However read_partial_sparse_msg_data() violates that and ends up
calling into the state machine in the OSD client. The sparse-read
state machine assumes that it's a new op and interprets some piece of
the footer as the sparse-read header and returns bogus extents/data
length, etc.

To determine whether read_partial_sparse_msg_data() should bail, let's
reuse cursor->total_resid. Because once it reaches to zero that means
all the extents and data have been successfully received in last read,
else it could break out when partially reading any of the extents and
data. And then osd_sparse_read() could continue where it left off.

[ idryomov: changelog ]

Link: https://tracker.ceph.com/issues/63586
Fixes: d396f89db39a ("libceph: add sparse read support to msgr1")
Signed-off-by: Xiubo Li <xiubli@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

authored by

Xiubo Li and committed by
Ilya Dryomov
8e46a2d0 ee97302f

+19 -21
+1 -1
include/linux/ceph/messenger.h
··· 283 283 struct kref kref; 284 284 bool more_to_follow; 285 285 bool needs_out_seq; 286 - bool sparse_read; 286 + u64 sparse_read_total; 287 287 int front_alloc_len; 288 288 289 289 struct ceph_msgpool *pool;
+13 -12
net/ceph/messenger_v1.c
··· 160 160 static void prepare_message_data(struct ceph_msg *msg, u32 data_len) 161 161 { 162 162 /* Initialize data cursor if it's not a sparse read */ 163 - if (!msg->sparse_read) 164 - ceph_msg_data_cursor_init(&msg->cursor, msg, data_len); 163 + u64 len = msg->sparse_read_total ? : data_len; 164 + 165 + ceph_msg_data_cursor_init(&msg->cursor, msg, len); 165 166 } 166 167 167 168 /* ··· 1037 1036 if (do_datacrc) 1038 1037 crc = con->in_data_crc; 1039 1038 1040 - do { 1039 + while (cursor->total_resid) { 1041 1040 if (con->v1.in_sr_kvec.iov_base) 1042 1041 ret = read_partial_message_chunk(con, 1043 1042 &con->v1.in_sr_kvec, ··· 1045 1044 &crc); 1046 1045 else if (cursor->sr_resid > 0) 1047 1046 ret = read_partial_sparse_msg_extent(con, &crc); 1048 - 1049 - if (ret <= 0) { 1050 - if (do_datacrc) 1051 - con->in_data_crc = crc; 1052 - return ret; 1053 - } 1047 + if (ret <= 0) 1048 + break; 1054 1049 1055 1050 memset(&con->v1.in_sr_kvec, 0, sizeof(con->v1.in_sr_kvec)); 1056 1051 ret = con->ops->sparse_read(con, cursor, 1057 1052 (char **)&con->v1.in_sr_kvec.iov_base); 1053 + if (ret <= 0) { 1054 + ret = ret ? ret : 1; /* must return > 0 to indicate success */ 1055 + break; 1056 + } 1058 1057 con->v1.in_sr_len = ret; 1059 - } while (ret > 0); 1058 + } 1060 1059 1061 1060 if (do_datacrc) 1062 1061 con->in_data_crc = crc; 1063 1062 1064 - return ret < 0 ? ret : 1; /* must return > 0 to indicate success */ 1063 + return ret; 1065 1064 } 1066 1065 1067 1066 static int read_partial_msg_data(struct ceph_connection *con) ··· 1254 1253 if (!m->num_data_items) 1255 1254 return -EIO; 1256 1255 1257 - if (m->sparse_read) 1256 + if (m->sparse_read_total) 1258 1257 ret = read_partial_sparse_msg_data(con); 1259 1258 else if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) 1260 1259 ret = read_partial_msg_data_bounce(con);
+2 -2
net/ceph/messenger_v2.c
··· 1128 1128 struct sg_table enc_sgt = {}; 1129 1129 struct sg_table sgt = {}; 1130 1130 struct page **pages = NULL; 1131 - bool sparse = con->in_msg->sparse_read; 1131 + bool sparse = !!con->in_msg->sparse_read_total; 1132 1132 int dpos = 0; 1133 1133 int tail_len; 1134 1134 int ret; ··· 2060 2060 } 2061 2061 2062 2062 if (data_len(msg)) { 2063 - if (msg->sparse_read) 2063 + if (msg->sparse_read_total) 2064 2064 con->v2.in_state = IN_S_PREPARE_SPARSE_DATA; 2065 2065 else 2066 2066 con->v2.in_state = IN_S_PREPARE_READ_DATA;
+3 -6
net/ceph/osd_client.c
··· 5510 5510 } 5511 5511 5512 5512 m = ceph_msg_get(req->r_reply); 5513 - m->sparse_read = (bool)srlen; 5513 + m->sparse_read_total = srlen; 5514 5514 5515 5515 dout("get_reply tid %lld %p\n", tid, m); 5516 5516 ··· 5777 5777 } 5778 5778 5779 5779 if (o->o_sparse_op_idx < 0) { 5780 - u64 srlen = sparse_data_requested(req); 5781 - 5782 - dout("%s: [%d] starting new sparse read req. srlen=0x%llx\n", 5783 - __func__, o->o_osd, srlen); 5784 - ceph_msg_data_cursor_init(cursor, con->in_msg, srlen); 5780 + dout("%s: [%d] starting new sparse read req\n", 5781 + __func__, o->o_osd); 5785 5782 } else { 5786 5783 u64 end; 5787 5784