Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

libceph: move away from global osd_req_flags

osd_req_flags is overly general and doesn't suit its only user
(read_from_replica option) well:

- applying osd_req_flags in account_request() affects all OSD
requests, including linger (i.e. watch and notify). However,
linger requests should always go to the primary even though
some of them are reads (e.g. notify has side effects but it
is a read because it doesn't result in mutation on the OSDs).

- calls to class methods that are reads are allowed to go to
the replica, but most such calls issued for "rbd map" and/or
exclusive lock transitions are requested to be resent to the
primary via EAGAIN, doubling the latency.

Get rid of global osd_req_flags and set read_from_replica flag
only on specific OSD requests instead.

Fixes: 8ad44d5e0d1e ("libceph: read_from_replica option")
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>

+13 -16
+3 -1
drivers/block/rbd.c
··· 1451 1451 static void rbd_osd_format_read(struct ceph_osd_request *osd_req) 1452 1452 { 1453 1453 struct rbd_obj_request *obj_request = osd_req->r_priv; 1454 + struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev; 1455 + struct ceph_options *opt = rbd_dev->rbd_client->client->options; 1454 1456 1455 - osd_req->r_flags = CEPH_OSD_FLAG_READ; 1457 + osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica; 1456 1458 osd_req->r_snapid = obj_request->img_request->snap_id; 1457 1459 } 1458 1460
+2 -2
include/linux/ceph/libceph.h
··· 52 52 unsigned long osd_idle_ttl; /* jiffies */ 53 53 unsigned long osd_keepalive_timeout; /* jiffies */ 54 54 unsigned long osd_request_timeout; /* jiffies */ 55 - 56 - u32 osd_req_flags; /* CEPH_OSD_FLAG_*, applied to each OSD request */ 55 + u32 read_from_replica; /* CEPH_OSD_FLAG_BALANCE/LOCALIZE_READS */ 57 56 58 57 /* 59 58 * any type that can't be simply compared or doesn't need ··· 75 76 #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) 76 77 #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) 77 78 #define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */ 79 + #define CEPH_READ_FROM_REPLICA_DEFAULT 0 /* read from primary */ 78 80 79 81 #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) 80 82 #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)
+6 -8
net/ceph/ceph_common.c
··· 332 332 opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; 333 333 opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; 334 334 opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT; 335 + opt->read_from_replica = CEPH_READ_FROM_REPLICA_DEFAULT; 335 336 return opt; 336 337 } 337 338 EXPORT_SYMBOL(ceph_alloc_options); ··· 491 490 case Opt_read_from_replica: 492 491 switch (result.uint_32) { 493 492 case Opt_read_from_replica_no: 494 - opt->osd_req_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | 495 - CEPH_OSD_FLAG_LOCALIZE_READS); 493 + opt->read_from_replica = 0; 496 494 break; 497 495 case Opt_read_from_replica_balance: 498 - opt->osd_req_flags |= CEPH_OSD_FLAG_BALANCE_READS; 499 - opt->osd_req_flags &= ~CEPH_OSD_FLAG_LOCALIZE_READS; 496 + opt->read_from_replica = CEPH_OSD_FLAG_BALANCE_READS; 500 497 break; 501 498 case Opt_read_from_replica_localize: 502 - opt->osd_req_flags |= CEPH_OSD_FLAG_LOCALIZE_READS; 503 - opt->osd_req_flags &= ~CEPH_OSD_FLAG_BALANCE_READS; 499 + opt->read_from_replica = CEPH_OSD_FLAG_LOCALIZE_READS; 504 500 break; 505 501 default: 506 502 BUG(); ··· 611 613 } 612 614 seq_putc(m, ','); 613 615 } 614 - if (opt->osd_req_flags & CEPH_OSD_FLAG_BALANCE_READS) { 616 + if (opt->read_from_replica == CEPH_OSD_FLAG_BALANCE_READS) { 615 617 seq_puts(m, "read_from_replica=balance,"); 616 - } else if (opt->osd_req_flags & CEPH_OSD_FLAG_LOCALIZE_READS) { 618 + } else if (opt->read_from_replica == CEPH_OSD_FLAG_LOCALIZE_READS) { 617 619 seq_puts(m, "read_from_replica=localize,"); 618 620 } 619 621
+2 -5
net/ceph/osd_client.c
··· 1117 1117 truncate_size, truncate_seq); 1118 1118 } 1119 1119 1120 - req->r_flags = flags; 1121 1120 req->r_base_oloc.pool = layout->pool_id; 1122 1121 req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); 1123 1122 ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum); 1123 + req->r_flags = flags | osdc->client->options->read_from_replica; 1124 1124 1125 1125 req->r_snapid = vino.snap; 1126 1126 if (flags & CEPH_OSD_FLAG_WRITE) ··· 2431 2431 2432 2432 static void account_request(struct ceph_osd_request *req) 2433 2433 { 2434 - struct ceph_osd_client *osdc = req->r_osdc; 2435 - 2436 2434 WARN_ON(req->r_flags & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK)); 2437 2435 WARN_ON(!(req->r_flags & (CEPH_OSD_FLAG_READ | CEPH_OSD_FLAG_WRITE))); 2438 2436 2439 2437 req->r_flags |= CEPH_OSD_FLAG_ONDISK; 2440 - req->r_flags |= osdc->client->options->osd_req_flags; 2441 - atomic_inc(&osdc->num_requests); 2438 + atomic_inc(&req->r_osdc->num_requests); 2442 2439 2443 2440 req->r_start_stamp = jiffies; 2444 2441 req->r_start_latency = ktime_get();