Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

libceph: support for balanced and localized reads

OSD-side issues with reads from replica have been resolved in
Octopus. Reading from replica should be safe wrt. unstable or
uncommitted state now, so add support for balanced and localized
reads.

There are two cases when a read from replica can't be served:

- OSD may silently drop the request, expecting the client to
notice that the acting set has changed and resend via the usual
means (handled with t->used_replica)

- OSD may return EAGAIN, expecting the client to resend to the
primary, ignoring replica read flags (see handle_reply())

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>

+193 -6
+1
include/linux/ceph/osd_client.h
··· 165 165 bool recovery_deletes; 166 166 167 167 unsigned int flags; /* CEPH_OSD_FLAG_* */ 168 + bool used_replica; 168 169 bool paused; 169 170 170 171 u32 epoch;
+3
include/linux/ceph/osdmap.h
··· 317 317 int ceph_compare_crush_locs(struct rb_root *locs1, struct rb_root *locs2); 318 318 void ceph_clear_crush_locs(struct rb_root *locs); 319 319 320 + int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id, 321 + struct rb_root *locs); 322 + 320 323 extern struct ceph_pg_pool_info *ceph_pg_pool_by_id(struct ceph_osdmap *map, 321 324 u64 id); 322 325 extern const char *ceph_pg_pool_name_by_id(struct ceph_osdmap *map, u64 id);
+4 -2
net/ceph/debugfs.c
··· 81 81 u32 state = map->osd_state[i]; 82 82 char sb[64]; 83 83 84 - seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", 84 + seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\t%2d\n", 85 85 i, ceph_pr_addr(addr), 86 86 ((map->osd_weight[i]*100) >> 16), 87 87 ceph_osdmap_state_str(sb, sizeof(sb), state), 88 - ((ceph_get_primary_affinity(map, i)*100) >> 16)); 88 + ((ceph_get_primary_affinity(map, i)*100) >> 16), 89 + ceph_get_crush_locality(map, i, 90 + &client->options->crush_locs)); 89 91 } 90 92 for (n = rb_first(&map->pg_temp); n; n = rb_next(n)) { 91 93 struct ceph_pg_mapping *pg =
+83 -4
net/ceph/osd_client.c
··· 1497 1497 (osdc->osdmap->epoch < osdc->epoch_barrier); 1498 1498 } 1499 1499 1500 + static int pick_random_replica(const struct ceph_osds *acting) 1501 + { 1502 + int i = prandom_u32() % acting->size; 1503 + 1504 + dout("%s picked osd%d, primary osd%d\n", __func__, 1505 + acting->osds[i], acting->primary); 1506 + return i; 1507 + } 1508 + 1509 + /* 1510 + * Picks the closest replica based on client's location given by 1511 + * crush_location option. Prefers the primary if the locality is 1512 + * the same. 1513 + */ 1514 + static int pick_closest_replica(struct ceph_osd_client *osdc, 1515 + const struct ceph_osds *acting) 1516 + { 1517 + struct ceph_options *opt = osdc->client->options; 1518 + int best_i, best_locality; 1519 + int i = 0, locality; 1520 + 1521 + do { 1522 + locality = ceph_get_crush_locality(osdc->osdmap, 1523 + acting->osds[i], 1524 + &opt->crush_locs); 1525 + if (i == 0 || 1526 + (locality >= 0 && best_locality < 0) || 1527 + (locality >= 0 && best_locality >= 0 && 1528 + locality < best_locality)) { 1529 + best_i = i; 1530 + best_locality = locality; 1531 + } 1532 + } while (++i < acting->size); 1533 + 1534 + dout("%s picked osd%d with locality %d, primary osd%d\n", __func__, 1535 + acting->osds[best_i], best_locality, acting->primary); 1536 + return best_i; 1537 + } 1538 + 1500 1539 enum calc_target_result { 1501 1540 CALC_TARGET_NO_ACTION = 0, 1502 1541 CALC_TARGET_NEED_RESEND, ··· 1549 1510 struct ceph_pg_pool_info *pi; 1550 1511 struct ceph_pg pgid, last_pgid; 1551 1512 struct ceph_osds up, acting; 1513 + bool is_read = t->flags & CEPH_OSD_FLAG_READ; 1514 + bool is_write = t->flags & CEPH_OSD_FLAG_WRITE; 1552 1515 bool force_resend = false; 1553 1516 bool unpaused = false; 1554 1517 bool legacy_change = false; ··· 1581 1540 ceph_oid_copy(&t->target_oid, &t->base_oid); 1582 1541 ceph_oloc_copy(&t->target_oloc, &t->base_oloc); 1583 1542 if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) { 1584 - if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) 1543 + if (is_read && pi->read_tier >= 0) 1585 1544 t->target_oloc.pool = pi->read_tier; 1586 - if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) 1545 + if (is_write && pi->write_tier >= 0) 1587 1546 t->target_oloc.pool = pi->write_tier; 1588 1547 1589 1548 pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool); ··· 1622 1581 unpaused = true; 1623 1582 } 1624 1583 legacy_change = ceph_pg_compare(&t->pgid, &pgid) || 1625 - ceph_osds_changed(&t->acting, &acting, any_change); 1584 + ceph_osds_changed(&t->acting, &acting, 1585 + t->used_replica || any_change); 1626 1586 if (t->pg_num) 1627 1587 split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num); 1628 1588 ··· 1639 1597 t->sort_bitwise = sort_bitwise; 1640 1598 t->recovery_deletes = recovery_deletes; 1641 1599 1642 - t->osd = acting.primary; 1600 + if ((t->flags & (CEPH_OSD_FLAG_BALANCE_READS | 1601 + CEPH_OSD_FLAG_LOCALIZE_READS)) && 1602 + !is_write && pi->type == CEPH_POOL_TYPE_REP && 1603 + acting.size > 1) { 1604 + int pos; 1605 + 1606 + WARN_ON(!is_read || acting.osds[0] != acting.primary); 1607 + if (t->flags & CEPH_OSD_FLAG_BALANCE_READS) { 1608 + pos = pick_random_replica(&acting); 1609 + } else { 1610 + pos = pick_closest_replica(osdc, &acting); 1611 + } 1612 + t->osd = acting.osds[pos]; 1613 + t->used_replica = pos > 0; 1614 + } else { 1615 + t->osd = acting.primary; 1616 + t->used_replica = false; 1617 + } 1643 1618 } 1644 1619 1645 1620 if (unpaused || legacy_change || force_resend || split) ··· 3714 3655 req->r_flags |= CEPH_OSD_FLAG_REDIRECTED | 3715 3656 CEPH_OSD_FLAG_IGNORE_OVERLAY | 3716 3657 CEPH_OSD_FLAG_IGNORE_CACHE; 3658 + req->r_tid = 0; 3659 + __submit_request(req, false); 3660 + goto out_unlock_osdc; 3661 + } 3662 + 3663 + if (m.result == -EAGAIN) { 3664 + dout("req %p tid %llu EAGAIN\n", req, req->r_tid); 3665 + unlink_request(osd, req); 3666 + mutex_unlock(&osd->lock); 3667 + 3668 + /* 3669 + * The object is missing on the replica or not (yet) 3670 + * readable. Clear pgid to force a resend to the primary 3671 + * via legacy_change. 3672 + */ 3673 + req->r_t.pgid.pool = 0; 3674 + req->r_t.pgid.seed = 0; 3675 + WARN_ON(!req->r_t.used_replica); 3676 + req->r_flags &= ~(CEPH_OSD_FLAG_BALANCE_READS | 3677 + CEPH_OSD_FLAG_LOCALIZE_READS); 3717 3678 req->r_tid = 0; 3718 3679 __submit_request(req, false); 3719 3680 goto out_unlock_osdc;
+102
net/ceph/osdmap.c
··· 2831 2831 free_crush_loc(loc); 2832 2832 } 2833 2833 } 2834 + 2835 + /* 2836 + * [a-zA-Z0-9-_.]+ 2837 + */ 2838 + static bool is_valid_crush_name(const char *name) 2839 + { 2840 + do { 2841 + if (!('a' <= *name && *name <= 'z') && 2842 + !('A' <= *name && *name <= 'Z') && 2843 + !('0' <= *name && *name <= '9') && 2844 + *name != '-' && *name != '_' && *name != '.') 2845 + return false; 2846 + } while (*++name != '\0'); 2847 + 2848 + return true; 2849 + } 2850 + 2851 + /* 2852 + * Gets the parent of an item. Returns its id (<0 because the 2853 + * parent is always a bucket), type id (>0 for the same reason, 2854 + * via @parent_type_id) and location (via @parent_loc). If no 2855 + * parent, returns 0. 2856 + * 2857 + * Does a linear search, as there are no parent pointers of any 2858 + * kind. Note that the result is ambigous for items that occur 2859 + * multiple times in the map. 2860 + */ 2861 + static int get_immediate_parent(struct crush_map *c, int id, 2862 + u16 *parent_type_id, 2863 + struct crush_loc *parent_loc) 2864 + { 2865 + struct crush_bucket *b; 2866 + struct crush_name_node *type_cn, *cn; 2867 + int i, j; 2868 + 2869 + for (i = 0; i < c->max_buckets; i++) { 2870 + b = c->buckets[i]; 2871 + if (!b) 2872 + continue; 2873 + 2874 + /* ignore per-class shadow hierarchy */ 2875 + cn = lookup_crush_name(&c->names, b->id); 2876 + if (!cn || !is_valid_crush_name(cn->cn_name)) 2877 + continue; 2878 + 2879 + for (j = 0; j < b->size; j++) { 2880 + if (b->items[j] != id) 2881 + continue; 2882 + 2883 + *parent_type_id = b->type; 2884 + type_cn = lookup_crush_name(&c->type_names, b->type); 2885 + parent_loc->cl_type_name = type_cn->cn_name; 2886 + parent_loc->cl_name = cn->cn_name; 2887 + return b->id; 2888 + } 2889 + } 2890 + 2891 + return 0; /* no parent */ 2892 + } 2893 + 2894 + /* 2895 + * Calculates the locality/distance from an item to a client 2896 + * location expressed in terms of CRUSH hierarchy as a set of 2897 + * (bucket type name, bucket name) pairs. Specifically, looks 2898 + * for the lowest-valued bucket type for which the location of 2899 + * @id matches one of the locations in @locs, so for standard 2900 + * bucket types (host = 1, rack = 3, datacenter = 8, zone = 9) 2901 + * a matching host is closer than a matching rack and a matching 2902 + * data center is closer than a matching zone. 2903 + * 2904 + * Specifying multiple locations (a "multipath" location) such 2905 + * as "rack=foo1 rack=foo2 datacenter=bar" is allowed -- @locs 2906 + * is a multimap. The locality will be: 2907 + * 2908 + * - 3 for OSDs in racks foo1 and foo2 2909 + * - 8 for OSDs in data center bar 2910 + * - -1 for all other OSDs 2911 + * 2912 + * The lowest possible bucket type is 1, so the best locality 2913 + * for an OSD is 1 (i.e. a matching host). Locality 0 would be 2914 + * the OSD itself. 2915 + */ 2916 + int ceph_get_crush_locality(struct ceph_osdmap *osdmap, int id, 2917 + struct rb_root *locs) 2918 + { 2919 + struct crush_loc loc; 2920 + u16 type_id; 2921 + 2922 + /* 2923 + * Instead of repeated get_immediate_parent() calls, 2924 + * the location of @id could be obtained with a single 2925 + * depth-first traversal. 2926 + */ 2927 + for (;;) { 2928 + id = get_immediate_parent(osdmap->crush, id, &type_id, &loc); 2929 + if (id >= 0) 2930 + return -1; /* not local */ 2931 + 2932 + if (lookup_crush_loc(locs, &loc)) 2933 + return type_id; 2934 + } 2935 + }