Merge tag 'ceph-for-5.6-rc1' of https://github.com/ceph/ceph-client

Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git

kernel os linux

Pull ceph fixes from Ilya Dryomov:

- a set of patches that fixes various corner cases in mount and umount
code (Xiubo Li). This has to do with choosing an MDS, distinguishing
between laggy and down MDSes and parsing the server path.

- inode initialization fixes (Jeff Layton). The one included here
mostly concerns things like open_by_handle() and there is another one
that will come through Al.

- copy_file_range() now uses the new copy-from2 op (Luis Henriques).
The existing copy-from op turned out to be infeasible for generic
filesystem use; we disable the copy offload if OSDs don't support
copy-from2.

- a patch to link "rbd" and "block" devices together in sysfs (Hannes
Reinecke)

... and a smattering of cleanups from Xiubo, Jeff and Chengguang.

* tag 'ceph-for-5.6-rc1' of https://github.com/ceph/ceph-client: (25 commits)
rbd: set the 'device' link in sysfs
ceph: move net/ceph/ceph_fs.c to fs/ceph/util.c
ceph: print name of xattr in __ceph_{get,set}xattr() douts
ceph: print r_direct_hash in hex in __choose_mds() dout
ceph: use copy-from2 op in copy_file_range
ceph: close holes in structs ceph_mds_session and ceph_mds_request
rbd: work around -Wuninitialized warning
ceph: allocate the correct amount of extra bytes for the session features
ceph: rename get_session and switch to use ceph_get_mds_session
ceph: remove the extra slashes in the server path
ceph: add possible_max_rank and make the code more readable
ceph: print dentry offset in hex and fix xattr_version type
ceph: only touch the caps which have the subset mask requested
ceph: don't clear I_NEW until inode metadata is fully populated
ceph: retry the same mds later after the new session is opened
ceph: check availability of mds cluster on mount after wait timeout
ceph: keep the session state until it is released
ceph: add __send_request helper
ceph: ensure we have a new cap before continuing in fill_inode
ceph: drop unused ttl_from parameter from fill_inode
...

Linus Torvalds 6 years ago 4c46bef2 5b211154

+360 -193

20 changed files

expand all

drivers

block

rbd.c

ceph

Makefile

acl.c

caps.c

debugfs.c

dir.c

file.c

inode.c

mds_client.c

mds_client.h

mdsmap.c

super.c

super.h

util.c

xattr.c

include

linux

ceph

mdsmap.h

osd_client.h

rados.h

net

ceph

Makefile

osd_client.c

+2 -2

drivers/block/rbd.c

··· 2662 2662 u64 off, u64 len) 2663 2663 { 2664 2664 struct ceph_file_extent ex = { off, len }; 2665 - union rbd_img_fill_iter dummy; 2665 + union rbd_img_fill_iter dummy = {}; 2666 2666 struct rbd_img_fill_ctx fctx = { 2667 2667 .pos_type = OBJ_REQUEST_NODATA, 2668 2668 .pos = &dummy, ··· 7143 7143 if (rc) 7144 7144 goto err_out_image_lock; 7145 7145 7146 - add_disk(rbd_dev->disk); 7146 + device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL); 7147 7147 /* see rbd_init_disk() */ 7148 7148 blk_put_queue(rbd_dev->disk->queue); 7149 7149

+1 -1

fs/ceph/Makefile

··· 8 8 ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 9 9 export.o caps.o snap.o xattr.o quota.o io.o \ 10 10 mds_client.o mdsmap.o strings.o ceph_frag.o \ 11 - debugfs.o 11 + debugfs.o util.o 12 12 13 13 ceph-$(CONFIG_CEPH_FSCACHE) += cache.o 14 14 ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o

+2 -2

fs/ceph/acl.c

··· 222 222 err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); 223 223 if (err) 224 224 goto out_err; 225 - err = ceph_pagelist_encode_string(pagelist, 226 - XATTR_NAME_POSIX_ACL_DEFAULT, len); 225 + ceph_pagelist_encode_string(pagelist, 226 + XATTR_NAME_POSIX_ACL_DEFAULT, len); 227 227 err = posix_acl_to_xattr(&init_user_ns, default_acl, 228 228 tmp_buf, val_size2); 229 229 if (err < 0)

+2 -1

fs/ceph/caps.c

··· 908 908 ci_node); 909 909 if (!__cap_is_valid(cap)) 910 910 continue; 911 - __touch_cap(cap); 911 + if (cap->issued & mask) 912 + __touch_cap(cap); 912 913 } 913 914 } 914 915 return 1;

+1 -1

fs/ceph/debugfs.c

··· 33 33 seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds); 34 34 seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout); 35 35 seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose); 36 - for (i = 0; i < mdsmap->m_num_mds; i++) { 36 + for (i = 0; i < mdsmap->possible_max_rank; i++) { 37 37 struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; 38 38 int state = mdsmap->m_info[i].state; 39 39 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,

+2 -2

fs/ceph/dir.c

··· 1186 1186 struct dentry *dn = di->dentry; 1187 1187 struct ceph_mds_client *mdsc; 1188 1188 1189 - dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n", 1189 + dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n", 1190 1190 di, dn, dn, di->offset); 1191 1191 1192 1192 if (!list_empty(&di->lease_list)) { ··· 1567 1567 inode = d_inode(dentry); 1568 1568 } 1569 1569 1570 - dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, 1570 + dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry, 1571 1571 dentry, inode, ceph_dentry(dentry)->offset); 1572 1572 1573 1573 /* always trust cached snapped dentries, snapdir dentry */

+10 -1

fs/ceph/file.c

··· 1974 1974 if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) 1975 1975 return -EOPNOTSUPP; 1976 1976 1977 + if (!src_fsc->have_copy_from2) 1978 + return -EOPNOTSUPP; 1979 + 1977 1980 /* 1978 1981 * Striped file layouts require that we copy partial objects, but the 1979 1982 * OSD copy-from operation only supports full-object copies. Limit ··· 2104 2101 CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, 2105 2102 &dst_oid, &dst_oloc, 2106 2103 CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | 2107 - CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0); 2104 + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 2105 + dst_ci->i_truncate_seq, dst_ci->i_truncate_size, 2106 + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); 2108 2107 if (err) { 2108 + if (err == -EOPNOTSUPP) { 2109 + src_fsc->have_copy_from2 = false; 2110 + pr_notice("OSDs don't support copy-from2; disabling copy offload\n"); 2111 + } 2109 2112 dout("ceph_osdc_copy_from returned %d\n", err); 2110 2113 if (!ret) 2111 2114 ret = err;

+33 -14

fs/ceph/inode.c

··· 55 55 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); 56 56 if (!inode) 57 57 return ERR_PTR(-ENOMEM); 58 - if (inode->i_state & I_NEW) { 58 + if (inode->i_state & I_NEW) 59 59 dout("get_inode created new inode %p %llx.%llx ino %llx\n", 60 60 inode, ceph_vinop(inode), (u64)inode->i_ino); 61 - unlock_new_inode(inode); 62 - } 63 61 64 62 dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino, 65 63 vino.snap, inode); ··· 86 88 inode->i_fop = &ceph_snapdir_fops; 87 89 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ 88 90 ci->i_rbytes = 0; 91 + 92 + if (inode->i_state & I_NEW) 93 + unlock_new_inode(inode); 94 + 89 95 return inode; 90 96 } 91 97 ··· 730 728 static int fill_inode(struct inode *inode, struct page *locked_page, 731 729 struct ceph_mds_reply_info_in *iinfo, 732 730 struct ceph_mds_reply_dirfrag *dirinfo, 733 - struct ceph_mds_session *session, 734 - unsigned long ttl_from, int cap_fmode, 731 + struct ceph_mds_session *session, int cap_fmode, 735 732 struct ceph_cap_reservation *caps_reservation) 736 733 { 737 734 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; ··· 755 754 info_caps = le32_to_cpu(info->cap.caps); 756 755 757 756 /* prealloc new cap struct */ 758 - if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) 757 + if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) { 759 758 new_cap = ceph_get_cap(mdsc, caps_reservation); 759 + if (!new_cap) 760 + return -ENOMEM; 761 + } 760 762 761 763 /* 762 764 * prealloc xattr data, if it looks like we'll need it. only ··· 1241 1237 if (dir) { 1242 1238 err = fill_inode(dir, NULL, 1243 1239 &rinfo->diri, rinfo->dirfrag, 1244 - session, req->r_request_started, -1, 1240 + session, -1, 1245 1241 &req->r_caps_reservation); 1246 1242 if (err < 0) 1247 1243 goto done; ··· 1306 1302 err = PTR_ERR(in); 1307 1303 goto done; 1308 1304 } 1309 - req->r_target_inode = in; 1310 1305 1311 1306 err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, 1312 - session, req->r_request_started, 1307 + session, 1313 1308 (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && 1314 - rinfo->head->result == 0) ? req->r_fmode : -1, 1309 + rinfo->head->result == 0) ? req->r_fmode : -1, 1315 1310 &req->r_caps_reservation); 1316 1311 if (err < 0) { 1317 1312 pr_err("fill_inode badness %p %llx.%llx\n", 1318 1313 in, ceph_vinop(in)); 1314 + if (in->i_state & I_NEW) 1315 + discard_new_inode(in); 1319 1316 goto done; 1320 1317 } 1318 + req->r_target_inode = in; 1319 + if (in->i_state & I_NEW) 1320 + unlock_new_inode(in); 1321 1321 } 1322 1322 1323 1323 /* ··· 1501 1493 continue; 1502 1494 } 1503 1495 rc = fill_inode(in, NULL, &rde->inode, NULL, session, 1504 - req->r_request_started, -1, 1505 - &req->r_caps_reservation); 1496 + -1, &req->r_caps_reservation); 1506 1497 if (rc < 0) { 1507 1498 pr_err("fill_inode badness on %p got %d\n", in, rc); 1508 1499 err = rc; 1500 + if (in->i_state & I_NEW) { 1501 + ihold(in); 1502 + discard_new_inode(in); 1503 + } 1504 + } else if (in->i_state & I_NEW) { 1505 + unlock_new_inode(in); 1509 1506 } 1507 + 1510 1508 /* avoid calling iput_final() in mds dispatch threads */ 1511 1509 ceph_async_iput(in); 1512 1510 } ··· 1708 1694 } 1709 1695 1710 1696 ret = fill_inode(in, NULL, &rde->inode, NULL, session, 1711 - req->r_request_started, -1, 1712 - &req->r_caps_reservation); 1697 + -1, &req->r_caps_reservation); 1713 1698 if (ret < 0) { 1714 1699 pr_err("fill_inode badness on %p\n", in); 1715 1700 if (d_really_is_negative(dn)) { 1716 1701 /* avoid calling iput_final() in mds 1717 1702 * dispatch threads */ 1703 + if (in->i_state & I_NEW) { 1704 + ihold(in); 1705 + discard_new_inode(in); 1706 + } 1718 1707 ceph_async_iput(in); 1719 1708 } 1720 1709 d_drop(dn); 1721 1710 err = ret; 1722 1711 goto next_item; 1723 1712 } 1713 + if (in->i_state & I_NEW) 1714 + unlock_new_inode(in); 1724 1715 1725 1716 if (d_really_is_negative(dn)) { 1726 1717 if (ceph_security_xattr_deadlock(in)) {

+97 -74

fs/ceph/mds_client.c

··· 9 9 #include <linux/debugfs.h> 10 10 #include <linux/seq_file.h> 11 11 #include <linux/ratelimit.h> 12 + #include <linux/bits.h> 12 13 13 14 #include "super.h" 14 15 #include "mds_client.h" ··· 531 530 case CEPH_MDS_SESSION_OPEN: return "open"; 532 531 case CEPH_MDS_SESSION_HUNG: return "hung"; 533 532 case CEPH_MDS_SESSION_CLOSING: return "closing"; 533 + case CEPH_MDS_SESSION_CLOSED: return "closed"; 534 534 case CEPH_MDS_SESSION_RESTARTING: return "restarting"; 535 535 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; 536 536 case CEPH_MDS_SESSION_REJECTED: return "rejected"; ··· 539 537 } 540 538 } 541 539 542 - static struct ceph_mds_session *get_session(struct ceph_mds_session *s) 540 + struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s) 543 541 { 544 542 if (refcount_inc_not_zero(&s->s_ref)) { 545 543 dout("mdsc get_session %p %d -> %d\n", s, ··· 570 568 { 571 569 if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) 572 570 return NULL; 573 - return get_session(mdsc->sessions[mds]); 571 + return ceph_get_mds_session(mdsc->sessions[mds]); 574 572 } 575 573 576 574 static bool __have_session(struct ceph_mds_client *mdsc, int mds) ··· 599 597 { 600 598 struct ceph_mds_session *s; 601 599 602 - if (mds >= mdsc->mdsmap->m_num_mds) 600 + if (mds >= mdsc->mdsmap->possible_max_rank) 603 601 return ERR_PTR(-EINVAL); 604 602 605 603 s = kzalloc(sizeof(*s), GFP_NOFS); ··· 676 674 dout("__unregister_session mds%d %p\n", s->s_mds, s); 677 675 BUG_ON(mdsc->sessions[s->s_mds] != s); 678 676 mdsc->sessions[s->s_mds] = NULL; 679 - s->s_state = 0; 680 677 ceph_con_close(&s->s_con); 681 678 ceph_put_mds_session(s); 682 679 atomic_dec(&mdsc->num_sessions); ··· 879 878 * Called under mdsc->mutex. 880 879 */ 881 880 static int __choose_mds(struct ceph_mds_client *mdsc, 882 - struct ceph_mds_request *req) 881 + struct ceph_mds_request *req, 882 + bool *random) 883 883 { 884 884 struct inode *inode; 885 885 struct ceph_inode_info *ci; ··· 890 888 u32 hash = req->r_direct_hash; 891 889 bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); 892 890 891 + if (random) 892 + *random = false; 893 + 893 894 /* 894 895 * is there a specific mds we should try? ignore hint if we have 895 896 * no session and the mds is not up (active or recovering). ··· 900 895 if (req->r_resend_mds >= 0 && 901 896 (__have_session(mdsc, req->r_resend_mds) || 902 897 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { 903 - dout("choose_mds using resend_mds mds%d\n", 898 + dout("%s using resend_mds mds%d\n", __func__, 904 899 req->r_resend_mds); 905 900 return req->r_resend_mds; 906 901 } ··· 918 913 rcu_read_lock(); 919 914 inode = get_nonsnap_parent(req->r_dentry); 920 915 rcu_read_unlock(); 921 - dout("__choose_mds using snapdir's parent %p\n", inode); 916 + dout("%s using snapdir's parent %p\n", __func__, inode); 922 917 } 923 918 } else if (req->r_dentry) { 924 919 /* ignore race with rename; old or new d_parent is okay */ ··· 938 933 /* direct snapped/virtual snapdir requests 939 934 * based on parent dir inode */ 940 935 inode = get_nonsnap_parent(parent); 941 - dout("__choose_mds using nonsnap parent %p\n", inode); 936 + dout("%s using nonsnap parent %p\n", __func__, inode); 942 937 } else { 943 938 /* dentry target */ 944 939 inode = d_inode(req->r_dentry); ··· 954 949 rcu_read_unlock(); 955 950 } 956 951 957 - dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, 958 - (int)hash, mode); 952 + dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash, 953 + hash, mode); 959 954 if (!inode) 960 955 goto random; 961 956 ci = ceph_inode(inode); ··· 973 968 get_random_bytes(&r, 1); 974 969 r %= frag.ndist; 975 970 mds = frag.dist[r]; 976 - dout("choose_mds %p %llx.%llx " 977 - "frag %u mds%d (%d/%d)\n", 978 - inode, ceph_vinop(inode), 979 - frag.frag, mds, 980 - (int)r, frag.ndist); 971 + dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n", 972 + __func__, inode, ceph_vinop(inode), 973 + frag.frag, mds, (int)r, frag.ndist); 981 974 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= 982 - CEPH_MDS_STATE_ACTIVE) 975 + CEPH_MDS_STATE_ACTIVE && 976 + !ceph_mdsmap_is_laggy(mdsc->mdsmap, mds)) 983 977 goto out; 984 978 } 985 979 986 980 /* since this file/dir wasn't known to be 987 981 * replicated, then we want to look for the 988 982 * authoritative mds. */ 989 - mode = USE_AUTH_MDS; 990 983 if (frag.mds >= 0) { 991 984 /* choose auth mds */ 992 985 mds = frag.mds; 993 - dout("choose_mds %p %llx.%llx " 994 - "frag %u mds%d (auth)\n", 995 - inode, ceph_vinop(inode), frag.frag, mds); 986 + dout("%s %p %llx.%llx frag %u mds%d (auth)\n", 987 + __func__, inode, ceph_vinop(inode), 988 + frag.frag, mds); 996 989 if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= 997 - CEPH_MDS_STATE_ACTIVE) 998 - goto out; 990 + CEPH_MDS_STATE_ACTIVE) { 991 + if (mode == USE_ANY_MDS && 992 + !ceph_mdsmap_is_laggy(mdsc->mdsmap, 993 + mds)) 994 + goto out; 995 + } 999 996 } 997 + mode = USE_AUTH_MDS; 1000 998 } 1001 999 } 1002 1000 ··· 1015 1007 goto random; 1016 1008 } 1017 1009 mds = cap->session->s_mds; 1018 - dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", 1010 + dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__, 1019 1011 inode, ceph_vinop(inode), mds, 1020 1012 cap == ci->i_auth_cap ? "auth " : "", cap); 1021 1013 spin_unlock(&ci->i_ceph_lock); ··· 1026 1018 return mds; 1027 1019 1028 1020 random: 1021 + if (random) 1022 + *random = true; 1023 + 1029 1024 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); 1030 - dout("choose_mds chose random mds%d\n", mds); 1025 + dout("%s chose random mds%d\n", __func__, mds); 1031 1026 return mds; 1032 1027 } 1033 1028 ··· 1056 1045 return msg; 1057 1046 } 1058 1047 1048 + static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; 1049 + #define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8) 1059 1050 static void encode_supported_features(void **p, void *end) 1060 1051 { 1061 - static const unsigned char bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; 1062 - static const size_t count = ARRAY_SIZE(bits); 1052 + static const size_t count = ARRAY_SIZE(feature_bits); 1063 1053 1064 1054 if (count > 0) { 1065 1055 size_t i; 1066 - size_t size = ((size_t)bits[count - 1] + 64) / 64 * 8; 1056 + size_t size = FEATURE_BYTES(count); 1067 1057 1068 1058 BUG_ON(*p + 4 + size > end); 1069 1059 ceph_encode_32(p, size); 1070 1060 memset(*p, 0, size); 1071 1061 for (i = 0; i < count; i++) 1072 - ((unsigned char*)(*p))[i / 8] |= 1 << (bits[i] % 8); 1062 + ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8); 1073 1063 *p += size; 1074 1064 } else { 1075 1065 BUG_ON(*p + 4 > end); ··· 1091 1079 int metadata_key_count = 0; 1092 1080 struct ceph_options *opt = mdsc->fsc->client->options; 1093 1081 struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; 1082 + size_t size, count; 1094 1083 void *p, *end; 1095 1084 1096 1085 const char* metadata[][2] = { ··· 1109 1096 strlen(metadata[i][1]); 1110 1097 metadata_key_count++; 1111 1098 } 1099 + 1112 1100 /* supported feature */ 1113 - extra_bytes += 4 + 8; 1101 + size = 0; 1102 + count = ARRAY_SIZE(feature_bits); 1103 + if (count > 0) 1104 + size = FEATURE_BYTES(count); 1105 + extra_bytes += 4 + size; 1114 1106 1115 1107 /* Allocate the message */ 1116 1108 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, ··· 1135 1117 * Serialize client metadata into waiting buffer space, using 1136 1118 * the format that userspace expects for map<string, string> 1137 1119 * 1138 - * ClientSession messages with metadata are v2 1120 + * ClientSession messages with metadata are v3 1139 1121 */ 1140 1122 msg->hdr.version = cpu_to_le16(3); 1141 1123 msg->hdr.compat_version = cpu_to_le16(1); ··· 1237 1219 struct ceph_mds_session *ts; 1238 1220 int i, mds = session->s_mds; 1239 1221 1240 - if (mds >= mdsc->mdsmap->m_num_mds) 1222 + if (mds >= mdsc->mdsmap->possible_max_rank) 1241 1223 return; 1242 1224 1243 1225 mi = &mdsc->mdsmap->m_info[mds]; ··· 1985 1967 if (mdsc->stopping) 1986 1968 return; 1987 1969 1988 - get_session(session); 1970 + ceph_get_mds_session(session); 1989 1971 if (queue_work(mdsc->fsc->cap_wq, 1990 1972 &session->s_cap_release_work)) { 1991 1973 dout("cap release work queued\n"); ··· 2534 2516 } 2535 2517 2536 2518 /* 2519 + * called under mdsc->mutex 2520 + */ 2521 + static int __send_request(struct ceph_mds_client *mdsc, 2522 + struct ceph_mds_session *session, 2523 + struct ceph_mds_request *req, 2524 + bool drop_cap_releases) 2525 + { 2526 + int err; 2527 + 2528 + err = __prepare_send_request(mdsc, req, session->s_mds, 2529 + drop_cap_releases); 2530 + if (!err) { 2531 + ceph_msg_get(req->r_request); 2532 + ceph_con_send(&session->s_con, req->r_request); 2533 + } 2534 + 2535 + return err; 2536 + } 2537 + 2538 + /* 2537 2539 * send request, or put it on the appropriate wait list. 2538 2540 */ 2539 2541 static void __do_request(struct ceph_mds_client *mdsc, ··· 2562 2524 struct ceph_mds_session *session = NULL; 2563 2525 int mds = -1; 2564 2526 int err = 0; 2527 + bool random; 2565 2528 2566 2529 if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { 2567 2530 if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) ··· 2595 2556 if (!(mdsc->fsc->mount_options->flags & 2596 2557 CEPH_MOUNT_OPT_MOUNTWAIT) && 2597 2558 !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) { 2598 - err = -ENOENT; 2599 - pr_info("probably no mds server is up\n"); 2559 + err = -EHOSTUNREACH; 2600 2560 goto finish; 2601 2561 } 2602 2562 } 2603 2563 2604 2564 put_request_session(req); 2605 2565 2606 - mds = __choose_mds(mdsc, req); 2566 + mds = __choose_mds(mdsc, req, &random); 2607 2567 if (mds < 0 || 2608 2568 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { 2609 2569 dout("do_request no mds or not active, waiting for map\n"); ··· 2619 2581 goto finish; 2620 2582 } 2621 2583 } 2622 - req->r_session = get_session(session); 2584 + req->r_session = ceph_get_mds_session(session); 2623 2585 2624 2586 dout("do_request mds%d session %p state %s\n", mds, session, 2625 2587 ceph_session_state_name(session->s_state)); ··· 2630 2592 goto out_session; 2631 2593 } 2632 2594 if (session->s_state == CEPH_MDS_SESSION_NEW || 2633 - session->s_state == CEPH_MDS_SESSION_CLOSING) 2595 + session->s_state == CEPH_MDS_SESSION_CLOSING) { 2634 2596 __open_session(mdsc, session); 2597 + /* retry the same mds later */ 2598 + if (random) 2599 + req->r_resend_mds = mds; 2600 + } 2635 2601 list_add(&req->r_wait, &session->s_waiting); 2636 2602 goto out_session; 2637 2603 } ··· 2646 2604 if (req->r_request_started == 0) /* note request start time */ 2647 2605 req->r_request_started = jiffies; 2648 2606 2649 - err = __prepare_send_request(mdsc, req, mds, false); 2650 - if (!err) { 2651 - ceph_msg_get(req->r_request); 2652 - ceph_con_send(&session->s_con, req->r_request); 2653 - } 2607 + err = __send_request(mdsc, session, req, false); 2654 2608 2655 2609 out_session: 2656 2610 ceph_put_mds_session(session); ··· 2899 2861 mutex_unlock(&mdsc->mutex); 2900 2862 goto out; 2901 2863 } else { 2902 - int mds = __choose_mds(mdsc, req); 2864 + int mds = __choose_mds(mdsc, req, NULL); 2903 2865 if (mds >= 0 && mds != req->r_session->s_mds) { 2904 2866 dout("but auth changed, so resending\n"); 2905 2867 __do_request(mdsc, req); ··· 2915 2877 set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); 2916 2878 __unregister_request(mdsc, req); 2917 2879 2880 + /* last request during umount? */ 2881 + if (mdsc->stopping && !__get_oldest_req(mdsc)) 2882 + complete_all(&mdsc->safe_umount_waiters); 2883 + 2918 2884 if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { 2919 2885 /* 2920 2886 * We already handled the unsafe response, now do the ··· 2929 2887 */ 2930 2888 dout("got safe reply %llu, mds%d\n", tid, mds); 2931 2889 2932 - /* last unsafe request during umount? */ 2933 - if (mdsc->stopping && !__get_oldest_req(mdsc)) 2934 - complete_all(&mdsc->safe_umount_waiters); 2935 2890 mutex_unlock(&mdsc->mutex); 2936 2891 goto out; 2937 2892 } ··· 3143 3104 3144 3105 mutex_lock(&mdsc->mutex); 3145 3106 if (op == CEPH_SESSION_CLOSE) { 3146 - get_session(session); 3107 + ceph_get_mds_session(session); 3147 3108 __unregister_session(mdsc, session); 3148 3109 } 3149 3110 /* FIXME: this ttl calculation is generous */ ··· 3181 3142 case CEPH_SESSION_CLOSE: 3182 3143 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) 3183 3144 pr_info("mds%d reconnect denied\n", session->s_mds); 3145 + session->s_state = CEPH_MDS_SESSION_CLOSED; 3184 3146 cleanup_session_requests(mdsc, session); 3185 3147 remove_session_caps(session); 3186 3148 wake = 2; /* for good measure */ ··· 3249 3209 return; 3250 3210 } 3251 3211 3252 - 3253 3212 /* 3254 3213 * called under session->mutex. 3255 3214 */ ··· 3257 3218 { 3258 3219 struct ceph_mds_request *req, *nreq; 3259 3220 struct rb_node *p; 3260 - int err; 3261 3221 3262 3222 dout("replay_unsafe_requests mds%d\n", session->s_mds); 3263 3223 3264 3224 mutex_lock(&mdsc->mutex); 3265 - list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { 3266 - err = __prepare_send_request(mdsc, req, session->s_mds, true); 3267 - if (!err) { 3268 - ceph_msg_get(req->r_request); 3269 - ceph_con_send(&session->s_con, req->r_request); 3270 - } 3271 - } 3225 + list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) 3226 + __send_request(mdsc, session, req, true); 3272 3227 3273 3228 /* 3274 3229 * also re-send old requests when MDS enters reconnect stage. So that MDS ··· 3277 3244 if (req->r_attempts == 0) 3278 3245 continue; /* only old requests */ 3279 3246 if (req->r_session && 3280 - req->r_session->s_mds == session->s_mds) { 3281 - err = __prepare_send_request(mdsc, req, 3282 - session->s_mds, true); 3283 - if (!err) { 3284 - ceph_msg_get(req->r_request); 3285 - ceph_con_send(&session->s_con, req->r_request); 3286 - } 3287 - } 3247 + req->r_session->s_mds == session->s_mds) 3248 + __send_request(mdsc, session, req, true); 3288 3249 } 3289 3250 mutex_unlock(&mdsc->mutex); 3290 3251 } ··· 3789 3762 dout("check_new_map new %u old %u\n", 3790 3763 newmap->m_epoch, oldmap->m_epoch); 3791 3764 3792 - for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { 3765 + for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) { 3793 3766 if (!mdsc->sessions[i]) 3794 3767 continue; 3795 3768 s = mdsc->sessions[i]; ··· 3803 3776 ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", 3804 3777 ceph_session_state_name(s->s_state)); 3805 3778 3806 - if (i >= newmap->m_num_mds) { 3779 + if (i >= newmap->possible_max_rank) { 3807 3780 /* force close session for stopped mds */ 3808 - get_session(s); 3781 + ceph_get_mds_session(s); 3809 3782 __unregister_session(mdsc, s); 3810 3783 __wake_requests(mdsc, &s->s_waiting); 3811 3784 mutex_unlock(&mdsc->mutex); ··· 3860 3833 } 3861 3834 } 3862 3835 3863 - for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) { 3836 + for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) { 3864 3837 s = mdsc->sessions[i]; 3865 3838 if (!s) 3866 3839 continue; ··· 4406 4379 mutex_lock(&mdsc->mutex); 4407 4380 for (i = 0; i < mdsc->max_sessions; i++) { 4408 4381 if (mdsc->sessions[i]) { 4409 - session = get_session(mdsc->sessions[i]); 4382 + session = ceph_get_mds_session(mdsc->sessions[i]); 4410 4383 __unregister_session(mdsc, session); 4411 4384 mutex_unlock(&mdsc->mutex); 4412 4385 mutex_lock(&session->s_mutex); ··· 4634 4607 { 4635 4608 struct ceph_mds_session *s = con->private; 4636 4609 4637 - if (get_session(s)) { 4638 - dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref)); 4610 + if (ceph_get_mds_session(s)) 4639 4611 return con; 4640 - } 4641 - dout("mdsc con_get %p FAIL\n", s); 4642 4612 return NULL; 4643 4613 } 4644 4614 ··· 4643 4619 { 4644 4620 struct ceph_mds_session *s = con->private; 4645 4621 4646 - dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1); 4647 4622 ceph_put_mds_session(s); 4648 4623 } 4649 4624

+22 -17

fs/ceph/mds_client.h

··· 17 17 #include <linux/ceph/auth.h> 18 18 19 19 /* The first 8 bits are reserved for old ceph releases */ 20 - #define CEPHFS_FEATURE_MIMIC 8 21 - #define CEPHFS_FEATURE_REPLY_ENCODING 9 22 - #define CEPHFS_FEATURE_RECLAIM_CLIENT 10 23 - #define CEPHFS_FEATURE_LAZY_CAP_WANTED 11 24 - #define CEPHFS_FEATURE_MULTI_RECONNECT 12 20 + enum ceph_feature_type { 21 + CEPHFS_FEATURE_MIMIC = 8, 22 + CEPHFS_FEATURE_REPLY_ENCODING, 23 + CEPHFS_FEATURE_RECLAIM_CLIENT, 24 + CEPHFS_FEATURE_LAZY_CAP_WANTED, 25 + CEPHFS_FEATURE_MULTI_RECONNECT, 25 26 26 - #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 27 + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT, 28 + }; 29 + 30 + /* 31 + * This will always have the highest feature bit value 32 + * as the last element of the array. 33 + */ 34 + #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 27 35 0, 1, 2, 3, 4, 5, 6, 7, \ 28 36 CEPHFS_FEATURE_MIMIC, \ 29 37 CEPHFS_FEATURE_REPLY_ENCODING, \ 30 38 CEPHFS_FEATURE_LAZY_CAP_WANTED, \ 31 39 CEPHFS_FEATURE_MULTI_RECONNECT, \ 40 + \ 41 + CEPHFS_FEATURE_MAX, \ 32 42 } 33 43 #define CEPHFS_FEATURES_CLIENT_REQUIRED {} 34 - 35 44 36 45 /* 37 46 * Some lock dependencies: ··· 160 151 CEPH_MDS_SESSION_RESTARTING = 5, 161 152 CEPH_MDS_SESSION_RECONNECTING = 6, 162 153 CEPH_MDS_SESSION_CLOSING = 7, 163 - CEPH_MDS_SESSION_REJECTED = 8, 154 + CEPH_MDS_SESSION_CLOSED = 8, 155 + CEPH_MDS_SESSION_REJECTED = 9, 164 156 }; 165 157 166 158 struct ceph_mds_session { ··· 184 174 185 175 /* protected by s_cap_lock */ 186 176 spinlock_t s_cap_lock; 177 + refcount_t s_ref; 187 178 struct list_head s_caps; /* all caps issued by this session */ 188 179 struct ceph_cap *s_cap_iterator; 189 180 int s_nr_caps; ··· 199 188 unsigned long s_renew_requested; /* last time we sent a renew req */ 200 189 u64 s_renew_seq; 201 190 202 - refcount_t s_ref; 203 191 struct list_head s_waiting; /* waiting requests */ 204 192 struct list_head s_unsafe; /* unsafe requests */ 205 193 }; ··· 234 224 struct rb_node r_node; 235 225 struct ceph_mds_client *r_mdsc; 236 226 227 + struct kref r_kref; 237 228 int r_op; /* mds op code */ 238 229 239 230 /* operation on what? */ ··· 305 294 int r_resend_mds; /* mds to resend to next, if any*/ 306 295 u32 r_sent_on_mseq; /* cap mseq request was sent at*/ 307 296 308 - struct kref r_kref; 309 297 struct list_head r_wait; 310 298 struct completion r_completion; 311 299 struct completion r_safe_completion; ··· 461 451 extern struct ceph_mds_session * 462 452 __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); 463 453 464 - static inline struct ceph_mds_session * 465 - ceph_get_mds_session(struct ceph_mds_session *s) 466 - { 467 - refcount_inc(&s->s_ref); 468 - return s; 469 - } 470 - 471 454 extern const char *ceph_session_state_name(int s); 472 455 456 + extern struct ceph_mds_session * 457 + ceph_get_mds_session(struct ceph_mds_session *s); 473 458 extern void ceph_put_mds_session(struct ceph_mds_session *s); 474 459 475 460 extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,

+52 -39

fs/ceph/mdsmap.c

··· 13 13 14 14 #include "super.h" 15 15 16 + #define CEPH_MDS_IS_READY(i, ignore_laggy) \ 17 + (m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy) 16 18 17 - /* 18 - * choose a random mds that is "up" (i.e. has a state > 0), or -1. 19 - */ 20 - int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) 19 + static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy) 21 20 { 22 21 int n = 0; 23 22 int i, j; 24 23 25 - /* special case for one mds */ 26 - if (1 == m->m_num_mds && m->m_info[0].state > 0) 27 - return 0; 28 - 29 24 /* count */ 30 - for (i = 0; i < m->m_num_mds; i++) 31 - if (m->m_info[i].state > 0) 25 + for (i = 0; i < m->possible_max_rank; i++) 26 + if (CEPH_MDS_IS_READY(i, ignore_laggy)) 32 27 n++; 33 28 if (n == 0) 34 29 return -1; 35 30 36 31 /* pick */ 37 32 n = prandom_u32() % n; 38 - for (j = 0, i = 0; i < m->m_num_mds; i++) { 39 - if (m->m_info[i].state > 0) 33 + for (j = 0, i = 0; i < m->possible_max_rank; i++) { 34 + if (CEPH_MDS_IS_READY(i, ignore_laggy)) 40 35 j++; 41 36 if (j > n) 42 37 break; 43 38 } 44 39 45 40 return i; 41 + } 42 + 43 + /* 44 + * choose a random mds that is "up" (i.e. has a state > 0), or -1. 45 + */ 46 + int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) 47 + { 48 + int mds; 49 + 50 + mds = __mdsmap_get_random_mds(m, false); 51 + if (mds == m->possible_max_rank || mds == -1) 52 + mds = __mdsmap_get_random_mds(m, true); 53 + 54 + return mds == m->possible_max_rank ? -1 : mds; 46 55 } 47 56 48 57 #define __decode_and_drop_type(p, end, type, bad) \ ··· 147 138 m->m_session_autoclose = ceph_decode_32(p); 148 139 m->m_max_file_size = ceph_decode_64(p); 149 140 m->m_max_mds = ceph_decode_32(p); 150 - m->m_num_mds = m->m_max_mds; 151 141 152 - m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); 142 + /* 143 + * pick out the active nodes as the m_num_active_mds, the 144 + * m_num_active_mds maybe larger than m_max_mds when decreasing 145 + * the max_mds in cluster side, in other case it should less 146 + * than or equal to m_max_mds. 147 + */ 148 + m->m_num_active_mds = n = ceph_decode_32(p); 149 + 150 + /* 151 + * the possible max rank, it maybe larger than the m_num_active_mds, 152 + * for example if the mds_max == 2 in the cluster, when the MDS(0) 153 + * was laggy and being replaced by a new MDS, we will temporarily 154 + * receive a new mds map with n_num_mds == 1 and the active MDS(1), 155 + * and the mds rank >= m_num_active_mds. 156 + */ 157 + m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds); 158 + 159 + m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS); 153 160 if (!m->m_info) 154 161 goto nomem; 155 162 156 163 /* pick out active nodes from mds_info (state > 0) */ 157 - n = ceph_decode_32(p); 158 164 for (i = 0; i < n; i++) { 159 165 u64 global_id; 160 166 u32 namelen; ··· 239 215 ceph_mds_state_name(state), 240 216 laggy ? "(laggy)" : ""); 241 217 242 - if (mds < 0 || state <= 0) 218 + if (mds < 0 || mds >= m->possible_max_rank) { 219 + pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds); 243 220 continue; 221 + } 244 222 245 - if (mds >= m->m_num_mds) { 246 - int new_num = max(mds + 1, m->m_num_mds * 2); 247 - void *new_m_info = krealloc(m->m_info, 248 - new_num * sizeof(*m->m_info), 249 - GFP_NOFS | __GFP_ZERO); 250 - if (!new_m_info) 251 - goto nomem; 252 - m->m_info = new_m_info; 253 - m->m_num_mds = new_num; 223 + if (state <= 0) { 224 + pr_warn("mdsmap_decode got incorrect state(%s)\n", 225 + ceph_mds_state_name(state)); 226 + continue; 254 227 } 255 228 256 229 info = &m->m_info[mds]; ··· 267 246 } else { 268 247 info->export_targets = NULL; 269 248 } 270 - } 271 - if (m->m_num_mds > m->m_max_mds) { 272 - /* find max up mds */ 273 - for (i = m->m_num_mds; i >= m->m_max_mds; i--) { 274 - if (i == 0 || m->m_info[i-1].state > 0) 275 - break; 276 - } 277 - m->m_num_mds = i; 278 249 } 279 250 280 251 /* pg_pools */ ··· 309 296 310 297 for (i = 0; i < n; i++) { 311 298 s32 mds = ceph_decode_32(p); 312 - if (mds >= 0 && mds < m->m_num_mds) { 299 + if (mds >= 0 && mds < m->possible_max_rank) { 313 300 if (m->m_info[mds].laggy) 314 301 num_laggy++; 315 302 } 316 303 } 317 304 m->m_num_laggy = num_laggy; 318 305 319 - if (n > m->m_num_mds) { 306 + if (n > m->possible_max_rank) { 320 307 void *new_m_info = krealloc(m->m_info, 321 308 n * sizeof(*m->m_info), 322 309 GFP_NOFS | __GFP_ZERO); ··· 324 311 goto nomem; 325 312 m->m_info = new_m_info; 326 313 } 327 - m->m_num_mds = n; 314 + m->possible_max_rank = n; 328 315 } 329 316 330 317 /* inc */ ··· 395 382 { 396 383 int i; 397 384 398 - for (i = 0; i < m->m_num_mds; i++) 385 + for (i = 0; i < m->possible_max_rank; i++) 399 386 kfree(m->m_info[i].export_targets); 400 387 kfree(m->m_info); 401 388 kfree(m->m_data_pg_pools); ··· 409 396 return false; 410 397 if (m->m_damaged) 411 398 return false; 412 - if (m->m_num_laggy > 0) 399 + if (m->m_num_laggy == m->m_num_active_mds) 413 400 return false; 414 - for (i = 0; i < m->m_num_mds; i++) { 401 + for (i = 0; i < m->possible_max_rank; i++) { 415 402 if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) 416 403 nr_active++; 417 404 }

+108 -20

fs/ceph/super.c

··· 107 107 return 0; 108 108 } 109 109 110 - 111 110 static int ceph_sync_fs(struct super_block *sb, int wait) 112 111 { 113 112 struct ceph_fs_client *fsc = ceph_sb_to_client(sb); ··· 210 211 211 212 /* 212 213 * Parse the source parameter. Distinguish the server list from the path. 213 - * Internally we do not include the leading '/' in the path. 214 214 * 215 215 * The source will look like: 216 216 * <server_spec>[,<server_spec>...]:[<path>] ··· 230 232 231 233 dev_name_end = strchr(dev_name, '/'); 232 234 if (dev_name_end) { 233 - if (strlen(dev_name_end) > 1) { 234 - kfree(fsopt->server_path); 235 - fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); 236 - if (!fsopt->server_path) 237 - return -ENOMEM; 238 - } 235 + kfree(fsopt->server_path); 236 + 237 + /* 238 + * The server_path will include the whole chars from userland 239 + * including the leading '/'. 240 + */ 241 + fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); 242 + if (!fsopt->server_path) 243 + return -ENOMEM; 239 244 } else { 240 245 dev_name_end = dev_name + strlen(dev_name); 241 246 } ··· 462 461 return strcmp(s1, s2); 463 462 } 464 463 464 + /** 465 + * path_remove_extra_slash - Remove the extra slashes in the server path 466 + * @server_path: the server path and could be NULL 467 + * 468 + * Return NULL if the path is NULL or only consists of "/", or a string 469 + * without any extra slashes including the leading slash(es) and the 470 + * slash(es) at the end of the server path, such as: 471 + * "//dir1////dir2///" --> "dir1/dir2" 472 + */ 473 + static char *path_remove_extra_slash(const char *server_path) 474 + { 475 + const char *path = server_path; 476 + const char *cur, *end; 477 + char *buf, *p; 478 + int len; 479 + 480 + /* if the server path is omitted */ 481 + if (!path) 482 + return NULL; 483 + 484 + /* remove all the leading slashes */ 485 + while (*path == '/') 486 + path++; 487 + 488 + /* if the server path only consists of slashes */ 489 + if (*path == '\0') 490 + return NULL; 491 + 492 + len = strlen(path); 493 + 494 + buf = kmalloc(len + 1, GFP_KERNEL); 495 + if (!buf) 496 + return ERR_PTR(-ENOMEM); 497 + 498 + end = path + len; 499 + p = buf; 500 + do { 501 + cur = strchr(path, '/'); 502 + if (!cur) 503 + cur = end; 504 + 505 + len = cur - path; 506 + 507 + /* including one '/' */ 508 + if (cur != end) 509 + len += 1; 510 + 511 + memcpy(p, path, len); 512 + p += len; 513 + 514 + while (cur <= end && *cur == '/') 515 + cur++; 516 + path = cur; 517 + } while (path < end); 518 + 519 + *p = '\0'; 520 + 521 + /* 522 + * remove the last slash if there has and just to make sure that 523 + * we will get something like "dir1/dir2" 524 + */ 525 + if (*(--p) == '/') 526 + *p = '\0'; 527 + 528 + return buf; 529 + } 530 + 465 531 static int compare_mount_options(struct ceph_mount_options *new_fsopt, 466 532 struct ceph_options *new_opt, 467 533 struct ceph_fs_client *fsc) ··· 536 468 struct ceph_mount_options *fsopt1 = new_fsopt; 537 469 struct ceph_mount_options *fsopt2 = fsc->mount_options; 538 470 int ofs = offsetof(struct ceph_mount_options, snapdir_name); 471 + char *p1, *p2; 539 472 int ret; 540 473 541 474 ret = memcmp(fsopt1, fsopt2, ofs); ··· 549 480 ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); 550 481 if (ret) 551 482 return ret; 552 - ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); 483 + 484 + p1 = path_remove_extra_slash(fsopt1->server_path); 485 + if (IS_ERR(p1)) 486 + return PTR_ERR(p1); 487 + p2 = path_remove_extra_slash(fsopt2->server_path); 488 + if (IS_ERR(p2)) { 489 + kfree(p1); 490 + return PTR_ERR(p2); 491 + } 492 + ret = strcmp_null(p1, p2); 493 + kfree(p1); 494 + kfree(p2); 553 495 if (ret) 554 496 return ret; 497 + 555 498 ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq); 556 499 if (ret) 557 500 return ret; ··· 718 637 fsc->sb = NULL; 719 638 fsc->mount_state = CEPH_MOUNT_MOUNTING; 720 639 fsc->filp_gen = 1; 640 + fsc->have_copy_from2 = true; 721 641 722 642 atomic_long_set(&fsc->writeback_count, 0); 723 643 ··· 870 788 ceph_fscache_unregister(); 871 789 } 872 790 873 - 874 791 /* 875 792 * ceph_umount_begin - initiate forced umount. Tear down down the 876 793 * mount, skipping steps that may hang while waiting for server(s). ··· 949 868 return root; 950 869 } 951 870 952 - 953 - 954 - 955 871 /* 956 872 * mount: join the ceph cluster, and open root directory. 957 873 */ ··· 963 885 mutex_lock(&fsc->client->mount_mutex); 964 886 965 887 if (!fsc->sb->s_root) { 966 - const char *path; 888 + const char *path, *p; 967 889 err = __ceph_open_session(fsc->client, started); 968 890 if (err < 0) 969 891 goto out; ··· 975 897 goto out; 976 898 } 977 899 978 - if (!fsc->mount_options->server_path) { 979 - path = ""; 980 - dout("mount opening path \\t\n"); 981 - } else { 982 - path = fsc->mount_options->server_path + 1; 983 - dout("mount opening path %s\n", path); 900 + p = path_remove_extra_slash(fsc->mount_options->server_path); 901 + if (IS_ERR(p)) { 902 + err = PTR_ERR(p); 903 + goto out; 984 904 } 905 + /* if the server path is omitted or just consists of '/' */ 906 + if (!p) 907 + path = ""; 908 + else 909 + path = p; 910 + dout("mount opening path '%s'\n", path); 985 911 986 912 ceph_fs_debugfs_init(fsc); 987 913 988 914 root = open_root_dentry(fsc, path, started); 915 + kfree(p); 989 916 if (IS_ERR(root)) { 990 917 err = PTR_ERR(root); 991 918 goto out; ··· 1153 1070 return 0; 1154 1071 1155 1072 out_splat: 1073 + if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) { 1074 + pr_info("No mds server is up or the cluster is laggy\n"); 1075 + err = -EHOSTUNREACH; 1076 + } 1077 + 1156 1078 ceph_mdsc_close_sessions(fsc->mdsc); 1157 1079 deactivate_locked_super(sb); 1158 1080 goto out_final;

fs/ceph/super.h

··· 106 106 unsigned long last_auto_reconnect; 107 107 bool blacklisted; 108 108 109 + bool have_copy_from2; 110 + 109 111 u32 filp_gen; 110 112 loff_t max_file_size; 111 113

+4 -3

fs/ceph/xattr.c

··· 655 655 u32 len; 656 656 const char *name, *val; 657 657 struct ceph_inode_info *ci = ceph_inode(inode); 658 - int xattr_version; 658 + u64 xattr_version; 659 659 struct ceph_inode_xattr **xattrs = NULL; 660 660 int err = 0; 661 661 int i; ··· 851 851 req_mask = __get_request_mask(inode); 852 852 853 853 spin_lock(&ci->i_ceph_lock); 854 - dout("getxattr %p ver=%lld index_ver=%lld\n", inode, 854 + dout("getxattr %p name '%s' ver=%lld index_ver=%lld\n", inode, name, 855 855 ci->i_xattrs.version, ci->i_xattrs.index_version); 856 856 857 857 if (ci->i_xattrs.version == 0 || ··· 1078 1078 } 1079 1079 } 1080 1080 1081 - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); 1081 + dout("setxattr %p name '%s' issued %s\n", inode, name, 1082 + ceph_cap_string(issued)); 1082 1083 __build_xattrs(inode); 1083 1084 1084 1085 required_blob_size = __get_required_blob_size(ci, name_len, val_len);

+6 -5

include/linux/ceph/mdsmap.h

··· 25 25 u32 m_session_timeout; /* seconds */ 26 26 u32 m_session_autoclose; /* seconds */ 27 27 u64 m_max_file_size; 28 - u32 m_max_mds; /* size of m_addr, m_state arrays */ 29 - int m_num_mds; 28 + u32 m_max_mds; /* expected up:active mds number */ 29 + u32 m_num_active_mds; /* actual up:active mds number */ 30 + u32 possible_max_rank; /* possible max rank index */ 30 31 struct ceph_mds_info *m_info; 31 32 32 33 /* which object pools file data can be stored in */ ··· 43 42 static inline struct ceph_entity_addr * 44 43 ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) 45 44 { 46 - if (w >= m->m_num_mds) 45 + if (w >= m->possible_max_rank) 47 46 return NULL; 48 47 return &m->m_info[w].addr; 49 48 } ··· 51 50 static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) 52 51 { 53 52 BUG_ON(w < 0); 54 - if (w >= m->m_num_mds) 53 + if (w >= m->possible_max_rank) 55 54 return CEPH_MDS_STATE_DNE; 56 55 return m->m_info[w].state; 57 56 } 58 57 59 58 static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) 60 59 { 61 - if (w >= 0 && w < m->m_num_mds) 60 + if (w >= 0 && w < m->possible_max_rank) 62 61 return m->m_info[w].laggy; 63 62 return false; 64 63 }

include/linux/ceph/osd_client.h

··· 534 534 struct ceph_object_id *dst_oid, 535 535 struct ceph_object_locator *dst_oloc, 536 536 u32 dst_fadvise_flags, 537 + u32 truncate_seq, u64 truncate_size, 537 538 u8 copy_from_flags); 538 539 539 540 /* watch/notify */

include/linux/ceph/rados.h

··· 256 256 \ 257 257 /* tiering */ \ 258 258 f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \ 259 + f(COPY_FROM2, __CEPH_OSD_OP(WR, DATA, 45), "copy-from2") \ 259 260 f(COPY_GET_CLASSIC, __CEPH_OSD_OP(RD, DATA, 27), "copy-get-classic") \ 260 261 f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \ 261 262 f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \ ··· 447 446 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to 448 447 * cloneid */ 449 448 CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */ 449 + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* send truncate_{seq,size} */ 450 450 }; 451 451 452 452 enum {

+1 -1

net/ceph/Makefile

··· 13 13 auth.o auth_none.o \ 14 14 crypto.o armor.o \ 15 15 auth_x.o \ 16 - ceph_fs.o ceph_strings.o ceph_hash.o \ 16 + ceph_strings.o ceph_hash.o \ 17 17 pagevec.o snapshot.o string_table.o

-4

net/ceph/ceph_fs.c fs/ceph/util.c

··· 39 39 fl->stripe_count == 0 && fl->object_size == 0) 40 40 fl->pool_id = -1; 41 41 } 42 - EXPORT_SYMBOL(ceph_file_layout_from_legacy); 43 42 44 43 void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, 45 44 struct ceph_file_layout_legacy *legacy) ··· 51 52 else 52 53 legacy->fl_pg_pool = 0; 53 54 } 54 - EXPORT_SYMBOL(ceph_file_layout_to_legacy); 55 55 56 56 int ceph_flags_to_mode(int flags) 57 57 { ··· 80 82 81 83 return mode; 82 84 } 83 - EXPORT_SYMBOL(ceph_flags_to_mode); 84 85 85 86 int ceph_caps_for_mode(int mode) 86 87 { ··· 98 101 99 102 return caps; 100 103 } 101 - EXPORT_SYMBOL(ceph_caps_for_mode);

+12 -6

net/ceph/osd_client.c

··· 402 402 case CEPH_OSD_OP_LIST_WATCHERS: 403 403 ceph_osd_data_release(&op->list_watchers.response_data); 404 404 break; 405 - case CEPH_OSD_OP_COPY_FROM: 405 + case CEPH_OSD_OP_COPY_FROM2: 406 406 ceph_osd_data_release(&op->copy_from.osd_data); 407 407 break; 408 408 default: ··· 697 697 case CEPH_OSD_OP_SETXATTR: 698 698 case CEPH_OSD_OP_CMPXATTR: 699 699 case CEPH_OSD_OP_NOTIFY_ACK: 700 - case CEPH_OSD_OP_COPY_FROM: 700 + case CEPH_OSD_OP_COPY_FROM2: 701 701 *num_request_data_items += 1; 702 702 break; 703 703 ··· 1029 1029 case CEPH_OSD_OP_CREATE: 1030 1030 case CEPH_OSD_OP_DELETE: 1031 1031 break; 1032 - case CEPH_OSD_OP_COPY_FROM: 1032 + case CEPH_OSD_OP_COPY_FROM2: 1033 1033 dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); 1034 1034 dst->copy_from.src_version = 1035 1035 cpu_to_le64(src->copy_from.src_version); ··· 1966 1966 ceph_osdc_msg_data_add(request_msg, 1967 1967 &op->notify_ack.request_data); 1968 1968 break; 1969 - case CEPH_OSD_OP_COPY_FROM: 1969 + case CEPH_OSD_OP_COPY_FROM2: 1970 1970 ceph_osdc_msg_data_add(request_msg, 1971 1971 &op->copy_from.osd_data); 1972 1972 break; ··· 5315 5315 struct ceph_object_locator *src_oloc, 5316 5316 u32 src_fadvise_flags, 5317 5317 u32 dst_fadvise_flags, 5318 + u32 truncate_seq, u64 truncate_size, 5318 5319 u8 copy_from_flags) 5319 5320 { 5320 5321 struct ceph_osd_req_op *op; ··· 5326 5325 if (IS_ERR(pages)) 5327 5326 return PTR_ERR(pages); 5328 5327 5329 - op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, dst_fadvise_flags); 5328 + op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2, 5329 + dst_fadvise_flags); 5330 5330 op->copy_from.snapid = src_snapid; 5331 5331 op->copy_from.src_version = src_version; 5332 5332 op->copy_from.flags = copy_from_flags; ··· 5337 5335 end = p + PAGE_SIZE; 5338 5336 ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); 5339 5337 encode_oloc(&p, end, src_oloc); 5338 + ceph_encode_32(&p, truncate_seq); 5339 + ceph_encode_64(&p, truncate_size); 5340 5340 op->indata_len = PAGE_SIZE - (end - p); 5341 5341 5342 5342 ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, ··· 5354 5350 struct ceph_object_id *dst_oid, 5355 5351 struct ceph_object_locator *dst_oloc, 5356 5352 u32 dst_fadvise_flags, 5353 + u32 truncate_seq, u64 truncate_size, 5357 5354 u8 copy_from_flags) 5358 5355 { 5359 5356 struct ceph_osd_request *req; ··· 5371 5366 5372 5367 ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid, 5373 5368 src_oloc, src_fadvise_flags, 5374 - dst_fadvise_flags, copy_from_flags); 5369 + dst_fadvise_flags, truncate_seq, 5370 + truncate_size, copy_from_flags); 5375 5371 if (ret) 5376 5372 goto out; 5377 5373