Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ceph: auto reconnect after blacklisted

Make client use osd reply and session message to infer if itself is
blacklisted. Client reconnect to cluster using new entity addr if it
is blacklisted. Auto reconnect is limited to once every 30 minutes.

Auto reconnect is disabled by default. It can be enabled/disabled by
recover_session=<no|clean> mount option. In 'clean' mode, client drops
any dirty data/metadata, invalidates page caches and invalidates all
writable file handles. After reconnect, file locks become stale because
MDS loses track of them. If an inode contains any stale file locks,
read/write on the indoe are not allowed until applications release all
stale file locks.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>

authored by

Yan, Zheng and committed by
Ilya Dryomov
131d7eb4 81f148a9

+93 -8
+14
Documentation/filesystems/ceph.txt
··· 158 158 copies. Currently, it's only used in copy_file_range, which will revert 159 159 to the default VFS implementation if this option is used. 160 160 161 + recover_session=<no|clean> 162 + Set auto reconnect mode in the case where the client is blacklisted. The 163 + available modes are "no" and "clean". The default is "no". 164 + 165 + * no: never attempt to reconnect when client detects that it has been 166 + blacklisted. Operations will generally fail after being blacklisted. 167 + 168 + * clean: client reconnects to the ceph cluster automatically when it 169 + detects that it has been blacklisted. During reconnect, client drops 170 + dirty data/metadata, invalidates page caches and writable file handles. 171 + After reconnect, file locks become stale because the MDS loses track 172 + of them. If an inode contains any stale file locks, read/write on the 173 + inode is not allowed until applications release all stale file locks. 174 + 161 175 More Information 162 176 ================ 163 177
+17 -5
fs/ceph/addr.c
··· 189 189 { 190 190 struct inode *inode = file_inode(filp); 191 191 struct ceph_inode_info *ci = ceph_inode(inode); 192 - struct ceph_osd_client *osdc = 193 - &ceph_inode_to_client(inode)->client->osdc; 192 + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 194 193 int err = 0; 195 194 u64 off = page_offset(page); 196 195 u64 len = PAGE_SIZE; ··· 218 219 219 220 dout("readpage inode %p file %p page %p index %lu\n", 220 221 inode, filp, page, page->index); 221 - err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 222 - off, &len, 222 + err = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), 223 + &ci->i_layout, off, &len, 223 224 ci->i_truncate_seq, ci->i_truncate_size, 224 225 &page, 1, 0); 225 226 if (err == -ENOENT) ··· 227 228 if (err < 0) { 228 229 SetPageError(page); 229 230 ceph_fscache_readpage_cancel(inode, page); 231 + if (err == -EBLACKLISTED) 232 + fsc->blacklisted = true; 230 233 goto out; 231 234 } 232 235 if (err < PAGE_SIZE) ··· 267 266 int i; 268 267 269 268 dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); 269 + if (rc == -EBLACKLISTED) 270 + ceph_inode_to_client(inode)->blacklisted = true; 270 271 271 272 /* unlock all pages, zeroing any data we didn't read */ 272 273 osd_data = osd_req_op_extent_osd_data(req, 0); ··· 644 641 end_page_writeback(page); 645 642 return err; 646 643 } 644 + if (err == -EBLACKLISTED) 645 + fsc->blacklisted = true; 647 646 dout("writepage setting page/mapping error %d %p\n", 648 647 err, page); 649 648 SetPageError(page); ··· 726 721 if (rc < 0) { 727 722 mapping_set_error(mapping, rc); 728 723 ceph_set_error_write(ci); 724 + if (rc == -EBLACKLISTED) 725 + fsc->blacklisted = true; 729 726 } else { 730 727 ceph_clear_error_write(ci); 731 728 } ··· 1955 1948 1956 1949 if (err >= 0 || err == -ENOENT) 1957 1950 have |= POOL_READ; 1958 - else if (err != -EPERM) 1951 + else if (err != -EPERM) { 1952 + if (err == -EBLACKLISTED) 1953 + fsc->blacklisted = true; 1959 1954 goto out_unlock; 1955 + } 1960 1956 1961 1957 if (err2 == 0 || err2 == -EEXIST) 1962 1958 have |= POOL_WRITE; 1963 1959 else if (err2 != -EPERM) { 1960 + if (err2 == -EBLACKLISTED) 1961 + fsc->blacklisted = true; 1964 1962 err = err2; 1965 1963 goto out_unlock; 1966 1964 }
+7 -1
fs/ceph/file.c
··· 698 698 ceph_release_page_vector(pages, num_pages); 699 699 } 700 700 701 - if (ret <= 0 || off >= i_size || !more) 701 + if (ret < 0) { 702 + if (ret == -EBLACKLISTED) 703 + fsc->blacklisted = true; 704 + break; 705 + } 706 + 707 + if (off >= i_size || !more) 702 708 break; 703 709 } 704 710
+32 -2
fs/ceph/mds_client.c
··· 3032 3032 pr_err("mdsc_handle_forward decode error err=%d\n", err); 3033 3033 } 3034 3034 3035 - static int __decode_and_drop_session_metadata(void **p, void *end) 3035 + static int __decode_session_metadata(void **p, void *end, 3036 + bool *blacklisted) 3036 3037 { 3037 3038 /* map<string,string> */ 3038 3039 u32 n; 3040 + bool err_str; 3039 3041 ceph_decode_32_safe(p, end, n, bad); 3040 3042 while (n-- > 0) { 3041 3043 u32 len; 3042 3044 ceph_decode_32_safe(p, end, len, bad); 3043 3045 ceph_decode_need(p, end, len, bad); 3046 + err_str = !strncmp(*p, "error_string", len); 3044 3047 *p += len; 3045 3048 ceph_decode_32_safe(p, end, len, bad); 3046 3049 ceph_decode_need(p, end, len, bad); 3050 + if (err_str && strnstr(*p, "blacklisted", len)) 3051 + *blacklisted = true; 3047 3052 *p += len; 3048 3053 } 3049 3054 return 0; ··· 3072 3067 u64 seq; 3073 3068 unsigned long features = 0; 3074 3069 int wake = 0; 3070 + bool blacklisted = false; 3075 3071 3076 3072 /* decode */ 3077 3073 ceph_decode_need(&p, end, sizeof(*h), bad); ··· 3085 3079 if (msg_version >= 3) { 3086 3080 u32 len; 3087 3081 /* version >= 2, metadata */ 3088 - if (__decode_and_drop_session_metadata(&p, end) < 0) 3082 + if (__decode_session_metadata(&p, end, &blacklisted) < 0) 3089 3083 goto bad; 3090 3084 /* version >= 3, feature bits */ 3091 3085 ceph_decode_32_safe(&p, end, len, bad); ··· 3172 3166 session->s_state = CEPH_MDS_SESSION_REJECTED; 3173 3167 cleanup_session_requests(mdsc, session); 3174 3168 remove_session_caps(session); 3169 + if (blacklisted) 3170 + mdsc->fsc->blacklisted = true; 3175 3171 wake = 2; /* for good measure */ 3176 3172 break; 3177 3173 ··· 4023 4015 mutex_unlock(&mdsc->mutex); 4024 4016 } 4025 4017 4018 + static void maybe_recover_session(struct ceph_mds_client *mdsc) 4019 + { 4020 + struct ceph_fs_client *fsc = mdsc->fsc; 4026 4021 4022 + if (!ceph_test_mount_opt(fsc, CLEANRECOVER)) 4023 + return; 4024 + 4025 + if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) 4026 + return; 4027 + 4028 + if (!READ_ONCE(fsc->blacklisted)) 4029 + return; 4030 + 4031 + if (fsc->last_auto_reconnect && 4032 + time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) 4033 + return; 4034 + 4035 + pr_info("auto reconnect after blacklisted\n"); 4036 + fsc->last_auto_reconnect = jiffies; 4037 + ceph_force_reconnect(fsc->sb); 4038 + } 4027 4039 4028 4040 /* 4029 4041 * delayed work -- periodically trim expired leases, renew caps with mds ··· 4116 4088 ceph_queue_cap_reclaim_work(mdsc); 4117 4089 4118 4090 ceph_trim_snapid_map(mdsc); 4091 + 4092 + maybe_recover_session(mdsc); 4119 4093 4120 4094 schedule_delayed(mdsc); 4121 4095 }
+19
fs/ceph/super.c
··· 143 143 Opt_snapdirname, 144 144 Opt_mds_namespace, 145 145 Opt_fscache_uniq, 146 + Opt_recover_session, 146 147 Opt_last_string, 147 148 /* string args above */ 148 149 Opt_dirstat, ··· 185 184 /* int args above */ 186 185 {Opt_snapdirname, "snapdirname=%s"}, 187 186 {Opt_mds_namespace, "mds_namespace=%s"}, 187 + {Opt_recover_session, "recover_session=%s"}, 188 188 {Opt_fscache_uniq, "fsc=%s"}, 189 189 /* string args above */ 190 190 {Opt_dirstat, "dirstat"}, ··· 255 253 GFP_KERNEL); 256 254 if (!fsopt->mds_namespace) 257 255 return -ENOMEM; 256 + break; 257 + case Opt_recover_session: 258 + if (!strncmp(argstr[0].from, "no", 259 + argstr[0].to - argstr[0].from)) { 260 + fsopt->flags &= ~CEPH_MOUNT_OPT_CLEANRECOVER; 261 + } else if (!strncmp(argstr[0].from, "clean", 262 + argstr[0].to - argstr[0].from)) { 263 + fsopt->flags |= CEPH_MOUNT_OPT_CLEANRECOVER; 264 + } else { 265 + return -EINVAL; 266 + } 258 267 break; 259 268 case Opt_fscache_uniq: 260 269 kfree(fsopt->fscache_uniq); ··· 589 576 590 577 if (fsopt->mds_namespace) 591 578 seq_show_option(m, "mds_namespace", fsopt->mds_namespace); 579 + 580 + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) 581 + seq_show_option(m, "recover_session", "clean"); 582 + 592 583 if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) 593 584 seq_printf(m, ",wsize=%d", fsopt->wsize); 594 585 if (fsopt->rsize != CEPH_MAX_READ_SIZE) ··· 1186 1169 ceph_reset_client_addr(fsc->client); 1187 1170 1188 1171 ceph_osdc_clear_abort_err(&fsc->client->osdc); 1172 + 1173 + fsc->blacklisted = false; 1189 1174 fsc->mount_state = CEPH_MOUNT_MOUNTED; 1190 1175 1191 1176 if (sb->s_root) {
+4
fs/ceph/super.h
··· 31 31 #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ 32 32 #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) 33 33 34 + #define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */ 34 35 #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ 35 36 #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ 36 37 #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ ··· 102 101 struct ceph_client *client; 103 102 104 103 unsigned long mount_state; 104 + 105 + unsigned long last_auto_reconnect; 106 + bool blacklisted; 105 107 106 108 u32 filp_gen; 107 109 loff_t max_file_size;