Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge tag 'ceph-for-5.20-rc1' of https://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
"We have a good pile of various fixes and cleanups from Xiubo, Jeff,
Luis and others, almost exclusively in the filesystem.

Several patches touch files outside of our normal purview to set the
stage for bringing in Jeff's long awaited ceph+fscrypt series in the
near future. All of them have appropriate acks and sat in linux-next
for a while"

* tag 'ceph-for-5.20-rc1' of https://github.com/ceph/ceph-client: (27 commits)
libceph: clean up ceph_osdc_start_request prototype
libceph: fix ceph_pagelist_reserve() comment typo
ceph: remove useless check for the folio
ceph: don't truncate file in atomic_open
ceph: make f_bsize always equal to f_frsize
ceph: flush the dirty caps immediatelly when quota is approaching
libceph: print fsid and epoch with osd id
libceph: check pointer before assigned to "c->rules[]"
ceph: don't get the inline data for new creating files
ceph: update the auth cap when the async create req is forwarded
ceph: make change_auth_cap_ses a global symbol
ceph: fix incorrect old_size length in ceph_mds_request_args
ceph: switch back to testing for NULL folio->private in ceph_dirty_folio
ceph: call netfs_subreq_terminated with was_async == false
ceph: convert to generic_file_llseek
ceph: fix the incorrect comment for the ceph_mds_caps struct
ceph: don't leak snap_rwsem in handle_cap_grant
ceph: prevent a client from exceeding the MDS maximum xattr size
ceph: choose auth MDS for getxattr with the Xs caps
ceph: add session already open notify support
...

+538 -233
+3 -3
drivers/block/rbd.c
··· 1297 1297 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n", 1298 1298 __func__, osd_req, obj_req, obj_req->ex.oe_objno, 1299 1299 obj_req->ex.oe_off, obj_req->ex.oe_len); 1300 - ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1300 + ceph_osdc_start_request(osd_req->r_osdc, osd_req); 1301 1301 } 1302 1302 1303 1303 /* ··· 2081 2081 if (ret) 2082 2082 return ret; 2083 2083 2084 - ceph_osdc_start_request(osdc, req, false); 2084 + ceph_osdc_start_request(osdc, req); 2085 2085 return 0; 2086 2086 } 2087 2087 ··· 4768 4768 if (ret) 4769 4769 goto out_req; 4770 4770 4771 - ceph_osdc_start_request(osdc, req, false); 4771 + ceph_osdc_start_request(osdc, req); 4772 4772 ret = ceph_osdc_wait_request(osdc, req); 4773 4773 if (ret >= 0) 4774 4774 ceph_copy_from_page_vector(pages, buf, 0, ret);
+24 -35
fs/ceph/addr.c
··· 122 122 * Reference snap context in folio->private. Also set 123 123 * PagePrivate so that we get invalidate_folio callback. 124 124 */ 125 - VM_BUG_ON_FOLIO(folio_test_private(folio), folio); 125 + VM_WARN_ON_FOLIO(folio->private, folio); 126 126 folio_attach_private(folio, snapc); 127 127 128 128 return ceph_fscache_dirty_folio(mapping, folio); ··· 237 237 if (err >= 0 && err < subreq->len) 238 238 __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); 239 239 240 - netfs_subreq_terminated(subreq, err, true); 240 + netfs_subreq_terminated(subreq, err, false); 241 241 242 242 num_pages = calc_pages_for(osd_data->alignment, osd_data->length); 243 243 ceph_put_page_vector(osd_data->pages, num_pages, false); ··· 313 313 int err = 0; 314 314 u64 len = subreq->len; 315 315 316 - if (ci->i_inline_version != CEPH_INLINE_NONE && 317 - ceph_netfs_issue_op_inline(subreq)) 316 + if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) 318 317 return; 319 318 320 319 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, ··· 337 338 /* should always give us a page-aligned read */ 338 339 WARN_ON_ONCE(page_off); 339 340 len = err; 341 + err = 0; 340 342 341 343 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); 342 344 req->r_callback = finish_netfs_read; ··· 345 345 req->r_inode = inode; 346 346 ihold(inode); 347 347 348 - err = ceph_osdc_start_request(req->r_osdc, req, false); 349 - if (err) 350 - iput(inode); 348 + ceph_osdc_start_request(req->r_osdc, req); 351 349 out: 352 350 ceph_osdc_put_request(req); 353 351 if (err) ··· 619 621 dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); 620 622 621 623 req->r_mtime = inode->i_mtime; 622 - err = ceph_osdc_start_request(osdc, req, true); 623 - if (!err) 624 - err = ceph_osdc_wait_request(osdc, req); 624 + ceph_osdc_start_request(osdc, req); 625 + err = ceph_osdc_wait_request(osdc, req); 625 626 626 627 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 627 628 req->r_end_latency, len, err); ··· 1148 1151 } 1149 1152 1150 1153 req->r_mtime = inode->i_mtime; 1151 - rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); 1152 - BUG_ON(rc); 1154 + ceph_osdc_start_request(&fsc->client->osdc, req); 1153 1155 req = NULL; 1154 1156 1155 1157 wbc->nr_to_write -= i; ··· 1323 1327 int r; 1324 1328 1325 1329 r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); 1326 - if (r == 0) 1327 - folio_wait_fscache(folio); 1328 - if (r < 0) { 1329 - if (folio) 1330 - folio_put(folio); 1331 - } else { 1332 - WARN_ON_ONCE(!folio_test_locked(folio)); 1333 - *pagep = &folio->page; 1334 - } 1335 - return r; 1330 + if (r < 0) 1331 + return r; 1332 + 1333 + folio_wait_fscache(folio); 1334 + WARN_ON_ONCE(!folio_test_locked(folio)); 1335 + *pagep = &folio->page; 1336 + return 0; 1336 1337 } 1337 1338 1338 1339 /* ··· 1432 1439 inode, off, ceph_cap_string(got)); 1433 1440 1434 1441 if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || 1435 - ci->i_inline_version == CEPH_INLINE_NONE) { 1442 + !ceph_has_inline_data(ci)) { 1436 1443 CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); 1437 1444 ceph_add_rw_context(fi, &rw_ctx); 1438 1445 ret = filemap_fault(vmf); ··· 1689 1696 } 1690 1697 1691 1698 req->r_mtime = inode->i_mtime; 1692 - err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1693 - if (!err) 1694 - err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1699 + ceph_osdc_start_request(&fsc->client->osdc, req); 1700 + err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1695 1701 ceph_osdc_put_request(req); 1696 1702 if (err < 0) 1697 1703 goto out_unlock; ··· 1731 1739 } 1732 1740 1733 1741 req->r_mtime = inode->i_mtime; 1734 - err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1735 - if (!err) 1736 - err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1742 + ceph_osdc_start_request(&fsc->client->osdc, req); 1743 + err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1737 1744 1738 1745 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 1739 1746 req->r_end_latency, len, err); ··· 1903 1912 1904 1913 osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 1905 1914 0, false, true); 1906 - err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); 1915 + ceph_osdc_start_request(&fsc->client->osdc, rd_req); 1907 1916 1908 1917 wr_req->r_mtime = ci->netfs.inode.i_mtime; 1909 - err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); 1918 + ceph_osdc_start_request(&fsc->client->osdc, wr_req); 1910 1919 1911 - if (!err) 1912 - err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); 1913 - if (!err2) 1914 - err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); 1920 + err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); 1921 + err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); 1915 1922 1916 1923 if (err >= 0 || err == -ENOENT) 1917 1924 have |= POOL_READ;
+19 -19
fs/ceph/caps.c
··· 602 602 * @ci: inode to be moved 603 603 * @session: new auth caps session 604 604 */ 605 - static void change_auth_cap_ses(struct ceph_inode_info *ci, 606 - struct ceph_mds_session *session) 605 + void change_auth_cap_ses(struct ceph_inode_info *ci, 606 + struct ceph_mds_session *session) 607 607 { 608 608 lockdep_assert_held(&ci->i_ceph_lock); 609 609 ··· 1978 1978 } 1979 1979 1980 1980 dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s" 1981 - " issued %s revoking %s retain %s %s%s\n", ceph_vinop(inode), 1981 + " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode), 1982 1982 ceph_cap_string(file_wanted), 1983 1983 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), 1984 1984 ceph_cap_string(ci->i_flushing_caps), 1985 1985 ceph_cap_string(issued), ceph_cap_string(revoking), 1986 1986 ceph_cap_string(retain), 1987 1987 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", 1988 - (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); 1988 + (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", 1989 + (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : ""); 1989 1990 1990 1991 /* 1991 1992 * If we no longer need to hold onto old our caps, and we may ··· 3006 3005 } 3007 3006 3008 3007 if (S_ISREG(ci->netfs.inode.i_mode) && 3009 - ci->i_inline_version != CEPH_INLINE_NONE && 3008 + ceph_has_inline_data(ci) && 3010 3009 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 3011 3010 i_size_read(inode) > 0) { 3012 3011 struct page *page = ··· 3579 3578 fill_inline = true; 3580 3579 } 3581 3580 3582 - if (ci->i_auth_cap == cap && 3583 - le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { 3584 - if (newcaps & ~extra_info->issued) 3585 - wake = true; 3581 + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { 3582 + if (ci->i_auth_cap == cap) { 3583 + if (newcaps & ~extra_info->issued) 3584 + wake = true; 3586 3585 3587 - if (ci->i_requested_max_size > max_size || 3588 - !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { 3589 - /* re-request max_size if necessary */ 3590 - ci->i_requested_max_size = 0; 3591 - wake = true; 3586 + if (ci->i_requested_max_size > max_size || 3587 + !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { 3588 + /* re-request max_size if necessary */ 3589 + ci->i_requested_max_size = 0; 3590 + wake = true; 3591 + } 3592 + 3593 + ceph_kick_flushing_inode_caps(session, ci); 3592 3594 } 3593 - 3594 - ceph_kick_flushing_inode_caps(session, ci); 3595 - spin_unlock(&ci->i_ceph_lock); 3596 3595 up_read(&session->s_mdsc->snap_rwsem); 3597 - } else { 3598 - spin_unlock(&ci->i_ceph_lock); 3599 3596 } 3597 + spin_unlock(&ci->i_ceph_lock); 3600 3598 3601 3599 if (fill_inline) 3602 3600 ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
+70 -9
fs/ceph/dir.c
··· 856 856 if (ceph_snap(dir) != CEPH_NOSNAP) 857 857 return -EROFS; 858 858 859 + err = ceph_wait_on_conflict_unlink(dentry); 860 + if (err) 861 + return err; 862 + 859 863 if (ceph_quota_is_max_files_exceeded(dir)) { 860 864 err = -EDQUOT; 861 865 goto out; ··· 922 918 if (ceph_snap(dir) != CEPH_NOSNAP) 923 919 return -EROFS; 924 920 921 + err = ceph_wait_on_conflict_unlink(dentry); 922 + if (err) 923 + return err; 924 + 925 925 if (ceph_quota_is_max_files_exceeded(dir)) { 926 926 err = -EDQUOT; 927 927 goto out; ··· 976 968 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); 977 969 struct ceph_mds_request *req; 978 970 struct ceph_acl_sec_ctx as_ctx = {}; 979 - int err = -EROFS; 971 + int err; 980 972 int op; 973 + 974 + err = ceph_wait_on_conflict_unlink(dentry); 975 + if (err) 976 + return err; 981 977 982 978 if (ceph_snap(dir) == CEPH_SNAPDIR) { 983 979 /* mkdir .snap/foo is a MKSNAP */ ··· 992 980 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); 993 981 op = CEPH_MDS_OP_MKDIR; 994 982 } else { 983 + err = -EROFS; 995 984 goto out; 996 985 } 997 986 ··· 1050 1037 struct ceph_mds_request *req; 1051 1038 int err; 1052 1039 1040 + err = ceph_wait_on_conflict_unlink(dentry); 1041 + if (err) 1042 + return err; 1043 + 1053 1044 if (ceph_snap(dir) != CEPH_NOSNAP) 1054 1045 return -EROFS; 1055 1046 ··· 1088 1071 static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, 1089 1072 struct ceph_mds_request *req) 1090 1073 { 1074 + struct dentry *dentry = req->r_dentry; 1075 + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 1076 + struct ceph_dentry_info *di = ceph_dentry(dentry); 1091 1077 int result = req->r_err ? req->r_err : 1092 1078 le32_to_cpu(req->r_reply_info.head->result); 1079 + 1080 + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) 1081 + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", 1082 + __func__, dentry, dentry); 1083 + 1084 + spin_lock(&fsc->async_unlink_conflict_lock); 1085 + hash_del_rcu(&di->hnode); 1086 + spin_unlock(&fsc->async_unlink_conflict_lock); 1087 + 1088 + spin_lock(&dentry->d_lock); 1089 + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; 1090 + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT); 1091 + spin_unlock(&dentry->d_lock); 1092 + 1093 + synchronize_rcu(); 1093 1094 1094 1095 if (result == -EJUKEBOX) 1095 1096 goto out; ··· 1116 1081 if (result) { 1117 1082 int pathlen = 0; 1118 1083 u64 base = 0; 1119 - char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 1084 + char *path = ceph_mdsc_build_path(dentry, &pathlen, 1120 1085 &base, 0); 1121 1086 1122 1087 /* mark error on parent + clear complete */ ··· 1124 1089 ceph_dir_clear_complete(req->r_parent); 1125 1090 1126 1091 /* drop the dentry -- we don't know its status */ 1127 - if (!d_unhashed(req->r_dentry)) 1128 - d_drop(req->r_dentry); 1092 + if (!d_unhashed(dentry)) 1093 + d_drop(dentry); 1129 1094 1130 1095 /* mark inode itself for an error (since metadata is bogus) */ 1131 1096 mapping_set_error(req->r_old_inode->i_mapping, result); 1132 1097 1133 - pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n", 1098 + pr_warn("async unlink failure path=(%llx)%s result=%d!\n", 1134 1099 base, IS_ERR(path) ? "<<bad>>" : path, result); 1135 1100 ceph_mdsc_free_path(path, pathlen); 1136 1101 } ··· 1215 1180 1216 1181 if (try_async && op == CEPH_MDS_OP_UNLINK && 1217 1182 (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { 1183 + struct ceph_dentry_info *di = ceph_dentry(dentry); 1184 + 1218 1185 dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir), 1219 1186 dentry->d_name.len, dentry->d_name.name, 1220 1187 ceph_cap_string(req->r_dir_caps)); ··· 1224 1187 req->r_callback = ceph_async_unlink_cb; 1225 1188 req->r_old_inode = d_inode(dentry); 1226 1189 ihold(req->r_old_inode); 1190 + 1191 + spin_lock(&dentry->d_lock); 1192 + di->flags |= CEPH_DENTRY_ASYNC_UNLINK; 1193 + spin_unlock(&dentry->d_lock); 1194 + 1195 + spin_lock(&fsc->async_unlink_conflict_lock); 1196 + hash_add_rcu(fsc->async_unlink_conflict, &di->hnode, 1197 + dentry->d_name.hash); 1198 + spin_unlock(&fsc->async_unlink_conflict_lock); 1199 + 1227 1200 err = ceph_mdsc_submit_request(mdsc, dir, req); 1228 1201 if (!err) { 1229 1202 /* ··· 1242 1195 */ 1243 1196 drop_nlink(inode); 1244 1197 d_delete(dentry); 1245 - } else if (err == -EJUKEBOX) { 1246 - try_async = false; 1247 - ceph_mdsc_put_request(req); 1248 - goto retry; 1198 + } else { 1199 + spin_lock(&fsc->async_unlink_conflict_lock); 1200 + hash_del_rcu(&di->hnode); 1201 + spin_unlock(&fsc->async_unlink_conflict_lock); 1202 + 1203 + spin_lock(&dentry->d_lock); 1204 + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; 1205 + spin_unlock(&dentry->d_lock); 1206 + 1207 + if (err == -EJUKEBOX) { 1208 + try_async = false; 1209 + ceph_mdsc_put_request(req); 1210 + goto retry; 1211 + } 1249 1212 } 1250 1213 } else { 1251 1214 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); ··· 1293 1236 if ((old_dir != new_dir) && 1294 1237 (!ceph_quota_is_same_realm(old_dir, new_dir))) 1295 1238 return -EXDEV; 1239 + 1240 + err = ceph_wait_on_conflict_unlink(new_dentry); 1241 + if (err) 1242 + return err; 1296 1243 1297 1244 dout("rename dir %p dentry %p to dir %p dentry %p\n", 1298 1245 old_dir, old_dentry, new_dir, new_dentry);
+48 -75
fs/ceph/file.c
··· 240 240 INIT_LIST_HEAD(&fi->rw_contexts); 241 241 fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); 242 242 243 - if ((file->f_mode & FMODE_WRITE) && 244 - ci->i_inline_version != CEPH_INLINE_NONE) { 243 + if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) { 245 244 ret = ceph_uninline_data(file); 246 245 if (ret < 0) 247 246 goto error; ··· 567 568 char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 568 569 &base, 0); 569 570 570 - pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", 571 + pr_warn("async create failure path=(%llx)%s result=%d!\n", 571 572 base, IS_ERR(path) ? "<<bad>>" : path, result); 572 573 ceph_mdsc_free_path(path, pathlen); 573 574 ··· 610 611 struct ceph_mds_reply_inode in = { }; 611 612 struct ceph_mds_reply_info_in iinfo = { .in = &in }; 612 613 struct ceph_inode_info *ci = ceph_inode(dir); 614 + struct ceph_dentry_info *di = ceph_dentry(dentry); 613 615 struct inode *inode; 614 616 struct timespec64 now; 615 617 struct ceph_string *pool_ns; ··· 709 709 file->f_mode |= FMODE_CREATED; 710 710 ret = finish_open(file, dentry, ceph_open); 711 711 } 712 + 713 + spin_lock(&dentry->d_lock); 714 + di->flags &= ~CEPH_DENTRY_ASYNC_CREATE; 715 + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT); 716 + spin_unlock(&dentry->d_lock); 717 + 712 718 return ret; 713 719 } 714 720 ··· 740 734 741 735 if (dentry->d_name.len > NAME_MAX) 742 736 return -ENAMETOOLONG; 737 + 738 + err = ceph_wait_on_conflict_unlink(dentry); 739 + if (err) 740 + return err; 741 + /* 742 + * Do not truncate the file, since atomic_open is called before the 743 + * permission check. The caller will do the truncation afterward. 744 + */ 745 + flags &= ~O_TRUNC; 743 746 744 747 if (flags & O_CREAT) { 745 748 if (ceph_quota_is_max_files_exceeded(dir)) ··· 796 781 (req->r_dir_caps = 797 782 try_prep_async_create(dir, dentry, &lo, 798 783 &req->r_deleg_ino))) { 784 + struct ceph_dentry_info *di = ceph_dentry(dentry); 785 + 799 786 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); 800 787 req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); 801 788 req->r_callback = ceph_async_create_cb; 789 + 790 + spin_lock(&dentry->d_lock); 791 + di->flags |= CEPH_DENTRY_ASYNC_CREATE; 792 + spin_unlock(&dentry->d_lock); 793 + 802 794 err = ceph_mdsc_submit_request(mdsc, dir, req); 803 795 if (!err) { 804 796 err = ceph_finish_async_create(dir, dentry, ··· 824 802 } 825 803 826 804 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 827 - err = ceph_mdsc_do_request(mdsc, 828 - (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 829 - req); 805 + err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); 830 806 if (err == -ENOENT) { 831 807 dentry = ceph_handle_snapdir(req, dentry); 832 808 if (IS_ERR(dentry)) { ··· 980 960 981 961 osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, 982 962 false, false); 983 - ret = ceph_osdc_start_request(osdc, req, false); 984 - if (!ret) 985 - ret = ceph_osdc_wait_request(osdc, req); 963 + ceph_osdc_start_request(osdc, req); 964 + ret = ceph_osdc_wait_request(osdc, req); 986 965 987 966 ceph_update_read_metrics(&fsc->mdsc->metric, 988 967 req->r_start_latency, ··· 1244 1225 req->r_inode = inode; 1245 1226 req->r_priv = aio_req; 1246 1227 1247 - ret = ceph_osdc_start_request(req->r_osdc, req, false); 1228 + ceph_osdc_start_request(req->r_osdc, req); 1248 1229 out: 1249 1230 if (ret < 0) { 1250 1231 req->r_result = ret; ··· 1381 1362 continue; 1382 1363 } 1383 1364 1384 - ret = ceph_osdc_start_request(req->r_osdc, req, false); 1385 - if (!ret) 1386 - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1365 + ceph_osdc_start_request(req->r_osdc, req); 1366 + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1387 1367 1388 1368 if (write) 1389 1369 ceph_update_write_metrics(metric, req->r_start_latency, ··· 1445 1427 r_private_item); 1446 1428 list_del_init(&req->r_private_item); 1447 1429 if (ret >= 0) 1448 - ret = ceph_osdc_start_request(req->r_osdc, 1449 - req, false); 1430 + ceph_osdc_start_request(req->r_osdc, req); 1450 1431 if (ret < 0) { 1451 1432 req->r_result = ret; 1452 1433 ceph_aio_complete_req(req); ··· 1558 1541 false, true); 1559 1542 1560 1543 req->r_mtime = mtime; 1561 - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1562 - if (!ret) 1563 - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1544 + ceph_osdc_start_request(&fsc->client->osdc, req); 1545 + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1564 1546 1565 1547 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 1566 1548 req->r_end_latency, len, ret); ··· 1643 1627 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 1644 1628 ceph_cap_string(got)); 1645 1629 1646 - if (ci->i_inline_version == CEPH_INLINE_NONE) { 1630 + if (!ceph_has_inline_data(ci)) { 1647 1631 if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { 1648 1632 ret = ceph_direct_read_write(iocb, to, 1649 1633 NULL, NULL); ··· 1906 1890 if (dirty) 1907 1891 __mark_inode_dirty(inode, dirty); 1908 1892 if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) 1909 - ceph_check_caps(ci, 0, NULL); 1893 + ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); 1910 1894 } 1911 1895 1912 1896 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", ··· 1946 1930 */ 1947 1931 static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) 1948 1932 { 1949 - struct inode *inode = file->f_mapping->host; 1950 - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1951 - loff_t i_size; 1952 - loff_t ret; 1953 - 1954 - inode_lock(inode); 1955 - 1956 1933 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 1934 + struct inode *inode = file_inode(file); 1935 + int ret; 1936 + 1957 1937 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); 1958 1938 if (ret < 0) 1959 - goto out; 1939 + return ret; 1960 1940 } 1961 - 1962 - i_size = i_size_read(inode); 1963 - switch (whence) { 1964 - case SEEK_END: 1965 - offset += i_size; 1966 - break; 1967 - case SEEK_CUR: 1968 - /* 1969 - * Here we special-case the lseek(fd, 0, SEEK_CUR) 1970 - * position-querying operation. Avoid rewriting the "same" 1971 - * f_pos value back to the file because a concurrent read(), 1972 - * write() or lseek() might have altered it 1973 - */ 1974 - if (offset == 0) { 1975 - ret = file->f_pos; 1976 - goto out; 1977 - } 1978 - offset += file->f_pos; 1979 - break; 1980 - case SEEK_DATA: 1981 - if (offset < 0 || offset >= i_size) { 1982 - ret = -ENXIO; 1983 - goto out; 1984 - } 1985 - break; 1986 - case SEEK_HOLE: 1987 - if (offset < 0 || offset >= i_size) { 1988 - ret = -ENXIO; 1989 - goto out; 1990 - } 1991 - offset = i_size; 1992 - break; 1993 - } 1994 - 1995 - ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size)); 1996 - 1997 - out: 1998 - inode_unlock(inode); 1999 - return ret; 1941 + return generic_file_llseek(file, offset, whence); 2000 1942 } 2001 1943 2002 1944 static inline void ceph_zero_partial_page( ··· 2023 2049 } 2024 2050 2025 2051 req->r_mtime = inode->i_mtime; 2026 - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 2027 - if (!ret) { 2028 - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 2029 - if (ret == -ENOENT) 2030 - ret = 0; 2031 - } 2052 + ceph_osdc_start_request(&fsc->client->osdc, req); 2053 + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 2054 + if (ret == -ENOENT) 2055 + ret = 0; 2032 2056 ceph_osdc_put_request(req); 2033 2057 2034 2058 out: ··· 2328 2356 if (IS_ERR(req)) 2329 2357 ret = PTR_ERR(req); 2330 2358 else { 2331 - ceph_osdc_start_request(osdc, req, false); 2359 + ceph_osdc_start_request(osdc, req); 2332 2360 ret = ceph_osdc_wait_request(osdc, req); 2333 2361 ceph_update_copyfrom_metrics(&fsc->mdsc->metric, 2334 2362 req->r_start_latency, ··· 2521 2549 /* Let the MDS know about dst file size change */ 2522 2550 if (ceph_inode_set_size(dst_inode, dst_off) || 2523 2551 ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) 2524 - ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL); 2552 + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH, 2553 + NULL); 2525 2554 } 2526 2555 /* Mark Fw dirty */ 2527 2556 spin_lock(&dst_ci->i_ceph_lock);
+10 -3
fs/ceph/inode.c
··· 1049 1049 iinfo->inline_version >= ci->i_inline_version) { 1050 1050 int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1051 1051 ci->i_inline_version = iinfo->inline_version; 1052 - if (ci->i_inline_version != CEPH_INLINE_NONE && 1052 + if (ceph_has_inline_data(ci) && 1053 1053 (locked_page || (info_caps & cache_caps))) 1054 1054 fill_inline = true; 1055 1055 } ··· 2275 2275 * 2276 2276 * This cost much when doing the Locker state transition and 2277 2277 * usually will need to revoke caps from clients. 2278 + * 2279 + * And for the 'Xs' caps for getxattr we will also choose the 2280 + * auth MDS, because the MDS side code is buggy due to setxattr 2281 + * won't notify the replica MDSes when the values changed and 2282 + * the replica MDS will return the old values. Though we will 2283 + * fix it in MDS code, but this still makes sense for old ceph. 2278 2284 */ 2279 2285 if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) 2280 - || (mask & CEPH_STAT_RSTAT)) 2286 + || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR))) 2281 2287 return USE_AUTH_MDS; 2282 2288 else 2283 2289 return USE_ANY_MDS; ··· 2327 2321 if (inline_version == 0) { 2328 2322 /* the reply is supposed to contain inline data */ 2329 2323 err = -EINVAL; 2330 - } else if (inline_version == CEPH_INLINE_NONE) { 2324 + } else if (inline_version == CEPH_INLINE_NONE || 2325 + inline_version == 1) { 2331 2326 err = -ENODATA; 2332 2327 } else { 2333 2328 err = req->r_reply_info.targeti.inline_len;
+157 -8
fs/ceph/mds_client.c
··· 456 456 dout("added delegated inode 0x%llx\n", 457 457 start - 1); 458 458 } else if (err == -EBUSY) { 459 - pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n", 459 + pr_warn("MDS delegated inode 0x%llx more than once.\n", 460 460 start - 1); 461 461 } else { 462 462 return err; ··· 653 653 if (!info->dir_entries) 654 654 return; 655 655 free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); 656 + } 657 + 658 + /* 659 + * In async unlink case the kclient won't wait for the first reply 660 + * from MDS and just drop all the links and unhash the dentry and then 661 + * succeeds immediately. 662 + * 663 + * For any new create/link/rename,etc requests followed by using the 664 + * same file names we must wait for the first reply of the inflight 665 + * unlink request, or the MDS possibly will fail these following 666 + * requests with -EEXIST if the inflight async unlink request was 667 + * delayed for some reasons. 668 + * 669 + * And the worst case is that for the none async openc request it will 670 + * successfully open the file if the CDentry hasn't been unlinked yet, 671 + * but later the previous delayed async unlink request will remove the 672 + * CDenty. That means the just created file is possiblly deleted later 673 + * by accident. 674 + * 675 + * We need to wait for the inflight async unlink requests to finish 676 + * when creating new files/directories by using the same file names. 677 + */ 678 + int ceph_wait_on_conflict_unlink(struct dentry *dentry) 679 + { 680 + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 681 + struct dentry *pdentry = dentry->d_parent; 682 + struct dentry *udentry, *found = NULL; 683 + struct ceph_dentry_info *di; 684 + struct qstr dname; 685 + u32 hash = dentry->d_name.hash; 686 + int err; 687 + 688 + dname.name = dentry->d_name.name; 689 + dname.len = dentry->d_name.len; 690 + 691 + rcu_read_lock(); 692 + hash_for_each_possible_rcu(fsc->async_unlink_conflict, di, 693 + hnode, hash) { 694 + udentry = di->dentry; 695 + 696 + spin_lock(&udentry->d_lock); 697 + if (udentry->d_name.hash != hash) 698 + goto next; 699 + if (unlikely(udentry->d_parent != pdentry)) 700 + goto next; 701 + if (!hash_hashed(&di->hnode)) 702 + goto next; 703 + 704 + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) 705 + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", 706 + __func__, dentry, dentry); 707 + 708 + if (!d_same_name(udentry, pdentry, &dname)) 709 + goto next; 710 + 711 + spin_unlock(&udentry->d_lock); 712 + found = dget(udentry); 713 + break; 714 + next: 715 + spin_unlock(&udentry->d_lock); 716 + } 717 + rcu_read_unlock(); 718 + 719 + if (likely(!found)) 720 + return 0; 721 + 722 + dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__, 723 + dentry, dentry, found, found); 724 + 725 + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT, 726 + TASK_KILLABLE); 727 + dput(found); 728 + return err; 656 729 } 657 730 658 731 ··· 1293 1220 if (count > 0) { 1294 1221 size_t i; 1295 1222 size_t size = FEATURE_BYTES(count); 1223 + unsigned long bit; 1296 1224 1297 1225 if (WARN_ON_ONCE(*p + 4 + size > end)) 1298 1226 return -ERANGE; 1299 1227 1300 1228 ceph_encode_32(p, size); 1301 1229 memset(*p, 0, size); 1302 - for (i = 0; i < count; i++) 1303 - ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8); 1230 + for (i = 0; i < count; i++) { 1231 + bit = feature_bits[i]; 1232 + ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8); 1233 + } 1304 1234 *p += size; 1305 1235 } else { 1306 1236 if (WARN_ON_ONCE(*p + 4 > end)) ··· 2960 2884 if (req->r_request_started == 0) /* note request start time */ 2961 2885 req->r_request_started = jiffies; 2962 2886 2887 + /* 2888 + * For async create we will choose the auth MDS of frag in parent 2889 + * directory to send the request and ususally this works fine, but 2890 + * if the migrated the dirtory to another MDS before it could handle 2891 + * it the request will be forwarded. 2892 + * 2893 + * And then the auth cap will be changed. 2894 + */ 2895 + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) { 2896 + struct ceph_dentry_info *di = ceph_dentry(req->r_dentry); 2897 + struct ceph_inode_info *ci; 2898 + struct ceph_cap *cap; 2899 + 2900 + /* 2901 + * The request maybe handled very fast and the new inode 2902 + * hasn't been linked to the dentry yet. We need to wait 2903 + * for the ceph_finish_async_create(), which shouldn't be 2904 + * stuck too long or fail in thoery, to finish when forwarding 2905 + * the request. 2906 + */ 2907 + if (!d_inode(req->r_dentry)) { 2908 + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT, 2909 + TASK_KILLABLE); 2910 + if (err) { 2911 + mutex_lock(&req->r_fill_mutex); 2912 + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); 2913 + mutex_unlock(&req->r_fill_mutex); 2914 + goto out_session; 2915 + } 2916 + } 2917 + 2918 + ci = ceph_inode(d_inode(req->r_dentry)); 2919 + 2920 + spin_lock(&ci->i_ceph_lock); 2921 + cap = ci->i_auth_cap; 2922 + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) { 2923 + dout("do_request session changed for auth cap %d -> %d\n", 2924 + cap->session->s_mds, session->s_mds); 2925 + 2926 + /* Remove the auth cap from old session */ 2927 + spin_lock(&cap->session->s_cap_lock); 2928 + cap->session->s_nr_caps--; 2929 + list_del_init(&cap->session_caps); 2930 + spin_unlock(&cap->session->s_cap_lock); 2931 + 2932 + /* Add the auth cap to the new session */ 2933 + cap->mds = mds; 2934 + cap->session = session; 2935 + spin_lock(&session->s_cap_lock); 2936 + session->s_nr_caps++; 2937 + list_add_tail(&cap->session_caps, &session->s_caps); 2938 + spin_unlock(&session->s_cap_lock); 2939 + 2940 + change_auth_cap_ses(ci, session); 2941 + } 2942 + spin_unlock(&ci->i_ceph_lock); 2943 + } 2944 + 2963 2945 err = __send_request(session, req, false); 2964 2946 2965 2947 out_session: ··· 3598 3464 case CEPH_SESSION_OPEN: 3599 3465 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) 3600 3466 pr_info("mds%d reconnect success\n", session->s_mds); 3601 - session->s_state = CEPH_MDS_SESSION_OPEN; 3602 - session->s_features = features; 3603 - renewed_caps(mdsc, session, 0); 3604 - if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) 3605 - metric_schedule_delayed(&mdsc->metric); 3467 + 3468 + if (session->s_state == CEPH_MDS_SESSION_OPEN) { 3469 + pr_notice("mds%d is already opened\n", session->s_mds); 3470 + } else { 3471 + session->s_state = CEPH_MDS_SESSION_OPEN; 3472 + session->s_features = features; 3473 + renewed_caps(mdsc, session, 0); 3474 + if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, 3475 + &session->s_features)) 3476 + metric_schedule_delayed(&mdsc->metric); 3477 + } 3478 + 3479 + /* 3480 + * The connection maybe broken and the session in client 3481 + * side has been reinitialized, need to update the seq 3482 + * anyway. 3483 + */ 3484 + if (!session->s_seq && seq) 3485 + session->s_seq = seq; 3486 + 3606 3487 wake = 1; 3607 3488 if (mdsc->stopping) 3608 3489 __close_session(mdsc, session);
+5 -8
fs/ceph/mds_client.h
··· 29 29 CEPHFS_FEATURE_MULTI_RECONNECT, 30 30 CEPHFS_FEATURE_DELEG_INO, 31 31 CEPHFS_FEATURE_METRIC_COLLECT, 32 + CEPHFS_FEATURE_ALTERNATE_NAME, 33 + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, 32 34 33 - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT, 35 + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE, 34 36 }; 35 37 36 - /* 37 - * This will always have the highest feature bit value 38 - * as the last element of the array. 39 - */ 40 38 #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 41 39 0, 1, 2, 3, 4, 5, 6, 7, \ 42 40 CEPHFS_FEATURE_MIMIC, \ ··· 43 45 CEPHFS_FEATURE_MULTI_RECONNECT, \ 44 46 CEPHFS_FEATURE_DELEG_INO, \ 45 47 CEPHFS_FEATURE_METRIC_COLLECT, \ 46 - \ 47 - CEPHFS_FEATURE_MAX, \ 48 + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ 48 49 } 49 - #define CEPHFS_FEATURES_CLIENT_REQUIRED {} 50 50 51 51 /* 52 52 * Some lock dependencies: ··· 578 582 TASK_KILLABLE); 579 583 } 580 584 585 + extern int ceph_wait_on_conflict_unlink(struct dentry *dentry); 581 586 extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); 582 587 extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); 583 588 #endif
+18 -4
fs/ceph/mdsmap.c
··· 352 352 __decode_and_drop_type(p, end, u8, bad_ext); 353 353 } 354 354 if (mdsmap_ev >= 8) { 355 - u32 name_len; 356 355 /* enabled */ 357 356 ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); 358 - ceph_decode_32_safe(p, end, name_len, bad_ext); 359 - ceph_decode_need(p, end, name_len, bad_ext); 360 - *p += name_len; 357 + /* fs_name */ 358 + ceph_decode_skip_string(p, end, bad_ext); 361 359 } 362 360 /* damaged */ 363 361 if (mdsmap_ev >= 9) { ··· 367 369 m->m_damaged = n > 0; 368 370 } else { 369 371 m->m_damaged = false; 372 + } 373 + if (mdsmap_ev >= 17) { 374 + /* balancer */ 375 + ceph_decode_skip_string(p, end, bad_ext); 376 + /* standby_count_wanted */ 377 + ceph_decode_skip_32(p, end, bad_ext); 378 + /* old_max_mds */ 379 + ceph_decode_skip_32(p, end, bad_ext); 380 + /* min_compat_client */ 381 + ceph_decode_skip_8(p, end, bad_ext); 382 + /* required_client_features */ 383 + ceph_decode_skip_set(p, end, 64, bad_ext); 384 + ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext); 385 + } else { 386 + /* This forces the usage of the (sync) SETXATTR Op */ 387 + m->m_max_xattr_size = 0; 370 388 } 371 389 bad_ext: 372 390 dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
+12 -7
fs/ceph/super.c
··· 72 72 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */ 73 73 74 74 /* 75 - * express utilization in terms of large blocks to avoid 75 + * Express utilization in terms of large blocks to avoid 76 76 * overflow on 32-bit machines. 77 - * 78 - * NOTE: for the time being, we make bsize == frsize to humor 79 - * not-yet-ancient versions of glibc that are broken. 80 - * Someday, we will probably want to report a real block 81 - * size... whatever that may mean for a network file system! 82 77 */ 83 - buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; 84 78 buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; 85 79 86 80 /* ··· 88 94 buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 89 95 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 90 96 } 97 + 98 + /* 99 + * NOTE: for the time being, we make bsize == frsize to humor 100 + * not-yet-ancient versions of glibc that are broken. 101 + * Someday, we will probably want to report a real block 102 + * size... whatever that may mean for a network file system! 103 + */ 104 + buf->f_bsize = buf->f_frsize; 91 105 92 106 buf->f_files = le64_to_cpu(st.num_objects); 93 107 buf->f_ffree = -1; ··· 817 815 fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); 818 816 if (!fsc->cap_wq) 819 817 goto fail_inode_wq; 818 + 819 + hash_init(fsc->async_unlink_conflict); 820 + spin_lock_init(&fsc->async_unlink_conflict_lock); 820 821 821 822 spin_lock(&ceph_fsc_lock); 822 823 list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);
+26 -5
fs/ceph/super.h
··· 19 19 #include <linux/security.h> 20 20 #include <linux/netfs.h> 21 21 #include <linux/fscache.h> 22 + #include <linux/hashtable.h> 22 23 23 24 #include <linux/ceph/libceph.h> 24 25 ··· 100 99 char *mon_addr; 101 100 }; 102 101 102 + #define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 103 + 103 104 struct ceph_fs_client { 104 105 struct super_block *sb; 105 106 ··· 126 123 127 124 struct workqueue_struct *inode_wq; 128 125 struct workqueue_struct *cap_wq; 126 + 127 + DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS); 128 + spinlock_t async_unlink_conflict_lock; 129 129 130 130 #ifdef CONFIG_DEBUG_FS 131 131 struct dentry *debugfs_dentry_lru, *debugfs_caps; ··· 286 280 struct dentry *dentry; 287 281 struct ceph_mds_session *lease_session; 288 282 struct list_head lease_list; 289 - unsigned flags; 283 + struct hlist_node hnode; 284 + unsigned long flags; 290 285 int lease_shared_gen; 291 286 u32 lease_gen; 292 287 u32 lease_seq; ··· 296 289 u64 offset; 297 290 }; 298 291 299 - #define CEPH_DENTRY_REFERENCED 1 300 - #define CEPH_DENTRY_LEASE_LIST 2 301 - #define CEPH_DENTRY_SHRINK_LIST 4 302 - #define CEPH_DENTRY_PRIMARY_LINK 8 292 + #define CEPH_DENTRY_REFERENCED (1 << 0) 293 + #define CEPH_DENTRY_LEASE_LIST (1 << 1) 294 + #define CEPH_DENTRY_SHRINK_LIST (1 << 2) 295 + #define CEPH_DENTRY_PRIMARY_LINK (1 << 3) 296 + #define CEPH_DENTRY_ASYNC_UNLINK_BIT (4) 297 + #define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT) 298 + #define CEPH_DENTRY_ASYNC_CREATE_BIT (5) 299 + #define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT) 303 300 304 301 struct ceph_inode_xattrs_info { 305 302 /* ··· 769 758 extern void ceph_reservation_status(struct ceph_fs_client *client, 770 759 int *total, int *avail, int *used, 771 760 int *reserved, int *min); 761 + extern void change_auth_cap_ses(struct ceph_inode_info *ci, 762 + struct ceph_mds_session *session); 772 763 773 764 774 765 ··· 1230 1217 extern int ceph_pool_perm_check(struct inode *inode, int need); 1231 1218 extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); 1232 1219 int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); 1220 + 1221 + static inline bool ceph_has_inline_data(struct ceph_inode_info *ci) 1222 + { 1223 + if (ci->i_inline_version == CEPH_INLINE_NONE || 1224 + ci->i_inline_version == 1) /* initial version, no data */ 1225 + return false; 1226 + return true; 1227 + } 1233 1228 1234 1229 /* file.c */ 1235 1230 extern const struct file_operations ceph_file_fops;
+8 -4
fs/ceph/xattr.c
··· 1086 1086 flags |= CEPH_XATTR_REMOVE; 1087 1087 } 1088 1088 1089 - dout("setxattr value=%.*s\n", (int)size, value); 1089 + dout("setxattr value size: %zu\n", size); 1090 1090 1091 1091 /* do request */ 1092 1092 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); ··· 1184 1184 spin_lock(&ci->i_ceph_lock); 1185 1185 retry: 1186 1186 issued = __ceph_caps_issued(ci, NULL); 1187 - if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) 1187 + required_blob_size = __get_required_blob_size(ci, name_len, val_len); 1188 + if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) || 1189 + (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) { 1190 + dout("%s do sync setxattr: version: %llu size: %d max: %llu\n", 1191 + __func__, ci->i_xattrs.version, required_blob_size, 1192 + mdsc->mdsmap->m_max_xattr_size); 1188 1193 goto do_sync; 1194 + } 1189 1195 1190 1196 if (!lock_snap_rwsem && !ci->i_head_snapc) { 1191 1197 lock_snap_rwsem = true; ··· 1206 1200 dout("setxattr %p name '%s' issued %s\n", inode, name, 1207 1201 ceph_cap_string(issued)); 1208 1202 __build_xattrs(inode); 1209 - 1210 - required_blob_size = __get_required_blob_size(ci, name_len, val_len); 1211 1203 1212 1204 if (!ci->i_xattrs.prealloc_blob || 1213 1205 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
+30 -6
fs/crypto/fname.c
··· 86 86 /** 87 87 * fscrypt_fname_encrypt() - encrypt a filename 88 88 * @inode: inode of the parent directory (for regular filenames) 89 - * or of the symlink (for symlink targets) 89 + * or of the symlink (for symlink targets). Key must already be 90 + * set up. 90 91 * @iname: the filename to encrypt 91 92 * @out: (output) the encrypted filename 92 93 * @olen: size of the encrypted filename. It must be at least @iname->len. ··· 138 137 139 138 return 0; 140 139 } 140 + EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt); 141 141 142 142 /** 143 143 * fname_decrypt() - decrypt a filename ··· 266 264 return bp - dst; 267 265 } 268 266 269 - bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, 270 - u32 orig_len, u32 max_len, 271 - u32 *encrypted_len_ret) 267 + bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, 268 + u32 orig_len, u32 max_len, 269 + u32 *encrypted_len_ret) 272 270 { 273 271 int padding = 4 << (fscrypt_policy_flags(policy) & 274 272 FSCRYPT_POLICY_FLAGS_PAD_MASK); ··· 281 279 *encrypted_len_ret = min(encrypted_len, max_len); 282 280 return true; 283 281 } 282 + 283 + /** 284 + * fscrypt_fname_encrypted_size() - calculate length of encrypted filename 285 + * @inode: parent inode of dentry name being encrypted. Key must 286 + * already be set up. 287 + * @orig_len: length of the original filename 288 + * @max_len: maximum length to return 289 + * @encrypted_len_ret: where calculated length should be returned (on success) 290 + * 291 + * Filenames that are shorter than the maximum length may have their lengths 292 + * increased slightly by encryption, due to padding that is applied. 293 + * 294 + * Return: false if the orig_len is greater than max_len. Otherwise, true and 295 + * fill out encrypted_len_ret with the length (up to max_len). 296 + */ 297 + bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, 298 + u32 max_len, u32 *encrypted_len_ret) 299 + { 300 + return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, 301 + orig_len, max_len, 302 + encrypted_len_ret); 303 + } 304 + EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); 284 305 285 306 /** 286 307 * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames ··· 460 435 return ret; 461 436 462 437 if (fscrypt_has_encryption_key(dir)) { 463 - if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, 464 - iname->len, NAME_MAX, 438 + if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX, 465 439 &fname->crypto_buf.len)) 466 440 return -ENAMETOOLONG; 467 441 fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,
+3 -6
fs/crypto/fscrypt_private.h
··· 297 297 const struct fscrypt_info *ci); 298 298 299 299 /* fname.c */ 300 - int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, 301 - u8 *out, unsigned int olen); 302 - bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, 303 - u32 orig_len, u32 max_len, 304 - u32 *encrypted_len_ret); 300 + bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, 301 + u32 orig_len, u32 max_len, 302 + u32 *encrypted_len_ret); 305 303 306 304 /* hkdf.c */ 307 - 308 305 struct fscrypt_hkdf { 309 306 struct crypto_shash *hmac_tfm; 310 307 };
+3 -3
fs/crypto/hooks.c
··· 228 228 * counting it (even though it is meaningless for ciphertext) is simpler 229 229 * for now since filesystems will assume it is there and subtract it. 230 230 */ 231 - if (!fscrypt_fname_encrypted_size(policy, len, 232 - max_len - sizeof(struct fscrypt_symlink_data), 233 - &disk_link->len)) 231 + if (!__fscrypt_fname_encrypted_size(policy, len, 232 + max_len - sizeof(struct fscrypt_symlink_data), 233 + &disk_link->len)) 234 234 return -ENAMETOOLONG; 235 235 disk_link->len += sizeof(struct fscrypt_symlink_data); 236 236
+29 -6
fs/crypto/policy.c
··· 694 694 } 695 695 696 696 /** 697 + * fscrypt_context_for_new_inode() - create an encryption context for a new inode 698 + * @ctx: where context should be written 699 + * @inode: inode from which to fetch policy and nonce 700 + * 701 + * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode, 702 + * generate a new context and write it to ctx. ctx _must_ be at least 703 + * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes. 704 + * 705 + * Return: size of the resulting context or a negative error code. 706 + */ 707 + int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) 708 + { 709 + struct fscrypt_info *ci = inode->i_crypt_info; 710 + 711 + BUILD_BUG_ON(sizeof(union fscrypt_context) != 712 + FSCRYPT_SET_CONTEXT_MAX_SIZE); 713 + 714 + /* fscrypt_prepare_new_inode() should have set up the key already. */ 715 + if (WARN_ON_ONCE(!ci)) 716 + return -ENOKEY; 717 + 718 + return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce); 719 + } 720 + EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); 721 + 722 + /** 697 723 * fscrypt_set_context() - Set the fscrypt context of a new inode 698 724 * @inode: a new inode 699 725 * @fs_data: private data given by FS and passed to ->set_context() ··· 735 709 union fscrypt_context ctx; 736 710 int ctxsize; 737 711 738 - /* fscrypt_prepare_new_inode() should have set up the key already. */ 739 - if (WARN_ON_ONCE(!ci)) 740 - return -ENOKEY; 741 - 742 - BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); 743 - ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce); 712 + ctxsize = fscrypt_context_for_new_inode(&ctx, inode); 713 + if (ctxsize < 0) 714 + return ctxsize; 744 715 745 716 /* 746 717 * This may be the first time the inode number is available, so do any
+11 -4
fs/dcache.c
··· 2248 2248 } 2249 2249 EXPORT_SYMBOL(d_add_ci); 2250 2250 2251 - 2252 - static inline bool d_same_name(const struct dentry *dentry, 2253 - const struct dentry *parent, 2254 - const struct qstr *name) 2251 + /** 2252 + * d_same_name - compare dentry name with case-exact name 2253 + * @parent: parent dentry 2254 + * @dentry: the negative dentry that was passed to the parent's lookup func 2255 + * @name: the case-exact name to be associated with the returned dentry 2256 + * 2257 + * Return: true if names are same, or false 2258 + */ 2259 + bool d_same_name(const struct dentry *dentry, const struct dentry *parent, 2260 + const struct qstr *name) 2255 2261 { 2256 2262 if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { 2257 2263 if (dentry->d_name.len != name->len) ··· 2268 2262 dentry->d_name.len, dentry->d_name.name, 2269 2263 name) == 0; 2270 2264 } 2265 + EXPORT_SYMBOL_GPL(d_same_name); 2271 2266 2272 2267 /** 2273 2268 * __d_lookup_rcu - search for a dentry (racy, store-free)
+7 -3
fs/inode.c
··· 422 422 INIT_LIST_HEAD(&inode->i_io_list); 423 423 INIT_LIST_HEAD(&inode->i_wb_list); 424 424 INIT_LIST_HEAD(&inode->i_lru); 425 + INIT_LIST_HEAD(&inode->i_sb_list); 425 426 __address_space_init_once(&inode->i_data); 426 427 i_size_ordered_init(inode); 427 428 } ··· 1022 1021 spin_lock(&inode->i_lock); 1023 1022 inode->i_state = 0; 1024 1023 spin_unlock(&inode->i_lock); 1025 - INIT_LIST_HEAD(&inode->i_sb_list); 1026 1024 } 1027 1025 return inode; 1028 1026 } ··· 1165 1165 { 1166 1166 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); 1167 1167 struct inode *old; 1168 - bool creating = inode->i_state & I_CREATING; 1169 1168 1170 1169 again: 1171 1170 spin_lock(&inode_hash_lock); ··· 1198 1199 inode->i_state |= I_NEW; 1199 1200 hlist_add_head_rcu(&inode->i_hash, head); 1200 1201 spin_unlock(&inode->i_lock); 1201 - if (!creating) 1202 + 1203 + /* 1204 + * Add inode to the sb list if it's not already. It has I_NEW at this 1205 + * point, so it should be safe to test i_sb_list locklessly. 1206 + */ 1207 + if (list_empty(&inode->i_sb_list)) 1202 1208 inode_sb_list_add(inode); 1203 1209 unlock: 1204 1210 spin_unlock(&inode_hash_lock);
+4 -4
include/linux/ceph/ceph_fs.h
··· 433 433 __le32 stripe_unit; /* layout for newly created file */ 434 434 __le32 stripe_count; /* ... */ 435 435 __le32 object_size; 436 - __le32 file_replication; 437 - __le32 mask; /* CEPH_CAP_* */ 438 - __le32 old_size; 436 + __le32 pool; 437 + __le32 mask; /* CEPH_CAP_* */ 438 + __le64 old_size; 439 439 } __attribute__ ((packed)) open; 440 440 struct { 441 441 __le32 flags; ··· 768 768 __le32 xattr_len; 769 769 __le64 xattr_version; 770 770 771 - /* filelock */ 771 + /* a union of non-export and export bodies. */ 772 772 __le64 size, max_size, truncate_size; 773 773 __le32 truncate_seq; 774 774 struct ceph_timespec mtime, atime, ctime;
+1
include/linux/ceph/mdsmap.h
··· 25 25 u32 m_session_timeout; /* seconds */ 26 26 u32 m_session_autoclose; /* seconds */ 27 27 u64 m_max_file_size; 28 + u64 m_max_xattr_size; /* maximum size for xattrs blob */ 28 29 u32 m_max_mds; /* expected up:active mds number */ 29 30 u32 m_num_active_mds; /* actual up:active mds number */ 30 31 u32 possible_max_rank; /* possible max rank index */
+2 -3
include/linux/ceph/osd_client.h
··· 507 507 extern void ceph_osdc_get_request(struct ceph_osd_request *req); 508 508 extern void ceph_osdc_put_request(struct ceph_osd_request *req); 509 509 510 - extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, 511 - struct ceph_osd_request *req, 512 - bool nofail); 510 + void ceph_osdc_start_request(struct ceph_osd_client *osdc, 511 + struct ceph_osd_request *req); 513 512 extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); 514 513 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, 515 514 struct ceph_osd_request *req);
+2
include/linux/dcache.h
··· 233 233 wait_queue_head_t *); 234 234 extern struct dentry * d_splice_alias(struct inode *, struct dentry *); 235 235 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); 236 + extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent, 237 + const struct qstr *name); 236 238 extern struct dentry * d_exact_alias(struct dentry *, struct inode *); 237 239 extern struct dentry *d_find_any_alias(struct inode *inode); 238 240 extern struct dentry * d_obtain_alias(struct inode *);
+5
include/linux/fscrypt.h
··· 284 284 int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); 285 285 int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); 286 286 int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); 287 + int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); 287 288 int fscrypt_set_context(struct inode *inode, void *fs_data); 288 289 289 290 struct fscrypt_dummy_policy { ··· 328 327 int fscrypt_drop_inode(struct inode *inode); 329 328 330 329 /* fname.c */ 330 + int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, 331 + u8 *out, unsigned int olen); 332 + bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, 333 + u32 max_len, u32 *encrypted_len_ret); 331 334 int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, 332 335 int lookup, struct fscrypt_name *fname); 333 336
+10
include/linux/mmdebug.h
··· 54 54 } \ 55 55 unlikely(__ret_warn_once); \ 56 56 }) 57 + #define VM_WARN_ON_FOLIO(cond, folio) ({ \ 58 + int __ret_warn = !!(cond); \ 59 + \ 60 + if (unlikely(__ret_warn)) { \ 61 + dump_page(&folio->page, "VM_WARN_ON_FOLIO(" __stringify(cond)")");\ 62 + WARN_ON(1); \ 63 + } \ 64 + unlikely(__ret_warn); \ 65 + }) 57 66 #define VM_WARN_ON_ONCE_FOLIO(cond, folio) ({ \ 58 67 static bool __section(".data.once") __warned; \ 59 68 int __ret_warn_once = !!(cond); \ ··· 88 79 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) 89 80 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) 90 81 #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) 82 + #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) 91 83 #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) 92 84 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) 93 85 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
+6 -9
net/ceph/osd_client.c
··· 4578 4578 /* 4579 4579 * Register request, send initial attempt. 4580 4580 */ 4581 - int ceph_osdc_start_request(struct ceph_osd_client *osdc, 4582 - struct ceph_osd_request *req, 4583 - bool nofail) 4581 + void ceph_osdc_start_request(struct ceph_osd_client *osdc, 4582 + struct ceph_osd_request *req) 4584 4583 { 4585 4584 down_read(&osdc->lock); 4586 4585 submit_request(req, false); 4587 4586 up_read(&osdc->lock); 4588 - 4589 - return 0; 4590 4587 } 4591 4588 EXPORT_SYMBOL(ceph_osdc_start_request); 4592 4589 ··· 4753 4756 if (ret) 4754 4757 goto out_put_req; 4755 4758 4756 - ceph_osdc_start_request(osdc, req, false); 4759 + ceph_osdc_start_request(osdc, req); 4757 4760 linger_cancel(lreq); 4758 4761 linger_put(lreq); 4759 4762 ret = wait_request_timeout(req, opts->mount_timeout); ··· 4824 4827 if (ret) 4825 4828 goto out_put_req; 4826 4829 4827 - ceph_osdc_start_request(osdc, req, false); 4830 + ceph_osdc_start_request(osdc, req); 4828 4831 ret = ceph_osdc_wait_request(osdc, req); 4829 4832 4830 4833 out_put_req: ··· 5040 5043 if (ret) 5041 5044 goto out_put_req; 5042 5045 5043 - ceph_osdc_start_request(osdc, req, false); 5046 + ceph_osdc_start_request(osdc, req); 5044 5047 ret = ceph_osdc_wait_request(osdc, req); 5045 5048 if (ret >= 0) { 5046 5049 void *p = page_address(pages[0]); ··· 5117 5120 if (ret) 5118 5121 goto out_put_req; 5119 5122 5120 - ceph_osdc_start_request(osdc, req, false); 5123 + ceph_osdc_start_request(osdc, req); 5121 5124 ret = ceph_osdc_wait_request(osdc, req); 5122 5125 if (ret >= 0) { 5123 5126 ret = req->r_ops[0].rval;
+24 -8
net/ceph/osdmap.c
··· 11 11 #include <linux/crush/hash.h> 12 12 #include <linux/crush/mapper.h> 13 13 14 + static __printf(2, 3) 15 + void osdmap_info(const struct ceph_osdmap *map, const char *fmt, ...) 16 + { 17 + struct va_format vaf; 18 + va_list args; 19 + 20 + va_start(args, fmt); 21 + vaf.fmt = fmt; 22 + vaf.va = &args; 23 + 24 + printk(KERN_INFO "%s (%pU e%u): %pV", KBUILD_MODNAME, &map->fsid, 25 + map->epoch, &vaf); 26 + 27 + va_end(args); 28 + } 29 + 14 30 char *ceph_osdmap_state_str(char *str, int len, u32 state) 15 31 { 16 32 if (!len) ··· 587 571 goto bad; 588 572 #endif 589 573 r = kmalloc(struct_size(r, steps, yes), GFP_NOFS); 590 - c->rules[i] = r; 591 574 if (r == NULL) 592 575 goto badmem; 593 576 dout(" rule %d is at %p\n", i, r); 577 + c->rules[i] = r; 594 578 r->len = yes; 595 579 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ 596 580 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); ··· 1582 1566 if (ret) 1583 1567 return ret; 1584 1568 1585 - pr_info("osd%d primary-affinity 0x%x\n", osd, aff); 1569 + osdmap_info(map, "osd%d primary-affinity 0x%x\n", osd, aff); 1586 1570 } 1587 1571 1588 1572 return 0; ··· 1880 1864 osd = ceph_decode_32(p); 1881 1865 w = ceph_decode_32(p); 1882 1866 BUG_ON(osd >= map->max_osd); 1883 - pr_info("osd%d weight 0x%x %s\n", osd, w, 1884 - w == CEPH_OSD_IN ? "(in)" : 1885 - (w == CEPH_OSD_OUT ? "(out)" : "")); 1867 + osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w, 1868 + w == CEPH_OSD_IN ? "(in)" : 1869 + (w == CEPH_OSD_OUT ? "(out)" : "")); 1886 1870 map->osd_weight[osd] = w; 1887 1871 1888 1872 /* ··· 1914 1898 BUG_ON(osd >= map->max_osd); 1915 1899 if ((map->osd_state[osd] & CEPH_OSD_UP) && 1916 1900 (xorstate & CEPH_OSD_UP)) 1917 - pr_info("osd%d down\n", osd); 1901 + osdmap_info(map, "osd%d down\n", osd); 1918 1902 if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && 1919 1903 (xorstate & CEPH_OSD_EXISTS)) { 1920 - pr_info("osd%d does not exist\n", osd); 1904 + osdmap_info(map, "osd%d does not exist\n", osd); 1921 1905 ret = set_primary_affinity(map, osd, 1922 1906 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); 1923 1907 if (ret) ··· 1947 1931 1948 1932 dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr)); 1949 1933 1950 - pr_info("osd%d up\n", osd); 1934 + osdmap_info(map, "osd%d up\n", osd); 1951 1935 map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; 1952 1936 map->osd_addr[osd] = addr; 1953 1937 }
+1 -1
net/ceph/pagelist.c
··· 96 96 EXPORT_SYMBOL(ceph_pagelist_append); 97 97 98 98 /* Allocate enough pages for a pagelist to append the given amount 99 - * of data without without allocating. 99 + * of data without allocating. 100 100 * Returns: 0 on success, -ENOMEM on error. 101 101 */ 102 102 int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)