Merge tag 'ceph-for-5.20-rc1' of https://github.com/ceph/ceph-client

+3 -3

drivers/block/rbd.c

··· 1297 1297 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n", 1298 1298 __func__, osd_req, obj_req, obj_req->ex.oe_objno, 1299 1299 obj_req->ex.oe_off, obj_req->ex.oe_len); 1300 - ceph_osdc_start_request(osd_req->r_osdc, osd_req, false); 1300 + ceph_osdc_start_request(osd_req->r_osdc, osd_req); 1301 1301 } 1302 1302 1303 1303 /* ··· 2081 2081 if (ret) 2082 2082 return ret; 2083 2083 2084 - ceph_osdc_start_request(osdc, req, false); 2084 + ceph_osdc_start_request(osdc, req); 2085 2085 return 0; 2086 2086 } 2087 2087 ··· 4768 4768 if (ret) 4769 4769 goto out_req; 4770 4770 4771 - ceph_osdc_start_request(osdc, req, false); 4771 + ceph_osdc_start_request(osdc, req); 4772 4772 ret = ceph_osdc_wait_request(osdc, req); 4773 4773 if (ret >= 0) 4774 4774 ceph_copy_from_page_vector(pages, buf, 0, ret);

+24 -35

fs/ceph/addr.c

··· 122 122 * Reference snap context in folio->private. Also set 123 123 * PagePrivate so that we get invalidate_folio callback. 124 124 */ 125 - VM_BUG_ON_FOLIO(folio_test_private(folio), folio); 125 + VM_WARN_ON_FOLIO(folio->private, folio); 126 126 folio_attach_private(folio, snapc); 127 127 128 128 return ceph_fscache_dirty_folio(mapping, folio); ··· 237 237 if (err >= 0 && err < subreq->len) 238 238 __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); 239 239 240 - netfs_subreq_terminated(subreq, err, true); 240 + netfs_subreq_terminated(subreq, err, false); 241 241 242 242 num_pages = calc_pages_for(osd_data->alignment, osd_data->length); 243 243 ceph_put_page_vector(osd_data->pages, num_pages, false); ··· 313 313 int err = 0; 314 314 u64 len = subreq->len; 315 315 316 - if (ci->i_inline_version != CEPH_INLINE_NONE && 317 - ceph_netfs_issue_op_inline(subreq)) 316 + if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq)) 318 317 return; 319 318 320 319 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, ··· 337 338 /* should always give us a page-aligned read */ 338 339 WARN_ON_ONCE(page_off); 339 340 len = err; 341 + err = 0; 340 342 341 343 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); 342 344 req->r_callback = finish_netfs_read; ··· 345 345 req->r_inode = inode; 346 346 ihold(inode); 347 347 348 - err = ceph_osdc_start_request(req->r_osdc, req, false); 349 - if (err) 350 - iput(inode); 348 + ceph_osdc_start_request(req->r_osdc, req); 351 349 out: 352 350 ceph_osdc_put_request(req); 353 351 if (err) ··· 619 621 dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); 620 622 621 623 req->r_mtime = inode->i_mtime; 622 - err = ceph_osdc_start_request(osdc, req, true); 623 - if (!err) 624 - err = ceph_osdc_wait_request(osdc, req); 624 + ceph_osdc_start_request(osdc, req); 625 + err = ceph_osdc_wait_request(osdc, req); 625 626 626 627 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 627 628 req->r_end_latency, len, err); ··· 1148 1151 } 1149 1152 1150 1153 req->r_mtime = inode->i_mtime; 1151 - rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); 1152 - BUG_ON(rc); 1154 + ceph_osdc_start_request(&fsc->client->osdc, req); 1153 1155 req = NULL; 1154 1156 1155 1157 wbc->nr_to_write -= i; ··· 1323 1327 int r; 1324 1328 1325 1329 r = netfs_write_begin(&ci->netfs, file, inode->i_mapping, pos, len, &folio, NULL); 1326 - if (r == 0) 1327 - folio_wait_fscache(folio); 1328 - if (r < 0) { 1329 - if (folio) 1330 - folio_put(folio); 1331 - } else { 1332 - WARN_ON_ONCE(!folio_test_locked(folio)); 1333 - *pagep = &folio->page; 1334 - } 1335 - return r; 1330 + if (r < 0) 1331 + return r; 1332 + 1333 + folio_wait_fscache(folio); 1334 + WARN_ON_ONCE(!folio_test_locked(folio)); 1335 + *pagep = &folio->page; 1336 + return 0; 1336 1337 } 1337 1338 1338 1339 /* ··· 1432 1439 inode, off, ceph_cap_string(got)); 1433 1440 1434 1441 if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || 1435 - ci->i_inline_version == CEPH_INLINE_NONE) { 1442 + !ceph_has_inline_data(ci)) { 1436 1443 CEPH_DEFINE_RW_CONTEXT(rw_ctx, got); 1437 1444 ceph_add_rw_context(fi, &rw_ctx); 1438 1445 ret = filemap_fault(vmf); ··· 1689 1696 } 1690 1697 1691 1698 req->r_mtime = inode->i_mtime; 1692 - err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1693 - if (!err) 1694 - err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1699 + ceph_osdc_start_request(&fsc->client->osdc, req); 1700 + err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1695 1701 ceph_osdc_put_request(req); 1696 1702 if (err < 0) 1697 1703 goto out_unlock; ··· 1731 1739 } 1732 1740 1733 1741 req->r_mtime = inode->i_mtime; 1734 - err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1735 - if (!err) 1736 - err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1742 + ceph_osdc_start_request(&fsc->client->osdc, req); 1743 + err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1737 1744 1738 1745 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 1739 1746 req->r_end_latency, len, err); ··· 1903 1912 1904 1913 osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, 1905 1914 0, false, true); 1906 - err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); 1915 + ceph_osdc_start_request(&fsc->client->osdc, rd_req); 1907 1916 1908 1917 wr_req->r_mtime = ci->netfs.inode.i_mtime; 1909 - err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); 1918 + ceph_osdc_start_request(&fsc->client->osdc, wr_req); 1910 1919 1911 - if (!err) 1912 - err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); 1913 - if (!err2) 1914 - err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); 1920 + err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); 1921 + err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); 1915 1922 1916 1923 if (err >= 0 || err == -ENOENT) 1917 1924 have |= POOL_READ;

+19 -19

fs/ceph/caps.c

··· 602 602 * @ci: inode to be moved 603 603 * @session: new auth caps session 604 604 */ 605 - static void change_auth_cap_ses(struct ceph_inode_info *ci, 606 - struct ceph_mds_session *session) 605 + void change_auth_cap_ses(struct ceph_inode_info *ci, 606 + struct ceph_mds_session *session) 607 607 { 608 608 lockdep_assert_held(&ci->i_ceph_lock); 609 609 ··· 1978 1978 } 1979 1979 1980 1980 dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s" 1981 - " issued %s revoking %s retain %s %s%s\n", ceph_vinop(inode), 1981 + " issued %s revoking %s retain %s %s%s%s\n", ceph_vinop(inode), 1982 1982 ceph_cap_string(file_wanted), 1983 1983 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), 1984 1984 ceph_cap_string(ci->i_flushing_caps), 1985 1985 ceph_cap_string(issued), ceph_cap_string(revoking), 1986 1986 ceph_cap_string(retain), 1987 1987 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", 1988 - (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : ""); 1988 + (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", 1989 + (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : ""); 1989 1990 1990 1991 /* 1991 1992 * If we no longer need to hold onto old our caps, and we may ··· 3006 3005 } 3007 3006 3008 3007 if (S_ISREG(ci->netfs.inode.i_mode) && 3009 - ci->i_inline_version != CEPH_INLINE_NONE && 3008 + ceph_has_inline_data(ci) && 3010 3009 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 3011 3010 i_size_read(inode) > 0) { 3012 3011 struct page *page = ··· 3579 3578 fill_inline = true; 3580 3579 } 3581 3580 3582 - if (ci->i_auth_cap == cap && 3583 - le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { 3584 - if (newcaps & ~extra_info->issued) 3585 - wake = true; 3581 + if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { 3582 + if (ci->i_auth_cap == cap) { 3583 + if (newcaps & ~extra_info->issued) 3584 + wake = true; 3586 3585 3587 - if (ci->i_requested_max_size > max_size || 3588 - !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { 3589 - /* re-request max_size if necessary */ 3590 - ci->i_requested_max_size = 0; 3591 - wake = true; 3586 + if (ci->i_requested_max_size > max_size || 3587 + !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { 3588 + /* re-request max_size if necessary */ 3589 + ci->i_requested_max_size = 0; 3590 + wake = true; 3591 + } 3592 + 3593 + ceph_kick_flushing_inode_caps(session, ci); 3592 3594 } 3593 - 3594 - ceph_kick_flushing_inode_caps(session, ci); 3595 - spin_unlock(&ci->i_ceph_lock); 3596 3595 up_read(&session->s_mdsc->snap_rwsem); 3597 - } else { 3598 - spin_unlock(&ci->i_ceph_lock); 3599 3596 } 3597 + spin_unlock(&ci->i_ceph_lock); 3600 3598 3601 3599 if (fill_inline) 3602 3600 ceph_fill_inline_data(inode, NULL, extra_info->inline_data,

+70 -9

fs/ceph/dir.c

··· 856 856 if (ceph_snap(dir) != CEPH_NOSNAP) 857 857 return -EROFS; 858 858 859 + err = ceph_wait_on_conflict_unlink(dentry); 860 + if (err) 861 + return err; 862 + 859 863 if (ceph_quota_is_max_files_exceeded(dir)) { 860 864 err = -EDQUOT; 861 865 goto out; ··· 922 918 if (ceph_snap(dir) != CEPH_NOSNAP) 923 919 return -EROFS; 924 920 921 + err = ceph_wait_on_conflict_unlink(dentry); 922 + if (err) 923 + return err; 924 + 925 925 if (ceph_quota_is_max_files_exceeded(dir)) { 926 926 err = -EDQUOT; 927 927 goto out; ··· 976 968 struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); 977 969 struct ceph_mds_request *req; 978 970 struct ceph_acl_sec_ctx as_ctx = {}; 979 - int err = -EROFS; 971 + int err; 980 972 int op; 973 + 974 + err = ceph_wait_on_conflict_unlink(dentry); 975 + if (err) 976 + return err; 981 977 982 978 if (ceph_snap(dir) == CEPH_SNAPDIR) { 983 979 /* mkdir .snap/foo is a MKSNAP */ ··· 992 980 dout("mkdir dir %p dn %p mode 0%ho\n", dir, dentry, mode); 993 981 op = CEPH_MDS_OP_MKDIR; 994 982 } else { 983 + err = -EROFS; 995 984 goto out; 996 985 } 997 986 ··· 1050 1037 struct ceph_mds_request *req; 1051 1038 int err; 1052 1039 1040 + err = ceph_wait_on_conflict_unlink(dentry); 1041 + if (err) 1042 + return err; 1043 + 1053 1044 if (ceph_snap(dir) != CEPH_NOSNAP) 1054 1045 return -EROFS; 1055 1046 ··· 1088 1071 static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, 1089 1072 struct ceph_mds_request *req) 1090 1073 { 1074 + struct dentry *dentry = req->r_dentry; 1075 + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 1076 + struct ceph_dentry_info *di = ceph_dentry(dentry); 1091 1077 int result = req->r_err ? req->r_err : 1092 1078 le32_to_cpu(req->r_reply_info.head->result); 1079 + 1080 + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) 1081 + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", 1082 + __func__, dentry, dentry); 1083 + 1084 + spin_lock(&fsc->async_unlink_conflict_lock); 1085 + hash_del_rcu(&di->hnode); 1086 + spin_unlock(&fsc->async_unlink_conflict_lock); 1087 + 1088 + spin_lock(&dentry->d_lock); 1089 + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; 1090 + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT); 1091 + spin_unlock(&dentry->d_lock); 1092 + 1093 + synchronize_rcu(); 1093 1094 1094 1095 if (result == -EJUKEBOX) 1095 1096 goto out; ··· 1116 1081 if (result) { 1117 1082 int pathlen = 0; 1118 1083 u64 base = 0; 1119 - char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 1084 + char *path = ceph_mdsc_build_path(dentry, &pathlen, 1120 1085 &base, 0); 1121 1086 1122 1087 /* mark error on parent + clear complete */ ··· 1124 1089 ceph_dir_clear_complete(req->r_parent); 1125 1090 1126 1091 /* drop the dentry -- we don't know its status */ 1127 - if (!d_unhashed(req->r_dentry)) 1128 - d_drop(req->r_dentry); 1092 + if (!d_unhashed(dentry)) 1093 + d_drop(dentry); 1129 1094 1130 1095 /* mark inode itself for an error (since metadata is bogus) */ 1131 1096 mapping_set_error(req->r_old_inode->i_mapping, result); 1132 1097 1133 - pr_warn("ceph: async unlink failure path=(%llx)%s result=%d!\n", 1098 + pr_warn("async unlink failure path=(%llx)%s result=%d!\n", 1134 1099 base, IS_ERR(path) ? "<<bad>>" : path, result); 1135 1100 ceph_mdsc_free_path(path, pathlen); 1136 1101 } ··· 1215 1180 1216 1181 if (try_async && op == CEPH_MDS_OP_UNLINK && 1217 1182 (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) { 1183 + struct ceph_dentry_info *di = ceph_dentry(dentry); 1184 + 1218 1185 dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir), 1219 1186 dentry->d_name.len, dentry->d_name.name, 1220 1187 ceph_cap_string(req->r_dir_caps)); ··· 1224 1187 req->r_callback = ceph_async_unlink_cb; 1225 1188 req->r_old_inode = d_inode(dentry); 1226 1189 ihold(req->r_old_inode); 1190 + 1191 + spin_lock(&dentry->d_lock); 1192 + di->flags |= CEPH_DENTRY_ASYNC_UNLINK; 1193 + spin_unlock(&dentry->d_lock); 1194 + 1195 + spin_lock(&fsc->async_unlink_conflict_lock); 1196 + hash_add_rcu(fsc->async_unlink_conflict, &di->hnode, 1197 + dentry->d_name.hash); 1198 + spin_unlock(&fsc->async_unlink_conflict_lock); 1199 + 1227 1200 err = ceph_mdsc_submit_request(mdsc, dir, req); 1228 1201 if (!err) { 1229 1202 /* ··· 1242 1195 */ 1243 1196 drop_nlink(inode); 1244 1197 d_delete(dentry); 1245 - } else if (err == -EJUKEBOX) { 1246 - try_async = false; 1247 - ceph_mdsc_put_request(req); 1248 - goto retry; 1198 + } else { 1199 + spin_lock(&fsc->async_unlink_conflict_lock); 1200 + hash_del_rcu(&di->hnode); 1201 + spin_unlock(&fsc->async_unlink_conflict_lock); 1202 + 1203 + spin_lock(&dentry->d_lock); 1204 + di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; 1205 + spin_unlock(&dentry->d_lock); 1206 + 1207 + if (err == -EJUKEBOX) { 1208 + try_async = false; 1209 + ceph_mdsc_put_request(req); 1210 + goto retry; 1211 + } 1249 1212 } 1250 1213 } else { 1251 1214 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); ··· 1293 1236 if ((old_dir != new_dir) && 1294 1237 (!ceph_quota_is_same_realm(old_dir, new_dir))) 1295 1238 return -EXDEV; 1239 + 1240 + err = ceph_wait_on_conflict_unlink(new_dentry); 1241 + if (err) 1242 + return err; 1296 1243 1297 1244 dout("rename dir %p dentry %p to dir %p dentry %p\n", 1298 1245 old_dir, old_dentry, new_dir, new_dentry);

+48 -75

fs/ceph/file.c

··· 240 240 INIT_LIST_HEAD(&fi->rw_contexts); 241 241 fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); 242 242 243 - if ((file->f_mode & FMODE_WRITE) && 244 - ci->i_inline_version != CEPH_INLINE_NONE) { 243 + if ((file->f_mode & FMODE_WRITE) && ceph_has_inline_data(ci)) { 245 244 ret = ceph_uninline_data(file); 246 245 if (ret < 0) 247 246 goto error; ··· 567 568 char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, 568 569 &base, 0); 569 570 570 - pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", 571 + pr_warn("async create failure path=(%llx)%s result=%d!\n", 571 572 base, IS_ERR(path) ? "<<bad>>" : path, result); 572 573 ceph_mdsc_free_path(path, pathlen); 573 574 ··· 610 611 struct ceph_mds_reply_inode in = { }; 611 612 struct ceph_mds_reply_info_in iinfo = { .in = &in }; 612 613 struct ceph_inode_info *ci = ceph_inode(dir); 614 + struct ceph_dentry_info *di = ceph_dentry(dentry); 613 615 struct inode *inode; 614 616 struct timespec64 now; 615 617 struct ceph_string *pool_ns; ··· 709 709 file->f_mode |= FMODE_CREATED; 710 710 ret = finish_open(file, dentry, ceph_open); 711 711 } 712 + 713 + spin_lock(&dentry->d_lock); 714 + di->flags &= ~CEPH_DENTRY_ASYNC_CREATE; 715 + wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT); 716 + spin_unlock(&dentry->d_lock); 717 + 712 718 return ret; 713 719 } 714 720 ··· 740 734 741 735 if (dentry->d_name.len > NAME_MAX) 742 736 return -ENAMETOOLONG; 737 + 738 + err = ceph_wait_on_conflict_unlink(dentry); 739 + if (err) 740 + return err; 741 + /* 742 + * Do not truncate the file, since atomic_open is called before the 743 + * permission check. The caller will do the truncation afterward. 744 + */ 745 + flags &= ~O_TRUNC; 743 746 744 747 if (flags & O_CREAT) { 745 748 if (ceph_quota_is_max_files_exceeded(dir)) ··· 796 781 (req->r_dir_caps = 797 782 try_prep_async_create(dir, dentry, &lo, 798 783 &req->r_deleg_ino))) { 784 + struct ceph_dentry_info *di = ceph_dentry(dentry); 785 + 799 786 set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags); 800 787 req->r_args.open.flags |= cpu_to_le32(CEPH_O_EXCL); 801 788 req->r_callback = ceph_async_create_cb; 789 + 790 + spin_lock(&dentry->d_lock); 791 + di->flags |= CEPH_DENTRY_ASYNC_CREATE; 792 + spin_unlock(&dentry->d_lock); 793 + 802 794 err = ceph_mdsc_submit_request(mdsc, dir, req); 803 795 if (!err) { 804 796 err = ceph_finish_async_create(dir, dentry, ··· 824 802 } 825 803 826 804 set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); 827 - err = ceph_mdsc_do_request(mdsc, 828 - (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, 829 - req); 805 + err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req); 830 806 if (err == -ENOENT) { 831 807 dentry = ceph_handle_snapdir(req, dentry); 832 808 if (IS_ERR(dentry)) { ··· 980 960 981 961 osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, 982 962 false, false); 983 - ret = ceph_osdc_start_request(osdc, req, false); 984 - if (!ret) 985 - ret = ceph_osdc_wait_request(osdc, req); 963 + ceph_osdc_start_request(osdc, req); 964 + ret = ceph_osdc_wait_request(osdc, req); 986 965 987 966 ceph_update_read_metrics(&fsc->mdsc->metric, 988 967 req->r_start_latency, ··· 1244 1225 req->r_inode = inode; 1245 1226 req->r_priv = aio_req; 1246 1227 1247 - ret = ceph_osdc_start_request(req->r_osdc, req, false); 1228 + ceph_osdc_start_request(req->r_osdc, req); 1248 1229 out: 1249 1230 if (ret < 0) { 1250 1231 req->r_result = ret; ··· 1381 1362 continue; 1382 1363 } 1383 1364 1384 - ret = ceph_osdc_start_request(req->r_osdc, req, false); 1385 - if (!ret) 1386 - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1365 + ceph_osdc_start_request(req->r_osdc, req); 1366 + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1387 1367 1388 1368 if (write) 1389 1369 ceph_update_write_metrics(metric, req->r_start_latency, ··· 1445 1427 r_private_item); 1446 1428 list_del_init(&req->r_private_item); 1447 1429 if (ret >= 0) 1448 - ret = ceph_osdc_start_request(req->r_osdc, 1449 - req, false); 1430 + ceph_osdc_start_request(req->r_osdc, req); 1450 1431 if (ret < 0) { 1451 1432 req->r_result = ret; 1452 1433 ceph_aio_complete_req(req); ··· 1558 1541 false, true); 1559 1542 1560 1543 req->r_mtime = mtime; 1561 - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1562 - if (!ret) 1563 - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1544 + ceph_osdc_start_request(&fsc->client->osdc, req); 1545 + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 1564 1546 1565 1547 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, 1566 1548 req->r_end_latency, len, ret); ··· 1643 1627 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 1644 1628 ceph_cap_string(got)); 1645 1629 1646 - if (ci->i_inline_version == CEPH_INLINE_NONE) { 1630 + if (!ceph_has_inline_data(ci)) { 1647 1631 if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { 1648 1632 ret = ceph_direct_read_write(iocb, to, 1649 1633 NULL, NULL); ··· 1906 1890 if (dirty) 1907 1891 __mark_inode_dirty(inode, dirty); 1908 1892 if (ceph_quota_is_max_bytes_approaching(inode, iocb->ki_pos)) 1909 - ceph_check_caps(ci, 0, NULL); 1893 + ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); 1910 1894 } 1911 1895 1912 1896 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n", ··· 1946 1930 */ 1947 1931 static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) 1948 1932 { 1949 - struct inode *inode = file->f_mapping->host; 1950 - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1951 - loff_t i_size; 1952 - loff_t ret; 1953 - 1954 - inode_lock(inode); 1955 - 1956 1933 if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { 1934 + struct inode *inode = file_inode(file); 1935 + int ret; 1936 + 1957 1937 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); 1958 1938 if (ret < 0) 1959 - goto out; 1939 + return ret; 1960 1940 } 1961 - 1962 - i_size = i_size_read(inode); 1963 - switch (whence) { 1964 - case SEEK_END: 1965 - offset += i_size; 1966 - break; 1967 - case SEEK_CUR: 1968 - /* 1969 - * Here we special-case the lseek(fd, 0, SEEK_CUR) 1970 - * position-querying operation. Avoid rewriting the "same" 1971 - * f_pos value back to the file because a concurrent read(), 1972 - * write() or lseek() might have altered it 1973 - */ 1974 - if (offset == 0) { 1975 - ret = file->f_pos; 1976 - goto out; 1977 - } 1978 - offset += file->f_pos; 1979 - break; 1980 - case SEEK_DATA: 1981 - if (offset < 0 || offset >= i_size) { 1982 - ret = -ENXIO; 1983 - goto out; 1984 - } 1985 - break; 1986 - case SEEK_HOLE: 1987 - if (offset < 0 || offset >= i_size) { 1988 - ret = -ENXIO; 1989 - goto out; 1990 - } 1991 - offset = i_size; 1992 - break; 1993 - } 1994 - 1995 - ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size)); 1996 - 1997 - out: 1998 - inode_unlock(inode); 1999 - return ret; 1941 + return generic_file_llseek(file, offset, whence); 2000 1942 } 2001 1943 2002 1944 static inline void ceph_zero_partial_page( ··· 2023 2049 } 2024 2050 2025 2051 req->r_mtime = inode->i_mtime; 2026 - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); 2027 - if (!ret) { 2028 - ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 2029 - if (ret == -ENOENT) 2030 - ret = 0; 2031 - } 2052 + ceph_osdc_start_request(&fsc->client->osdc, req); 2053 + ret = ceph_osdc_wait_request(&fsc->client->osdc, req); 2054 + if (ret == -ENOENT) 2055 + ret = 0; 2032 2056 ceph_osdc_put_request(req); 2033 2057 2034 2058 out: ··· 2328 2356 if (IS_ERR(req)) 2329 2357 ret = PTR_ERR(req); 2330 2358 else { 2331 - ceph_osdc_start_request(osdc, req, false); 2359 + ceph_osdc_start_request(osdc, req); 2332 2360 ret = ceph_osdc_wait_request(osdc, req); 2333 2361 ceph_update_copyfrom_metrics(&fsc->mdsc->metric, 2334 2362 req->r_start_latency, ··· 2521 2549 /* Let the MDS know about dst file size change */ 2522 2550 if (ceph_inode_set_size(dst_inode, dst_off) || 2523 2551 ceph_quota_is_max_bytes_approaching(dst_inode, dst_off)) 2524 - ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY, NULL); 2552 + ceph_check_caps(dst_ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_FLUSH, 2553 + NULL); 2525 2554 } 2526 2555 /* Mark Fw dirty */ 2527 2556 spin_lock(&dst_ci->i_ceph_lock);

+10 -3

fs/ceph/inode.c

··· 1049 1049 iinfo->inline_version >= ci->i_inline_version) { 1050 1050 int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 1051 1051 ci->i_inline_version = iinfo->inline_version; 1052 - if (ci->i_inline_version != CEPH_INLINE_NONE && 1052 + if (ceph_has_inline_data(ci) && 1053 1053 (locked_page || (info_caps & cache_caps))) 1054 1054 fill_inline = true; 1055 1055 } ··· 2275 2275 * 2276 2276 * This cost much when doing the Locker state transition and 2277 2277 * usually will need to revoke caps from clients. 2278 + * 2279 + * And for the 'Xs' caps for getxattr we will also choose the 2280 + * auth MDS, because the MDS side code is buggy due to setxattr 2281 + * won't notify the replica MDSes when the values changed and 2282 + * the replica MDS will return the old values. Though we will 2283 + * fix it in MDS code, but this still makes sense for old ceph. 2278 2284 */ 2279 2285 if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL)) 2280 - || (mask & CEPH_STAT_RSTAT)) 2286 + || (mask & (CEPH_STAT_RSTAT | CEPH_STAT_CAP_XATTR))) 2281 2287 return USE_AUTH_MDS; 2282 2288 else 2283 2289 return USE_ANY_MDS; ··· 2327 2321 if (inline_version == 0) { 2328 2322 /* the reply is supposed to contain inline data */ 2329 2323 err = -EINVAL; 2330 - } else if (inline_version == CEPH_INLINE_NONE) { 2324 + } else if (inline_version == CEPH_INLINE_NONE || 2325 + inline_version == 1) { 2331 2326 err = -ENODATA; 2332 2327 } else { 2333 2328 err = req->r_reply_info.targeti.inline_len;

+157 -8

fs/ceph/mds_client.c

··· 456 456 dout("added delegated inode 0x%llx\n", 457 457 start - 1); 458 458 } else if (err == -EBUSY) { 459 - pr_warn("ceph: MDS delegated inode 0x%llx more than once.\n", 459 + pr_warn("MDS delegated inode 0x%llx more than once.\n", 460 460 start - 1); 461 461 } else { 462 462 return err; ··· 653 653 if (!info->dir_entries) 654 654 return; 655 655 free_pages((unsigned long)info->dir_entries, get_order(info->dir_buf_size)); 656 + } 657 + 658 + /* 659 + * In async unlink case the kclient won't wait for the first reply 660 + * from MDS and just drop all the links and unhash the dentry and then 661 + * succeeds immediately. 662 + * 663 + * For any new create/link/rename,etc requests followed by using the 664 + * same file names we must wait for the first reply of the inflight 665 + * unlink request, or the MDS possibly will fail these following 666 + * requests with -EEXIST if the inflight async unlink request was 667 + * delayed for some reasons. 668 + * 669 + * And the worst case is that for the none async openc request it will 670 + * successfully open the file if the CDentry hasn't been unlinked yet, 671 + * but later the previous delayed async unlink request will remove the 672 + * CDenty. That means the just created file is possiblly deleted later 673 + * by accident. 674 + * 675 + * We need to wait for the inflight async unlink requests to finish 676 + * when creating new files/directories by using the same file names. 677 + */ 678 + int ceph_wait_on_conflict_unlink(struct dentry *dentry) 679 + { 680 + struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); 681 + struct dentry *pdentry = dentry->d_parent; 682 + struct dentry *udentry, *found = NULL; 683 + struct ceph_dentry_info *di; 684 + struct qstr dname; 685 + u32 hash = dentry->d_name.hash; 686 + int err; 687 + 688 + dname.name = dentry->d_name.name; 689 + dname.len = dentry->d_name.len; 690 + 691 + rcu_read_lock(); 692 + hash_for_each_possible_rcu(fsc->async_unlink_conflict, di, 693 + hnode, hash) { 694 + udentry = di->dentry; 695 + 696 + spin_lock(&udentry->d_lock); 697 + if (udentry->d_name.hash != hash) 698 + goto next; 699 + if (unlikely(udentry->d_parent != pdentry)) 700 + goto next; 701 + if (!hash_hashed(&di->hnode)) 702 + goto next; 703 + 704 + if (!test_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags)) 705 + pr_warn("%s dentry %p:%pd async unlink bit is not set\n", 706 + __func__, dentry, dentry); 707 + 708 + if (!d_same_name(udentry, pdentry, &dname)) 709 + goto next; 710 + 711 + spin_unlock(&udentry->d_lock); 712 + found = dget(udentry); 713 + break; 714 + next: 715 + spin_unlock(&udentry->d_lock); 716 + } 717 + rcu_read_unlock(); 718 + 719 + if (likely(!found)) 720 + return 0; 721 + 722 + dout("%s dentry %p:%pd conflict with old %p:%pd\n", __func__, 723 + dentry, dentry, found, found); 724 + 725 + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT, 726 + TASK_KILLABLE); 727 + dput(found); 728 + return err; 656 729 } 657 730 658 731 ··· 1293 1220 if (count > 0) { 1294 1221 size_t i; 1295 1222 size_t size = FEATURE_BYTES(count); 1223 + unsigned long bit; 1296 1224 1297 1225 if (WARN_ON_ONCE(*p + 4 + size > end)) 1298 1226 return -ERANGE; 1299 1227 1300 1228 ceph_encode_32(p, size); 1301 1229 memset(*p, 0, size); 1302 - for (i = 0; i < count; i++) 1303 - ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8); 1230 + for (i = 0; i < count; i++) { 1231 + bit = feature_bits[i]; 1232 + ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8); 1233 + } 1304 1234 *p += size; 1305 1235 } else { 1306 1236 if (WARN_ON_ONCE(*p + 4 > end)) ··· 2960 2884 if (req->r_request_started == 0) /* note request start time */ 2961 2885 req->r_request_started = jiffies; 2962 2886 2887 + /* 2888 + * For async create we will choose the auth MDS of frag in parent 2889 + * directory to send the request and ususally this works fine, but 2890 + * if the migrated the dirtory to another MDS before it could handle 2891 + * it the request will be forwarded. 2892 + * 2893 + * And then the auth cap will be changed. 2894 + */ 2895 + if (test_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags) && req->r_num_fwd) { 2896 + struct ceph_dentry_info *di = ceph_dentry(req->r_dentry); 2897 + struct ceph_inode_info *ci; 2898 + struct ceph_cap *cap; 2899 + 2900 + /* 2901 + * The request maybe handled very fast and the new inode 2902 + * hasn't been linked to the dentry yet. We need to wait 2903 + * for the ceph_finish_async_create(), which shouldn't be 2904 + * stuck too long or fail in thoery, to finish when forwarding 2905 + * the request. 2906 + */ 2907 + if (!d_inode(req->r_dentry)) { 2908 + err = wait_on_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT, 2909 + TASK_KILLABLE); 2910 + if (err) { 2911 + mutex_lock(&req->r_fill_mutex); 2912 + set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags); 2913 + mutex_unlock(&req->r_fill_mutex); 2914 + goto out_session; 2915 + } 2916 + } 2917 + 2918 + ci = ceph_inode(d_inode(req->r_dentry)); 2919 + 2920 + spin_lock(&ci->i_ceph_lock); 2921 + cap = ci->i_auth_cap; 2922 + if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE && mds != cap->mds) { 2923 + dout("do_request session changed for auth cap %d -> %d\n", 2924 + cap->session->s_mds, session->s_mds); 2925 + 2926 + /* Remove the auth cap from old session */ 2927 + spin_lock(&cap->session->s_cap_lock); 2928 + cap->session->s_nr_caps--; 2929 + list_del_init(&cap->session_caps); 2930 + spin_unlock(&cap->session->s_cap_lock); 2931 + 2932 + /* Add the auth cap to the new session */ 2933 + cap->mds = mds; 2934 + cap->session = session; 2935 + spin_lock(&session->s_cap_lock); 2936 + session->s_nr_caps++; 2937 + list_add_tail(&cap->session_caps, &session->s_caps); 2938 + spin_unlock(&session->s_cap_lock); 2939 + 2940 + change_auth_cap_ses(ci, session); 2941 + } 2942 + spin_unlock(&ci->i_ceph_lock); 2943 + } 2944 + 2963 2945 err = __send_request(session, req, false); 2964 2946 2965 2947 out_session: ··· 3598 3464 case CEPH_SESSION_OPEN: 3599 3465 if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) 3600 3466 pr_info("mds%d reconnect success\n", session->s_mds); 3601 - session->s_state = CEPH_MDS_SESSION_OPEN; 3602 - session->s_features = features; 3603 - renewed_caps(mdsc, session, 0); 3604 - if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, &session->s_features)) 3605 - metric_schedule_delayed(&mdsc->metric); 3467 + 3468 + if (session->s_state == CEPH_MDS_SESSION_OPEN) { 3469 + pr_notice("mds%d is already opened\n", session->s_mds); 3470 + } else { 3471 + session->s_state = CEPH_MDS_SESSION_OPEN; 3472 + session->s_features = features; 3473 + renewed_caps(mdsc, session, 0); 3474 + if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT, 3475 + &session->s_features)) 3476 + metric_schedule_delayed(&mdsc->metric); 3477 + } 3478 + 3479 + /* 3480 + * The connection maybe broken and the session in client 3481 + * side has been reinitialized, need to update the seq 3482 + * anyway. 3483 + */ 3484 + if (!session->s_seq && seq) 3485 + session->s_seq = seq; 3486 + 3606 3487 wake = 1; 3607 3488 if (mdsc->stopping) 3608 3489 __close_session(mdsc, session);

+5 -8

fs/ceph/mds_client.h

··· 29 29 CEPHFS_FEATURE_MULTI_RECONNECT, 30 30 CEPHFS_FEATURE_DELEG_INO, 31 31 CEPHFS_FEATURE_METRIC_COLLECT, 32 + CEPHFS_FEATURE_ALTERNATE_NAME, 33 + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, 32 34 33 - CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT, 35 + CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_NOTIFY_SESSION_STATE, 34 36 }; 35 37 36 - /* 37 - * This will always have the highest feature bit value 38 - * as the last element of the array. 39 - */ 40 38 #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ 41 39 0, 1, 2, 3, 4, 5, 6, 7, \ 42 40 CEPHFS_FEATURE_MIMIC, \ ··· 43 45 CEPHFS_FEATURE_MULTI_RECONNECT, \ 44 46 CEPHFS_FEATURE_DELEG_INO, \ 45 47 CEPHFS_FEATURE_METRIC_COLLECT, \ 46 - \ 47 - CEPHFS_FEATURE_MAX, \ 48 + CEPHFS_FEATURE_NOTIFY_SESSION_STATE, \ 48 49 } 49 - #define CEPHFS_FEATURES_CLIENT_REQUIRED {} 50 50 51 51 /* 52 52 * Some lock dependencies: ··· 578 582 TASK_KILLABLE); 579 583 } 580 584 585 + extern int ceph_wait_on_conflict_unlink(struct dentry *dentry); 581 586 extern u64 ceph_get_deleg_ino(struct ceph_mds_session *session); 582 587 extern int ceph_restore_deleg_ino(struct ceph_mds_session *session, u64 ino); 583 588 #endif

+18 -4

fs/ceph/mdsmap.c

··· 352 352 __decode_and_drop_type(p, end, u8, bad_ext); 353 353 } 354 354 if (mdsmap_ev >= 8) { 355 - u32 name_len; 356 355 /* enabled */ 357 356 ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); 358 - ceph_decode_32_safe(p, end, name_len, bad_ext); 359 - ceph_decode_need(p, end, name_len, bad_ext); 360 - *p += name_len; 357 + /* fs_name */ 358 + ceph_decode_skip_string(p, end, bad_ext); 361 359 } 362 360 /* damaged */ 363 361 if (mdsmap_ev >= 9) { ··· 367 369 m->m_damaged = n > 0; 368 370 } else { 369 371 m->m_damaged = false; 372 + } 373 + if (mdsmap_ev >= 17) { 374 + /* balancer */ 375 + ceph_decode_skip_string(p, end, bad_ext); 376 + /* standby_count_wanted */ 377 + ceph_decode_skip_32(p, end, bad_ext); 378 + /* old_max_mds */ 379 + ceph_decode_skip_32(p, end, bad_ext); 380 + /* min_compat_client */ 381 + ceph_decode_skip_8(p, end, bad_ext); 382 + /* required_client_features */ 383 + ceph_decode_skip_set(p, end, 64, bad_ext); 384 + ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext); 385 + } else { 386 + /* This forces the usage of the (sync) SETXATTR Op */ 387 + m->m_max_xattr_size = 0; 370 388 } 371 389 bad_ext: 372 390 dout("mdsmap_decode m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",

+12 -7

fs/ceph/super.c

··· 72 72 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */ 73 73 74 74 /* 75 - * express utilization in terms of large blocks to avoid 75 + * Express utilization in terms of large blocks to avoid 76 76 * overflow on 32-bit machines. 77 - * 78 - * NOTE: for the time being, we make bsize == frsize to humor 79 - * not-yet-ancient versions of glibc that are broken. 80 - * Someday, we will probably want to report a real block 81 - * size... whatever that may mean for a network file system! 82 77 */ 83 - buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; 84 78 buf->f_frsize = 1 << CEPH_BLOCK_SHIFT; 85 79 86 80 /* ··· 88 94 buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 89 95 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); 90 96 } 97 + 98 + /* 99 + * NOTE: for the time being, we make bsize == frsize to humor 100 + * not-yet-ancient versions of glibc that are broken. 101 + * Someday, we will probably want to report a real block 102 + * size... whatever that may mean for a network file system! 103 + */ 104 + buf->f_bsize = buf->f_frsize; 91 105 92 106 buf->f_files = le64_to_cpu(st.num_objects); 93 107 buf->f_ffree = -1; ··· 817 815 fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); 818 816 if (!fsc->cap_wq) 819 817 goto fail_inode_wq; 818 + 819 + hash_init(fsc->async_unlink_conflict); 820 + spin_lock_init(&fsc->async_unlink_conflict_lock); 820 821 821 822 spin_lock(&ceph_fsc_lock); 822 823 list_add_tail(&fsc->metric_wakeup, &ceph_fsc_list);

+26 -5

fs/ceph/super.h

··· 19 19 #include <linux/security.h> 20 20 #include <linux/netfs.h> 21 21 #include <linux/fscache.h> 22 + #include <linux/hashtable.h> 22 23 23 24 #include <linux/ceph/libceph.h> 24 25 ··· 100 99 char *mon_addr; 101 100 }; 102 101 102 + #define CEPH_ASYNC_CREATE_CONFLICT_BITS 8 103 + 103 104 struct ceph_fs_client { 104 105 struct super_block *sb; 105 106 ··· 126 123 127 124 struct workqueue_struct *inode_wq; 128 125 struct workqueue_struct *cap_wq; 126 + 127 + DECLARE_HASHTABLE(async_unlink_conflict, CEPH_ASYNC_CREATE_CONFLICT_BITS); 128 + spinlock_t async_unlink_conflict_lock; 129 129 130 130 #ifdef CONFIG_DEBUG_FS 131 131 struct dentry *debugfs_dentry_lru, *debugfs_caps; ··· 286 280 struct dentry *dentry; 287 281 struct ceph_mds_session *lease_session; 288 282 struct list_head lease_list; 289 - unsigned flags; 283 + struct hlist_node hnode; 284 + unsigned long flags; 290 285 int lease_shared_gen; 291 286 u32 lease_gen; 292 287 u32 lease_seq; ··· 296 289 u64 offset; 297 290 }; 298 291 299 - #define CEPH_DENTRY_REFERENCED 1 300 - #define CEPH_DENTRY_LEASE_LIST 2 301 - #define CEPH_DENTRY_SHRINK_LIST 4 302 - #define CEPH_DENTRY_PRIMARY_LINK 8 292 + #define CEPH_DENTRY_REFERENCED (1 << 0) 293 + #define CEPH_DENTRY_LEASE_LIST (1 << 1) 294 + #define CEPH_DENTRY_SHRINK_LIST (1 << 2) 295 + #define CEPH_DENTRY_PRIMARY_LINK (1 << 3) 296 + #define CEPH_DENTRY_ASYNC_UNLINK_BIT (4) 297 + #define CEPH_DENTRY_ASYNC_UNLINK (1 << CEPH_DENTRY_ASYNC_UNLINK_BIT) 298 + #define CEPH_DENTRY_ASYNC_CREATE_BIT (5) 299 + #define CEPH_DENTRY_ASYNC_CREATE (1 << CEPH_DENTRY_ASYNC_CREATE_BIT) 303 300 304 301 struct ceph_inode_xattrs_info { 305 302 /* ··· 769 758 extern void ceph_reservation_status(struct ceph_fs_client *client, 770 759 int *total, int *avail, int *used, 771 760 int *reserved, int *min); 761 + extern void change_auth_cap_ses(struct ceph_inode_info *ci, 762 + struct ceph_mds_session *session); 772 763 773 764 774 765 ··· 1230 1217 extern int ceph_pool_perm_check(struct inode *inode, int need); 1231 1218 extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); 1232 1219 int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); 1220 + 1221 + static inline bool ceph_has_inline_data(struct ceph_inode_info *ci) 1222 + { 1223 + if (ci->i_inline_version == CEPH_INLINE_NONE || 1224 + ci->i_inline_version == 1) /* initial version, no data */ 1225 + return false; 1226 + return true; 1227 + } 1233 1228 1234 1229 /* file.c */ 1235 1230 extern const struct file_operations ceph_file_fops;

+8 -4

fs/ceph/xattr.c

··· 1086 1086 flags |= CEPH_XATTR_REMOVE; 1087 1087 } 1088 1088 1089 - dout("setxattr value=%.*s\n", (int)size, value); 1089 + dout("setxattr value size: %zu\n", size); 1090 1090 1091 1091 /* do request */ 1092 1092 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); ··· 1184 1184 spin_lock(&ci->i_ceph_lock); 1185 1185 retry: 1186 1186 issued = __ceph_caps_issued(ci, NULL); 1187 - if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) 1187 + required_blob_size = __get_required_blob_size(ci, name_len, val_len); 1188 + if ((ci->i_xattrs.version == 0) || !(issued & CEPH_CAP_XATTR_EXCL) || 1189 + (required_blob_size > mdsc->mdsmap->m_max_xattr_size)) { 1190 + dout("%s do sync setxattr: version: %llu size: %d max: %llu\n", 1191 + __func__, ci->i_xattrs.version, required_blob_size, 1192 + mdsc->mdsmap->m_max_xattr_size); 1188 1193 goto do_sync; 1194 + } 1189 1195 1190 1196 if (!lock_snap_rwsem && !ci->i_head_snapc) { 1191 1197 lock_snap_rwsem = true; ··· 1206 1200 dout("setxattr %p name '%s' issued %s\n", inode, name, 1207 1201 ceph_cap_string(issued)); 1208 1202 __build_xattrs(inode); 1209 - 1210 - required_blob_size = __get_required_blob_size(ci, name_len, val_len); 1211 1203 1212 1204 if (!ci->i_xattrs.prealloc_blob || 1213 1205 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {

+30 -6

fs/crypto/fname.c

··· 86 86 /** 87 87 * fscrypt_fname_encrypt() - encrypt a filename 88 88 * @inode: inode of the parent directory (for regular filenames) 89 - * or of the symlink (for symlink targets) 89 + * or of the symlink (for symlink targets). Key must already be 90 + * set up. 90 91 * @iname: the filename to encrypt 91 92 * @out: (output) the encrypted filename 92 93 * @olen: size of the encrypted filename. It must be at least @iname->len. ··· 138 137 139 138 return 0; 140 139 } 140 + EXPORT_SYMBOL_GPL(fscrypt_fname_encrypt); 141 141 142 142 /** 143 143 * fname_decrypt() - decrypt a filename ··· 266 264 return bp - dst; 267 265 } 268 266 269 - bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, 270 - u32 orig_len, u32 max_len, 271 - u32 *encrypted_len_ret) 267 + bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, 268 + u32 orig_len, u32 max_len, 269 + u32 *encrypted_len_ret) 272 270 { 273 271 int padding = 4 << (fscrypt_policy_flags(policy) & 274 272 FSCRYPT_POLICY_FLAGS_PAD_MASK); ··· 281 279 *encrypted_len_ret = min(encrypted_len, max_len); 282 280 return true; 283 281 } 282 + 283 + /** 284 + * fscrypt_fname_encrypted_size() - calculate length of encrypted filename 285 + * @inode: parent inode of dentry name being encrypted. Key must 286 + * already be set up. 287 + * @orig_len: length of the original filename 288 + * @max_len: maximum length to return 289 + * @encrypted_len_ret: where calculated length should be returned (on success) 290 + * 291 + * Filenames that are shorter than the maximum length may have their lengths 292 + * increased slightly by encryption, due to padding that is applied. 293 + * 294 + * Return: false if the orig_len is greater than max_len. Otherwise, true and 295 + * fill out encrypted_len_ret with the length (up to max_len). 296 + */ 297 + bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, 298 + u32 max_len, u32 *encrypted_len_ret) 299 + { 300 + return __fscrypt_fname_encrypted_size(&inode->i_crypt_info->ci_policy, 301 + orig_len, max_len, 302 + encrypted_len_ret); 303 + } 304 + EXPORT_SYMBOL_GPL(fscrypt_fname_encrypted_size); 284 305 285 306 /** 286 307 * fscrypt_fname_alloc_buffer() - allocate a buffer for presented filenames ··· 460 435 return ret; 461 436 462 437 if (fscrypt_has_encryption_key(dir)) { 463 - if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, 464 - iname->len, NAME_MAX, 438 + if (!fscrypt_fname_encrypted_size(dir, iname->len, NAME_MAX, 465 439 &fname->crypto_buf.len)) 466 440 return -ENAMETOOLONG; 467 441 fname->crypto_buf.name = kmalloc(fname->crypto_buf.len,

+3 -6

fs/crypto/fscrypt_private.h

··· 297 297 const struct fscrypt_info *ci); 298 298 299 299 /* fname.c */ 300 - int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, 301 - u8 *out, unsigned int olen); 302 - bool fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, 303 - u32 orig_len, u32 max_len, 304 - u32 *encrypted_len_ret); 300 + bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, 301 + u32 orig_len, u32 max_len, 302 + u32 *encrypted_len_ret); 305 303 306 304 /* hkdf.c */ 307 - 308 305 struct fscrypt_hkdf { 309 306 struct crypto_shash *hmac_tfm; 310 307 };

+3 -3

fs/crypto/hooks.c

··· 228 228 * counting it (even though it is meaningless for ciphertext) is simpler 229 229 * for now since filesystems will assume it is there and subtract it. 230 230 */ 231 - if (!fscrypt_fname_encrypted_size(policy, len, 232 - max_len - sizeof(struct fscrypt_symlink_data), 233 - &disk_link->len)) 231 + if (!__fscrypt_fname_encrypted_size(policy, len, 232 + max_len - sizeof(struct fscrypt_symlink_data), 233 + &disk_link->len)) 234 234 return -ENAMETOOLONG; 235 235 disk_link->len += sizeof(struct fscrypt_symlink_data); 236 236

+29 -6

fs/crypto/policy.c

··· 694 694 } 695 695 696 696 /** 697 + * fscrypt_context_for_new_inode() - create an encryption context for a new inode 698 + * @ctx: where context should be written 699 + * @inode: inode from which to fetch policy and nonce 700 + * 701 + * Given an in-core "prepared" (via fscrypt_prepare_new_inode) inode, 702 + * generate a new context and write it to ctx. ctx _must_ be at least 703 + * FSCRYPT_SET_CONTEXT_MAX_SIZE bytes. 704 + * 705 + * Return: size of the resulting context or a negative error code. 706 + */ 707 + int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) 708 + { 709 + struct fscrypt_info *ci = inode->i_crypt_info; 710 + 711 + BUILD_BUG_ON(sizeof(union fscrypt_context) != 712 + FSCRYPT_SET_CONTEXT_MAX_SIZE); 713 + 714 + /* fscrypt_prepare_new_inode() should have set up the key already. */ 715 + if (WARN_ON_ONCE(!ci)) 716 + return -ENOKEY; 717 + 718 + return fscrypt_new_context(ctx, &ci->ci_policy, ci->ci_nonce); 719 + } 720 + EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); 721 + 722 + /** 697 723 * fscrypt_set_context() - Set the fscrypt context of a new inode 698 724 * @inode: a new inode 699 725 * @fs_data: private data given by FS and passed to ->set_context() ··· 735 709 union fscrypt_context ctx; 736 710 int ctxsize; 737 711 738 - /* fscrypt_prepare_new_inode() should have set up the key already. */ 739 - if (WARN_ON_ONCE(!ci)) 740 - return -ENOKEY; 741 - 742 - BUILD_BUG_ON(sizeof(ctx) != FSCRYPT_SET_CONTEXT_MAX_SIZE); 743 - ctxsize = fscrypt_new_context(&ctx, &ci->ci_policy, ci->ci_nonce); 712 + ctxsize = fscrypt_context_for_new_inode(&ctx, inode); 713 + if (ctxsize < 0) 714 + return ctxsize; 744 715 745 716 /* 746 717 * This may be the first time the inode number is available, so do any

+11 -4

fs/dcache.c

··· 2248 2248 } 2249 2249 EXPORT_SYMBOL(d_add_ci); 2250 2250 2251 - 2252 - static inline bool d_same_name(const struct dentry *dentry, 2253 - const struct dentry *parent, 2254 - const struct qstr *name) 2251 + /** 2252 + * d_same_name - compare dentry name with case-exact name 2253 + * @parent: parent dentry 2254 + * @dentry: the negative dentry that was passed to the parent's lookup func 2255 + * @name: the case-exact name to be associated with the returned dentry 2256 + * 2257 + * Return: true if names are same, or false 2258 + */ 2259 + bool d_same_name(const struct dentry *dentry, const struct dentry *parent, 2260 + const struct qstr *name) 2255 2261 { 2256 2262 if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { 2257 2263 if (dentry->d_name.len != name->len) ··· 2268 2262 dentry->d_name.len, dentry->d_name.name, 2269 2263 name) == 0; 2270 2264 } 2265 + EXPORT_SYMBOL_GPL(d_same_name); 2271 2266 2272 2267 /** 2273 2268 * __d_lookup_rcu - search for a dentry (racy, store-free)

+7 -3

fs/inode.c

··· 422 422 INIT_LIST_HEAD(&inode->i_io_list); 423 423 INIT_LIST_HEAD(&inode->i_wb_list); 424 424 INIT_LIST_HEAD(&inode->i_lru); 425 + INIT_LIST_HEAD(&inode->i_sb_list); 425 426 __address_space_init_once(&inode->i_data); 426 427 i_size_ordered_init(inode); 427 428 } ··· 1022 1021 spin_lock(&inode->i_lock); 1023 1022 inode->i_state = 0; 1024 1023 spin_unlock(&inode->i_lock); 1025 - INIT_LIST_HEAD(&inode->i_sb_list); 1026 1024 } 1027 1025 return inode; 1028 1026 } ··· 1165 1165 { 1166 1166 struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); 1167 1167 struct inode *old; 1168 - bool creating = inode->i_state & I_CREATING; 1169 1168 1170 1169 again: 1171 1170 spin_lock(&inode_hash_lock); ··· 1198 1199 inode->i_state |= I_NEW; 1199 1200 hlist_add_head_rcu(&inode->i_hash, head); 1200 1201 spin_unlock(&inode->i_lock); 1201 - if (!creating) 1202 + 1203 + /* 1204 + * Add inode to the sb list if it's not already. It has I_NEW at this 1205 + * point, so it should be safe to test i_sb_list locklessly. 1206 + */ 1207 + if (list_empty(&inode->i_sb_list)) 1202 1208 inode_sb_list_add(inode); 1203 1209 unlock: 1204 1210 spin_unlock(&inode_hash_lock);

+4 -4

include/linux/ceph/ceph_fs.h

··· 433 433 __le32 stripe_unit; /* layout for newly created file */ 434 434 __le32 stripe_count; /* ... */ 435 435 __le32 object_size; 436 - __le32 file_replication; 437 - __le32 mask; /* CEPH_CAP_* */ 438 - __le32 old_size; 436 + __le32 pool; 437 + __le32 mask; /* CEPH_CAP_* */ 438 + __le64 old_size; 439 439 } __attribute__ ((packed)) open; 440 440 struct { 441 441 __le32 flags; ··· 768 768 __le32 xattr_len; 769 769 __le64 xattr_version; 770 770 771 - /* filelock */ 771 + /* a union of non-export and export bodies. */ 772 772 __le64 size, max_size, truncate_size; 773 773 __le32 truncate_seq; 774 774 struct ceph_timespec mtime, atime, ctime;

+1

include/linux/ceph/mdsmap.h

··· 25 25 u32 m_session_timeout; /* seconds */ 26 26 u32 m_session_autoclose; /* seconds */ 27 27 u64 m_max_file_size; 28 + u64 m_max_xattr_size; /* maximum size for xattrs blob */ 28 29 u32 m_max_mds; /* expected up:active mds number */ 29 30 u32 m_num_active_mds; /* actual up:active mds number */ 30 31 u32 possible_max_rank; /* possible max rank index */

+2 -3

include/linux/ceph/osd_client.h

··· 507 507 extern void ceph_osdc_get_request(struct ceph_osd_request *req); 508 508 extern void ceph_osdc_put_request(struct ceph_osd_request *req); 509 509 510 - extern int ceph_osdc_start_request(struct ceph_osd_client *osdc, 511 - struct ceph_osd_request *req, 512 - bool nofail); 510 + void ceph_osdc_start_request(struct ceph_osd_client *osdc, 511 + struct ceph_osd_request *req); 513 512 extern void ceph_osdc_cancel_request(struct ceph_osd_request *req); 514 513 extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc, 515 514 struct ceph_osd_request *req);

+2

include/linux/dcache.h

··· 233 233 wait_queue_head_t *); 234 234 extern struct dentry * d_splice_alias(struct inode *, struct dentry *); 235 235 extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); 236 + extern bool d_same_name(const struct dentry *dentry, const struct dentry *parent, 237 + const struct qstr *name); 236 238 extern struct dentry * d_exact_alias(struct dentry *, struct inode *); 237 239 extern struct dentry *d_find_any_alias(struct inode *inode); 238 240 extern struct dentry * d_obtain_alias(struct inode *);

+5

include/linux/fscrypt.h

··· 284 284 int fscrypt_ioctl_get_policy_ex(struct file *filp, void __user *arg); 285 285 int fscrypt_ioctl_get_nonce(struct file *filp, void __user *arg); 286 286 int fscrypt_has_permitted_context(struct inode *parent, struct inode *child); 287 + int fscrypt_context_for_new_inode(void *ctx, struct inode *inode); 287 288 int fscrypt_set_context(struct inode *inode, void *fs_data); 288 289 289 290 struct fscrypt_dummy_policy { ··· 328 327 int fscrypt_drop_inode(struct inode *inode); 329 328 330 329 /* fname.c */ 330 + int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, 331 + u8 *out, unsigned int olen); 332 + bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, 333 + u32 max_len, u32 *encrypted_len_ret); 331 334 int fscrypt_setup_filename(struct inode *inode, const struct qstr *iname, 332 335 int lookup, struct fscrypt_name *fname); 333 336

+10

include/linux/mmdebug.h

··· 54 54 } \ 55 55 unlikely(__ret_warn_once); \ 56 56 }) 57 + #define VM_WARN_ON_FOLIO(cond, folio) ({ \ 58 + int __ret_warn = !!(cond); \ 59 + \ 60 + if (unlikely(__ret_warn)) { \ 61 + dump_page(&folio->page, "VM_WARN_ON_FOLIO(" __stringify(cond)")");\ 62 + WARN_ON(1); \ 63 + } \ 64 + unlikely(__ret_warn); \ 65 + }) 57 66 #define VM_WARN_ON_ONCE_FOLIO(cond, folio) ({ \ 58 67 static bool __section(".data.once") __warned; \ 59 68 int __ret_warn_once = !!(cond); \ ··· 88 79 #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) 89 80 #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) 90 81 #define VM_WARN_ON_ONCE_PAGE(cond, page) BUILD_BUG_ON_INVALID(cond) 82 + #define VM_WARN_ON_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) 91 83 #define VM_WARN_ON_ONCE_FOLIO(cond, folio) BUILD_BUG_ON_INVALID(cond) 92 84 #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) 93 85 #define VM_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)

+6 -9

net/ceph/osd_client.c

··· 4578 4578 /* 4579 4579 * Register request, send initial attempt. 4580 4580 */ 4581 - int ceph_osdc_start_request(struct ceph_osd_client *osdc, 4582 - struct ceph_osd_request *req, 4583 - bool nofail) 4581 + void ceph_osdc_start_request(struct ceph_osd_client *osdc, 4582 + struct ceph_osd_request *req) 4584 4583 { 4585 4584 down_read(&osdc->lock); 4586 4585 submit_request(req, false); 4587 4586 up_read(&osdc->lock); 4588 - 4589 - return 0; 4590 4587 } 4591 4588 EXPORT_SYMBOL(ceph_osdc_start_request); 4592 4589 ··· 4753 4756 if (ret) 4754 4757 goto out_put_req; 4755 4758 4756 - ceph_osdc_start_request(osdc, req, false); 4759 + ceph_osdc_start_request(osdc, req); 4757 4760 linger_cancel(lreq); 4758 4761 linger_put(lreq); 4759 4762 ret = wait_request_timeout(req, opts->mount_timeout); ··· 4824 4827 if (ret) 4825 4828 goto out_put_req; 4826 4829 4827 - ceph_osdc_start_request(osdc, req, false); 4830 + ceph_osdc_start_request(osdc, req); 4828 4831 ret = ceph_osdc_wait_request(osdc, req); 4829 4832 4830 4833 out_put_req: ··· 5040 5043 if (ret) 5041 5044 goto out_put_req; 5042 5045 5043 - ceph_osdc_start_request(osdc, req, false); 5046 + ceph_osdc_start_request(osdc, req); 5044 5047 ret = ceph_osdc_wait_request(osdc, req); 5045 5048 if (ret >= 0) { 5046 5049 void *p = page_address(pages[0]); ··· 5117 5120 if (ret) 5118 5121 goto out_put_req; 5119 5122 5120 - ceph_osdc_start_request(osdc, req, false); 5123 + ceph_osdc_start_request(osdc, req); 5121 5124 ret = ceph_osdc_wait_request(osdc, req); 5122 5125 if (ret >= 0) { 5123 5126 ret = req->r_ops[0].rval;

+24 -8

net/ceph/osdmap.c

··· 11 11 #include <linux/crush/hash.h> 12 12 #include <linux/crush/mapper.h> 13 13 14 + static __printf(2, 3) 15 + void osdmap_info(const struct ceph_osdmap *map, const char *fmt, ...) 16 + { 17 + struct va_format vaf; 18 + va_list args; 19 + 20 + va_start(args, fmt); 21 + vaf.fmt = fmt; 22 + vaf.va = &args; 23 + 24 + printk(KERN_INFO "%s (%pU e%u): %pV", KBUILD_MODNAME, &map->fsid, 25 + map->epoch, &vaf); 26 + 27 + va_end(args); 28 + } 29 + 14 30 char *ceph_osdmap_state_str(char *str, int len, u32 state) 15 31 { 16 32 if (!len) ··· 587 571 goto bad; 588 572 #endif 589 573 r = kmalloc(struct_size(r, steps, yes), GFP_NOFS); 590 - c->rules[i] = r; 591 574 if (r == NULL) 592 575 goto badmem; 593 576 dout(" rule %d is at %p\n", i, r); 577 + c->rules[i] = r; 594 578 r->len = yes; 595 579 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */ 596 580 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad); ··· 1582 1566 if (ret) 1583 1567 return ret; 1584 1568 1585 - pr_info("osd%d primary-affinity 0x%x\n", osd, aff); 1569 + osdmap_info(map, "osd%d primary-affinity 0x%x\n", osd, aff); 1586 1570 } 1587 1571 1588 1572 return 0; ··· 1880 1864 osd = ceph_decode_32(p); 1881 1865 w = ceph_decode_32(p); 1882 1866 BUG_ON(osd >= map->max_osd); 1883 - pr_info("osd%d weight 0x%x %s\n", osd, w, 1884 - w == CEPH_OSD_IN ? "(in)" : 1885 - (w == CEPH_OSD_OUT ? "(out)" : "")); 1867 + osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w, 1868 + w == CEPH_OSD_IN ? "(in)" : 1869 + (w == CEPH_OSD_OUT ? "(out)" : "")); 1886 1870 map->osd_weight[osd] = w; 1887 1871 1888 1872 /* ··· 1914 1898 BUG_ON(osd >= map->max_osd); 1915 1899 if ((map->osd_state[osd] & CEPH_OSD_UP) && 1916 1900 (xorstate & CEPH_OSD_UP)) 1917 - pr_info("osd%d down\n", osd); 1901 + osdmap_info(map, "osd%d down\n", osd); 1918 1902 if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && 1919 1903 (xorstate & CEPH_OSD_EXISTS)) { 1920 - pr_info("osd%d does not exist\n", osd); 1904 + osdmap_info(map, "osd%d does not exist\n", osd); 1921 1905 ret = set_primary_affinity(map, osd, 1922 1906 CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); 1923 1907 if (ret) ··· 1947 1931 1948 1932 dout("%s osd%d addr %s\n", __func__, osd, ceph_pr_addr(&addr)); 1949 1933 1950 - pr_info("osd%d up\n", osd); 1934 + osdmap_info(map, "osd%d up\n", osd); 1951 1935 map->osd_state[osd] |= CEPH_OSD_EXISTS | CEPH_OSD_UP; 1952 1936 map->osd_addr[osd] = addr; 1953 1937 }

+1 -1

net/ceph/pagelist.c

··· 96 96 EXPORT_SYMBOL(ceph_pagelist_append); 97 97 98 98 /* Allocate enough pages for a pagelist to append the given amount 99 - * of data without without allocating. 99 + * of data without allocating. 100 100 * Returns: 0 on success, -ENOMEM on error. 101 101 */ 102 102 int ceph_pagelist_reserve(struct ceph_pagelist *pl, size_t space)