Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

+7 -4

drivers/block/rbd.c

··· 2370 2370 opcode = CEPH_OSD_OP_READ; 2371 2371 } 2372 2372 2373 - osd_req_op_extent_init(osd_request, num_ops, opcode, offset, length, 2374 - 0, 0); 2373 + if (opcode == CEPH_OSD_OP_DELETE) 2374 + osd_req_op_init(osd_request, num_ops, opcode); 2375 + else 2376 + osd_req_op_extent_init(osd_request, num_ops, opcode, 2377 + offset, length, 0, 0); 2378 + 2375 2379 if (obj_request->type == OBJ_REQUEST_BIO) 2376 2380 osd_req_op_extent_osd_data_bio(osd_request, num_ops, 2377 2381 obj_request->bio_list, length); ··· 3409 3405 if (result) 3410 3406 rbd_warn(rbd_dev, "%s %llx at %llx result %d", 3411 3407 obj_op_name(op_type), length, offset, result); 3412 - if (snapc) 3413 - ceph_put_snap_context(snapc); 3408 + ceph_put_snap_context(snapc); 3414 3409 blk_end_request_all(rq, result); 3415 3410 } 3416 3411

+262 -11

fs/ceph/addr.c

··· 192 192 struct ceph_osd_client *osdc = 193 193 &ceph_inode_to_client(inode)->client->osdc; 194 194 int err = 0; 195 + u64 off = page_offset(page); 195 196 u64 len = PAGE_CACHE_SIZE; 196 197 197 - err = ceph_readpage_from_fscache(inode, page); 198 + if (off >= i_size_read(inode)) { 199 + zero_user_segment(page, err, PAGE_CACHE_SIZE); 200 + SetPageUptodate(page); 201 + return 0; 202 + } 198 203 204 + /* 205 + * Uptodate inline data should have been added into page cache 206 + * while getting Fcr caps. 207 + */ 208 + if (ci->i_inline_version != CEPH_INLINE_NONE) 209 + return -EINVAL; 210 + 211 + err = ceph_readpage_from_fscache(inode, page); 199 212 if (err == 0) 200 213 goto out; 201 214 202 215 dout("readpage inode %p file %p page %p index %lu\n", 203 216 inode, filp, page, page->index); 204 217 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 205 - (u64) page_offset(page), &len, 218 + off, &len, 206 219 ci->i_truncate_seq, ci->i_truncate_size, 207 220 &page, 1, 0); 208 221 if (err == -ENOENT) ··· 332 319 off, len); 333 320 vino = ceph_vino(inode); 334 321 req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 335 - 1, CEPH_OSD_OP_READ, 322 + 0, 1, CEPH_OSD_OP_READ, 336 323 CEPH_OSD_FLAG_READ, NULL, 337 324 ci->i_truncate_seq, ci->i_truncate_size, 338 325 false); ··· 396 383 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 397 384 int rc = 0; 398 385 int max = 0; 386 + 387 + if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) 388 + return -EINVAL; 399 389 400 390 rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, 401 391 &nr_pages); ··· 689 673 int rc = 0; 690 674 unsigned wsize = 1 << inode->i_blkbits; 691 675 struct ceph_osd_request *req = NULL; 692 - int do_sync; 676 + int do_sync = 0; 693 677 u64 truncate_size, snap_size; 694 678 u32 truncate_seq; 695 679 ··· 766 750 last_snapc = snapc; 767 751 768 752 while (!done && index <= end) { 769 - int num_ops = do_sync ? 2 : 1; 770 753 unsigned i; 771 754 int first; 772 755 pgoff_t next; ··· 865 850 len = wsize; 866 851 req = ceph_osdc_new_request(&fsc->client->osdc, 867 852 &ci->i_layout, vino, 868 - offset, &len, num_ops, 853 + offset, &len, 0, 854 + do_sync ? 2 : 1, 869 855 CEPH_OSD_OP_WRITE, 870 856 CEPH_OSD_FLAG_WRITE | 871 857 CEPH_OSD_FLAG_ONDISK, ··· 877 861 unlock_page(page); 878 862 break; 879 863 } 864 + 865 + if (do_sync) 866 + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); 880 867 881 868 req->r_callback = writepages_finish; 882 869 req->r_inode = inode; ··· 1223 1204 struct inode *inode = file_inode(vma->vm_file); 1224 1205 struct ceph_inode_info *ci = ceph_inode(inode); 1225 1206 struct ceph_file_info *fi = vma->vm_file->private_data; 1207 + struct page *pinned_page = NULL; 1226 1208 loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; 1227 1209 int want, got, ret; 1228 1210 ··· 1235 1215 want = CEPH_CAP_FILE_CACHE; 1236 1216 while (1) { 1237 1217 got = 0; 1238 - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); 1218 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, 1219 + -1, &got, &pinned_page); 1239 1220 if (ret == 0) 1240 1221 break; 1241 1222 if (ret != -ERESTARTSYS) { ··· 1247 1226 dout("filemap_fault %p %llu~%zd got cap refs on %s\n", 1248 1227 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); 1249 1228 1250 - ret = filemap_fault(vma, vmf); 1229 + if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || 1230 + ci->i_inline_version == CEPH_INLINE_NONE) 1231 + ret = filemap_fault(vma, vmf); 1232 + else 1233 + ret = -EAGAIN; 1251 1234 1252 1235 dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", 1253 1236 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); 1237 + if (pinned_page) 1238 + page_cache_release(pinned_page); 1254 1239 ceph_put_cap_refs(ci, got); 1255 1240 1241 + if (ret != -EAGAIN) 1242 + return ret; 1243 + 1244 + /* read inline data */ 1245 + if (off >= PAGE_CACHE_SIZE) { 1246 + /* does not support inline data > PAGE_SIZE */ 1247 + ret = VM_FAULT_SIGBUS; 1248 + } else { 1249 + int ret1; 1250 + struct address_space *mapping = inode->i_mapping; 1251 + struct page *page = find_or_create_page(mapping, 0, 1252 + mapping_gfp_mask(mapping) & 1253 + ~__GFP_FS); 1254 + if (!page) { 1255 + ret = VM_FAULT_OOM; 1256 + goto out; 1257 + } 1258 + ret1 = __ceph_do_getattr(inode, page, 1259 + CEPH_STAT_CAP_INLINE_DATA, true); 1260 + if (ret1 < 0 || off >= i_size_read(inode)) { 1261 + unlock_page(page); 1262 + page_cache_release(page); 1263 + ret = VM_FAULT_SIGBUS; 1264 + goto out; 1265 + } 1266 + if (ret1 < PAGE_CACHE_SIZE) 1267 + zero_user_segment(page, ret1, PAGE_CACHE_SIZE); 1268 + else 1269 + flush_dcache_page(page); 1270 + SetPageUptodate(page); 1271 + vmf->page = page; 1272 + ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; 1273 + } 1274 + out: 1275 + dout("filemap_fault %p %llu~%zd read inline data ret %d\n", 1276 + inode, off, (size_t)PAGE_CACHE_SIZE, ret); 1256 1277 return ret; 1257 1278 } 1258 1279 ··· 1313 1250 size_t len; 1314 1251 int want, got, ret; 1315 1252 1253 + if (ci->i_inline_version != CEPH_INLINE_NONE) { 1254 + struct page *locked_page = NULL; 1255 + if (off == 0) { 1256 + lock_page(page); 1257 + locked_page = page; 1258 + } 1259 + ret = ceph_uninline_data(vma->vm_file, locked_page); 1260 + if (locked_page) 1261 + unlock_page(locked_page); 1262 + if (ret < 0) 1263 + return VM_FAULT_SIGBUS; 1264 + } 1265 + 1316 1266 if (off + PAGE_CACHE_SIZE <= size) 1317 1267 len = PAGE_CACHE_SIZE; 1318 1268 else ··· 1339 1263 want = CEPH_CAP_FILE_BUFFER; 1340 1264 while (1) { 1341 1265 got = 0; 1342 - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); 1266 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, 1267 + &got, NULL); 1343 1268 if (ret == 0) 1344 1269 break; 1345 1270 if (ret != -ERESTARTSYS) { ··· 1374 1297 ret = VM_FAULT_SIGBUS; 1375 1298 } 1376 1299 out: 1377 - if (ret != VM_FAULT_LOCKED) { 1300 + if (ret != VM_FAULT_LOCKED) 1378 1301 unlock_page(page); 1379 - } else { 1302 + if (ret == VM_FAULT_LOCKED || 1303 + ci->i_inline_version != CEPH_INLINE_NONE) { 1380 1304 int dirty; 1381 1305 spin_lock(&ci->i_ceph_lock); 1306 + ci->i_inline_version = CEPH_INLINE_NONE; 1382 1307 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 1383 1308 spin_unlock(&ci->i_ceph_lock); 1384 1309 if (dirty) ··· 1392 1313 ceph_put_cap_refs(ci, got); 1393 1314 1394 1315 return ret; 1316 + } 1317 + 1318 + void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, 1319 + char *data, size_t len) 1320 + { 1321 + struct address_space *mapping = inode->i_mapping; 1322 + struct page *page; 1323 + 1324 + if (locked_page) { 1325 + page = locked_page; 1326 + } else { 1327 + if (i_size_read(inode) == 0) 1328 + return; 1329 + page = find_or_create_page(mapping, 0, 1330 + mapping_gfp_mask(mapping) & ~__GFP_FS); 1331 + if (!page) 1332 + return; 1333 + if (PageUptodate(page)) { 1334 + unlock_page(page); 1335 + page_cache_release(page); 1336 + return; 1337 + } 1338 + } 1339 + 1340 + dout("fill_inline_data %p %llx.%llx len %lu locked_page %p\n", 1341 + inode, ceph_vinop(inode), len, locked_page); 1342 + 1343 + if (len > 0) { 1344 + void *kaddr = kmap_atomic(page); 1345 + memcpy(kaddr, data, len); 1346 + kunmap_atomic(kaddr); 1347 + } 1348 + 1349 + if (page != locked_page) { 1350 + if (len < PAGE_CACHE_SIZE) 1351 + zero_user_segment(page, len, PAGE_CACHE_SIZE); 1352 + else 1353 + flush_dcache_page(page); 1354 + 1355 + SetPageUptodate(page); 1356 + unlock_page(page); 1357 + page_cache_release(page); 1358 + } 1359 + } 1360 + 1361 + int ceph_uninline_data(struct file *filp, struct page *locked_page) 1362 + { 1363 + struct inode *inode = file_inode(filp); 1364 + struct ceph_inode_info *ci = ceph_inode(inode); 1365 + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1366 + struct ceph_osd_request *req; 1367 + struct page *page = NULL; 1368 + u64 len, inline_version; 1369 + int err = 0; 1370 + bool from_pagecache = false; 1371 + 1372 + spin_lock(&ci->i_ceph_lock); 1373 + inline_version = ci->i_inline_version; 1374 + spin_unlock(&ci->i_ceph_lock); 1375 + 1376 + dout("uninline_data %p %llx.%llx inline_version %llu\n", 1377 + inode, ceph_vinop(inode), inline_version); 1378 + 1379 + if (inline_version == 1 || /* initial version, no data */ 1380 + inline_version == CEPH_INLINE_NONE) 1381 + goto out; 1382 + 1383 + if (locked_page) { 1384 + page = locked_page; 1385 + WARN_ON(!PageUptodate(page)); 1386 + } else if (ceph_caps_issued(ci) & 1387 + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { 1388 + page = find_get_page(inode->i_mapping, 0); 1389 + if (page) { 1390 + if (PageUptodate(page)) { 1391 + from_pagecache = true; 1392 + lock_page(page); 1393 + } else { 1394 + page_cache_release(page); 1395 + page = NULL; 1396 + } 1397 + } 1398 + } 1399 + 1400 + if (page) { 1401 + len = i_size_read(inode); 1402 + if (len > PAGE_CACHE_SIZE) 1403 + len = PAGE_CACHE_SIZE; 1404 + } else { 1405 + page = __page_cache_alloc(GFP_NOFS); 1406 + if (!page) { 1407 + err = -ENOMEM; 1408 + goto out; 1409 + } 1410 + err = __ceph_do_getattr(inode, page, 1411 + CEPH_STAT_CAP_INLINE_DATA, true); 1412 + if (err < 0) { 1413 + /* no inline data */ 1414 + if (err == -ENODATA) 1415 + err = 0; 1416 + goto out; 1417 + } 1418 + len = err; 1419 + } 1420 + 1421 + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1422 + ceph_vino(inode), 0, &len, 0, 1, 1423 + CEPH_OSD_OP_CREATE, 1424 + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1425 + ci->i_snap_realm->cached_context, 1426 + 0, 0, false); 1427 + if (IS_ERR(req)) { 1428 + err = PTR_ERR(req); 1429 + goto out; 1430 + } 1431 + 1432 + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); 1433 + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1434 + if (!err) 1435 + err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1436 + ceph_osdc_put_request(req); 1437 + if (err < 0) 1438 + goto out; 1439 + 1440 + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1441 + ceph_vino(inode), 0, &len, 1, 3, 1442 + CEPH_OSD_OP_WRITE, 1443 + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1444 + ci->i_snap_realm->cached_context, 1445 + ci->i_truncate_seq, ci->i_truncate_size, 1446 + false); 1447 + if (IS_ERR(req)) { 1448 + err = PTR_ERR(req); 1449 + goto out; 1450 + } 1451 + 1452 + osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); 1453 + 1454 + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, 1455 + "inline_version", &inline_version, 1456 + sizeof(inline_version), 1457 + CEPH_OSD_CMPXATTR_OP_GT, 1458 + CEPH_OSD_CMPXATTR_MODE_U64); 1459 + if (err) 1460 + goto out_put; 1461 + 1462 + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, 1463 + "inline_version", &inline_version, 1464 + sizeof(inline_version), 0, 0); 1465 + if (err) 1466 + goto out_put; 1467 + 1468 + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); 1469 + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1470 + if (!err) 1471 + err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1472 + out_put: 1473 + ceph_osdc_put_request(req); 1474 + if (err == -ECANCELED) 1475 + err = 0; 1476 + out: 1477 + if (page && page != locked_page) { 1478 + if (from_pagecache) { 1479 + unlock_page(page); 1480 + page_cache_release(page); 1481 + } else 1482 + __free_pages(page, 0); 1483 + } 1484 + 1485 + dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", 1486 + inode, ceph_vinop(inode), inline_version, err); 1487 + return err; 1395 1488 } 1396 1489 1397 1490 static struct vm_operations_struct ceph_vmops = {

+102 -30

fs/ceph/caps.c

··· 975 975 kuid_t uid, kgid_t gid, umode_t mode, 976 976 u64 xattr_version, 977 977 struct ceph_buffer *xattrs_buf, 978 - u64 follows) 978 + u64 follows, bool inline_data) 979 979 { 980 980 struct ceph_mds_caps *fc; 981 981 struct ceph_msg *msg; 982 + void *p; 983 + size_t extra_len; 982 984 983 985 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" 984 986 " seq %u/%u mseq %u follows %lld size %llu/%llu" ··· 990 988 seq, issue_seq, mseq, follows, size, max_size, 991 989 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 992 990 993 - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); 991 + /* flock buffer size + inline version + inline data size */ 992 + extra_len = 4 + 8 + 4; 993 + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, 994 + GFP_NOFS, false); 994 995 if (!msg) 995 996 return -ENOMEM; 996 997 ··· 1024 1019 fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); 1025 1020 fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); 1026 1021 fc->mode = cpu_to_le32(mode); 1022 + 1023 + p = fc + 1; 1024 + /* flock buffer size */ 1025 + ceph_encode_32(&p, 0); 1026 + /* inline version */ 1027 + ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); 1028 + /* inline data size */ 1029 + ceph_encode_32(&p, 0); 1027 1030 1028 1031 fc->xattr_version = cpu_to_le64(xattr_version); 1029 1032 if (xattrs_buf) { ··· 1139 1126 u64 flush_tid = 0; 1140 1127 int i; 1141 1128 int ret; 1129 + bool inline_data; 1142 1130 1143 1131 held = cap->issued | cap->implemented; 1144 1132 revoking = cap->implemented & ~cap->issued; ··· 1223 1209 xattr_version = ci->i_xattrs.version; 1224 1210 } 1225 1211 1212 + inline_data = ci->i_inline_version != CEPH_INLINE_NONE; 1213 + 1226 1214 spin_unlock(&ci->i_ceph_lock); 1227 1215 1228 1216 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1229 1217 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, 1230 1218 size, max_size, &mtime, &atime, time_warp_seq, 1231 1219 uid, gid, mode, xattr_version, xattr_blob, 1232 - follows); 1220 + follows, inline_data); 1233 1221 if (ret < 0) { 1234 1222 dout("error sending cap msg, must requeue %p\n", inode); 1235 1223 delayed = 1; ··· 1352 1336 capsnap->time_warp_seq, 1353 1337 capsnap->uid, capsnap->gid, capsnap->mode, 1354 1338 capsnap->xattr_version, capsnap->xattr_blob, 1355 - capsnap->follows); 1339 + capsnap->follows, capsnap->inline_data); 1356 1340 1357 1341 next_follows = capsnap->follows + 1; 1358 1342 ceph_put_cap_snap(capsnap); ··· 2073 2057 * requested from the MDS. 2074 2058 */ 2075 2059 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, 2076 - int *got, loff_t endoff, int *check_max, int *err) 2060 + loff_t endoff, int *got, struct page **pinned_page, 2061 + int *check_max, int *err) 2077 2062 { 2078 2063 struct inode *inode = &ci->vfs_inode; 2079 2064 int ret = 0; 2080 - int have, implemented; 2065 + int have, implemented, _got = 0; 2081 2066 int file_wanted; 2082 2067 2083 2068 dout("get_cap_refs %p need %s want %s\n", inode, 2084 2069 ceph_cap_string(need), ceph_cap_string(want)); 2070 + again: 2085 2071 spin_lock(&ci->i_ceph_lock); 2086 2072 2087 2073 /* make sure file is actually open */ ··· 2093 2075 ceph_cap_string(need), ceph_cap_string(file_wanted)); 2094 2076 *err = -EBADF; 2095 2077 ret = 1; 2096 - goto out; 2078 + goto out_unlock; 2097 2079 } 2098 2080 2099 2081 /* finish pending truncate */ ··· 2113 2095 *check_max = 1; 2114 2096 ret = 1; 2115 2097 } 2116 - goto out; 2098 + goto out_unlock; 2117 2099 } 2118 2100 /* 2119 2101 * If a sync write is in progress, we must wait, so that we ··· 2121 2103 */ 2122 2104 if (__ceph_have_pending_cap_snap(ci)) { 2123 2105 dout("get_cap_refs %p cap_snap_pending\n", inode); 2124 - goto out; 2106 + goto out_unlock; 2125 2107 } 2126 2108 } 2127 2109 ··· 2138 2120 inode, ceph_cap_string(have), ceph_cap_string(not), 2139 2121 ceph_cap_string(revoking)); 2140 2122 if ((revoking & not) == 0) { 2141 - *got = need | (have & want); 2142 - __take_cap_refs(ci, *got); 2123 + _got = need | (have & want); 2124 + __take_cap_refs(ci, _got); 2143 2125 ret = 1; 2144 2126 } 2145 2127 } else { 2146 2128 dout("get_cap_refs %p have %s needed %s\n", inode, 2147 2129 ceph_cap_string(have), ceph_cap_string(need)); 2148 2130 } 2149 - out: 2131 + out_unlock: 2150 2132 spin_unlock(&ci->i_ceph_lock); 2133 + 2134 + if (ci->i_inline_version != CEPH_INLINE_NONE && 2135 + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 2136 + i_size_read(inode) > 0) { 2137 + int ret1; 2138 + struct page *page = find_get_page(inode->i_mapping, 0); 2139 + if (page) { 2140 + if (PageUptodate(page)) { 2141 + *pinned_page = page; 2142 + goto out; 2143 + } 2144 + page_cache_release(page); 2145 + } 2146 + /* 2147 + * drop cap refs first because getattr while holding 2148 + * caps refs can cause deadlock. 2149 + */ 2150 + ceph_put_cap_refs(ci, _got); 2151 + _got = 0; 2152 + 2153 + /* getattr request will bring inline data into page cache */ 2154 + ret1 = __ceph_do_getattr(inode, NULL, 2155 + CEPH_STAT_CAP_INLINE_DATA, true); 2156 + if (ret1 >= 0) { 2157 + ret = 0; 2158 + goto again; 2159 + } 2160 + *err = ret1; 2161 + ret = 1; 2162 + } 2163 + out: 2151 2164 dout("get_cap_refs %p ret %d got %s\n", inode, 2152 - ret, ceph_cap_string(*got)); 2165 + ret, ceph_cap_string(_got)); 2166 + *got = _got; 2153 2167 return ret; 2154 2168 } 2155 2169 ··· 2218 2168 * due to a small max_size, make sure we check_max_size (and possibly 2219 2169 * ask the mds) so we don't get hung up indefinitely. 2220 2170 */ 2221 - int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got, 2222 - loff_t endoff) 2171 + int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 2172 + loff_t endoff, int *got, struct page **pinned_page) 2223 2173 { 2224 2174 int check_max, ret, err; 2225 2175 ··· 2229 2179 check_max = 0; 2230 2180 err = 0; 2231 2181 ret = wait_event_interruptible(ci->i_cap_wq, 2232 - try_get_cap_refs(ci, need, want, 2233 - got, endoff, 2182 + try_get_cap_refs(ci, need, want, endoff, 2183 + got, pinned_page, 2234 2184 &check_max, &err)); 2235 2185 if (err) 2236 2186 ret = err; ··· 2433 2383 static void handle_cap_grant(struct ceph_mds_client *mdsc, 2434 2384 struct inode *inode, struct ceph_mds_caps *grant, 2435 2385 void *snaptrace, int snaptrace_len, 2386 + u64 inline_version, 2387 + void *inline_data, int inline_len, 2436 2388 struct ceph_buffer *xattr_buf, 2437 2389 struct ceph_mds_session *session, 2438 2390 struct ceph_cap *cap, int issued) ··· 2455 2403 bool queue_invalidate = false; 2456 2404 bool queue_revalidate = false; 2457 2405 bool deleted_inode = false; 2406 + bool fill_inline = false; 2458 2407 2459 2408 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2460 2409 inode, cap, mds, seq, ceph_cap_string(newcaps)); ··· 2629 2576 } 2630 2577 BUG_ON(cap->issued & ~cap->implemented); 2631 2578 2579 + if (inline_version > 0 && inline_version >= ci->i_inline_version) { 2580 + ci->i_inline_version = inline_version; 2581 + if (ci->i_inline_version != CEPH_INLINE_NONE && 2582 + (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) 2583 + fill_inline = true; 2584 + } 2585 + 2632 2586 spin_unlock(&ci->i_ceph_lock); 2633 2587 2634 2588 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { ··· 2648 2588 if (newcaps & ~issued) 2649 2589 wake = true; 2650 2590 } 2591 + 2592 + if (fill_inline) 2593 + ceph_fill_inline_data(inode, NULL, inline_data, inline_len); 2651 2594 2652 2595 if (queue_trunc) { 2653 2596 ceph_queue_vmtruncate(inode); ··· 3059 2996 u64 cap_id; 3060 2997 u64 size, max_size; 3061 2998 u64 tid; 2999 + u64 inline_version = 0; 3000 + void *inline_data = NULL; 3001 + u32 inline_len = 0; 3062 3002 void *snaptrace; 3063 3003 size_t snaptrace_len; 3064 - void *flock; 3065 - void *end; 3066 - u32 flock_len; 3004 + void *p, *end; 3067 3005 3068 3006 dout("handle_caps from mds%d\n", mds); 3069 3007 ··· 3085 3021 3086 3022 snaptrace = h + 1; 3087 3023 snaptrace_len = le32_to_cpu(h->snap_trace_len); 3024 + p = snaptrace + snaptrace_len; 3088 3025 3089 3026 if (le16_to_cpu(msg->hdr.version) >= 2) { 3090 - void *p = snaptrace + snaptrace_len; 3027 + u32 flock_len; 3091 3028 ceph_decode_32_safe(&p, end, flock_len, bad); 3092 3029 if (p + flock_len > end) 3093 3030 goto bad; 3094 - flock = p; 3095 - } else { 3096 - flock = NULL; 3097 - flock_len = 0; 3031 + p += flock_len; 3098 3032 } 3099 3033 3100 3034 if (le16_to_cpu(msg->hdr.version) >= 3) { 3101 3035 if (op == CEPH_CAP_OP_IMPORT) { 3102 - void *p = flock + flock_len; 3103 3036 if (p + sizeof(*peer) > end) 3104 3037 goto bad; 3105 3038 peer = p; 3039 + p += sizeof(*peer); 3106 3040 } else if (op == CEPH_CAP_OP_EXPORT) { 3107 3041 /* recorded in unused fields */ 3108 3042 peer = (void *)&h->size; 3109 3043 } 3044 + } 3045 + 3046 + if (le16_to_cpu(msg->hdr.version) >= 4) { 3047 + ceph_decode_64_safe(&p, end, inline_version, bad); 3048 + ceph_decode_32_safe(&p, end, inline_len, bad); 3049 + if (p + inline_len > end) 3050 + goto bad; 3051 + inline_data = p; 3052 + p += inline_len; 3110 3053 } 3111 3054 3112 3055 /* lookup ino */ ··· 3156 3085 handle_cap_import(mdsc, inode, h, peer, session, 3157 3086 &cap, &issued); 3158 3087 handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, 3088 + inline_version, inline_data, inline_len, 3159 3089 msg->middle, session, cap, issued); 3160 3090 goto done_unlocked; 3161 3091 } ··· 3177 3105 case CEPH_CAP_OP_GRANT: 3178 3106 __ceph_caps_issued(ci, &issued); 3179 3107 issued |= __ceph_caps_dirty(ci); 3180 - handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, 3181 - session, cap, issued); 3108 + handle_cap_grant(mdsc, inode, h, NULL, 0, 3109 + inline_version, inline_data, inline_len, 3110 + msg->middle, session, cap, issued); 3182 3111 goto done_unlocked; 3183 3112 3184 3113 case CEPH_CAP_OP_FLUSH_ACK: ··· 3210 3137 done: 3211 3138 mutex_unlock(&session->s_mutex); 3212 3139 done_unlocked: 3213 - if (inode) 3214 - iput(inode); 3140 + iput(inode); 3215 3141 return; 3216 3142 3217 3143 bad:

+18 -9

fs/ceph/dir.c

··· 183 183 spin_unlock(&parent->d_lock); 184 184 185 185 /* make sure a dentry wasn't dropped while we didn't have parent lock */ 186 - if (!ceph_dir_is_complete(dir)) { 186 + if (!ceph_dir_is_complete_ordered(dir)) { 187 187 dout(" lost dir complete on %p; falling back to mds\n", dir); 188 188 dput(dentry); 189 189 err = -EAGAIN; ··· 261 261 262 262 /* always start with . and .. */ 263 263 if (ctx->pos == 0) { 264 - /* note dir version at start of readdir so we can tell 265 - * if any dentries get dropped */ 266 - fi->dir_release_count = atomic_read(&ci->i_release_count); 267 - 268 264 dout("readdir off 0 -> '.'\n"); 269 265 if (!dir_emit(ctx, ".", 1, 270 266 ceph_translate_ino(inode->i_sb, inode->i_ino), ··· 285 289 if ((ctx->pos == 2 || fi->dentry) && 286 290 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 287 291 ceph_snap(inode) != CEPH_SNAPDIR && 288 - __ceph_dir_is_complete(ci) && 292 + __ceph_dir_is_complete_ordered(ci) && 289 293 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 290 294 u32 shared_gen = ci->i_shared_gen; 291 295 spin_unlock(&ci->i_ceph_lock); ··· 307 311 } 308 312 309 313 /* proceed with a normal readdir */ 314 + 315 + if (ctx->pos == 2) { 316 + /* note dir version at start of readdir so we can tell 317 + * if any dentries get dropped */ 318 + fi->dir_release_count = atomic_read(&ci->i_release_count); 319 + fi->dir_ordered_count = ci->i_ordered_count; 320 + } 310 321 311 322 more: 312 323 /* do we have the correct frag content buffered? */ ··· 449 446 */ 450 447 spin_lock(&ci->i_ceph_lock); 451 448 if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { 452 - dout(" marking %p complete\n", inode); 453 - __ceph_dir_set_complete(ci, fi->dir_release_count); 449 + if (ci->i_ordered_count == fi->dir_ordered_count) 450 + dout(" marking %p complete and ordered\n", inode); 451 + else 452 + dout(" marking %p complete\n", inode); 453 + __ceph_dir_set_complete(ci, fi->dir_release_count, 454 + fi->dir_ordered_count); 454 455 } 455 456 spin_unlock(&ci->i_ceph_lock); 456 457 ··· 812 805 acls.pagelist = NULL; 813 806 } 814 807 err = ceph_mdsc_do_request(mdsc, dir, req); 815 - if (!err && !req->r_reply_info.head->is_dentry) 808 + if (!err && 809 + !req->r_reply_info.head->is_target && 810 + !req->r_reply_info.head->is_dentry) 816 811 err = ceph_handle_notrace_create(dir, dentry); 817 812 ceph_mdsc_put_request(req); 818 813 out:

+83 -14

fs/ceph/file.c

··· 333 333 return 0; 334 334 } 335 335 336 + enum { 337 + CHECK_EOF = 1, 338 + READ_INLINE = 2, 339 + }; 340 + 336 341 /* 337 342 * Read a range of bytes striped over one or more objects. Iterate over 338 343 * objects we stripe over. (That's not atomic, but good enough for now.) ··· 417 412 ret = read; 418 413 /* did we bounce off eof? */ 419 414 if (pos + left > inode->i_size) 420 - *checkeof = 1; 415 + *checkeof = CHECK_EOF; 421 416 } 422 417 423 418 dout("striped_read returns %d\n", ret); ··· 603 598 snapc = ci->i_snap_realm->cached_context; 604 599 vino = ceph_vino(inode); 605 600 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 606 - vino, pos, &len, 601 + vino, pos, &len, 0, 607 602 2,/*include a 'startsync' command*/ 608 603 CEPH_OSD_OP_WRITE, flags, snapc, 609 604 ci->i_truncate_seq, ··· 613 608 ret = PTR_ERR(req); 614 609 break; 615 610 } 611 + 612 + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); 616 613 617 614 n = iov_iter_get_pages_alloc(from, &pages, len, &start); 618 615 if (unlikely(n < 0)) { ··· 720 713 snapc = ci->i_snap_realm->cached_context; 721 714 vino = ceph_vino(inode); 722 715 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 723 - vino, pos, &len, 1, 716 + vino, pos, &len, 0, 1, 724 717 CEPH_OSD_OP_WRITE, flags, snapc, 725 718 ci->i_truncate_seq, 726 719 ci->i_truncate_size, ··· 810 803 size_t len = iocb->ki_nbytes; 811 804 struct inode *inode = file_inode(filp); 812 805 struct ceph_inode_info *ci = ceph_inode(inode); 806 + struct page *pinned_page = NULL; 813 807 ssize_t ret; 814 808 int want, got = 0; 815 - int checkeof = 0, read = 0; 809 + int retry_op = 0, read = 0; 816 810 817 811 again: 818 812 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", ··· 823 815 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 824 816 else 825 817 want = CEPH_CAP_FILE_CACHE; 826 - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); 818 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); 827 819 if (ret < 0) 828 820 return ret; 829 821 ··· 835 827 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 836 828 ceph_cap_string(got)); 837 829 838 - /* hmm, this isn't really async... */ 839 - ret = ceph_sync_read(iocb, to, &checkeof); 830 + if (ci->i_inline_version == CEPH_INLINE_NONE) { 831 + /* hmm, this isn't really async... */ 832 + ret = ceph_sync_read(iocb, to, &retry_op); 833 + } else { 834 + retry_op = READ_INLINE; 835 + } 840 836 } else { 841 837 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 842 838 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ··· 850 838 } 851 839 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 852 840 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); 841 + if (pinned_page) { 842 + page_cache_release(pinned_page); 843 + pinned_page = NULL; 844 + } 853 845 ceph_put_cap_refs(ci, got); 846 + if (retry_op && ret >= 0) { 847 + int statret; 848 + struct page *page = NULL; 849 + loff_t i_size; 850 + if (retry_op == READ_INLINE) { 851 + page = __page_cache_alloc(GFP_NOFS); 852 + if (!page) 853 + return -ENOMEM; 854 + } 854 855 855 - if (checkeof && ret >= 0) { 856 - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); 856 + statret = __ceph_do_getattr(inode, page, 857 + CEPH_STAT_CAP_INLINE_DATA, !!page); 858 + if (statret < 0) { 859 + __free_page(page); 860 + if (statret == -ENODATA) { 861 + BUG_ON(retry_op != READ_INLINE); 862 + goto again; 863 + } 864 + return statret; 865 + } 866 + 867 + i_size = i_size_read(inode); 868 + if (retry_op == READ_INLINE) { 869 + /* does not support inline data > PAGE_SIZE */ 870 + if (i_size > PAGE_CACHE_SIZE) { 871 + ret = -EIO; 872 + } else if (iocb->ki_pos < i_size) { 873 + loff_t end = min_t(loff_t, i_size, 874 + iocb->ki_pos + len); 875 + if (statret < end) 876 + zero_user_segment(page, statret, end); 877 + ret = copy_page_to_iter(page, 878 + iocb->ki_pos & ~PAGE_MASK, 879 + end - iocb->ki_pos, to); 880 + iocb->ki_pos += ret; 881 + } else { 882 + ret = 0; 883 + } 884 + __free_pages(page, 0); 885 + return ret; 886 + } 857 887 858 888 /* hit EOF or hole? */ 859 - if (statret == 0 && iocb->ki_pos < inode->i_size && 889 + if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && 860 890 ret < len) { 861 891 dout("sync_read hit hole, ppos %lld < size %lld" 862 892 ", reading more\n", iocb->ki_pos, ··· 906 852 907 853 read += ret; 908 854 len -= ret; 909 - checkeof = 0; 855 + retry_op = 0; 910 856 goto again; 911 857 } 912 858 } ··· 963 909 if (err) 964 910 goto out; 965 911 912 + if (ci->i_inline_version != CEPH_INLINE_NONE) { 913 + err = ceph_uninline_data(file, NULL); 914 + if (err < 0) 915 + goto out; 916 + } 917 + 966 918 retry_snap: 967 919 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { 968 920 err = -ENOSPC; ··· 982 922 else 983 923 want = CEPH_CAP_FILE_BUFFER; 984 924 got = 0; 985 - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); 925 + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, 926 + &got, NULL); 986 927 if (err < 0) 987 928 goto out; 988 929 ··· 1030 969 if (written >= 0) { 1031 970 int dirty; 1032 971 spin_lock(&ci->i_ceph_lock); 972 + ci->i_inline_version = CEPH_INLINE_NONE; 1033 973 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 1034 974 spin_unlock(&ci->i_ceph_lock); 1035 975 if (dirty) ··· 1173 1111 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1174 1112 ceph_vino(inode), 1175 1113 offset, length, 1176 - 1, op, 1114 + 0, 1, op, 1177 1115 CEPH_OSD_FLAG_WRITE | 1178 1116 CEPH_OSD_FLAG_ONDISK, 1179 1117 NULL, 0, 0, false); ··· 1276 1214 goto unlock; 1277 1215 } 1278 1216 1217 + if (ci->i_inline_version != CEPH_INLINE_NONE) { 1218 + ret = ceph_uninline_data(file, NULL); 1219 + if (ret < 0) 1220 + goto unlock; 1221 + } 1222 + 1279 1223 size = i_size_read(inode); 1280 1224 if (!(mode & FALLOC_FL_KEEP_SIZE)) 1281 1225 endoff = offset + length; ··· 1291 1223 else 1292 1224 want = CEPH_CAP_FILE_BUFFER; 1293 1225 1294 - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); 1226 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); 1295 1227 if (ret < 0) 1296 1228 goto unlock; 1297 1229 ··· 1308 1240 1309 1241 if (!ret) { 1310 1242 spin_lock(&ci->i_ceph_lock); 1243 + ci->i_inline_version = CEPH_INLINE_NONE; 1311 1244 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 1312 1245 spin_unlock(&ci->i_ceph_lock); 1313 1246 if (dirty)

+47 -12

fs/ceph/inode.c

··· 387 387 spin_lock_init(&ci->i_ceph_lock); 388 388 389 389 ci->i_version = 0; 390 + ci->i_inline_version = 0; 390 391 ci->i_time_warp_seq = 0; 391 392 ci->i_ceph_flags = 0; 393 + ci->i_ordered_count = 0; 392 394 atomic_set(&ci->i_release_count, 1); 393 395 atomic_set(&ci->i_complete_count, 0); 394 396 ci->i_symlink = NULL; ··· 659 657 * Populate an inode based on info from mds. May be called on new or 660 658 * existing inodes. 661 659 */ 662 - static int fill_inode(struct inode *inode, 660 + static int fill_inode(struct inode *inode, struct page *locked_page, 663 661 struct ceph_mds_reply_info_in *iinfo, 664 662 struct ceph_mds_reply_dirfrag *dirinfo, 665 663 struct ceph_mds_session *session, ··· 677 675 bool wake = false; 678 676 bool queue_trunc = false; 679 677 bool new_version = false; 678 + bool fill_inline = false; 680 679 681 680 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", 682 681 inode, ceph_vinop(inode), le64_to_cpu(info->version), ··· 848 845 (issued & CEPH_CAP_FILE_EXCL) == 0 && 849 846 !__ceph_dir_is_complete(ci)) { 850 847 dout(" marking %p complete (empty)\n", inode); 851 - __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); 848 + __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count), 849 + ci->i_ordered_count); 852 850 } 853 851 854 852 /* were we issued a capability? */ ··· 877 873 ceph_vinop(inode)); 878 874 __ceph_get_fmode(ci, cap_fmode); 879 875 } 876 + 877 + if (iinfo->inline_version > 0 && 878 + iinfo->inline_version >= ci->i_inline_version) { 879 + int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 880 + ci->i_inline_version = iinfo->inline_version; 881 + if (ci->i_inline_version != CEPH_INLINE_NONE && 882 + (locked_page || 883 + (le32_to_cpu(info->cap.caps) & cache_caps))) 884 + fill_inline = true; 885 + } 886 + 880 887 spin_unlock(&ci->i_ceph_lock); 888 + 889 + if (fill_inline) 890 + ceph_fill_inline_data(inode, locked_page, 891 + iinfo->inline_data, iinfo->inline_len); 881 892 882 893 if (wake) 883 894 wake_up_all(&ci->i_cap_wq); ··· 1081 1062 struct inode *dir = req->r_locked_dir; 1082 1063 1083 1064 if (dir) { 1084 - err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, 1065 + err = fill_inode(dir, NULL, 1066 + &rinfo->diri, rinfo->dirfrag, 1085 1067 session, req->r_request_started, -1, 1086 1068 &req->r_caps_reservation); 1087 1069 if (err < 0) ··· 1152 1132 } 1153 1133 req->r_target_inode = in; 1154 1134 1155 - err = fill_inode(in, &rinfo->targeti, NULL, 1135 + err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, 1156 1136 session, req->r_request_started, 1157 1137 (!req->r_aborted && rinfo->head->result == 0) ? 1158 1138 req->r_fmode : -1, ··· 1224 1204 ceph_invalidate_dentry_lease(dn); 1225 1205 1226 1206 /* d_move screws up sibling dentries' offsets */ 1227 - ceph_dir_clear_complete(dir); 1228 - ceph_dir_clear_complete(olddir); 1207 + ceph_dir_clear_ordered(dir); 1208 + ceph_dir_clear_ordered(olddir); 1229 1209 1230 1210 dout("dn %p gets new offset %lld\n", req->r_old_dentry, 1231 1211 ceph_dentry(req->r_old_dentry)->offset); ··· 1237 1217 if (!rinfo->head->is_target) { 1238 1218 dout("fill_trace null dentry\n"); 1239 1219 if (dn->d_inode) { 1220 + ceph_dir_clear_ordered(dir); 1240 1221 dout("d_delete %p\n", dn); 1241 1222 d_delete(dn); 1242 1223 } else { ··· 1254 1233 1255 1234 /* attach proper inode */ 1256 1235 if (!dn->d_inode) { 1257 - ceph_dir_clear_complete(dir); 1236 + ceph_dir_clear_ordered(dir); 1258 1237 ihold(in); 1259 1238 dn = splice_dentry(dn, in, &have_lease); 1260 1239 if (IS_ERR(dn)) { ··· 1284 1263 BUG_ON(!dir); 1285 1264 BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); 1286 1265 dout(" linking snapped dir %p to dn %p\n", in, dn); 1287 - ceph_dir_clear_complete(dir); 1266 + ceph_dir_clear_ordered(dir); 1288 1267 ihold(in); 1289 1268 dn = splice_dentry(dn, in, NULL); 1290 1269 if (IS_ERR(dn)) { ··· 1321 1300 dout("new_inode badness got %d\n", err); 1322 1301 continue; 1323 1302 } 1324 - rc = fill_inode(in, &rinfo->dir_in[i], NULL, session, 1303 + rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, 1325 1304 req->r_request_started, -1, 1326 1305 &req->r_caps_reservation); 1327 1306 if (rc < 0) { ··· 1437 1416 } 1438 1417 } 1439 1418 1440 - if (fill_inode(in, &rinfo->dir_in[i], NULL, session, 1419 + if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, 1441 1420 req->r_request_started, -1, 1442 1421 &req->r_caps_reservation) < 0) { 1443 1422 pr_err("fill_inode badness on %p\n", in); ··· 1920 1899 * Verify that we have a lease on the given mask. If not, 1921 1900 * do a getattr against an mds. 1922 1901 */ 1923 - int ceph_do_getattr(struct inode *inode, int mask, bool force) 1902 + int __ceph_do_getattr(struct inode *inode, struct page *locked_page, 1903 + int mask, bool force) 1924 1904 { 1925 1905 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 1926 1906 struct ceph_mds_client *mdsc = fsc->mdsc; ··· 1933 1911 return 0; 1934 1912 } 1935 1913 1936 - dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); 1914 + dout("do_getattr inode %p mask %s mode 0%o\n", 1915 + inode, ceph_cap_string(mask), inode->i_mode); 1937 1916 if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) 1938 1917 return 0; 1939 1918 ··· 1945 1922 ihold(inode); 1946 1923 req->r_num_caps = 1; 1947 1924 req->r_args.getattr.mask = cpu_to_le32(mask); 1925 + req->r_locked_page = locked_page; 1948 1926 err = ceph_mdsc_do_request(mdsc, NULL, req); 1927 + if (locked_page && err == 0) { 1928 + u64 inline_version = req->r_reply_info.targeti.inline_version; 1929 + if (inline_version == 0) { 1930 + /* the reply is supposed to contain inline data */ 1931 + err = -EINVAL; 1932 + } else if (inline_version == CEPH_INLINE_NONE) { 1933 + err = -ENODATA; 1934 + } else { 1935 + err = req->r_reply_info.targeti.inline_len; 1936 + } 1937 + } 1949 1938 ceph_mdsc_put_request(req); 1950 1939 dout("do_getattr result=%d\n", err); 1951 1940 return err;

+54 -10

fs/ceph/locks.c

··· 9 9 #include <linux/ceph/pagelist.h> 10 10 11 11 static u64 lock_secret; 12 + static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, 13 + struct ceph_mds_request *req); 12 14 13 15 static inline u64 secure_addr(void *addr) 14 16 { ··· 42 40 u64 length = 0; 43 41 u64 owner; 44 42 43 + if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) 44 + wait = 0; 45 + 45 46 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); 46 47 if (IS_ERR(req)) 47 48 return PTR_ERR(req); ··· 73 68 req->r_args.filelock_change.length = cpu_to_le64(length); 74 69 req->r_args.filelock_change.wait = wait; 75 70 71 + if (wait) 72 + req->r_wait_for_completion = ceph_lock_wait_for_completion; 73 + 76 74 err = ceph_mdsc_do_request(mdsc, inode, req); 77 75 78 76 if (operation == CEPH_MDS_OP_GETFILELOCK) { ··· 102 94 (int)operation, (u64)fl->fl_pid, fl->fl_start, 103 95 length, wait, fl->fl_type, err); 104 96 return err; 97 + } 98 + 99 + static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, 100 + struct ceph_mds_request *req) 101 + { 102 + struct ceph_mds_request *intr_req; 103 + struct inode *inode = req->r_inode; 104 + int err, lock_type; 105 + 106 + BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK); 107 + if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL) 108 + lock_type = CEPH_LOCK_FCNTL_INTR; 109 + else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK) 110 + lock_type = CEPH_LOCK_FLOCK_INTR; 111 + else 112 + BUG_ON(1); 113 + BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK); 114 + 115 + err = wait_for_completion_interruptible(&req->r_completion); 116 + if (!err) 117 + return 0; 118 + 119 + dout("ceph_lock_wait_for_completion: request %llu was interrupted\n", 120 + req->r_tid); 121 + 122 + intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, 123 + USE_AUTH_MDS); 124 + if (IS_ERR(intr_req)) 125 + return PTR_ERR(intr_req); 126 + 127 + intr_req->r_inode = inode; 128 + ihold(inode); 129 + intr_req->r_num_caps = 1; 130 + 131 + intr_req->r_args.filelock_change = req->r_args.filelock_change; 132 + intr_req->r_args.filelock_change.rule = lock_type; 133 + intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK; 134 + 135 + err = ceph_mdsc_do_request(mdsc, inode, intr_req); 136 + ceph_mdsc_put_request(intr_req); 137 + 138 + if (err && err != -ERESTARTSYS) 139 + return err; 140 + 141 + wait_for_completion(&req->r_completion); 142 + return 0; 105 143 } 106 144 107 145 /** ··· 197 143 err); 198 144 } 199 145 } 200 - 201 - } else if (err == -ERESTARTSYS) { 202 - dout("undoing lock\n"); 203 - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 204 - CEPH_LOCK_UNLOCK, 0, fl); 205 146 } 206 147 return err; 207 148 } ··· 235 186 file, CEPH_LOCK_UNLOCK, 0, fl); 236 187 dout("got %d on flock_lock_file_wait, undid lock", err); 237 188 } 238 - } else if (err == -ERESTARTSYS) { 239 - dout("undoing lock\n"); 240 - ceph_lock_message(CEPH_LOCK_FLOCK, 241 - CEPH_MDS_OP_SETFILELOCK, 242 - file, CEPH_LOCK_UNLOCK, 0, fl); 243 189 } 244 190 return err; 245 191 }

+35 -6

fs/ceph/mds_client.c

··· 89 89 ceph_decode_need(p, end, info->xattr_len, bad); 90 90 info->xattr_data = *p; 91 91 *p += info->xattr_len; 92 + 93 + if (features & CEPH_FEATURE_MDS_INLINE_DATA) { 94 + ceph_decode_64_safe(p, end, info->inline_version, bad); 95 + ceph_decode_32_safe(p, end, info->inline_len, bad); 96 + ceph_decode_need(p, end, info->inline_len, bad); 97 + info->inline_data = *p; 98 + *p += info->inline_len; 99 + } else 100 + info->inline_version = CEPH_INLINE_NONE; 101 + 92 102 return 0; 93 103 bad: 94 104 return err; ··· 534 524 } 535 525 if (req->r_locked_dir) 536 526 ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); 537 - if (req->r_target_inode) 538 - iput(req->r_target_inode); 527 + iput(req->r_target_inode); 539 528 if (req->r_dentry) 540 529 dput(req->r_dentry); 541 530 if (req->r_old_dentry) ··· 870 861 /* 871 862 * Serialize client metadata into waiting buffer space, using 872 863 * the format that userspace expects for map<string, string> 864 + * 865 + * ClientSession messages with metadata are v2 873 866 */ 874 - msg->hdr.version = 2; /* ClientSession messages with metadata are v2 */ 867 + msg->hdr.version = cpu_to_le16(2); 868 + msg->hdr.compat_version = cpu_to_le16(1); 875 869 876 870 /* The write pointer, following the session_head structure */ 877 871 p = msg->front.iov_base + sizeof(*h); ··· 1078 1066 session->s_cap_iterator = NULL; 1079 1067 spin_unlock(&session->s_cap_lock); 1080 1068 1081 - if (last_inode) 1082 - iput(last_inode); 1069 + iput(last_inode); 1083 1070 if (old_cap) 1084 1071 ceph_put_cap(session->s_mdsc, old_cap); 1085 1072 ··· 1885 1874 goto out_free2; 1886 1875 } 1887 1876 1888 - msg->hdr.version = 2; 1877 + msg->hdr.version = cpu_to_le16(2); 1889 1878 msg->hdr.tid = cpu_to_le64(req->r_tid); 1890 1879 1891 1880 head = msg->front.iov_base; ··· 2219 2208 &req->r_completion, req->r_timeout); 2220 2209 if (err == 0) 2221 2210 err = -EIO; 2211 + } else if (req->r_wait_for_completion) { 2212 + err = req->r_wait_for_completion(mdsc, req); 2222 2213 } else { 2223 2214 err = wait_for_completion_killable(&req->r_completion); 2224 2215 } ··· 3757 3744 return msg; 3758 3745 } 3759 3746 3747 + static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) 3748 + { 3749 + struct ceph_mds_session *s = con->private; 3750 + struct ceph_auth_handshake *auth = &s->s_auth; 3751 + return ceph_auth_sign_message(auth, msg); 3752 + } 3753 + 3754 + static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) 3755 + { 3756 + struct ceph_mds_session *s = con->private; 3757 + struct ceph_auth_handshake *auth = &s->s_auth; 3758 + return ceph_auth_check_message_signature(auth, msg); 3759 + } 3760 + 3760 3761 static const struct ceph_connection_operations mds_con_ops = { 3761 3762 .get = con_get, 3762 3763 .put = con_put, ··· 3780 3753 .invalidate_authorizer = invalidate_authorizer, 3781 3754 .peer_reset = peer_reset, 3782 3755 .alloc_msg = mds_alloc_msg, 3756 + .sign_message = sign_message, 3757 + .check_message_signature = check_message_signature, 3783 3758 }; 3784 3759 3785 3760 /* eof */

+10

fs/ceph/mds_client.h

··· 41 41 char *symlink; 42 42 u32 xattr_len; 43 43 char *xattr_data; 44 + u64 inline_version; 45 + u32 inline_len; 46 + char *inline_data; 44 47 }; 45 48 46 49 /* ··· 169 166 */ 170 167 typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, 171 168 struct ceph_mds_request *req); 169 + /* 170 + * wait for request completion callback 171 + */ 172 + typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, 173 + struct ceph_mds_request *req); 172 174 173 175 /* 174 176 * an in-flight mds request ··· 223 215 int r_request_release_offset; 224 216 struct ceph_msg *r_reply; 225 217 struct ceph_mds_reply_info_parsed r_reply_info; 218 + struct page *r_locked_page; 226 219 int r_err; 227 220 bool r_aborted; 228 221 ··· 248 239 struct completion r_completion; 249 240 struct completion r_safe_completion; 250 241 ceph_mds_request_callback_t r_callback; 242 + ceph_mds_request_wait_callback_t r_wait_for_completion; 251 243 struct list_head r_unsafe_item; /* per-session unsafe list item */ 252 244 bool r_got_unsafe, r_got_safe, r_got_result; 253 245

+30 -7

fs/ceph/snap.c

··· 288 288 return 0; 289 289 } 290 290 291 + 292 + static struct ceph_snap_context *empty_snapc; 293 + 291 294 /* 292 295 * build the snap context for a given realm. 293 296 */ ··· 331 328 return 0; 332 329 } 333 330 331 + if (num == 0 && realm->seq == empty_snapc->seq) { 332 + ceph_get_snap_context(empty_snapc); 333 + snapc = empty_snapc; 334 + goto done; 335 + } 336 + 334 337 /* alloc new snap context */ 335 338 err = -ENOMEM; 336 339 if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) ··· 374 365 realm->ino, realm, snapc, snapc->seq, 375 366 (unsigned int) snapc->num_snaps); 376 367 377 - if (realm->cached_context) 378 - ceph_put_snap_context(realm->cached_context); 368 + done: 369 + ceph_put_snap_context(realm->cached_context); 379 370 realm->cached_context = snapc; 380 371 return 0; 381 372 ··· 475 466 cap_snap. lucky us. */ 476 467 dout("queue_cap_snap %p already pending\n", inode); 477 468 kfree(capsnap); 469 + } else if (ci->i_snap_realm->cached_context == empty_snapc) { 470 + dout("queue_cap_snap %p empty snapc\n", inode); 471 + kfree(capsnap); 478 472 } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| 479 473 CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { 480 474 struct ceph_snap_context *snapc = ci->i_head_snapc; ··· 515 503 capsnap->xattr_blob = NULL; 516 504 capsnap->xattr_version = 0; 517 505 } 506 + 507 + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; 518 508 519 509 /* dirty page count moved from _head to this cap_snap; 520 510 all subsequent writes page dirties occur _after_ this ··· 604 590 if (!inode) 605 591 continue; 606 592 spin_unlock(&realm->inodes_with_caps_lock); 607 - if (lastinode) 608 - iput(lastinode); 593 + iput(lastinode); 609 594 lastinode = inode; 610 595 ceph_queue_cap_snap(ci); 611 596 spin_lock(&realm->inodes_with_caps_lock); 612 597 } 613 598 spin_unlock(&realm->inodes_with_caps_lock); 614 - if (lastinode) 615 - iput(lastinode); 599 + iput(lastinode); 616 600 617 601 list_for_each_entry(child, &realm->children, child_item) { 618 602 dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", ··· 940 928 return; 941 929 } 942 930 931 + int __init ceph_snap_init(void) 932 + { 933 + empty_snapc = ceph_create_snap_context(0, GFP_NOFS); 934 + if (!empty_snapc) 935 + return -ENOMEM; 936 + empty_snapc->seq = 1; 937 + return 0; 938 + } 943 939 944 - 940 + void ceph_snap_exit(void) 941 + { 942 + ceph_put_snap_context(empty_snapc); 943 + }

+10 -6

fs/ceph/super.c

··· 515 515 struct ceph_fs_client *fsc; 516 516 const u64 supported_features = 517 517 CEPH_FEATURE_FLOCK | 518 - CEPH_FEATURE_DIRLAYOUTHASH; 518 + CEPH_FEATURE_DIRLAYOUTHASH | 519 + CEPH_FEATURE_MDS_INLINE_DATA; 519 520 const u64 required_features = 0; 520 521 int page_count; 521 522 size_t size; ··· 1018 1017 }; 1019 1018 MODULE_ALIAS_FS("ceph"); 1020 1019 1021 - #define _STRINGIFY(x) #x 1022 - #define STRINGIFY(x) _STRINGIFY(x) 1023 - 1024 1020 static int __init init_ceph(void) 1025 1021 { 1026 1022 int ret = init_caches(); ··· 1026 1028 1027 1029 ceph_flock_init(); 1028 1030 ceph_xattr_init(); 1031 + ret = ceph_snap_init(); 1032 + if (ret) 1033 + goto out_xattr; 1029 1034 ret = register_filesystem(&ceph_fs_type); 1030 1035 if (ret) 1031 - goto out_icache; 1036 + goto out_snap; 1032 1037 1033 1038 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); 1034 1039 1035 1040 return 0; 1036 1041 1037 - out_icache: 1042 + out_snap: 1043 + ceph_snap_exit(); 1044 + out_xattr: 1038 1045 ceph_xattr_exit(); 1039 1046 destroy_caches(); 1040 1047 out: ··· 1050 1047 { 1051 1048 dout("exit_ceph\n"); 1052 1049 unregister_filesystem(&ceph_fs_type); 1050 + ceph_snap_exit(); 1053 1051 ceph_xattr_exit(); 1054 1052 destroy_caches(); 1055 1053 }

+46 -9

fs/ceph/super.h

··· 161 161 u64 time_warp_seq; 162 162 int writing; /* a sync write is still in progress */ 163 163 int dirty_pages; /* dirty pages awaiting writeback */ 164 + bool inline_data; 164 165 }; 165 166 166 167 static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) ··· 254 253 spinlock_t i_ceph_lock; 255 254 256 255 u64 i_version; 256 + u64 i_inline_version; 257 257 u32 i_time_warp_seq; 258 258 259 259 unsigned i_ceph_flags; 260 + int i_ordered_count; 260 261 atomic_t i_release_count; 261 262 atomic_t i_complete_count; 262 263 ··· 437 434 /* 438 435 * Ceph inode. 439 436 */ 440 - #define CEPH_I_NODELAY 4 /* do not delay cap release */ 441 - #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ 442 - #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ 437 + #define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ 438 + #define CEPH_I_NODELAY 4 /* do not delay cap release */ 439 + #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ 440 + #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ 443 441 444 442 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, 445 - int release_count) 443 + int release_count, int ordered_count) 446 444 { 447 445 atomic_set(&ci->i_complete_count, release_count); 446 + if (ci->i_ordered_count == ordered_count) 447 + ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; 448 + else 449 + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; 448 450 } 449 451 450 452 static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) ··· 463 455 atomic_read(&ci->i_release_count); 464 456 } 465 457 458 + static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) 459 + { 460 + return __ceph_dir_is_complete(ci) && 461 + (ci->i_ceph_flags & CEPH_I_DIR_ORDERED); 462 + } 463 + 466 464 static inline void ceph_dir_clear_complete(struct inode *inode) 467 465 { 468 466 __ceph_dir_clear_complete(ceph_inode(inode)); 469 467 } 470 468 471 - static inline bool ceph_dir_is_complete(struct inode *inode) 469 + static inline void ceph_dir_clear_ordered(struct inode *inode) 472 470 { 473 - return __ceph_dir_is_complete(ceph_inode(inode)); 471 + struct ceph_inode_info *ci = ceph_inode(inode); 472 + spin_lock(&ci->i_ceph_lock); 473 + ci->i_ordered_count++; 474 + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; 475 + spin_unlock(&ci->i_ceph_lock); 474 476 } 475 477 478 + static inline bool ceph_dir_is_complete_ordered(struct inode *inode) 479 + { 480 + struct ceph_inode_info *ci = ceph_inode(inode); 481 + bool ret; 482 + spin_lock(&ci->i_ceph_lock); 483 + ret = __ceph_dir_is_complete_ordered(ci); 484 + spin_unlock(&ci->i_ceph_lock); 485 + return ret; 486 + } 476 487 477 488 /* find a specific frag @f */ 478 489 extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, ··· 607 580 char *last_name; /* last entry in previous chunk */ 608 581 struct dentry *dentry; /* next dentry (for dcache readdir) */ 609 582 int dir_release_count; 583 + int dir_ordered_count; 610 584 611 585 /* used for -o dirstat read() on directory thing */ 612 586 char *dir_info; ··· 701 673 extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, 702 674 struct ceph_cap_snap *capsnap); 703 675 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); 676 + extern int ceph_snap_init(void); 677 + extern void ceph_snap_exit(void); 704 678 705 679 /* 706 680 * a cap_snap is "pending" if it is still awaiting an in-progress ··· 745 715 extern void ceph_queue_invalidate(struct inode *inode); 746 716 extern void ceph_queue_writeback(struct inode *inode); 747 717 748 - extern int ceph_do_getattr(struct inode *inode, int mask, bool force); 718 + extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, 719 + int mask, bool force); 720 + static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) 721 + { 722 + return __ceph_do_getattr(inode, NULL, mask, force); 723 + } 749 724 extern int ceph_permission(struct inode *inode, int mask); 750 725 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); 751 726 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, ··· 865 830 int mds, int drop, int unless); 866 831 867 832 extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 868 - int *got, loff_t endoff); 833 + loff_t endoff, int *got, struct page **pinned_page); 869 834 870 835 /* for counting open files by mode */ 871 836 static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) ··· 887 852 struct file *file, unsigned flags, umode_t mode, 888 853 int *opened); 889 854 extern int ceph_release(struct inode *inode, struct file *filp); 890 - 855 + extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, 856 + char *data, size_t len); 857 + int ceph_uninline_data(struct file *filp, struct page *locked_page); 891 858 /* dir.c */ 892 859 extern const struct file_operations ceph_dir_fops; 893 860 extern const struct inode_operations ceph_dir_iops;

+10

fs/ceph/super.h.rej

··· 1 + --- fs/ceph/super.h 2 + +++ fs/ceph/super.h 3 + @@ -254,6 +255,7 @@ 4 + spinlock_t i_ceph_lock; 5 + 6 + u64 i_version; 7 + + u64 i_inline_version; 8 + u32 i_time_warp_seq; 9 + 10 + unsigned i_ceph_flags;

+5 -2

fs/ceph/xattr.c

··· 854 854 struct ceph_pagelist *pagelist = NULL; 855 855 int err; 856 856 857 - if (value) { 857 + if (size > 0) { 858 858 /* copy value into pagelist */ 859 859 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); 860 860 if (!pagelist) ··· 864 864 err = ceph_pagelist_append(pagelist, value, size); 865 865 if (err) 866 866 goto out; 867 - } else { 867 + } else if (!value) { 868 868 flags |= CEPH_XATTR_REMOVE; 869 869 } 870 870 ··· 1000 1000 1001 1001 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 1002 1002 return generic_setxattr(dentry, name, value, size, flags); 1003 + 1004 + if (size == 0) 1005 + value = ""; /* empty EA, do not remove */ 1003 1006 1004 1007 return __ceph_setxattr(dentry, name, value, size, flags); 1005 1008 }

+26

include/linux/ceph/auth.h

··· 13 13 14 14 struct ceph_auth_client; 15 15 struct ceph_authorizer; 16 + struct ceph_msg; 16 17 17 18 struct ceph_auth_handshake { 18 19 struct ceph_authorizer *authorizer; ··· 21 20 size_t authorizer_buf_len; 22 21 void *authorizer_reply_buf; 23 22 size_t authorizer_reply_buf_len; 23 + int (*sign_message)(struct ceph_auth_handshake *auth, 24 + struct ceph_msg *msg); 25 + int (*check_message_signature)(struct ceph_auth_handshake *auth, 26 + struct ceph_msg *msg); 24 27 }; 25 28 26 29 struct ceph_auth_client_ops { ··· 71 66 void (*reset)(struct ceph_auth_client *ac); 72 67 73 68 void (*destroy)(struct ceph_auth_client *ac); 69 + 70 + int (*sign_message)(struct ceph_auth_handshake *auth, 71 + struct ceph_msg *msg); 72 + int (*check_message_signature)(struct ceph_auth_handshake *auth, 73 + struct ceph_msg *msg); 74 74 }; 75 75 76 76 struct ceph_auth_client { ··· 123 113 extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, 124 114 int peer_type); 125 115 116 + static inline int ceph_auth_sign_message(struct ceph_auth_handshake *auth, 117 + struct ceph_msg *msg) 118 + { 119 + if (auth->sign_message) 120 + return auth->sign_message(auth, msg); 121 + return 0; 122 + } 123 + 124 + static inline 125 + int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth, 126 + struct ceph_msg *msg) 127 + { 128 + if (auth->check_message_signature) 129 + return auth->check_message_signature(auth, msg); 130 + return 0; 131 + } 126 132 #endif

+1 -2

include/linux/ceph/buffer.h

··· 10 10 /* 11 11 * a simple reference counted buffer. 12 12 * 13 - * use kmalloc for small sizes (<= one page), vmalloc for larger 14 - * sizes. 13 + * use kmalloc for smaller sizes, vmalloc for larger sizes. 15 14 */ 16 15 struct ceph_buffer { 17 16 struct kref kref;

+1

include/linux/ceph/ceph_features.h

+8 -2

include/linux/ceph/ceph_fs.h

··· 522 522 __le32 dist[]; 523 523 } __attribute__ ((packed)); 524 524 525 - #define CEPH_LOCK_FCNTL 1 526 - #define CEPH_LOCK_FLOCK 2 525 + #define CEPH_LOCK_FCNTL 1 526 + #define CEPH_LOCK_FLOCK 2 527 + #define CEPH_LOCK_FCNTL_INTR 3 528 + #define CEPH_LOCK_FLOCK_INTR 4 529 + 527 530 528 531 #define CEPH_LOCK_SHARED 1 529 532 #define CEPH_LOCK_EXCL 2 ··· 552 549 553 550 int ceph_flags_to_mode(int flags); 554 551 552 + #define CEPH_INLINE_NONE ((__u64)-1) 555 553 556 554 /* capability bits */ 557 555 #define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */ ··· 617 613 CEPH_CAP_LINK_SHARED | \ 618 614 CEPH_CAP_FILE_SHARED | \ 619 615 CEPH_CAP_XATTR_SHARED) 616 + #define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \ 617 + CEPH_CAP_FILE_RD) 620 618 621 619 #define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \ 622 620 CEPH_CAP_LINK_SHARED | \

+1 -1

include/linux/ceph/libceph.h

··· 29 29 #define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ 30 30 #define CEPH_OPT_MYIP (1<<2) /* specified my ip */ 31 31 #define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ 32 + #define CEPH_OPT_NOMSGAUTH (1<<4) /* not require cephx message signature */ 32 33 33 34 #define CEPH_OPT_DEFAULT (0) 34 35 ··· 185 184 extern const char *ceph_msg_type_name(int type); 186 185 extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); 187 186 extern void *ceph_kvmalloc(size_t size, gfp_t flags); 188 - extern void ceph_kvfree(const void *ptr); 189 187 190 188 extern struct ceph_options *ceph_parse_options(char *options, 191 189 const char *dev_name, const char *dev_name_end,

+8 -1

include/linux/ceph/messenger.h

··· 42 42 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con, 43 43 struct ceph_msg_header *hdr, 44 44 int *skip); 45 + int (*sign_message) (struct ceph_connection *con, struct ceph_msg *msg); 46 + 47 + int (*check_message_signature) (struct ceph_connection *con, 48 + struct ceph_msg *msg); 45 49 }; 46 50 47 51 /* use format string %s%d */ ··· 146 142 */ 147 143 struct ceph_msg { 148 144 struct ceph_msg_header hdr; /* header */ 149 - struct ceph_msg_footer footer; /* footer */ 145 + union { 146 + struct ceph_msg_footer footer; /* footer */ 147 + struct ceph_msg_footer_old old_footer; /* old format footer */ 148 + }; 150 149 struct kvec front; /* unaligned blobs of message */ 151 150 struct ceph_buffer *middle; 152 151

+10 -1

include/linux/ceph/msgr.h

··· 152 152 receiver: mask against ~PAGE_MASK */ 153 153 154 154 struct ceph_entity_name src; 155 - __le32 reserved; 155 + __le16 compat_version; 156 + __le16 reserved; 156 157 __le32 crc; /* header crc32c */ 157 158 } __attribute__ ((packed)); 158 159 ··· 165 164 /* 166 165 * follows data payload 167 166 */ 167 + struct ceph_msg_footer_old { 168 + __le32 front_crc, middle_crc, data_crc; 169 + __u8 flags; 170 + } __attribute__ ((packed)); 171 + 168 172 struct ceph_msg_footer { 169 173 __le32 front_crc, middle_crc, data_crc; 174 + // sig holds the 64 bits of the digital signature for the message PLR 175 + __le64 sig; 170 176 __u8 flags; 171 177 } __attribute__ ((packed)); 172 178 173 179 #define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */ 174 180 #define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ 181 + #define CEPH_MSG_FOOTER_SIGNED (1<<2) /* msg was signed */ 175 182 176 183 177 184 #endif

+12 -1

include/linux/ceph/osd_client.h

··· 87 87 struct ceph_osd_data osd_data; 88 88 } extent; 89 89 struct { 90 + __le32 name_len; 91 + __le32 value_len; 92 + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ 93 + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ 94 + struct ceph_osd_data osd_data; 95 + } xattr; 96 + struct { 90 97 const char *class_name; 91 98 const char *method_name; 92 99 struct ceph_osd_data request_info; ··· 302 295 extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, 303 296 unsigned int which, u16 opcode, 304 297 const char *class, const char *method); 298 + extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, 299 + u16 opcode, const char *name, const void *value, 300 + size_t size, u8 cmp_op, u8 cmp_mode); 305 301 extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, 306 302 unsigned int which, u16 opcode, 307 303 u64 cookie, u64 version, int flag); ··· 328 318 struct ceph_file_layout *layout, 329 319 struct ceph_vino vino, 330 320 u64 offset, u64 *len, 331 - int num_ops, int opcode, int flags, 321 + unsigned int which, int num_ops, 322 + int opcode, int flags, 332 323 struct ceph_snap_context *snapc, 333 324 u32 truncate_seq, u64 truncate_size, 334 325 bool use_mempool);

+3 -1

include/linux/ceph/pagelist.h

··· 1 1 #ifndef __FS_CEPH_PAGELIST_H 2 2 #define __FS_CEPH_PAGELIST_H 3 3 4 - #include <linux/list.h> 4 + #include <asm/byteorder.h> 5 5 #include <linux/atomic.h> 6 + #include <linux/list.h> 7 + #include <linux/types.h> 6 8 7 9 struct ceph_pagelist { 8 10 struct list_head head;

+69 -7

net/ceph/auth_x.c

··· 8 8 9 9 #include <linux/ceph/decode.h> 10 10 #include <linux/ceph/auth.h> 11 + #include <linux/ceph/messenger.h> 11 12 12 13 #include "crypto.h" 13 14 #include "auth_x.h" ··· 294 293 dout("build_authorizer for %s %p\n", 295 294 ceph_entity_type_name(th->service), au); 296 295 296 + ceph_crypto_key_destroy(&au->session_key); 297 + ret = ceph_crypto_key_clone(&au->session_key, &th->session_key); 298 + if (ret) 299 + return ret; 300 + 297 301 maxlen = sizeof(*msg_a) + sizeof(msg_b) + 298 302 ceph_x_encrypt_buflen(ticket_blob_len); 299 303 dout(" need len %d\n", maxlen); ··· 308 302 } 309 303 if (!au->buf) { 310 304 au->buf = ceph_buffer_new(maxlen, GFP_NOFS); 311 - if (!au->buf) 305 + if (!au->buf) { 306 + ceph_crypto_key_destroy(&au->session_key); 312 307 return -ENOMEM; 308 + } 313 309 } 314 310 au->service = th->service; 315 311 au->secret_id = th->secret_id; ··· 337 329 get_random_bytes(&au->nonce, sizeof(au->nonce)); 338 330 msg_b.struct_v = 1; 339 331 msg_b.nonce = cpu_to_le64(au->nonce); 340 - ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b), 332 + ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b), 341 333 p, end - p); 342 334 if (ret < 0) 343 335 goto out_buf; ··· 568 560 auth->authorizer_buf_len = au->buf->vec.iov_len; 569 561 auth->authorizer_reply_buf = au->reply_buf; 570 562 auth->authorizer_reply_buf_len = sizeof (au->reply_buf); 563 + auth->sign_message = ac->ops->sign_message; 564 + auth->check_message_signature = ac->ops->check_message_signature; 571 565 572 566 return 0; 573 567 } ··· 598 588 struct ceph_authorizer *a, size_t len) 599 589 { 600 590 struct ceph_x_authorizer *au = (void *)a; 601 - struct ceph_x_ticket_handler *th; 602 591 int ret = 0; 603 592 struct ceph_x_authorize_reply reply; 604 593 void *preply = &reply; 605 594 void *p = au->reply_buf; 606 595 void *end = p + sizeof(au->reply_buf); 607 596 608 - th = get_ticket_handler(ac, au->service); 609 - if (IS_ERR(th)) 610 - return PTR_ERR(th); 611 - ret = ceph_x_decrypt(&th->session_key, &p, end, &preply, sizeof(reply)); 597 + ret = ceph_x_decrypt(&au->session_key, &p, end, &preply, sizeof(reply)); 612 598 if (ret < 0) 613 599 return ret; 614 600 if (ret != sizeof(reply)) ··· 624 618 { 625 619 struct ceph_x_authorizer *au = (void *)a; 626 620 621 + ceph_crypto_key_destroy(&au->session_key); 627 622 ceph_buffer_put(au->buf); 628 623 kfree(au); 629 624 } ··· 670 663 memset(&th->validity, 0, sizeof(th->validity)); 671 664 } 672 665 666 + static int calcu_signature(struct ceph_x_authorizer *au, 667 + struct ceph_msg *msg, __le64 *sig) 668 + { 669 + int ret; 670 + char tmp_enc[40]; 671 + __le32 tmp[5] = { 672 + 16u, msg->hdr.crc, msg->footer.front_crc, 673 + msg->footer.middle_crc, msg->footer.data_crc, 674 + }; 675 + ret = ceph_x_encrypt(&au->session_key, &tmp, sizeof(tmp), 676 + tmp_enc, sizeof(tmp_enc)); 677 + if (ret < 0) 678 + return ret; 679 + *sig = *(__le64*)(tmp_enc + 4); 680 + return 0; 681 + } 682 + 683 + static int ceph_x_sign_message(struct ceph_auth_handshake *auth, 684 + struct ceph_msg *msg) 685 + { 686 + int ret; 687 + if (!auth->authorizer) 688 + return 0; 689 + ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, 690 + msg, &msg->footer.sig); 691 + if (ret < 0) 692 + return ret; 693 + msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED; 694 + return 0; 695 + } 696 + 697 + static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth, 698 + struct ceph_msg *msg) 699 + { 700 + __le64 sig_check; 701 + int ret; 702 + 703 + if (!auth->authorizer) 704 + return 0; 705 + ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, 706 + msg, &sig_check); 707 + if (ret < 0) 708 + return ret; 709 + if (sig_check == msg->footer.sig) 710 + return 0; 711 + if (msg->footer.flags & CEPH_MSG_FOOTER_SIGNED) 712 + dout("ceph_x_check_message_signature %p has signature %llx " 713 + "expect %llx\n", msg, msg->footer.sig, sig_check); 714 + else 715 + dout("ceph_x_check_message_signature %p sender did not set " 716 + "CEPH_MSG_FOOTER_SIGNED\n", msg); 717 + return -EBADMSG; 718 + } 673 719 674 720 static const struct ceph_auth_client_ops ceph_x_ops = { 675 721 .name = "x", ··· 737 677 .invalidate_authorizer = ceph_x_invalidate_authorizer, 738 678 .reset = ceph_x_reset, 739 679 .destroy = ceph_x_destroy, 680 + .sign_message = ceph_x_sign_message, 681 + .check_message_signature = ceph_x_check_message_signature, 740 682 }; 741 683 742 684

+1

net/ceph/auth_x.h

··· 26 26 27 27 28 28 struct ceph_x_authorizer { 29 + struct ceph_crypto_key session_key; 29 30 struct ceph_buffer *buf; 30 31 unsigned int service; 31 32 u64 nonce;

+2 -2

net/ceph/buffer.c

··· 6 6 7 7 #include <linux/ceph/buffer.h> 8 8 #include <linux/ceph/decode.h> 9 - #include <linux/ceph/libceph.h> /* for ceph_kv{malloc,free} */ 9 + #include <linux/ceph/libceph.h> /* for ceph_kvmalloc */ 10 10 11 11 struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) 12 12 { ··· 35 35 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); 36 36 37 37 dout("buffer_release %p\n", b); 38 - ceph_kvfree(b->vec.iov_base); 38 + kvfree(b->vec.iov_base); 39 39 kfree(b); 40 40 } 41 41 EXPORT_SYMBOL(ceph_buffer_release);

+13 -8

net/ceph/ceph_common.c

··· 184 184 return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); 185 185 } 186 186 187 - void ceph_kvfree(const void *ptr) 188 - { 189 - if (is_vmalloc_addr(ptr)) 190 - vfree(ptr); 191 - else 192 - kfree(ptr); 193 - } 194 - 195 187 196 188 static int parse_fsid(const char *str, struct ceph_fsid *fsid) 197 189 { ··· 237 245 Opt_noshare, 238 246 Opt_crc, 239 247 Opt_nocrc, 248 + Opt_cephx_require_signatures, 249 + Opt_nocephx_require_signatures, 240 250 }; 241 251 242 252 static match_table_t opt_tokens = { ··· 257 263 {Opt_noshare, "noshare"}, 258 264 {Opt_crc, "crc"}, 259 265 {Opt_nocrc, "nocrc"}, 266 + {Opt_cephx_require_signatures, "cephx_require_signatures"}, 267 + {Opt_nocephx_require_signatures, "nocephx_require_signatures"}, 260 268 {-1, NULL} 261 269 }; 262 270 ··· 457 461 case Opt_nocrc: 458 462 opt->flags |= CEPH_OPT_NOCRC; 459 463 break; 464 + case Opt_cephx_require_signatures: 465 + opt->flags &= ~CEPH_OPT_NOMSGAUTH; 466 + break; 467 + case Opt_nocephx_require_signatures: 468 + opt->flags |= CEPH_OPT_NOMSGAUTH; 469 + break; 460 470 461 471 default: 462 472 BUG_ON(token); ··· 505 503 mutex_init(&client->mount_mutex); 506 504 init_waitqueue_head(&client->auth_wq); 507 505 client->auth_err = 0; 506 + 507 + if (!ceph_test_opt(client, NOMSGAUTH)) 508 + required_features |= CEPH_FEATURE_MSG_AUTH; 508 509 509 510 client->extra_mon_dispatch = NULL; 510 511 client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |

+30 -4

net/ceph/messenger.c

··· 1196 1196 dout("prepare_write_message_footer %p\n", con); 1197 1197 con->out_kvec_is_msg = true; 1198 1198 con->out_kvec[v].iov_base = &m->footer; 1199 - con->out_kvec[v].iov_len = sizeof(m->footer); 1200 - con->out_kvec_bytes += sizeof(m->footer); 1199 + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { 1200 + if (con->ops->sign_message) 1201 + con->ops->sign_message(con, m); 1202 + else 1203 + m->footer.sig = 0; 1204 + con->out_kvec[v].iov_len = sizeof(m->footer); 1205 + con->out_kvec_bytes += sizeof(m->footer); 1206 + } else { 1207 + m->old_footer.flags = m->footer.flags; 1208 + con->out_kvec[v].iov_len = sizeof(m->old_footer); 1209 + con->out_kvec_bytes += sizeof(m->old_footer); 1210 + } 1201 1211 con->out_kvec_left++; 1202 1212 con->out_more = m->more_to_follow; 1203 1213 con->out_msg_done = true; ··· 2259 2249 int ret; 2260 2250 unsigned int front_len, middle_len, data_len; 2261 2251 bool do_datacrc = !con->msgr->nocrc; 2252 + bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); 2262 2253 u64 seq; 2263 2254 u32 crc; 2264 2255 ··· 2372 2361 } 2373 2362 2374 2363 /* footer */ 2375 - size = sizeof (m->footer); 2364 + if (need_sign) 2365 + size = sizeof(m->footer); 2366 + else 2367 + size = sizeof(m->old_footer); 2368 + 2376 2369 end += size; 2377 2370 ret = read_partial(con, end, size, &m->footer); 2378 2371 if (ret <= 0) 2379 2372 return ret; 2373 + 2374 + if (!need_sign) { 2375 + m->footer.flags = m->old_footer.flags; 2376 + m->footer.sig = 0; 2377 + } 2380 2378 2381 2379 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", 2382 2380 m, front_len, m->footer.front_crc, middle_len, ··· 2407 2387 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { 2408 2388 pr_err("read_partial_message %p data crc %u != exp. %u\n", m, 2409 2389 con->in_data_crc, le32_to_cpu(m->footer.data_crc)); 2390 + return -EBADMSG; 2391 + } 2392 + 2393 + if (need_sign && con->ops->check_message_signature && 2394 + con->ops->check_message_signature(con, m)) { 2395 + pr_err("read_partial_message %p signature check failed\n", m); 2410 2396 return -EBADMSG; 2411 2397 } 2412 2398 ··· 3314 3288 static void ceph_msg_free(struct ceph_msg *m) 3315 3289 { 3316 3290 dout("%s %p\n", __func__, m); 3317 - ceph_kvfree(m->front.iov_base); 3291 + kvfree(m->front.iov_base); 3318 3292 kmem_cache_free(ceph_msg_cache, m); 3319 3293 } 3320 3294

+88 -30

net/ceph/osd_client.c

··· 292 292 ceph_osd_data_release(&op->cls.request_data); 293 293 ceph_osd_data_release(&op->cls.response_data); 294 294 break; 295 + case CEPH_OSD_OP_SETXATTR: 296 + case CEPH_OSD_OP_CMPXATTR: 297 + ceph_osd_data_release(&op->xattr.osd_data); 298 + break; 295 299 default: 296 300 break; 297 301 } ··· 480 476 size_t payload_len = 0; 481 477 482 478 BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && 483 - opcode != CEPH_OSD_OP_DELETE && opcode != CEPH_OSD_OP_ZERO && 484 - opcode != CEPH_OSD_OP_TRUNCATE); 479 + opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE); 485 480 486 481 op->extent.offset = offset; 487 482 op->extent.length = length; ··· 547 544 op->payload_len = payload_len; 548 545 } 549 546 EXPORT_SYMBOL(osd_req_op_cls_init); 547 + 548 + int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, 549 + u16 opcode, const char *name, const void *value, 550 + size_t size, u8 cmp_op, u8 cmp_mode) 551 + { 552 + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); 553 + struct ceph_pagelist *pagelist; 554 + size_t payload_len; 555 + 556 + BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); 557 + 558 + pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); 559 + if (!pagelist) 560 + return -ENOMEM; 561 + 562 + ceph_pagelist_init(pagelist); 563 + 564 + payload_len = strlen(name); 565 + op->xattr.name_len = payload_len; 566 + ceph_pagelist_append(pagelist, name, payload_len); 567 + 568 + op->xattr.value_len = size; 569 + ceph_pagelist_append(pagelist, value, size); 570 + payload_len += size; 571 + 572 + op->xattr.cmp_op = cmp_op; 573 + op->xattr.cmp_mode = cmp_mode; 574 + 575 + ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); 576 + op->payload_len = payload_len; 577 + return 0; 578 + } 579 + EXPORT_SYMBOL(osd_req_op_xattr_init); 550 580 551 581 void osd_req_op_watch_init(struct ceph_osd_request *osd_req, 552 582 unsigned int which, u16 opcode, ··· 662 626 case CEPH_OSD_OP_READ: 663 627 case CEPH_OSD_OP_WRITE: 664 628 case CEPH_OSD_OP_ZERO: 665 - case CEPH_OSD_OP_DELETE: 666 629 case CEPH_OSD_OP_TRUNCATE: 667 630 if (src->op == CEPH_OSD_OP_WRITE) 668 631 request_data_len = src->extent.length; ··· 711 676 dst->alloc_hint.expected_write_size = 712 677 cpu_to_le64(src->alloc_hint.expected_write_size); 713 678 break; 679 + case CEPH_OSD_OP_SETXATTR: 680 + case CEPH_OSD_OP_CMPXATTR: 681 + dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); 682 + dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); 683 + dst->xattr.cmp_op = src->xattr.cmp_op; 684 + dst->xattr.cmp_mode = src->xattr.cmp_mode; 685 + osd_data = &src->xattr.osd_data; 686 + ceph_osdc_msg_data_add(req->r_request, osd_data); 687 + request_data_len = osd_data->pagelist->length; 688 + break; 689 + case CEPH_OSD_OP_CREATE: 690 + case CEPH_OSD_OP_DELETE: 691 + break; 714 692 default: 715 693 pr_err("unsupported osd opcode %s\n", 716 694 ceph_osd_op_name(src->op)); ··· 753 705 struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, 754 706 struct ceph_file_layout *layout, 755 707 struct ceph_vino vino, 756 - u64 off, u64 *plen, int num_ops, 708 + u64 off, u64 *plen, 709 + unsigned int which, int num_ops, 757 710 int opcode, int flags, 758 711 struct ceph_snap_context *snapc, 759 712 u32 truncate_seq, ··· 765 716 u64 objnum = 0; 766 717 u64 objoff = 0; 767 718 u64 objlen = 0; 768 - u32 object_size; 769 - u64 object_base; 770 719 int r; 771 720 772 721 BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && 773 - opcode != CEPH_OSD_OP_DELETE && opcode != CEPH_OSD_OP_ZERO && 774 - opcode != CEPH_OSD_OP_TRUNCATE); 722 + opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE && 723 + opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE); 775 724 776 725 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, 777 726 GFP_NOFS); ··· 785 738 return ERR_PTR(r); 786 739 } 787 740 788 - object_size = le32_to_cpu(layout->fl_object_size); 789 - object_base = off - objoff; 790 - if (!(truncate_seq == 1 && truncate_size == -1ULL)) { 791 - if (truncate_size <= object_base) { 792 - truncate_size = 0; 793 - } else { 794 - truncate_size -= object_base; 795 - if (truncate_size > object_size) 796 - truncate_size = object_size; 741 + if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { 742 + osd_req_op_init(req, which, opcode); 743 + } else { 744 + u32 object_size = le32_to_cpu(layout->fl_object_size); 745 + u32 object_base = off - objoff; 746 + if (!(truncate_seq == 1 && truncate_size == -1ULL)) { 747 + if (truncate_size <= object_base) { 748 + truncate_size = 0; 749 + } else { 750 + truncate_size -= object_base; 751 + if (truncate_size > object_size) 752 + truncate_size = object_size; 753 + } 797 754 } 755 + osd_req_op_extent_init(req, which, opcode, objoff, objlen, 756 + truncate_size, truncate_seq); 798 757 } 799 - 800 - osd_req_op_extent_init(req, 0, opcode, objoff, objlen, 801 - truncate_size, truncate_seq); 802 - 803 - /* 804 - * A second op in the ops array means the caller wants to 805 - * also issue a include a 'startsync' command so that the 806 - * osd will flush data quickly. 807 - */ 808 - if (num_ops > 1) 809 - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); 810 758 811 759 req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); 812 760 ··· 2668 2626 2669 2627 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, 2670 2628 vino.snap, off, *plen); 2671 - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, 2629 + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1, 2672 2630 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 2673 2631 NULL, truncate_seq, truncate_size, 2674 2632 false); ··· 2711 2669 int page_align = off & ~PAGE_MASK; 2712 2670 2713 2671 BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ 2714 - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, 2672 + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, 2715 2673 CEPH_OSD_OP_WRITE, 2716 2674 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 2717 2675 snapc, truncate_seq, truncate_size, ··· 2962 2920 return ceph_monc_validate_auth(&osdc->client->monc); 2963 2921 } 2964 2922 2923 + static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) 2924 + { 2925 + struct ceph_osd *o = con->private; 2926 + struct ceph_auth_handshake *auth = &o->o_auth; 2927 + return ceph_auth_sign_message(auth, msg); 2928 + } 2929 + 2930 + static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) 2931 + { 2932 + struct ceph_osd *o = con->private; 2933 + struct ceph_auth_handshake *auth = &o->o_auth; 2934 + return ceph_auth_check_message_signature(auth, msg); 2935 + } 2936 + 2965 2937 static const struct ceph_connection_operations osd_con_ops = { 2966 2938 .get = get_osd_con, 2967 2939 .put = put_osd_con, ··· 2984 2928 .verify_authorizer_reply = verify_authorizer_reply, 2985 2929 .invalidate_authorizer = invalidate_authorizer, 2986 2930 .alloc_msg = alloc_msg, 2931 + .sign_message = sign_message, 2932 + .check_message_signature = check_message_signature, 2987 2933 .fault = osd_reset, 2988 2934 };