Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull ceph updates from Sage Weil:
"The big item here is support for inline data for CephFS and for
message signatures from Zheng. There are also several bug fixes,
including interrupted flock request handling, 0-length xattrs, mksnap,
cached readdir results, and a message version compat field. Finally
there are several cleanups from Ilya, Dan, and Markus.

Note that there is another series coming soon that fixes some bugs in
the RBD 'lingering' requests, but it isn't quite ready yet"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (27 commits)
ceph: fix setting empty extended attribute
ceph: fix mksnap crash
ceph: do_sync is never initialized
libceph: fixup includes in pagelist.h
ceph: support inline data feature
ceph: flush inline version
ceph: convert inline data to normal data before data write
ceph: sync read inline data
ceph: fetch inline data when getting Fcr cap refs
ceph: use getattr request to fetch inline data
ceph: add inline data to pagecache
ceph: parse inline data in MClientReply and MClientCaps
libceph: specify position of extent operation
libceph: add CREATE osd operation support
libceph: add SETXATTR/CMPXATTR osd operations support
rbd: don't treat CEPH_OSD_OP_DELETE as extent op
ceph: remove unused stringification macros
libceph: require cephx message signature by default
ceph: introduce global empty snap context
ceph: message versioning fixes
...

+992 -180
+7 -4
drivers/block/rbd.c
··· 2370 2370 opcode = CEPH_OSD_OP_READ; 2371 2371 } 2372 2372 2373 - osd_req_op_extent_init(osd_request, num_ops, opcode, offset, length, 2374 - 0, 0); 2373 + if (opcode == CEPH_OSD_OP_DELETE) 2374 + osd_req_op_init(osd_request, num_ops, opcode); 2375 + else 2376 + osd_req_op_extent_init(osd_request, num_ops, opcode, 2377 + offset, length, 0, 0); 2378 + 2375 2379 if (obj_request->type == OBJ_REQUEST_BIO) 2376 2380 osd_req_op_extent_osd_data_bio(osd_request, num_ops, 2377 2381 obj_request->bio_list, length); ··· 3409 3405 if (result) 3410 3406 rbd_warn(rbd_dev, "%s %llx at %llx result %d", 3411 3407 obj_op_name(op_type), length, offset, result); 3412 - if (snapc) 3413 - ceph_put_snap_context(snapc); 3408 + ceph_put_snap_context(snapc); 3414 3409 blk_end_request_all(rq, result); 3415 3410 } 3416 3411
+262 -11
fs/ceph/addr.c
··· 192 192 struct ceph_osd_client *osdc = 193 193 &ceph_inode_to_client(inode)->client->osdc; 194 194 int err = 0; 195 + u64 off = page_offset(page); 195 196 u64 len = PAGE_CACHE_SIZE; 196 197 197 - err = ceph_readpage_from_fscache(inode, page); 198 + if (off >= i_size_read(inode)) { 199 + zero_user_segment(page, err, PAGE_CACHE_SIZE); 200 + SetPageUptodate(page); 201 + return 0; 202 + } 198 203 204 + /* 205 + * Uptodate inline data should have been added into page cache 206 + * while getting Fcr caps. 207 + */ 208 + if (ci->i_inline_version != CEPH_INLINE_NONE) 209 + return -EINVAL; 210 + 211 + err = ceph_readpage_from_fscache(inode, page); 199 212 if (err == 0) 200 213 goto out; 201 214 202 215 dout("readpage inode %p file %p page %p index %lu\n", 203 216 inode, filp, page, page->index); 204 217 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 205 - (u64) page_offset(page), &len, 218 + off, &len, 206 219 ci->i_truncate_seq, ci->i_truncate_size, 207 220 &page, 1, 0); 208 221 if (err == -ENOENT) ··· 332 319 off, len); 333 320 vino = ceph_vino(inode); 334 321 req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 335 - 1, CEPH_OSD_OP_READ, 322 + 0, 1, CEPH_OSD_OP_READ, 336 323 CEPH_OSD_FLAG_READ, NULL, 337 324 ci->i_truncate_seq, ci->i_truncate_size, 338 325 false); ··· 396 383 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 397 384 int rc = 0; 398 385 int max = 0; 386 + 387 + if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) 388 + return -EINVAL; 399 389 400 390 rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, 401 391 &nr_pages); ··· 689 673 int rc = 0; 690 674 unsigned wsize = 1 << inode->i_blkbits; 691 675 struct ceph_osd_request *req = NULL; 692 - int do_sync; 676 + int do_sync = 0; 693 677 u64 truncate_size, snap_size; 694 678 u32 truncate_seq; 695 679 ··· 766 750 last_snapc = snapc; 767 751 768 752 while (!done && index <= end) { 769 - int num_ops = do_sync ? 2 : 1; 770 753 unsigned i; 771 754 int first; 772 755 pgoff_t next; ··· 865 850 len = wsize; 866 851 req = ceph_osdc_new_request(&fsc->client->osdc, 867 852 &ci->i_layout, vino, 868 - offset, &len, num_ops, 853 + offset, &len, 0, 854 + do_sync ? 2 : 1, 869 855 CEPH_OSD_OP_WRITE, 870 856 CEPH_OSD_FLAG_WRITE | 871 857 CEPH_OSD_FLAG_ONDISK, ··· 877 861 unlock_page(page); 878 862 break; 879 863 } 864 + 865 + if (do_sync) 866 + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); 880 867 881 868 req->r_callback = writepages_finish; 882 869 req->r_inode = inode; ··· 1223 1204 struct inode *inode = file_inode(vma->vm_file); 1224 1205 struct ceph_inode_info *ci = ceph_inode(inode); 1225 1206 struct ceph_file_info *fi = vma->vm_file->private_data; 1207 + struct page *pinned_page = NULL; 1226 1208 loff_t off = vmf->pgoff << PAGE_CACHE_SHIFT; 1227 1209 int want, got, ret; 1228 1210 ··· 1235 1215 want = CEPH_CAP_FILE_CACHE; 1236 1216 while (1) { 1237 1217 got = 0; 1238 - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); 1218 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, 1219 + -1, &got, &pinned_page); 1239 1220 if (ret == 0) 1240 1221 break; 1241 1222 if (ret != -ERESTARTSYS) { ··· 1247 1226 dout("filemap_fault %p %llu~%zd got cap refs on %s\n", 1248 1227 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got)); 1249 1228 1250 - ret = filemap_fault(vma, vmf); 1229 + if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || 1230 + ci->i_inline_version == CEPH_INLINE_NONE) 1231 + ret = filemap_fault(vma, vmf); 1232 + else 1233 + ret = -EAGAIN; 1251 1234 1252 1235 dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", 1253 1236 inode, off, (size_t)PAGE_CACHE_SIZE, ceph_cap_string(got), ret); 1237 + if (pinned_page) 1238 + page_cache_release(pinned_page); 1254 1239 ceph_put_cap_refs(ci, got); 1255 1240 1241 + if (ret != -EAGAIN) 1242 + return ret; 1243 + 1244 + /* read inline data */ 1245 + if (off >= PAGE_CACHE_SIZE) { 1246 + /* does not support inline data > PAGE_SIZE */ 1247 + ret = VM_FAULT_SIGBUS; 1248 + } else { 1249 + int ret1; 1250 + struct address_space *mapping = inode->i_mapping; 1251 + struct page *page = find_or_create_page(mapping, 0, 1252 + mapping_gfp_mask(mapping) & 1253 + ~__GFP_FS); 1254 + if (!page) { 1255 + ret = VM_FAULT_OOM; 1256 + goto out; 1257 + } 1258 + ret1 = __ceph_do_getattr(inode, page, 1259 + CEPH_STAT_CAP_INLINE_DATA, true); 1260 + if (ret1 < 0 || off >= i_size_read(inode)) { 1261 + unlock_page(page); 1262 + page_cache_release(page); 1263 + ret = VM_FAULT_SIGBUS; 1264 + goto out; 1265 + } 1266 + if (ret1 < PAGE_CACHE_SIZE) 1267 + zero_user_segment(page, ret1, PAGE_CACHE_SIZE); 1268 + else 1269 + flush_dcache_page(page); 1270 + SetPageUptodate(page); 1271 + vmf->page = page; 1272 + ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; 1273 + } 1274 + out: 1275 + dout("filemap_fault %p %llu~%zd read inline data ret %d\n", 1276 + inode, off, (size_t)PAGE_CACHE_SIZE, ret); 1256 1277 return ret; 1257 1278 } 1258 1279 ··· 1313 1250 size_t len; 1314 1251 int want, got, ret; 1315 1252 1253 + if (ci->i_inline_version != CEPH_INLINE_NONE) { 1254 + struct page *locked_page = NULL; 1255 + if (off == 0) { 1256 + lock_page(page); 1257 + locked_page = page; 1258 + } 1259 + ret = ceph_uninline_data(vma->vm_file, locked_page); 1260 + if (locked_page) 1261 + unlock_page(locked_page); 1262 + if (ret < 0) 1263 + return VM_FAULT_SIGBUS; 1264 + } 1265 + 1316 1266 if (off + PAGE_CACHE_SIZE <= size) 1317 1267 len = PAGE_CACHE_SIZE; 1318 1268 else ··· 1339 1263 want = CEPH_CAP_FILE_BUFFER; 1340 1264 while (1) { 1341 1265 got = 0; 1342 - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, off + len); 1266 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, 1267 + &got, NULL); 1343 1268 if (ret == 0) 1344 1269 break; 1345 1270 if (ret != -ERESTARTSYS) { ··· 1374 1297 ret = VM_FAULT_SIGBUS; 1375 1298 } 1376 1299 out: 1377 - if (ret != VM_FAULT_LOCKED) { 1300 + if (ret != VM_FAULT_LOCKED) 1378 1301 unlock_page(page); 1379 - } else { 1302 + if (ret == VM_FAULT_LOCKED || 1303 + ci->i_inline_version != CEPH_INLINE_NONE) { 1380 1304 int dirty; 1381 1305 spin_lock(&ci->i_ceph_lock); 1306 + ci->i_inline_version = CEPH_INLINE_NONE; 1382 1307 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 1383 1308 spin_unlock(&ci->i_ceph_lock); 1384 1309 if (dirty) ··· 1392 1313 ceph_put_cap_refs(ci, got); 1393 1314 1394 1315 return ret; 1316 + } 1317 + 1318 + void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, 1319 + char *data, size_t len) 1320 + { 1321 + struct address_space *mapping = inode->i_mapping; 1322 + struct page *page; 1323 + 1324 + if (locked_page) { 1325 + page = locked_page; 1326 + } else { 1327 + if (i_size_read(inode) == 0) 1328 + return; 1329 + page = find_or_create_page(mapping, 0, 1330 + mapping_gfp_mask(mapping) & ~__GFP_FS); 1331 + if (!page) 1332 + return; 1333 + if (PageUptodate(page)) { 1334 + unlock_page(page); 1335 + page_cache_release(page); 1336 + return; 1337 + } 1338 + } 1339 + 1340 + dout("fill_inline_data %p %llx.%llx len %lu locked_page %p\n", 1341 + inode, ceph_vinop(inode), len, locked_page); 1342 + 1343 + if (len > 0) { 1344 + void *kaddr = kmap_atomic(page); 1345 + memcpy(kaddr, data, len); 1346 + kunmap_atomic(kaddr); 1347 + } 1348 + 1349 + if (page != locked_page) { 1350 + if (len < PAGE_CACHE_SIZE) 1351 + zero_user_segment(page, len, PAGE_CACHE_SIZE); 1352 + else 1353 + flush_dcache_page(page); 1354 + 1355 + SetPageUptodate(page); 1356 + unlock_page(page); 1357 + page_cache_release(page); 1358 + } 1359 + } 1360 + 1361 + int ceph_uninline_data(struct file *filp, struct page *locked_page) 1362 + { 1363 + struct inode *inode = file_inode(filp); 1364 + struct ceph_inode_info *ci = ceph_inode(inode); 1365 + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 1366 + struct ceph_osd_request *req; 1367 + struct page *page = NULL; 1368 + u64 len, inline_version; 1369 + int err = 0; 1370 + bool from_pagecache = false; 1371 + 1372 + spin_lock(&ci->i_ceph_lock); 1373 + inline_version = ci->i_inline_version; 1374 + spin_unlock(&ci->i_ceph_lock); 1375 + 1376 + dout("uninline_data %p %llx.%llx inline_version %llu\n", 1377 + inode, ceph_vinop(inode), inline_version); 1378 + 1379 + if (inline_version == 1 || /* initial version, no data */ 1380 + inline_version == CEPH_INLINE_NONE) 1381 + goto out; 1382 + 1383 + if (locked_page) { 1384 + page = locked_page; 1385 + WARN_ON(!PageUptodate(page)); 1386 + } else if (ceph_caps_issued(ci) & 1387 + (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) { 1388 + page = find_get_page(inode->i_mapping, 0); 1389 + if (page) { 1390 + if (PageUptodate(page)) { 1391 + from_pagecache = true; 1392 + lock_page(page); 1393 + } else { 1394 + page_cache_release(page); 1395 + page = NULL; 1396 + } 1397 + } 1398 + } 1399 + 1400 + if (page) { 1401 + len = i_size_read(inode); 1402 + if (len > PAGE_CACHE_SIZE) 1403 + len = PAGE_CACHE_SIZE; 1404 + } else { 1405 + page = __page_cache_alloc(GFP_NOFS); 1406 + if (!page) { 1407 + err = -ENOMEM; 1408 + goto out; 1409 + } 1410 + err = __ceph_do_getattr(inode, page, 1411 + CEPH_STAT_CAP_INLINE_DATA, true); 1412 + if (err < 0) { 1413 + /* no inline data */ 1414 + if (err == -ENODATA) 1415 + err = 0; 1416 + goto out; 1417 + } 1418 + len = err; 1419 + } 1420 + 1421 + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1422 + ceph_vino(inode), 0, &len, 0, 1, 1423 + CEPH_OSD_OP_CREATE, 1424 + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1425 + ci->i_snap_realm->cached_context, 1426 + 0, 0, false); 1427 + if (IS_ERR(req)) { 1428 + err = PTR_ERR(req); 1429 + goto out; 1430 + } 1431 + 1432 + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); 1433 + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1434 + if (!err) 1435 + err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1436 + ceph_osdc_put_request(req); 1437 + if (err < 0) 1438 + goto out; 1439 + 1440 + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1441 + ceph_vino(inode), 0, &len, 1, 3, 1442 + CEPH_OSD_OP_WRITE, 1443 + CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 1444 + ci->i_snap_realm->cached_context, 1445 + ci->i_truncate_seq, ci->i_truncate_size, 1446 + false); 1447 + if (IS_ERR(req)) { 1448 + err = PTR_ERR(req); 1449 + goto out; 1450 + } 1451 + 1452 + osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); 1453 + 1454 + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, 1455 + "inline_version", &inline_version, 1456 + sizeof(inline_version), 1457 + CEPH_OSD_CMPXATTR_OP_GT, 1458 + CEPH_OSD_CMPXATTR_MODE_U64); 1459 + if (err) 1460 + goto out_put; 1461 + 1462 + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, 1463 + "inline_version", &inline_version, 1464 + sizeof(inline_version), 0, 0); 1465 + if (err) 1466 + goto out_put; 1467 + 1468 + ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); 1469 + err = ceph_osdc_start_request(&fsc->client->osdc, req, false); 1470 + if (!err) 1471 + err = ceph_osdc_wait_request(&fsc->client->osdc, req); 1472 + out_put: 1473 + ceph_osdc_put_request(req); 1474 + if (err == -ECANCELED) 1475 + err = 0; 1476 + out: 1477 + if (page && page != locked_page) { 1478 + if (from_pagecache) { 1479 + unlock_page(page); 1480 + page_cache_release(page); 1481 + } else 1482 + __free_pages(page, 0); 1483 + } 1484 + 1485 + dout("uninline_data %p %llx.%llx inline_version %llu = %d\n", 1486 + inode, ceph_vinop(inode), inline_version, err); 1487 + return err; 1395 1488 } 1396 1489 1397 1490 static struct vm_operations_struct ceph_vmops = {
+102 -30
fs/ceph/caps.c
··· 975 975 kuid_t uid, kgid_t gid, umode_t mode, 976 976 u64 xattr_version, 977 977 struct ceph_buffer *xattrs_buf, 978 - u64 follows) 978 + u64 follows, bool inline_data) 979 979 { 980 980 struct ceph_mds_caps *fc; 981 981 struct ceph_msg *msg; 982 + void *p; 983 + size_t extra_len; 982 984 983 985 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" 984 986 " seq %u/%u mseq %u follows %lld size %llu/%llu" ··· 990 988 seq, issue_seq, mseq, follows, size, max_size, 991 989 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); 992 990 993 - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS, false); 991 + /* flock buffer size + inline version + inline data size */ 992 + extra_len = 4 + 8 + 4; 993 + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, 994 + GFP_NOFS, false); 994 995 if (!msg) 995 996 return -ENOMEM; 996 997 ··· 1024 1019 fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); 1025 1020 fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); 1026 1021 fc->mode = cpu_to_le32(mode); 1022 + 1023 + p = fc + 1; 1024 + /* flock buffer size */ 1025 + ceph_encode_32(&p, 0); 1026 + /* inline version */ 1027 + ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); 1028 + /* inline data size */ 1029 + ceph_encode_32(&p, 0); 1027 1030 1028 1031 fc->xattr_version = cpu_to_le64(xattr_version); 1029 1032 if (xattrs_buf) { ··· 1139 1126 u64 flush_tid = 0; 1140 1127 int i; 1141 1128 int ret; 1129 + bool inline_data; 1142 1130 1143 1131 held = cap->issued | cap->implemented; 1144 1132 revoking = cap->implemented & ~cap->issued; ··· 1223 1209 xattr_version = ci->i_xattrs.version; 1224 1210 } 1225 1211 1212 + inline_data = ci->i_inline_version != CEPH_INLINE_NONE; 1213 + 1226 1214 spin_unlock(&ci->i_ceph_lock); 1227 1215 1228 1216 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, 1229 1217 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, 1230 1218 size, max_size, &mtime, &atime, time_warp_seq, 1231 1219 uid, gid, mode, xattr_version, xattr_blob, 1232 - follows); 1220 + follows, inline_data); 1233 1221 if (ret < 0) { 1234 1222 dout("error sending cap msg, must requeue %p\n", inode); 1235 1223 delayed = 1; ··· 1352 1336 capsnap->time_warp_seq, 1353 1337 capsnap->uid, capsnap->gid, capsnap->mode, 1354 1338 capsnap->xattr_version, capsnap->xattr_blob, 1355 - capsnap->follows); 1339 + capsnap->follows, capsnap->inline_data); 1356 1340 1357 1341 next_follows = capsnap->follows + 1; 1358 1342 ceph_put_cap_snap(capsnap); ··· 2073 2057 * requested from the MDS. 2074 2058 */ 2075 2059 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, 2076 - int *got, loff_t endoff, int *check_max, int *err) 2060 + loff_t endoff, int *got, struct page **pinned_page, 2061 + int *check_max, int *err) 2077 2062 { 2078 2063 struct inode *inode = &ci->vfs_inode; 2079 2064 int ret = 0; 2080 - int have, implemented; 2065 + int have, implemented, _got = 0; 2081 2066 int file_wanted; 2082 2067 2083 2068 dout("get_cap_refs %p need %s want %s\n", inode, 2084 2069 ceph_cap_string(need), ceph_cap_string(want)); 2070 + again: 2085 2071 spin_lock(&ci->i_ceph_lock); 2086 2072 2087 2073 /* make sure file is actually open */ ··· 2093 2075 ceph_cap_string(need), ceph_cap_string(file_wanted)); 2094 2076 *err = -EBADF; 2095 2077 ret = 1; 2096 - goto out; 2078 + goto out_unlock; 2097 2079 } 2098 2080 2099 2081 /* finish pending truncate */ ··· 2113 2095 *check_max = 1; 2114 2096 ret = 1; 2115 2097 } 2116 - goto out; 2098 + goto out_unlock; 2117 2099 } 2118 2100 /* 2119 2101 * If a sync write is in progress, we must wait, so that we ··· 2121 2103 */ 2122 2104 if (__ceph_have_pending_cap_snap(ci)) { 2123 2105 dout("get_cap_refs %p cap_snap_pending\n", inode); 2124 - goto out; 2106 + goto out_unlock; 2125 2107 } 2126 2108 } 2127 2109 ··· 2138 2120 inode, ceph_cap_string(have), ceph_cap_string(not), 2139 2121 ceph_cap_string(revoking)); 2140 2122 if ((revoking & not) == 0) { 2141 - *got = need | (have & want); 2142 - __take_cap_refs(ci, *got); 2123 + _got = need | (have & want); 2124 + __take_cap_refs(ci, _got); 2143 2125 ret = 1; 2144 2126 } 2145 2127 } else { 2146 2128 dout("get_cap_refs %p have %s needed %s\n", inode, 2147 2129 ceph_cap_string(have), ceph_cap_string(need)); 2148 2130 } 2149 - out: 2131 + out_unlock: 2150 2132 spin_unlock(&ci->i_ceph_lock); 2133 + 2134 + if (ci->i_inline_version != CEPH_INLINE_NONE && 2135 + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && 2136 + i_size_read(inode) > 0) { 2137 + int ret1; 2138 + struct page *page = find_get_page(inode->i_mapping, 0); 2139 + if (page) { 2140 + if (PageUptodate(page)) { 2141 + *pinned_page = page; 2142 + goto out; 2143 + } 2144 + page_cache_release(page); 2145 + } 2146 + /* 2147 + * drop cap refs first because getattr while holding 2148 + * caps refs can cause deadlock. 2149 + */ 2150 + ceph_put_cap_refs(ci, _got); 2151 + _got = 0; 2152 + 2153 + /* getattr request will bring inline data into page cache */ 2154 + ret1 = __ceph_do_getattr(inode, NULL, 2155 + CEPH_STAT_CAP_INLINE_DATA, true); 2156 + if (ret1 >= 0) { 2157 + ret = 0; 2158 + goto again; 2159 + } 2160 + *err = ret1; 2161 + ret = 1; 2162 + } 2163 + out: 2151 2164 dout("get_cap_refs %p ret %d got %s\n", inode, 2152 - ret, ceph_cap_string(*got)); 2165 + ret, ceph_cap_string(_got)); 2166 + *got = _got; 2153 2167 return ret; 2154 2168 } 2155 2169 ··· 2218 2168 * due to a small max_size, make sure we check_max_size (and possibly 2219 2169 * ask the mds) so we don't get hung up indefinitely. 2220 2170 */ 2221 - int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got, 2222 - loff_t endoff) 2171 + int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 2172 + loff_t endoff, int *got, struct page **pinned_page) 2223 2173 { 2224 2174 int check_max, ret, err; 2225 2175 ··· 2229 2179 check_max = 0; 2230 2180 err = 0; 2231 2181 ret = wait_event_interruptible(ci->i_cap_wq, 2232 - try_get_cap_refs(ci, need, want, 2233 - got, endoff, 2182 + try_get_cap_refs(ci, need, want, endoff, 2183 + got, pinned_page, 2234 2184 &check_max, &err)); 2235 2185 if (err) 2236 2186 ret = err; ··· 2433 2383 static void handle_cap_grant(struct ceph_mds_client *mdsc, 2434 2384 struct inode *inode, struct ceph_mds_caps *grant, 2435 2385 void *snaptrace, int snaptrace_len, 2386 + u64 inline_version, 2387 + void *inline_data, int inline_len, 2436 2388 struct ceph_buffer *xattr_buf, 2437 2389 struct ceph_mds_session *session, 2438 2390 struct ceph_cap *cap, int issued) ··· 2455 2403 bool queue_invalidate = false; 2456 2404 bool queue_revalidate = false; 2457 2405 bool deleted_inode = false; 2406 + bool fill_inline = false; 2458 2407 2459 2408 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", 2460 2409 inode, cap, mds, seq, ceph_cap_string(newcaps)); ··· 2629 2576 } 2630 2577 BUG_ON(cap->issued & ~cap->implemented); 2631 2578 2579 + if (inline_version > 0 && inline_version >= ci->i_inline_version) { 2580 + ci->i_inline_version = inline_version; 2581 + if (ci->i_inline_version != CEPH_INLINE_NONE && 2582 + (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) 2583 + fill_inline = true; 2584 + } 2585 + 2632 2586 spin_unlock(&ci->i_ceph_lock); 2633 2587 2634 2588 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { ··· 2648 2588 if (newcaps & ~issued) 2649 2589 wake = true; 2650 2590 } 2591 + 2592 + if (fill_inline) 2593 + ceph_fill_inline_data(inode, NULL, inline_data, inline_len); 2651 2594 2652 2595 if (queue_trunc) { 2653 2596 ceph_queue_vmtruncate(inode); ··· 3059 2996 u64 cap_id; 3060 2997 u64 size, max_size; 3061 2998 u64 tid; 2999 + u64 inline_version = 0; 3000 + void *inline_data = NULL; 3001 + u32 inline_len = 0; 3062 3002 void *snaptrace; 3063 3003 size_t snaptrace_len; 3064 - void *flock; 3065 - void *end; 3066 - u32 flock_len; 3004 + void *p, *end; 3067 3005 3068 3006 dout("handle_caps from mds%d\n", mds); 3069 3007 ··· 3085 3021 3086 3022 snaptrace = h + 1; 3087 3023 snaptrace_len = le32_to_cpu(h->snap_trace_len); 3024 + p = snaptrace + snaptrace_len; 3088 3025 3089 3026 if (le16_to_cpu(msg->hdr.version) >= 2) { 3090 - void *p = snaptrace + snaptrace_len; 3027 + u32 flock_len; 3091 3028 ceph_decode_32_safe(&p, end, flock_len, bad); 3092 3029 if (p + flock_len > end) 3093 3030 goto bad; 3094 - flock = p; 3095 - } else { 3096 - flock = NULL; 3097 - flock_len = 0; 3031 + p += flock_len; 3098 3032 } 3099 3033 3100 3034 if (le16_to_cpu(msg->hdr.version) >= 3) { 3101 3035 if (op == CEPH_CAP_OP_IMPORT) { 3102 - void *p = flock + flock_len; 3103 3036 if (p + sizeof(*peer) > end) 3104 3037 goto bad; 3105 3038 peer = p; 3039 + p += sizeof(*peer); 3106 3040 } else if (op == CEPH_CAP_OP_EXPORT) { 3107 3041 /* recorded in unused fields */ 3108 3042 peer = (void *)&h->size; 3109 3043 } 3044 + } 3045 + 3046 + if (le16_to_cpu(msg->hdr.version) >= 4) { 3047 + ceph_decode_64_safe(&p, end, inline_version, bad); 3048 + ceph_decode_32_safe(&p, end, inline_len, bad); 3049 + if (p + inline_len > end) 3050 + goto bad; 3051 + inline_data = p; 3052 + p += inline_len; 3110 3053 } 3111 3054 3112 3055 /* lookup ino */ ··· 3156 3085 handle_cap_import(mdsc, inode, h, peer, session, 3157 3086 &cap, &issued); 3158 3087 handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len, 3088 + inline_version, inline_data, inline_len, 3159 3089 msg->middle, session, cap, issued); 3160 3090 goto done_unlocked; 3161 3091 } ··· 3177 3105 case CEPH_CAP_OP_GRANT: 3178 3106 __ceph_caps_issued(ci, &issued); 3179 3107 issued |= __ceph_caps_dirty(ci); 3180 - handle_cap_grant(mdsc, inode, h, NULL, 0, msg->middle, 3181 - session, cap, issued); 3108 + handle_cap_grant(mdsc, inode, h, NULL, 0, 3109 + inline_version, inline_data, inline_len, 3110 + msg->middle, session, cap, issued); 3182 3111 goto done_unlocked; 3183 3112 3184 3113 case CEPH_CAP_OP_FLUSH_ACK: ··· 3210 3137 done: 3211 3138 mutex_unlock(&session->s_mutex); 3212 3139 done_unlocked: 3213 - if (inode) 3214 - iput(inode); 3140 + iput(inode); 3215 3141 return; 3216 3142 3217 3143 bad:
+18 -9
fs/ceph/dir.c
··· 183 183 spin_unlock(&parent->d_lock); 184 184 185 185 /* make sure a dentry wasn't dropped while we didn't have parent lock */ 186 - if (!ceph_dir_is_complete(dir)) { 186 + if (!ceph_dir_is_complete_ordered(dir)) { 187 187 dout(" lost dir complete on %p; falling back to mds\n", dir); 188 188 dput(dentry); 189 189 err = -EAGAIN; ··· 261 261 262 262 /* always start with . and .. */ 263 263 if (ctx->pos == 0) { 264 - /* note dir version at start of readdir so we can tell 265 - * if any dentries get dropped */ 266 - fi->dir_release_count = atomic_read(&ci->i_release_count); 267 - 268 264 dout("readdir off 0 -> '.'\n"); 269 265 if (!dir_emit(ctx, ".", 1, 270 266 ceph_translate_ino(inode->i_sb, inode->i_ino), ··· 285 289 if ((ctx->pos == 2 || fi->dentry) && 286 290 !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && 287 291 ceph_snap(inode) != CEPH_SNAPDIR && 288 - __ceph_dir_is_complete(ci) && 292 + __ceph_dir_is_complete_ordered(ci) && 289 293 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) { 290 294 u32 shared_gen = ci->i_shared_gen; 291 295 spin_unlock(&ci->i_ceph_lock); ··· 307 311 } 308 312 309 313 /* proceed with a normal readdir */ 314 + 315 + if (ctx->pos == 2) { 316 + /* note dir version at start of readdir so we can tell 317 + * if any dentries get dropped */ 318 + fi->dir_release_count = atomic_read(&ci->i_release_count); 319 + fi->dir_ordered_count = ci->i_ordered_count; 320 + } 310 321 311 322 more: 312 323 /* do we have the correct frag content buffered? */ ··· 449 446 */ 450 447 spin_lock(&ci->i_ceph_lock); 451 448 if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { 452 - dout(" marking %p complete\n", inode); 453 - __ceph_dir_set_complete(ci, fi->dir_release_count); 449 + if (ci->i_ordered_count == fi->dir_ordered_count) 450 + dout(" marking %p complete and ordered\n", inode); 451 + else 452 + dout(" marking %p complete\n", inode); 453 + __ceph_dir_set_complete(ci, fi->dir_release_count, 454 + fi->dir_ordered_count); 454 455 } 455 456 spin_unlock(&ci->i_ceph_lock); 456 457 ··· 812 805 acls.pagelist = NULL; 813 806 } 814 807 err = ceph_mdsc_do_request(mdsc, dir, req); 815 - if (!err && !req->r_reply_info.head->is_dentry) 808 + if (!err && 809 + !req->r_reply_info.head->is_target && 810 + !req->r_reply_info.head->is_dentry) 816 811 err = ceph_handle_notrace_create(dir, dentry); 817 812 ceph_mdsc_put_request(req); 818 813 out:
+83 -14
fs/ceph/file.c
··· 333 333 return 0; 334 334 } 335 335 336 + enum { 337 + CHECK_EOF = 1, 338 + READ_INLINE = 2, 339 + }; 340 + 336 341 /* 337 342 * Read a range of bytes striped over one or more objects. Iterate over 338 343 * objects we stripe over. (That's not atomic, but good enough for now.) ··· 417 412 ret = read; 418 413 /* did we bounce off eof? */ 419 414 if (pos + left > inode->i_size) 420 - *checkeof = 1; 415 + *checkeof = CHECK_EOF; 421 416 } 422 417 423 418 dout("striped_read returns %d\n", ret); ··· 603 598 snapc = ci->i_snap_realm->cached_context; 604 599 vino = ceph_vino(inode); 605 600 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 606 - vino, pos, &len, 601 + vino, pos, &len, 0, 607 602 2,/*include a 'startsync' command*/ 608 603 CEPH_OSD_OP_WRITE, flags, snapc, 609 604 ci->i_truncate_seq, ··· 613 608 ret = PTR_ERR(req); 614 609 break; 615 610 } 611 + 612 + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); 616 613 617 614 n = iov_iter_get_pages_alloc(from, &pages, len, &start); 618 615 if (unlikely(n < 0)) { ··· 720 713 snapc = ci->i_snap_realm->cached_context; 721 714 vino = ceph_vino(inode); 722 715 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 723 - vino, pos, &len, 1, 716 + vino, pos, &len, 0, 1, 724 717 CEPH_OSD_OP_WRITE, flags, snapc, 725 718 ci->i_truncate_seq, 726 719 ci->i_truncate_size, ··· 810 803 size_t len = iocb->ki_nbytes; 811 804 struct inode *inode = file_inode(filp); 812 805 struct ceph_inode_info *ci = ceph_inode(inode); 806 + struct page *pinned_page = NULL; 813 807 ssize_t ret; 814 808 int want, got = 0; 815 - int checkeof = 0, read = 0; 809 + int retry_op = 0, read = 0; 816 810 817 811 again: 818 812 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", ··· 823 815 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 824 816 else 825 817 want = CEPH_CAP_FILE_CACHE; 826 - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); 818 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); 827 819 if (ret < 0) 828 820 return ret; 829 821 ··· 835 827 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, 836 828 ceph_cap_string(got)); 837 829 838 - /* hmm, this isn't really async... */ 839 - ret = ceph_sync_read(iocb, to, &checkeof); 830 + if (ci->i_inline_version == CEPH_INLINE_NONE) { 831 + /* hmm, this isn't really async... */ 832 + ret = ceph_sync_read(iocb, to, &retry_op); 833 + } else { 834 + retry_op = READ_INLINE; 835 + } 840 836 } else { 841 837 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", 842 838 inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ··· 850 838 } 851 839 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", 852 840 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); 841 + if (pinned_page) { 842 + page_cache_release(pinned_page); 843 + pinned_page = NULL; 844 + } 853 845 ceph_put_cap_refs(ci, got); 846 + if (retry_op && ret >= 0) { 847 + int statret; 848 + struct page *page = NULL; 849 + loff_t i_size; 850 + if (retry_op == READ_INLINE) { 851 + page = __page_cache_alloc(GFP_NOFS); 852 + if (!page) 853 + return -ENOMEM; 854 + } 854 855 855 - if (checkeof && ret >= 0) { 856 - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); 856 + statret = __ceph_do_getattr(inode, page, 857 + CEPH_STAT_CAP_INLINE_DATA, !!page); 858 + if (statret < 0) { 859 + __free_page(page); 860 + if (statret == -ENODATA) { 861 + BUG_ON(retry_op != READ_INLINE); 862 + goto again; 863 + } 864 + return statret; 865 + } 866 + 867 + i_size = i_size_read(inode); 868 + if (retry_op == READ_INLINE) { 869 + /* does not support inline data > PAGE_SIZE */ 870 + if (i_size > PAGE_CACHE_SIZE) { 871 + ret = -EIO; 872 + } else if (iocb->ki_pos < i_size) { 873 + loff_t end = min_t(loff_t, i_size, 874 + iocb->ki_pos + len); 875 + if (statret < end) 876 + zero_user_segment(page, statret, end); 877 + ret = copy_page_to_iter(page, 878 + iocb->ki_pos & ~PAGE_MASK, 879 + end - iocb->ki_pos, to); 880 + iocb->ki_pos += ret; 881 + } else { 882 + ret = 0; 883 + } 884 + __free_pages(page, 0); 885 + return ret; 886 + } 857 887 858 888 /* hit EOF or hole? */ 859 - if (statret == 0 && iocb->ki_pos < inode->i_size && 889 + if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && 860 890 ret < len) { 861 891 dout("sync_read hit hole, ppos %lld < size %lld" 862 892 ", reading more\n", iocb->ki_pos, ··· 906 852 907 853 read += ret; 908 854 len -= ret; 909 - checkeof = 0; 855 + retry_op = 0; 910 856 goto again; 911 857 } 912 858 } ··· 963 909 if (err) 964 910 goto out; 965 911 912 + if (ci->i_inline_version != CEPH_INLINE_NONE) { 913 + err = ceph_uninline_data(file, NULL); 914 + if (err < 0) 915 + goto out; 916 + } 917 + 966 918 retry_snap: 967 919 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { 968 920 err = -ENOSPC; ··· 982 922 else 983 923 want = CEPH_CAP_FILE_BUFFER; 984 924 got = 0; 985 - err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, pos + count); 925 + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, pos + count, 926 + &got, NULL); 986 927 if (err < 0) 987 928 goto out; 988 929 ··· 1030 969 if (written >= 0) { 1031 970 int dirty; 1032 971 spin_lock(&ci->i_ceph_lock); 972 + ci->i_inline_version = CEPH_INLINE_NONE; 1033 973 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 1034 974 spin_unlock(&ci->i_ceph_lock); 1035 975 if (dirty) ··· 1173 1111 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 1174 1112 ceph_vino(inode), 1175 1113 offset, length, 1176 - 1, op, 1114 + 0, 1, op, 1177 1115 CEPH_OSD_FLAG_WRITE | 1178 1116 CEPH_OSD_FLAG_ONDISK, 1179 1117 NULL, 0, 0, false); ··· 1276 1214 goto unlock; 1277 1215 } 1278 1216 1217 + if (ci->i_inline_version != CEPH_INLINE_NONE) { 1218 + ret = ceph_uninline_data(file, NULL); 1219 + if (ret < 0) 1220 + goto unlock; 1221 + } 1222 + 1279 1223 size = i_size_read(inode); 1280 1224 if (!(mode & FALLOC_FL_KEEP_SIZE)) 1281 1225 endoff = offset + length; ··· 1291 1223 else 1292 1224 want = CEPH_CAP_FILE_BUFFER; 1293 1225 1294 - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, &got, endoff); 1226 + ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); 1295 1227 if (ret < 0) 1296 1228 goto unlock; 1297 1229 ··· 1308 1240 1309 1241 if (!ret) { 1310 1242 spin_lock(&ci->i_ceph_lock); 1243 + ci->i_inline_version = CEPH_INLINE_NONE; 1311 1244 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); 1312 1245 spin_unlock(&ci->i_ceph_lock); 1313 1246 if (dirty)
+47 -12
fs/ceph/inode.c
··· 387 387 spin_lock_init(&ci->i_ceph_lock); 388 388 389 389 ci->i_version = 0; 390 + ci->i_inline_version = 0; 390 391 ci->i_time_warp_seq = 0; 391 392 ci->i_ceph_flags = 0; 393 + ci->i_ordered_count = 0; 392 394 atomic_set(&ci->i_release_count, 1); 393 395 atomic_set(&ci->i_complete_count, 0); 394 396 ci->i_symlink = NULL; ··· 659 657 * Populate an inode based on info from mds. May be called on new or 660 658 * existing inodes. 661 659 */ 662 - static int fill_inode(struct inode *inode, 660 + static int fill_inode(struct inode *inode, struct page *locked_page, 663 661 struct ceph_mds_reply_info_in *iinfo, 664 662 struct ceph_mds_reply_dirfrag *dirinfo, 665 663 struct ceph_mds_session *session, ··· 677 675 bool wake = false; 678 676 bool queue_trunc = false; 679 677 bool new_version = false; 678 + bool fill_inline = false; 680 679 681 680 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n", 682 681 inode, ceph_vinop(inode), le64_to_cpu(info->version), ··· 848 845 (issued & CEPH_CAP_FILE_EXCL) == 0 && 849 846 !__ceph_dir_is_complete(ci)) { 850 847 dout(" marking %p complete (empty)\n", inode); 851 - __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); 848 + __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count), 849 + ci->i_ordered_count); 852 850 } 853 851 854 852 /* were we issued a capability? */ ··· 877 873 ceph_vinop(inode)); 878 874 __ceph_get_fmode(ci, cap_fmode); 879 875 } 876 + 877 + if (iinfo->inline_version > 0 && 878 + iinfo->inline_version >= ci->i_inline_version) { 879 + int cache_caps = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; 880 + ci->i_inline_version = iinfo->inline_version; 881 + if (ci->i_inline_version != CEPH_INLINE_NONE && 882 + (locked_page || 883 + (le32_to_cpu(info->cap.caps) & cache_caps))) 884 + fill_inline = true; 885 + } 886 + 880 887 spin_unlock(&ci->i_ceph_lock); 888 + 889 + if (fill_inline) 890 + ceph_fill_inline_data(inode, locked_page, 891 + iinfo->inline_data, iinfo->inline_len); 881 892 882 893 if (wake) 883 894 wake_up_all(&ci->i_cap_wq); ··· 1081 1062 struct inode *dir = req->r_locked_dir; 1082 1063 1083 1064 if (dir) { 1084 - err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag, 1065 + err = fill_inode(dir, NULL, 1066 + &rinfo->diri, rinfo->dirfrag, 1085 1067 session, req->r_request_started, -1, 1086 1068 &req->r_caps_reservation); 1087 1069 if (err < 0) ··· 1152 1132 } 1153 1133 req->r_target_inode = in; 1154 1134 1155 - err = fill_inode(in, &rinfo->targeti, NULL, 1135 + err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, 1156 1136 session, req->r_request_started, 1157 1137 (!req->r_aborted && rinfo->head->result == 0) ? 1158 1138 req->r_fmode : -1, ··· 1224 1204 ceph_invalidate_dentry_lease(dn); 1225 1205 1226 1206 /* d_move screws up sibling dentries' offsets */ 1227 - ceph_dir_clear_complete(dir); 1228 - ceph_dir_clear_complete(olddir); 1207 + ceph_dir_clear_ordered(dir); 1208 + ceph_dir_clear_ordered(olddir); 1229 1209 1230 1210 dout("dn %p gets new offset %lld\n", req->r_old_dentry, 1231 1211 ceph_dentry(req->r_old_dentry)->offset); ··· 1237 1217 if (!rinfo->head->is_target) { 1238 1218 dout("fill_trace null dentry\n"); 1239 1219 if (dn->d_inode) { 1220 + ceph_dir_clear_ordered(dir); 1240 1221 dout("d_delete %p\n", dn); 1241 1222 d_delete(dn); 1242 1223 } else { ··· 1254 1233 1255 1234 /* attach proper inode */ 1256 1235 if (!dn->d_inode) { 1257 - ceph_dir_clear_complete(dir); 1236 + ceph_dir_clear_ordered(dir); 1258 1237 ihold(in); 1259 1238 dn = splice_dentry(dn, in, &have_lease); 1260 1239 if (IS_ERR(dn)) { ··· 1284 1263 BUG_ON(!dir); 1285 1264 BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); 1286 1265 dout(" linking snapped dir %p to dn %p\n", in, dn); 1287 - ceph_dir_clear_complete(dir); 1266 + ceph_dir_clear_ordered(dir); 1288 1267 ihold(in); 1289 1268 dn = splice_dentry(dn, in, NULL); 1290 1269 if (IS_ERR(dn)) { ··· 1321 1300 dout("new_inode badness got %d\n", err); 1322 1301 continue; 1323 1302 } 1324 - rc = fill_inode(in, &rinfo->dir_in[i], NULL, session, 1303 + rc = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, 1325 1304 req->r_request_started, -1, 1326 1305 &req->r_caps_reservation); 1327 1306 if (rc < 0) { ··· 1437 1416 } 1438 1417 } 1439 1418 1440 - if (fill_inode(in, &rinfo->dir_in[i], NULL, session, 1419 + if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, 1441 1420 req->r_request_started, -1, 1442 1421 &req->r_caps_reservation) < 0) { 1443 1422 pr_err("fill_inode badness on %p\n", in); ··· 1920 1899 * Verify that we have a lease on the given mask. If not, 1921 1900 * do a getattr against an mds. 1922 1901 */ 1923 - int ceph_do_getattr(struct inode *inode, int mask, bool force) 1902 + int __ceph_do_getattr(struct inode *inode, struct page *locked_page, 1903 + int mask, bool force) 1924 1904 { 1925 1905 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); 1926 1906 struct ceph_mds_client *mdsc = fsc->mdsc; ··· 1933 1911 return 0; 1934 1912 } 1935 1913 1936 - dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); 1914 + dout("do_getattr inode %p mask %s mode 0%o\n", 1915 + inode, ceph_cap_string(mask), inode->i_mode); 1937 1916 if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) 1938 1917 return 0; 1939 1918 ··· 1945 1922 ihold(inode); 1946 1923 req->r_num_caps = 1; 1947 1924 req->r_args.getattr.mask = cpu_to_le32(mask); 1925 + req->r_locked_page = locked_page; 1948 1926 err = ceph_mdsc_do_request(mdsc, NULL, req); 1927 + if (locked_page && err == 0) { 1928 + u64 inline_version = req->r_reply_info.targeti.inline_version; 1929 + if (inline_version == 0) { 1930 + /* the reply is supposed to contain inline data */ 1931 + err = -EINVAL; 1932 + } else if (inline_version == CEPH_INLINE_NONE) { 1933 + err = -ENODATA; 1934 + } else { 1935 + err = req->r_reply_info.targeti.inline_len; 1936 + } 1937 + } 1949 1938 ceph_mdsc_put_request(req); 1950 1939 dout("do_getattr result=%d\n", err); 1951 1940 return err;
+54 -10
fs/ceph/locks.c
··· 9 9 #include <linux/ceph/pagelist.h> 10 10 11 11 static u64 lock_secret; 12 + static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, 13 + struct ceph_mds_request *req); 12 14 13 15 static inline u64 secure_addr(void *addr) 14 16 { ··· 42 40 u64 length = 0; 43 41 u64 owner; 44 42 43 + if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) 44 + wait = 0; 45 + 45 46 req = ceph_mdsc_create_request(mdsc, operation, USE_AUTH_MDS); 46 47 if (IS_ERR(req)) 47 48 return PTR_ERR(req); ··· 73 68 req->r_args.filelock_change.length = cpu_to_le64(length); 74 69 req->r_args.filelock_change.wait = wait; 75 70 71 + if (wait) 72 + req->r_wait_for_completion = ceph_lock_wait_for_completion; 73 + 76 74 err = ceph_mdsc_do_request(mdsc, inode, req); 77 75 78 76 if (operation == CEPH_MDS_OP_GETFILELOCK) { ··· 102 94 (int)operation, (u64)fl->fl_pid, fl->fl_start, 103 95 length, wait, fl->fl_type, err); 104 96 return err; 97 + } 98 + 99 + static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, 100 + struct ceph_mds_request *req) 101 + { 102 + struct ceph_mds_request *intr_req; 103 + struct inode *inode = req->r_inode; 104 + int err, lock_type; 105 + 106 + BUG_ON(req->r_op != CEPH_MDS_OP_SETFILELOCK); 107 + if (req->r_args.filelock_change.rule == CEPH_LOCK_FCNTL) 108 + lock_type = CEPH_LOCK_FCNTL_INTR; 109 + else if (req->r_args.filelock_change.rule == CEPH_LOCK_FLOCK) 110 + lock_type = CEPH_LOCK_FLOCK_INTR; 111 + else 112 + BUG_ON(1); 113 + BUG_ON(req->r_args.filelock_change.type == CEPH_LOCK_UNLOCK); 114 + 115 + err = wait_for_completion_interruptible(&req->r_completion); 116 + if (!err) 117 + return 0; 118 + 119 + dout("ceph_lock_wait_for_completion: request %llu was interrupted\n", 120 + req->r_tid); 121 + 122 + intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, 123 + USE_AUTH_MDS); 124 + if (IS_ERR(intr_req)) 125 + return PTR_ERR(intr_req); 126 + 127 + intr_req->r_inode = inode; 128 + ihold(inode); 129 + intr_req->r_num_caps = 1; 130 + 131 + intr_req->r_args.filelock_change = req->r_args.filelock_change; 132 + intr_req->r_args.filelock_change.rule = lock_type; 133 + intr_req->r_args.filelock_change.type = CEPH_LOCK_UNLOCK; 134 + 135 + err = ceph_mdsc_do_request(mdsc, inode, intr_req); 136 + ceph_mdsc_put_request(intr_req); 137 + 138 + if (err && err != -ERESTARTSYS) 139 + return err; 140 + 141 + wait_for_completion(&req->r_completion); 142 + return 0; 105 143 } 106 144 107 145 /** ··· 197 143 err); 198 144 } 199 145 } 200 - 201 - } else if (err == -ERESTARTSYS) { 202 - dout("undoing lock\n"); 203 - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, 204 - CEPH_LOCK_UNLOCK, 0, fl); 205 146 } 206 147 return err; 207 148 } ··· 235 186 file, CEPH_LOCK_UNLOCK, 0, fl); 236 187 dout("got %d on flock_lock_file_wait, undid lock", err); 237 188 } 238 - } else if (err == -ERESTARTSYS) { 239 - dout("undoing lock\n"); 240 - ceph_lock_message(CEPH_LOCK_FLOCK, 241 - CEPH_MDS_OP_SETFILELOCK, 242 - file, CEPH_LOCK_UNLOCK, 0, fl); 243 189 } 244 190 return err; 245 191 }
+35 -6
fs/ceph/mds_client.c
··· 89 89 ceph_decode_need(p, end, info->xattr_len, bad); 90 90 info->xattr_data = *p; 91 91 *p += info->xattr_len; 92 + 93 + if (features & CEPH_FEATURE_MDS_INLINE_DATA) { 94 + ceph_decode_64_safe(p, end, info->inline_version, bad); 95 + ceph_decode_32_safe(p, end, info->inline_len, bad); 96 + ceph_decode_need(p, end, info->inline_len, bad); 97 + info->inline_data = *p; 98 + *p += info->inline_len; 99 + } else 100 + info->inline_version = CEPH_INLINE_NONE; 101 + 92 102 return 0; 93 103 bad: 94 104 return err; ··· 534 524 } 535 525 if (req->r_locked_dir) 536 526 ceph_put_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN); 537 - if (req->r_target_inode) 538 - iput(req->r_target_inode); 527 + iput(req->r_target_inode); 539 528 if (req->r_dentry) 540 529 dput(req->r_dentry); 541 530 if (req->r_old_dentry) ··· 870 861 /* 871 862 * Serialize client metadata into waiting buffer space, using 872 863 * the format that userspace expects for map<string, string> 864 + * 865 + * ClientSession messages with metadata are v2 873 866 */ 874 - msg->hdr.version = 2; /* ClientSession messages with metadata are v2 */ 867 + msg->hdr.version = cpu_to_le16(2); 868 + msg->hdr.compat_version = cpu_to_le16(1); 875 869 876 870 /* The write pointer, following the session_head structure */ 877 871 p = msg->front.iov_base + sizeof(*h); ··· 1078 1066 session->s_cap_iterator = NULL; 1079 1067 spin_unlock(&session->s_cap_lock); 1080 1068 1081 - if (last_inode) 1082 - iput(last_inode); 1069 + iput(last_inode); 1083 1070 if (old_cap) 1084 1071 ceph_put_cap(session->s_mdsc, old_cap); 1085 1072 ··· 1885 1874 goto out_free2; 1886 1875 } 1887 1876 1888 - msg->hdr.version = 2; 1877 + msg->hdr.version = cpu_to_le16(2); 1889 1878 msg->hdr.tid = cpu_to_le64(req->r_tid); 1890 1879 1891 1880 head = msg->front.iov_base; ··· 2219 2208 &req->r_completion, req->r_timeout); 2220 2209 if (err == 0) 2221 2210 err = -EIO; 2211 + } else if (req->r_wait_for_completion) { 2212 + err = req->r_wait_for_completion(mdsc, req); 2222 2213 } else { 2223 2214 err = wait_for_completion_killable(&req->r_completion); 2224 2215 } ··· 3757 3744 return msg; 3758 3745 } 3759 3746 3747 + static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) 3748 + { 3749 + struct ceph_mds_session *s = con->private; 3750 + struct ceph_auth_handshake *auth = &s->s_auth; 3751 + return ceph_auth_sign_message(auth, msg); 3752 + } 3753 + 3754 + static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) 3755 + { 3756 + struct ceph_mds_session *s = con->private; 3757 + struct ceph_auth_handshake *auth = &s->s_auth; 3758 + return ceph_auth_check_message_signature(auth, msg); 3759 + } 3760 + 3760 3761 static const struct ceph_connection_operations mds_con_ops = { 3761 3762 .get = con_get, 3762 3763 .put = con_put, ··· 3780 3753 .invalidate_authorizer = invalidate_authorizer, 3781 3754 .peer_reset = peer_reset, 3782 3755 .alloc_msg = mds_alloc_msg, 3756 + .sign_message = sign_message, 3757 + .check_message_signature = check_message_signature, 3783 3758 }; 3784 3759 3785 3760 /* eof */
+10
fs/ceph/mds_client.h
··· 41 41 char *symlink; 42 42 u32 xattr_len; 43 43 char *xattr_data; 44 + u64 inline_version; 45 + u32 inline_len; 46 + char *inline_data; 44 47 }; 45 48 46 49 /* ··· 169 166 */ 170 167 typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, 171 168 struct ceph_mds_request *req); 169 + /* 170 + * wait for request completion callback 171 + */ 172 + typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, 173 + struct ceph_mds_request *req); 172 174 173 175 /* 174 176 * an in-flight mds request ··· 223 215 int r_request_release_offset; 224 216 struct ceph_msg *r_reply; 225 217 struct ceph_mds_reply_info_parsed r_reply_info; 218 + struct page *r_locked_page; 226 219 int r_err; 227 220 bool r_aborted; 228 221 ··· 248 239 struct completion r_completion; 249 240 struct completion r_safe_completion; 250 241 ceph_mds_request_callback_t r_callback; 242 + ceph_mds_request_wait_callback_t r_wait_for_completion; 251 243 struct list_head r_unsafe_item; /* per-session unsafe list item */ 252 244 bool r_got_unsafe, r_got_safe, r_got_result; 253 245
+30 -7
fs/ceph/snap.c
··· 288 288 return 0; 289 289 } 290 290 291 + 292 + static struct ceph_snap_context *empty_snapc; 293 + 291 294 /* 292 295 * build the snap context for a given realm. 293 296 */ ··· 331 328 return 0; 332 329 } 333 330 331 + if (num == 0 && realm->seq == empty_snapc->seq) { 332 + ceph_get_snap_context(empty_snapc); 333 + snapc = empty_snapc; 334 + goto done; 335 + } 336 + 334 337 /* alloc new snap context */ 335 338 err = -ENOMEM; 336 339 if (num > (SIZE_MAX - sizeof(*snapc)) / sizeof(u64)) ··· 374 365 realm->ino, realm, snapc, snapc->seq, 375 366 (unsigned int) snapc->num_snaps); 376 367 377 - if (realm->cached_context) 378 - ceph_put_snap_context(realm->cached_context); 368 + done: 369 + ceph_put_snap_context(realm->cached_context); 379 370 realm->cached_context = snapc; 380 371 return 0; 381 372 ··· 475 466 cap_snap. lucky us. */ 476 467 dout("queue_cap_snap %p already pending\n", inode); 477 468 kfree(capsnap); 469 + } else if (ci->i_snap_realm->cached_context == empty_snapc) { 470 + dout("queue_cap_snap %p empty snapc\n", inode); 471 + kfree(capsnap); 478 472 } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| 479 473 CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { 480 474 struct ceph_snap_context *snapc = ci->i_head_snapc; ··· 515 503 capsnap->xattr_blob = NULL; 516 504 capsnap->xattr_version = 0; 517 505 } 506 + 507 + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; 518 508 519 509 /* dirty page count moved from _head to this cap_snap; 520 510 all subsequent writes page dirties occur _after_ this ··· 604 590 if (!inode) 605 591 continue; 606 592 spin_unlock(&realm->inodes_with_caps_lock); 607 - if (lastinode) 608 - iput(lastinode); 593 + iput(lastinode); 609 594 lastinode = inode; 610 595 ceph_queue_cap_snap(ci); 611 596 spin_lock(&realm->inodes_with_caps_lock); 612 597 } 613 598 spin_unlock(&realm->inodes_with_caps_lock); 614 - if (lastinode) 615 - iput(lastinode); 599 + iput(lastinode); 616 600 617 601 list_for_each_entry(child, &realm->children, child_item) { 618 602 dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", ··· 940 928 return; 941 929 } 942 930 931 + int __init ceph_snap_init(void) 932 + { 933 + empty_snapc = ceph_create_snap_context(0, GFP_NOFS); 934 + if (!empty_snapc) 935 + return -ENOMEM; 936 + empty_snapc->seq = 1; 937 + return 0; 938 + } 943 939 944 - 940 + void ceph_snap_exit(void) 941 + { 942 + ceph_put_snap_context(empty_snapc); 943 + }
+10 -6
fs/ceph/super.c
··· 515 515 struct ceph_fs_client *fsc; 516 516 const u64 supported_features = 517 517 CEPH_FEATURE_FLOCK | 518 - CEPH_FEATURE_DIRLAYOUTHASH; 518 + CEPH_FEATURE_DIRLAYOUTHASH | 519 + CEPH_FEATURE_MDS_INLINE_DATA; 519 520 const u64 required_features = 0; 520 521 int page_count; 521 522 size_t size; ··· 1018 1017 }; 1019 1018 MODULE_ALIAS_FS("ceph"); 1020 1019 1021 - #define _STRINGIFY(x) #x 1022 - #define STRINGIFY(x) _STRINGIFY(x) 1023 - 1024 1020 static int __init init_ceph(void) 1025 1021 { 1026 1022 int ret = init_caches(); ··· 1026 1028 1027 1029 ceph_flock_init(); 1028 1030 ceph_xattr_init(); 1031 + ret = ceph_snap_init(); 1032 + if (ret) 1033 + goto out_xattr; 1029 1034 ret = register_filesystem(&ceph_fs_type); 1030 1035 if (ret) 1031 - goto out_icache; 1036 + goto out_snap; 1032 1037 1033 1038 pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); 1034 1039 1035 1040 return 0; 1036 1041 1037 - out_icache: 1042 + out_snap: 1043 + ceph_snap_exit(); 1044 + out_xattr: 1038 1045 ceph_xattr_exit(); 1039 1046 destroy_caches(); 1040 1047 out: ··· 1050 1047 { 1051 1048 dout("exit_ceph\n"); 1052 1049 unregister_filesystem(&ceph_fs_type); 1050 + ceph_snap_exit(); 1053 1051 ceph_xattr_exit(); 1054 1052 destroy_caches(); 1055 1053 }
+46 -9
fs/ceph/super.h
··· 161 161 u64 time_warp_seq; 162 162 int writing; /* a sync write is still in progress */ 163 163 int dirty_pages; /* dirty pages awaiting writeback */ 164 + bool inline_data; 164 165 }; 165 166 166 167 static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) ··· 254 253 spinlock_t i_ceph_lock; 255 254 256 255 u64 i_version; 256 + u64 i_inline_version; 257 257 u32 i_time_warp_seq; 258 258 259 259 unsigned i_ceph_flags; 260 + int i_ordered_count; 260 261 atomic_t i_release_count; 261 262 atomic_t i_complete_count; 262 263 ··· 437 434 /* 438 435 * Ceph inode. 439 436 */ 440 - #define CEPH_I_NODELAY 4 /* do not delay cap release */ 441 - #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ 442 - #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ 437 + #define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ 438 + #define CEPH_I_NODELAY 4 /* do not delay cap release */ 439 + #define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ 440 + #define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ 443 441 444 442 static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, 445 - int release_count) 443 + int release_count, int ordered_count) 446 444 { 447 445 atomic_set(&ci->i_complete_count, release_count); 446 + if (ci->i_ordered_count == ordered_count) 447 + ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; 448 + else 449 + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; 448 450 } 449 451 450 452 static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) ··· 463 455 atomic_read(&ci->i_release_count); 464 456 } 465 457 458 + static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) 459 + { 460 + return __ceph_dir_is_complete(ci) && 461 + (ci->i_ceph_flags & CEPH_I_DIR_ORDERED); 462 + } 463 + 466 464 static inline void ceph_dir_clear_complete(struct inode *inode) 467 465 { 468 466 __ceph_dir_clear_complete(ceph_inode(inode)); 469 467 } 470 468 471 - static inline bool ceph_dir_is_complete(struct inode *inode) 469 + static inline void ceph_dir_clear_ordered(struct inode *inode) 472 470 { 473 - return __ceph_dir_is_complete(ceph_inode(inode)); 471 + struct ceph_inode_info *ci = ceph_inode(inode); 472 + spin_lock(&ci->i_ceph_lock); 473 + ci->i_ordered_count++; 474 + ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; 475 + spin_unlock(&ci->i_ceph_lock); 474 476 } 475 477 478 + static inline bool ceph_dir_is_complete_ordered(struct inode *inode) 479 + { 480 + struct ceph_inode_info *ci = ceph_inode(inode); 481 + bool ret; 482 + spin_lock(&ci->i_ceph_lock); 483 + ret = __ceph_dir_is_complete_ordered(ci); 484 + spin_unlock(&ci->i_ceph_lock); 485 + return ret; 486 + } 476 487 477 488 /* find a specific frag @f */ 478 489 extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, ··· 607 580 char *last_name; /* last entry in previous chunk */ 608 581 struct dentry *dentry; /* next dentry (for dcache readdir) */ 609 582 int dir_release_count; 583 + int dir_ordered_count; 610 584 611 585 /* used for -o dirstat read() on directory thing */ 612 586 char *dir_info; ··· 701 673 extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, 702 674 struct ceph_cap_snap *capsnap); 703 675 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); 676 + extern int ceph_snap_init(void); 677 + extern void ceph_snap_exit(void); 704 678 705 679 /* 706 680 * a cap_snap is "pending" if it is still awaiting an in-progress ··· 745 715 extern void ceph_queue_invalidate(struct inode *inode); 746 716 extern void ceph_queue_writeback(struct inode *inode); 747 717 748 - extern int ceph_do_getattr(struct inode *inode, int mask, bool force); 718 + extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, 719 + int mask, bool force); 720 + static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) 721 + { 722 + return __ceph_do_getattr(inode, NULL, mask, force); 723 + } 749 724 extern int ceph_permission(struct inode *inode, int mask); 750 725 extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); 751 726 extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, ··· 865 830 int mds, int drop, int unless); 866 831 867 832 extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, 868 - int *got, loff_t endoff); 833 + loff_t endoff, int *got, struct page **pinned_page); 869 834 870 835 /* for counting open files by mode */ 871 836 static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode) ··· 887 852 struct file *file, unsigned flags, umode_t mode, 888 853 int *opened); 889 854 extern int ceph_release(struct inode *inode, struct file *filp); 890 - 855 + extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, 856 + char *data, size_t len); 857 + int ceph_uninline_data(struct file *filp, struct page *locked_page); 891 858 /* dir.c */ 892 859 extern const struct file_operations ceph_dir_fops; 893 860 extern const struct inode_operations ceph_dir_iops;
+10
fs/ceph/super.h.rej
··· 1 + --- fs/ceph/super.h 2 + +++ fs/ceph/super.h 3 + @@ -254,6 +255,7 @@ 4 + spinlock_t i_ceph_lock; 5 + 6 + u64 i_version; 7 + + u64 i_inline_version; 8 + u32 i_time_warp_seq; 9 + 10 + unsigned i_ceph_flags;
+5 -2
fs/ceph/xattr.c
··· 854 854 struct ceph_pagelist *pagelist = NULL; 855 855 int err; 856 856 857 - if (value) { 857 + if (size > 0) { 858 858 /* copy value into pagelist */ 859 859 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); 860 860 if (!pagelist) ··· 864 864 err = ceph_pagelist_append(pagelist, value, size); 865 865 if (err) 866 866 goto out; 867 - } else { 867 + } else if (!value) { 868 868 flags |= CEPH_XATTR_REMOVE; 869 869 } 870 870 ··· 1000 1000 1001 1001 if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) 1002 1002 return generic_setxattr(dentry, name, value, size, flags); 1003 + 1004 + if (size == 0) 1005 + value = ""; /* empty EA, do not remove */ 1003 1006 1004 1007 return __ceph_setxattr(dentry, name, value, size, flags); 1005 1008 }
+26
include/linux/ceph/auth.h
··· 13 13 14 14 struct ceph_auth_client; 15 15 struct ceph_authorizer; 16 + struct ceph_msg; 16 17 17 18 struct ceph_auth_handshake { 18 19 struct ceph_authorizer *authorizer; ··· 21 20 size_t authorizer_buf_len; 22 21 void *authorizer_reply_buf; 23 22 size_t authorizer_reply_buf_len; 23 + int (*sign_message)(struct ceph_auth_handshake *auth, 24 + struct ceph_msg *msg); 25 + int (*check_message_signature)(struct ceph_auth_handshake *auth, 26 + struct ceph_msg *msg); 24 27 }; 25 28 26 29 struct ceph_auth_client_ops { ··· 71 66 void (*reset)(struct ceph_auth_client *ac); 72 67 73 68 void (*destroy)(struct ceph_auth_client *ac); 69 + 70 + int (*sign_message)(struct ceph_auth_handshake *auth, 71 + struct ceph_msg *msg); 72 + int (*check_message_signature)(struct ceph_auth_handshake *auth, 73 + struct ceph_msg *msg); 74 74 }; 75 75 76 76 struct ceph_auth_client { ··· 123 113 extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, 124 114 int peer_type); 125 115 116 + static inline int ceph_auth_sign_message(struct ceph_auth_handshake *auth, 117 + struct ceph_msg *msg) 118 + { 119 + if (auth->sign_message) 120 + return auth->sign_message(auth, msg); 121 + return 0; 122 + } 123 + 124 + static inline 125 + int ceph_auth_check_message_signature(struct ceph_auth_handshake *auth, 126 + struct ceph_msg *msg) 127 + { 128 + if (auth->check_message_signature) 129 + return auth->check_message_signature(auth, msg); 130 + return 0; 131 + } 126 132 #endif
+1 -2
include/linux/ceph/buffer.h
··· 10 10 /* 11 11 * a simple reference counted buffer. 12 12 * 13 - * use kmalloc for small sizes (<= one page), vmalloc for larger 14 - * sizes. 13 + * use kmalloc for smaller sizes, vmalloc for larger sizes. 15 14 */ 16 15 struct ceph_buffer { 17 16 struct kref kref;
+1
include/linux/ceph/ceph_features.h
··· 84 84 CEPH_FEATURE_PGPOOL3 | \ 85 85 CEPH_FEATURE_OSDENC | \ 86 86 CEPH_FEATURE_CRUSH_TUNABLES | \ 87 + CEPH_FEATURE_MSG_AUTH | \ 87 88 CEPH_FEATURE_CRUSH_TUNABLES2 | \ 88 89 CEPH_FEATURE_REPLY_CREATE_INODE | \ 89 90 CEPH_FEATURE_OSDHASHPSPOOL | \
+8 -2
include/linux/ceph/ceph_fs.h
··· 522 522 __le32 dist[]; 523 523 } __attribute__ ((packed)); 524 524 525 - #define CEPH_LOCK_FCNTL 1 526 - #define CEPH_LOCK_FLOCK 2 525 + #define CEPH_LOCK_FCNTL 1 526 + #define CEPH_LOCK_FLOCK 2 527 + #define CEPH_LOCK_FCNTL_INTR 3 528 + #define CEPH_LOCK_FLOCK_INTR 4 529 + 527 530 528 531 #define CEPH_LOCK_SHARED 1 529 532 #define CEPH_LOCK_EXCL 2 ··· 552 549 553 550 int ceph_flags_to_mode(int flags); 554 551 552 + #define CEPH_INLINE_NONE ((__u64)-1) 555 553 556 554 /* capability bits */ 557 555 #define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */ ··· 617 613 CEPH_CAP_LINK_SHARED | \ 618 614 CEPH_CAP_FILE_SHARED | \ 619 615 CEPH_CAP_XATTR_SHARED) 616 + #define CEPH_STAT_CAP_INLINE_DATA (CEPH_CAP_FILE_SHARED | \ 617 + CEPH_CAP_FILE_RD) 620 618 621 619 #define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \ 622 620 CEPH_CAP_LINK_SHARED | \
+1 -1
include/linux/ceph/libceph.h
··· 29 29 #define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */ 30 30 #define CEPH_OPT_MYIP (1<<2) /* specified my ip */ 31 31 #define CEPH_OPT_NOCRC (1<<3) /* no data crc on writes */ 32 + #define CEPH_OPT_NOMSGAUTH (1<<4) /* not require cephx message signature */ 32 33 33 34 #define CEPH_OPT_DEFAULT (0) 34 35 ··· 185 184 extern const char *ceph_msg_type_name(int type); 186 185 extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); 187 186 extern void *ceph_kvmalloc(size_t size, gfp_t flags); 188 - extern void ceph_kvfree(const void *ptr); 189 187 190 188 extern struct ceph_options *ceph_parse_options(char *options, 191 189 const char *dev_name, const char *dev_name_end,
+8 -1
include/linux/ceph/messenger.h
··· 42 42 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con, 43 43 struct ceph_msg_header *hdr, 44 44 int *skip); 45 + int (*sign_message) (struct ceph_connection *con, struct ceph_msg *msg); 46 + 47 + int (*check_message_signature) (struct ceph_connection *con, 48 + struct ceph_msg *msg); 45 49 }; 46 50 47 51 /* use format string %s%d */ ··· 146 142 */ 147 143 struct ceph_msg { 148 144 struct ceph_msg_header hdr; /* header */ 149 - struct ceph_msg_footer footer; /* footer */ 145 + union { 146 + struct ceph_msg_footer footer; /* footer */ 147 + struct ceph_msg_footer_old old_footer; /* old format footer */ 148 + }; 150 149 struct kvec front; /* unaligned blobs of message */ 151 150 struct ceph_buffer *middle; 152 151
+10 -1
include/linux/ceph/msgr.h
··· 152 152 receiver: mask against ~PAGE_MASK */ 153 153 154 154 struct ceph_entity_name src; 155 - __le32 reserved; 155 + __le16 compat_version; 156 + __le16 reserved; 156 157 __le32 crc; /* header crc32c */ 157 158 } __attribute__ ((packed)); 158 159 ··· 165 164 /* 166 165 * follows data payload 167 166 */ 167 + struct ceph_msg_footer_old { 168 + __le32 front_crc, middle_crc, data_crc; 169 + __u8 flags; 170 + } __attribute__ ((packed)); 171 + 168 172 struct ceph_msg_footer { 169 173 __le32 front_crc, middle_crc, data_crc; 174 + // sig holds the 64 bits of the digital signature for the message PLR 175 + __le64 sig; 170 176 __u8 flags; 171 177 } __attribute__ ((packed)); 172 178 173 179 #define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */ 174 180 #define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */ 181 + #define CEPH_MSG_FOOTER_SIGNED (1<<2) /* msg was signed */ 175 182 176 183 177 184 #endif
+12 -1
include/linux/ceph/osd_client.h
··· 87 87 struct ceph_osd_data osd_data; 88 88 } extent; 89 89 struct { 90 + __le32 name_len; 91 + __le32 value_len; 92 + __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ 93 + __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ 94 + struct ceph_osd_data osd_data; 95 + } xattr; 96 + struct { 90 97 const char *class_name; 91 98 const char *method_name; 92 99 struct ceph_osd_data request_info; ··· 302 295 extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req, 303 296 unsigned int which, u16 opcode, 304 297 const char *class, const char *method); 298 + extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, 299 + u16 opcode, const char *name, const void *value, 300 + size_t size, u8 cmp_op, u8 cmp_mode); 305 301 extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req, 306 302 unsigned int which, u16 opcode, 307 303 u64 cookie, u64 version, int flag); ··· 328 318 struct ceph_file_layout *layout, 329 319 struct ceph_vino vino, 330 320 u64 offset, u64 *len, 331 - int num_ops, int opcode, int flags, 321 + unsigned int which, int num_ops, 322 + int opcode, int flags, 332 323 struct ceph_snap_context *snapc, 333 324 u32 truncate_seq, u64 truncate_size, 334 325 bool use_mempool);
+3 -1
include/linux/ceph/pagelist.h
··· 1 1 #ifndef __FS_CEPH_PAGELIST_H 2 2 #define __FS_CEPH_PAGELIST_H 3 3 4 - #include <linux/list.h> 4 + #include <asm/byteorder.h> 5 5 #include <linux/atomic.h> 6 + #include <linux/list.h> 7 + #include <linux/types.h> 6 8 7 9 struct ceph_pagelist { 8 10 struct list_head head;
+69 -7
net/ceph/auth_x.c
··· 8 8 9 9 #include <linux/ceph/decode.h> 10 10 #include <linux/ceph/auth.h> 11 + #include <linux/ceph/messenger.h> 11 12 12 13 #include "crypto.h" 13 14 #include "auth_x.h" ··· 294 293 dout("build_authorizer for %s %p\n", 295 294 ceph_entity_type_name(th->service), au); 296 295 296 + ceph_crypto_key_destroy(&au->session_key); 297 + ret = ceph_crypto_key_clone(&au->session_key, &th->session_key); 298 + if (ret) 299 + return ret; 300 + 297 301 maxlen = sizeof(*msg_a) + sizeof(msg_b) + 298 302 ceph_x_encrypt_buflen(ticket_blob_len); 299 303 dout(" need len %d\n", maxlen); ··· 308 302 } 309 303 if (!au->buf) { 310 304 au->buf = ceph_buffer_new(maxlen, GFP_NOFS); 311 - if (!au->buf) 305 + if (!au->buf) { 306 + ceph_crypto_key_destroy(&au->session_key); 312 307 return -ENOMEM; 308 + } 313 309 } 314 310 au->service = th->service; 315 311 au->secret_id = th->secret_id; ··· 337 329 get_random_bytes(&au->nonce, sizeof(au->nonce)); 338 330 msg_b.struct_v = 1; 339 331 msg_b.nonce = cpu_to_le64(au->nonce); 340 - ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b), 332 + ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b), 341 333 p, end - p); 342 334 if (ret < 0) 343 335 goto out_buf; ··· 568 560 auth->authorizer_buf_len = au->buf->vec.iov_len; 569 561 auth->authorizer_reply_buf = au->reply_buf; 570 562 auth->authorizer_reply_buf_len = sizeof (au->reply_buf); 563 + auth->sign_message = ac->ops->sign_message; 564 + auth->check_message_signature = ac->ops->check_message_signature; 571 565 572 566 return 0; 573 567 } ··· 598 588 struct ceph_authorizer *a, size_t len) 599 589 { 600 590 struct ceph_x_authorizer *au = (void *)a; 601 - struct ceph_x_ticket_handler *th; 602 591 int ret = 0; 603 592 struct ceph_x_authorize_reply reply; 604 593 void *preply = &reply; 605 594 void *p = au->reply_buf; 606 595 void *end = p + sizeof(au->reply_buf); 607 596 608 - th = get_ticket_handler(ac, au->service); 609 - if (IS_ERR(th)) 610 - return PTR_ERR(th); 611 - ret = ceph_x_decrypt(&th->session_key, &p, end, &preply, sizeof(reply)); 597 + ret = ceph_x_decrypt(&au->session_key, &p, end, &preply, sizeof(reply)); 612 598 if (ret < 0) 613 599 return ret; 614 600 if (ret != sizeof(reply)) ··· 624 618 { 625 619 struct ceph_x_authorizer *au = (void *)a; 626 620 621 + ceph_crypto_key_destroy(&au->session_key); 627 622 ceph_buffer_put(au->buf); 628 623 kfree(au); 629 624 } ··· 670 663 memset(&th->validity, 0, sizeof(th->validity)); 671 664 } 672 665 666 + static int calcu_signature(struct ceph_x_authorizer *au, 667 + struct ceph_msg *msg, __le64 *sig) 668 + { 669 + int ret; 670 + char tmp_enc[40]; 671 + __le32 tmp[5] = { 672 + 16u, msg->hdr.crc, msg->footer.front_crc, 673 + msg->footer.middle_crc, msg->footer.data_crc, 674 + }; 675 + ret = ceph_x_encrypt(&au->session_key, &tmp, sizeof(tmp), 676 + tmp_enc, sizeof(tmp_enc)); 677 + if (ret < 0) 678 + return ret; 679 + *sig = *(__le64*)(tmp_enc + 4); 680 + return 0; 681 + } 682 + 683 + static int ceph_x_sign_message(struct ceph_auth_handshake *auth, 684 + struct ceph_msg *msg) 685 + { 686 + int ret; 687 + if (!auth->authorizer) 688 + return 0; 689 + ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, 690 + msg, &msg->footer.sig); 691 + if (ret < 0) 692 + return ret; 693 + msg->footer.flags |= CEPH_MSG_FOOTER_SIGNED; 694 + return 0; 695 + } 696 + 697 + static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth, 698 + struct ceph_msg *msg) 699 + { 700 + __le64 sig_check; 701 + int ret; 702 + 703 + if (!auth->authorizer) 704 + return 0; 705 + ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer, 706 + msg, &sig_check); 707 + if (ret < 0) 708 + return ret; 709 + if (sig_check == msg->footer.sig) 710 + return 0; 711 + if (msg->footer.flags & CEPH_MSG_FOOTER_SIGNED) 712 + dout("ceph_x_check_message_signature %p has signature %llx " 713 + "expect %llx\n", msg, msg->footer.sig, sig_check); 714 + else 715 + dout("ceph_x_check_message_signature %p sender did not set " 716 + "CEPH_MSG_FOOTER_SIGNED\n", msg); 717 + return -EBADMSG; 718 + } 673 719 674 720 static const struct ceph_auth_client_ops ceph_x_ops = { 675 721 .name = "x", ··· 737 677 .invalidate_authorizer = ceph_x_invalidate_authorizer, 738 678 .reset = ceph_x_reset, 739 679 .destroy = ceph_x_destroy, 680 + .sign_message = ceph_x_sign_message, 681 + .check_message_signature = ceph_x_check_message_signature, 740 682 }; 741 683 742 684
+1
net/ceph/auth_x.h
··· 26 26 27 27 28 28 struct ceph_x_authorizer { 29 + struct ceph_crypto_key session_key; 29 30 struct ceph_buffer *buf; 30 31 unsigned int service; 31 32 u64 nonce;
+2 -2
net/ceph/buffer.c
··· 6 6 7 7 #include <linux/ceph/buffer.h> 8 8 #include <linux/ceph/decode.h> 9 - #include <linux/ceph/libceph.h> /* for ceph_kv{malloc,free} */ 9 + #include <linux/ceph/libceph.h> /* for ceph_kvmalloc */ 10 10 11 11 struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) 12 12 { ··· 35 35 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref); 36 36 37 37 dout("buffer_release %p\n", b); 38 - ceph_kvfree(b->vec.iov_base); 38 + kvfree(b->vec.iov_base); 39 39 kfree(b); 40 40 } 41 41 EXPORT_SYMBOL(ceph_buffer_release);
+13 -8
net/ceph/ceph_common.c
··· 184 184 return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); 185 185 } 186 186 187 - void ceph_kvfree(const void *ptr) 188 - { 189 - if (is_vmalloc_addr(ptr)) 190 - vfree(ptr); 191 - else 192 - kfree(ptr); 193 - } 194 - 195 187 196 188 static int parse_fsid(const char *str, struct ceph_fsid *fsid) 197 189 { ··· 237 245 Opt_noshare, 238 246 Opt_crc, 239 247 Opt_nocrc, 248 + Opt_cephx_require_signatures, 249 + Opt_nocephx_require_signatures, 240 250 }; 241 251 242 252 static match_table_t opt_tokens = { ··· 257 263 {Opt_noshare, "noshare"}, 258 264 {Opt_crc, "crc"}, 259 265 {Opt_nocrc, "nocrc"}, 266 + {Opt_cephx_require_signatures, "cephx_require_signatures"}, 267 + {Opt_nocephx_require_signatures, "nocephx_require_signatures"}, 260 268 {-1, NULL} 261 269 }; 262 270 ··· 457 461 case Opt_nocrc: 458 462 opt->flags |= CEPH_OPT_NOCRC; 459 463 break; 464 + case Opt_cephx_require_signatures: 465 + opt->flags &= ~CEPH_OPT_NOMSGAUTH; 466 + break; 467 + case Opt_nocephx_require_signatures: 468 + opt->flags |= CEPH_OPT_NOMSGAUTH; 469 + break; 460 470 461 471 default: 462 472 BUG_ON(token); ··· 505 503 mutex_init(&client->mount_mutex); 506 504 init_waitqueue_head(&client->auth_wq); 507 505 client->auth_err = 0; 506 + 507 + if (!ceph_test_opt(client, NOMSGAUTH)) 508 + required_features |= CEPH_FEATURE_MSG_AUTH; 508 509 509 510 client->extra_mon_dispatch = NULL; 510 511 client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT |
+30 -4
net/ceph/messenger.c
··· 1196 1196 dout("prepare_write_message_footer %p\n", con); 1197 1197 con->out_kvec_is_msg = true; 1198 1198 con->out_kvec[v].iov_base = &m->footer; 1199 - con->out_kvec[v].iov_len = sizeof(m->footer); 1200 - con->out_kvec_bytes += sizeof(m->footer); 1199 + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { 1200 + if (con->ops->sign_message) 1201 + con->ops->sign_message(con, m); 1202 + else 1203 + m->footer.sig = 0; 1204 + con->out_kvec[v].iov_len = sizeof(m->footer); 1205 + con->out_kvec_bytes += sizeof(m->footer); 1206 + } else { 1207 + m->old_footer.flags = m->footer.flags; 1208 + con->out_kvec[v].iov_len = sizeof(m->old_footer); 1209 + con->out_kvec_bytes += sizeof(m->old_footer); 1210 + } 1201 1211 con->out_kvec_left++; 1202 1212 con->out_more = m->more_to_follow; 1203 1213 con->out_msg_done = true; ··· 2259 2249 int ret; 2260 2250 unsigned int front_len, middle_len, data_len; 2261 2251 bool do_datacrc = !con->msgr->nocrc; 2252 + bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH); 2262 2253 u64 seq; 2263 2254 u32 crc; 2264 2255 ··· 2372 2361 } 2373 2362 2374 2363 /* footer */ 2375 - size = sizeof (m->footer); 2364 + if (need_sign) 2365 + size = sizeof(m->footer); 2366 + else 2367 + size = sizeof(m->old_footer); 2368 + 2376 2369 end += size; 2377 2370 ret = read_partial(con, end, size, &m->footer); 2378 2371 if (ret <= 0) 2379 2372 return ret; 2373 + 2374 + if (!need_sign) { 2375 + m->footer.flags = m->old_footer.flags; 2376 + m->footer.sig = 0; 2377 + } 2380 2378 2381 2379 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n", 2382 2380 m, front_len, m->footer.front_crc, middle_len, ··· 2407 2387 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) { 2408 2388 pr_err("read_partial_message %p data crc %u != exp. %u\n", m, 2409 2389 con->in_data_crc, le32_to_cpu(m->footer.data_crc)); 2390 + return -EBADMSG; 2391 + } 2392 + 2393 + if (need_sign && con->ops->check_message_signature && 2394 + con->ops->check_message_signature(con, m)) { 2395 + pr_err("read_partial_message %p signature check failed\n", m); 2410 2396 return -EBADMSG; 2411 2397 } 2412 2398 ··· 3314 3288 static void ceph_msg_free(struct ceph_msg *m) 3315 3289 { 3316 3290 dout("%s %p\n", __func__, m); 3317 - ceph_kvfree(m->front.iov_base); 3291 + kvfree(m->front.iov_base); 3318 3292 kmem_cache_free(ceph_msg_cache, m); 3319 3293 } 3320 3294
+88 -30
net/ceph/osd_client.c
··· 292 292 ceph_osd_data_release(&op->cls.request_data); 293 293 ceph_osd_data_release(&op->cls.response_data); 294 294 break; 295 + case CEPH_OSD_OP_SETXATTR: 296 + case CEPH_OSD_OP_CMPXATTR: 297 + ceph_osd_data_release(&op->xattr.osd_data); 298 + break; 295 299 default: 296 300 break; 297 301 } ··· 480 476 size_t payload_len = 0; 481 477 482 478 BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && 483 - opcode != CEPH_OSD_OP_DELETE && opcode != CEPH_OSD_OP_ZERO && 484 - opcode != CEPH_OSD_OP_TRUNCATE); 479 + opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE); 485 480 486 481 op->extent.offset = offset; 487 482 op->extent.length = length; ··· 547 544 op->payload_len = payload_len; 548 545 } 549 546 EXPORT_SYMBOL(osd_req_op_cls_init); 547 + 548 + int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, 549 + u16 opcode, const char *name, const void *value, 550 + size_t size, u8 cmp_op, u8 cmp_mode) 551 + { 552 + struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode); 553 + struct ceph_pagelist *pagelist; 554 + size_t payload_len; 555 + 556 + BUG_ON(opcode != CEPH_OSD_OP_SETXATTR && opcode != CEPH_OSD_OP_CMPXATTR); 557 + 558 + pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); 559 + if (!pagelist) 560 + return -ENOMEM; 561 + 562 + ceph_pagelist_init(pagelist); 563 + 564 + payload_len = strlen(name); 565 + op->xattr.name_len = payload_len; 566 + ceph_pagelist_append(pagelist, name, payload_len); 567 + 568 + op->xattr.value_len = size; 569 + ceph_pagelist_append(pagelist, value, size); 570 + payload_len += size; 571 + 572 + op->xattr.cmp_op = cmp_op; 573 + op->xattr.cmp_mode = cmp_mode; 574 + 575 + ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); 576 + op->payload_len = payload_len; 577 + return 0; 578 + } 579 + EXPORT_SYMBOL(osd_req_op_xattr_init); 550 580 551 581 void osd_req_op_watch_init(struct ceph_osd_request *osd_req, 552 582 unsigned int which, u16 opcode, ··· 662 626 case CEPH_OSD_OP_READ: 663 627 case CEPH_OSD_OP_WRITE: 664 628 case CEPH_OSD_OP_ZERO: 665 - case CEPH_OSD_OP_DELETE: 666 629 case CEPH_OSD_OP_TRUNCATE: 667 630 if (src->op == CEPH_OSD_OP_WRITE) 668 631 request_data_len = src->extent.length; ··· 711 676 dst->alloc_hint.expected_write_size = 712 677 cpu_to_le64(src->alloc_hint.expected_write_size); 713 678 break; 679 + case CEPH_OSD_OP_SETXATTR: 680 + case CEPH_OSD_OP_CMPXATTR: 681 + dst->xattr.name_len = cpu_to_le32(src->xattr.name_len); 682 + dst->xattr.value_len = cpu_to_le32(src->xattr.value_len); 683 + dst->xattr.cmp_op = src->xattr.cmp_op; 684 + dst->xattr.cmp_mode = src->xattr.cmp_mode; 685 + osd_data = &src->xattr.osd_data; 686 + ceph_osdc_msg_data_add(req->r_request, osd_data); 687 + request_data_len = osd_data->pagelist->length; 688 + break; 689 + case CEPH_OSD_OP_CREATE: 690 + case CEPH_OSD_OP_DELETE: 691 + break; 714 692 default: 715 693 pr_err("unsupported osd opcode %s\n", 716 694 ceph_osd_op_name(src->op)); ··· 753 705 struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, 754 706 struct ceph_file_layout *layout, 755 707 struct ceph_vino vino, 756 - u64 off, u64 *plen, int num_ops, 708 + u64 off, u64 *plen, 709 + unsigned int which, int num_ops, 757 710 int opcode, int flags, 758 711 struct ceph_snap_context *snapc, 759 712 u32 truncate_seq, ··· 765 716 u64 objnum = 0; 766 717 u64 objoff = 0; 767 718 u64 objlen = 0; 768 - u32 object_size; 769 - u64 object_base; 770 719 int r; 771 720 772 721 BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE && 773 - opcode != CEPH_OSD_OP_DELETE && opcode != CEPH_OSD_OP_ZERO && 774 - opcode != CEPH_OSD_OP_TRUNCATE); 722 + opcode != CEPH_OSD_OP_ZERO && opcode != CEPH_OSD_OP_TRUNCATE && 723 + opcode != CEPH_OSD_OP_CREATE && opcode != CEPH_OSD_OP_DELETE); 775 724 776 725 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool, 777 726 GFP_NOFS); ··· 785 738 return ERR_PTR(r); 786 739 } 787 740 788 - object_size = le32_to_cpu(layout->fl_object_size); 789 - object_base = off - objoff; 790 - if (!(truncate_seq == 1 && truncate_size == -1ULL)) { 791 - if (truncate_size <= object_base) { 792 - truncate_size = 0; 793 - } else { 794 - truncate_size -= object_base; 795 - if (truncate_size > object_size) 796 - truncate_size = object_size; 741 + if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) { 742 + osd_req_op_init(req, which, opcode); 743 + } else { 744 + u32 object_size = le32_to_cpu(layout->fl_object_size); 745 + u32 object_base = off - objoff; 746 + if (!(truncate_seq == 1 && truncate_size == -1ULL)) { 747 + if (truncate_size <= object_base) { 748 + truncate_size = 0; 749 + } else { 750 + truncate_size -= object_base; 751 + if (truncate_size > object_size) 752 + truncate_size = object_size; 753 + } 797 754 } 755 + osd_req_op_extent_init(req, which, opcode, objoff, objlen, 756 + truncate_size, truncate_seq); 798 757 } 799 - 800 - osd_req_op_extent_init(req, 0, opcode, objoff, objlen, 801 - truncate_size, truncate_seq); 802 - 803 - /* 804 - * A second op in the ops array means the caller wants to 805 - * also issue a include a 'startsync' command so that the 806 - * osd will flush data quickly. 807 - */ 808 - if (num_ops > 1) 809 - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); 810 758 811 759 req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout); 812 760 ··· 2668 2626 2669 2627 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, 2670 2628 vino.snap, off, *plen); 2671 - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, 2629 + req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1, 2672 2630 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 2673 2631 NULL, truncate_seq, truncate_size, 2674 2632 false); ··· 2711 2669 int page_align = off & ~PAGE_MASK; 2712 2670 2713 2671 BUG_ON(vino.snap != CEPH_NOSNAP); /* snapshots aren't writeable */ 2714 - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, 2672 + req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, 2715 2673 CEPH_OSD_OP_WRITE, 2716 2674 CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, 2717 2675 snapc, truncate_seq, truncate_size, ··· 2962 2920 return ceph_monc_validate_auth(&osdc->client->monc); 2963 2921 } 2964 2922 2923 + static int sign_message(struct ceph_connection *con, struct ceph_msg *msg) 2924 + { 2925 + struct ceph_osd *o = con->private; 2926 + struct ceph_auth_handshake *auth = &o->o_auth; 2927 + return ceph_auth_sign_message(auth, msg); 2928 + } 2929 + 2930 + static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg) 2931 + { 2932 + struct ceph_osd *o = con->private; 2933 + struct ceph_auth_handshake *auth = &o->o_auth; 2934 + return ceph_auth_check_message_signature(auth, msg); 2935 + } 2936 + 2965 2937 static const struct ceph_connection_operations osd_con_ops = { 2966 2938 .get = get_osd_con, 2967 2939 .put = put_osd_con, ··· 2984 2928 .verify_authorizer_reply = verify_authorizer_reply, 2985 2929 .invalidate_authorizer = invalidate_authorizer, 2986 2930 .alloc_msg = alloc_msg, 2931 + .sign_message = sign_message, 2932 + .check_message_signature = check_message_signature, 2987 2933 .fault = osd_reset, 2988 2934 };