Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ceph: using hash value to compose dentry offset

If MDS sorts dentries in dirfrag in hash order, we use hash value to
compose dentry offset. dentry offset is:

(0xff << 52) | ((24 bits hash) << 28) |
(the nth entry hash hash collision)

This offset is stable across directory fragmentation. This alos means
there is no need to reset readdir offset if directory get fragmented
in the middle of readdir.

Signed-off-by: Yan, Zheng <zyan@redhat.com>

authored by

Yan, Zheng and committed by
Ilya Dryomov
f3c4ebe6 076c40f1

+136 -47
+105 -35
fs/ceph/dir.c
··· 69 69 } 70 70 71 71 /* 72 - * for readdir, we encode the directory frag and offset within that 73 - * frag into f_pos. 72 + * for f_pos for readdir: 73 + * - hash order: 74 + * (0xff << 52) | ((24 bits hash) << 28) | 75 + * (the nth entry has hash collision); 76 + * - frag+name order; 77 + * ((frag value) << 28) | (the nth entry in frag); 74 78 */ 79 + #define OFFSET_BITS 28 80 + #define OFFSET_MASK ((1 << OFFSET_BITS) - 1) 81 + #define HASH_ORDER (0xffull << (OFFSET_BITS + 24)) 82 + loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order) 83 + { 84 + loff_t fpos = ((loff_t)high << 28) | (loff_t)off; 85 + if (hash_order) 86 + fpos |= HASH_ORDER; 87 + return fpos; 88 + } 89 + 90 + static bool is_hash_order(loff_t p) 91 + { 92 + return (p & HASH_ORDER) == HASH_ORDER; 93 + } 94 + 75 95 static unsigned fpos_frag(loff_t p) 76 96 { 77 - return p >> 32; 97 + return p >> OFFSET_BITS; 78 98 } 99 + 100 + static unsigned fpos_hash(loff_t p) 101 + { 102 + return ceph_frag_value(fpos_frag(p)); 103 + } 104 + 79 105 static unsigned fpos_off(loff_t p) 80 106 { 81 - return p & 0xffffffff; 107 + return p & OFFSET_MASK; 82 108 } 83 109 84 110 static int fpos_cmp(loff_t l, loff_t r) ··· 203 177 u64 idx = 0; 204 178 int err = 0; 205 179 206 - dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); 180 + dout("__dcache_readdir %p v%u at %llx\n", dir, shared_gen, ctx->pos); 207 181 208 182 /* search start position */ 209 183 if (ctx->pos > 2) { ··· 260 234 spin_unlock(&dentry->d_lock); 261 235 262 236 if (emit_dentry) { 263 - dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, 237 + dout(" %llx dentry %p %pd %p\n", di->offset, 264 238 dentry, dentry, d_inode(dentry)); 265 239 ctx->pos = di->offset; 266 240 if (!dir_emit(ctx, dentry->d_name.name, ··· 295 269 return err; 296 270 } 297 271 272 + static bool need_send_readdir(struct ceph_file_info *fi, loff_t pos) 273 + { 274 + if (!fi->last_readdir) 275 + return true; 276 + if (is_hash_order(pos)) 277 + return !ceph_frag_contains_value(fi->frag, fpos_hash(pos)); 278 + else 279 + return fi->frag != fpos_frag(pos); 280 + } 281 + 298 282 static int ceph_readdir(struct file *file, struct dir_context *ctx) 299 283 { 300 284 struct ceph_file_info *fi = file->private_data; ··· 312 276 struct ceph_inode_info *ci = ceph_inode(inode); 313 277 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 314 278 struct ceph_mds_client *mdsc = fsc->mdsc; 315 - unsigned frag = fpos_frag(ctx->pos); 316 279 int i; 317 280 int err; 318 281 u32 ftype; ··· 352 317 err = __dcache_readdir(file, ctx, shared_gen); 353 318 if (err != -EAGAIN) 354 319 return err; 355 - frag = fpos_frag(ctx->pos); 356 320 } else { 357 321 spin_unlock(&ci->i_ceph_lock); 358 322 } ··· 359 325 /* proceed with a normal readdir */ 360 326 more: 361 327 /* do we have the correct frag content buffered? */ 362 - if (fi->frag != frag || fi->last_readdir == NULL) { 328 + if (need_send_readdir(fi, ctx->pos)) { 363 329 struct ceph_mds_request *req; 330 + unsigned frag; 364 331 int op = ceph_snap(inode) == CEPH_SNAPDIR ? 365 332 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; 366 333 ··· 369 334 if (fi->last_readdir) { 370 335 ceph_mdsc_put_request(fi->last_readdir); 371 336 fi->last_readdir = NULL; 337 + } 338 + 339 + if (is_hash_order(ctx->pos)) { 340 + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), 341 + NULL, NULL); 342 + } else { 343 + frag = fpos_frag(ctx->pos); 372 344 } 373 345 374 346 dout("readdir fetching %llx.%llx frag %x offset '%s'\n", ··· 415 373 ceph_mdsc_put_request(req); 416 374 return err; 417 375 } 418 - dout("readdir got and parsed readdir result=%d" 419 - " on frag %x, end=%d, complete=%d\n", err, frag, 376 + dout("readdir got and parsed readdir result=%d on " 377 + "frag %x, end=%d, complete=%d, hash_order=%d\n", 378 + err, frag, 420 379 (int)req->r_reply_info.dir_end, 421 - (int)req->r_reply_info.dir_complete); 380 + (int)req->r_reply_info.dir_complete, 381 + (int)req->r_reply_info.hash_order); 422 382 423 - 424 - /* note next offset and last dentry name */ 425 383 rinfo = &req->r_reply_info; 426 384 if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { 427 385 frag = le32_to_cpu(rinfo->dir_dir->frag); 428 - fi->next_offset = req->r_readdir_offset; 429 - /* adjust ctx->pos to beginning of frag */ 430 - ctx->pos = ceph_make_fpos(frag, fi->next_offset); 386 + if (!rinfo->hash_order) { 387 + fi->next_offset = req->r_readdir_offset; 388 + /* adjust ctx->pos to beginning of frag */ 389 + ctx->pos = ceph_make_fpos(frag, 390 + fi->next_offset, 391 + false); 392 + } 431 393 } 432 394 433 395 fi->frag = frag; ··· 457 411 fi->dir_release_count = 0; 458 412 } 459 413 460 - if (req->r_reply_info.dir_end) { 461 - kfree(fi->last_name); 462 - fi->last_name = NULL; 463 - fi->next_offset = 2; 464 - } else { 414 + /* note next offset and last dentry name */ 415 + if (rinfo->dir_nr > 0) { 465 416 struct ceph_mds_reply_dir_entry *rde = 466 417 rinfo->dir_entries + (rinfo->dir_nr-1); 418 + unsigned next_offset = req->r_reply_info.dir_end ? 419 + 2 : (fpos_off(rde->offset) + 1); 467 420 err = note_last_dentry(fi, rde->name, rde->name_len, 468 - fpos_off(rde->offset) + 1); 421 + next_offset); 469 422 if (err) 470 423 return err; 424 + } else if (req->r_reply_info.dir_end) { 425 + fi->next_offset = 2; 426 + /* keep last name */ 471 427 } 472 428 } 473 429 474 430 rinfo = &fi->last_readdir->r_reply_info; 475 431 dout("readdir frag %x num %d pos %llx chunk first %llx\n", 476 - frag, rinfo->dir_nr, ctx->pos, 432 + fi->frag, rinfo->dir_nr, ctx->pos, 477 433 rinfo->dir_nr ? rinfo->dir_entries[0].offset : 0LL); 478 434 479 435 i = 0; ··· 518 470 ctx->pos++; 519 471 } 520 472 521 - if (fi->last_name) { 473 + if (fi->next_offset > 2) { 522 474 ceph_mdsc_put_request(fi->last_readdir); 523 475 fi->last_readdir = NULL; 524 476 goto more; 525 477 } 526 478 527 479 /* more frags? */ 528 - if (!ceph_frag_is_rightmost(frag)) { 529 - frag = ceph_frag_next(frag); 530 - ctx->pos = ceph_make_fpos(frag, 2); 480 + if (!ceph_frag_is_rightmost(fi->frag)) { 481 + unsigned frag = ceph_frag_next(fi->frag); 482 + if (is_hash_order(ctx->pos)) { 483 + loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), 484 + fi->next_offset, true); 485 + if (new_pos > ctx->pos) 486 + ctx->pos = new_pos; 487 + /* keep last_name */ 488 + } else { 489 + ctx->pos = ceph_make_fpos(frag, fi->next_offset, false); 490 + kfree(fi->last_name); 491 + fi->last_name = NULL; 492 + } 531 493 dout("readdir next frag is %x\n", frag); 532 494 goto more; 533 495 } ··· 590 532 static bool need_reset_readdir(struct ceph_file_info *fi, loff_t new_pos) 591 533 { 592 534 struct ceph_mds_reply_info_parsed *rinfo; 535 + loff_t chunk_offset; 593 536 if (new_pos == 0) 594 537 return true; 595 - if (fpos_frag(new_pos) != fi->frag) 538 + if (is_hash_order(new_pos)) { 539 + /* no need to reset last_name for a forward seek when 540 + * dentries are sotred in hash order */ 541 + } else if (fi->frag |= fpos_frag(new_pos)) { 596 542 return true; 543 + } 597 544 rinfo = fi->last_readdir ? &fi->last_readdir->r_reply_info : NULL; 598 545 if (!rinfo || !rinfo->dir_nr) 599 546 return true; 600 - return new_pos < rinfo->dir_entries[0].offset;; 547 + chunk_offset = rinfo->dir_entries[0].offset; 548 + return new_pos < chunk_offset || 549 + is_hash_order(new_pos) != is_hash_order(chunk_offset); 601 550 } 602 551 603 552 static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) ··· 627 562 } 628 563 629 564 if (offset >= 0) { 565 + if (need_reset_readdir(fi, offset)) { 566 + dout("dir_llseek dropping %p content\n", file); 567 + reset_readdir(fi); 568 + } else if (is_hash_order(offset) && offset > file->f_pos) { 569 + /* for hash offset, we don't know if a forward seek 570 + * is within same frag */ 571 + fi->dir_release_count = 0; 572 + fi->readdir_cache_idx = -1; 573 + } 574 + 630 575 if (offset != file->f_pos) { 631 576 file->f_pos = offset; 632 577 file->f_version = 0; 633 578 fi->flags &= ~CEPH_F_ATEND; 634 579 } 635 580 retval = offset; 636 - 637 - if (need_reset_readdir(fi, offset)) { 638 - dout("dir_llseek dropping %p content\n", file); 639 - reset_readdir(fi); 640 - } 641 581 } 642 582 out: 643 583 inode_unlock(inode);
+25 -6
fs/ceph/inode.c
··· 1387 1387 struct ceph_mds_session *session) 1388 1388 { 1389 1389 struct dentry *parent = req->r_dentry; 1390 + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); 1390 1391 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; 1391 1392 struct qstr dname; 1392 1393 struct dentry *dn; ··· 1395 1394 int err = 0, skipped = 0, ret, i; 1396 1395 struct inode *snapdir = NULL; 1397 1396 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; 1398 - struct ceph_dentry_info *di; 1399 1397 u32 frag = le32_to_cpu(rhead->args.readdir.frag); 1398 + u32 last_hash = 0; 1399 + u32 fpos_offset; 1400 1400 struct ceph_readdir_cache_control cache_ctl = {}; 1401 1401 1402 1402 if (req->r_aborted) 1403 1403 return readdir_prepopulate_inodes_only(req, session); 1404 + 1405 + if (rinfo->hash_order && req->r_path2) { 1406 + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, 1407 + req->r_path2, strlen(req->r_path2)); 1408 + last_hash = ceph_frag_value(last_hash); 1409 + } 1404 1410 1405 1411 if (rinfo->dir_dir && 1406 1412 le32_to_cpu(rinfo->dir_dir->frag) != frag) { 1407 1413 dout("readdir_prepopulate got new frag %x -> %x\n", 1408 1414 frag, le32_to_cpu(rinfo->dir_dir->frag)); 1409 1415 frag = le32_to_cpu(rinfo->dir_dir->frag); 1410 - req->r_readdir_offset = 2; 1416 + if (!rinfo->hash_order) 1417 + req->r_readdir_offset = 2; 1411 1418 } 1412 1419 1413 1420 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { ··· 1433 1424 if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { 1434 1425 /* note dir version at start of readdir so we can tell 1435 1426 * if any dentries get dropped */ 1436 - struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); 1437 1427 req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); 1438 1428 req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); 1439 1429 req->r_readdir_cache_idx = 0; 1440 1430 } 1441 1431 1442 1432 cache_ctl.index = req->r_readdir_cache_idx; 1433 + fpos_offset = req->r_readdir_offset; 1443 1434 1444 1435 /* FIXME: release caps/leases if error occurs */ 1445 1436 for (i = 0; i < rinfo->dir_nr; i++) { ··· 1452 1443 1453 1444 vino.ino = le64_to_cpu(rde->inode.in->ino); 1454 1445 vino.snap = le64_to_cpu(rde->inode.in->snapid); 1446 + 1447 + if (rinfo->hash_order) { 1448 + u32 hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, 1449 + rde->name, rde->name_len); 1450 + hash = ceph_frag_value(hash); 1451 + if (hash != last_hash) 1452 + fpos_offset = 2; 1453 + last_hash = hash; 1454 + rde->offset = ceph_make_fpos(hash, fpos_offset++, true); 1455 + } else { 1456 + rde->offset = ceph_make_fpos(frag, fpos_offset++, false); 1457 + } 1455 1458 1456 1459 retry_lookup: 1457 1460 dn = d_lookup(parent, &dname); ··· 1542 1521 dn = realdn; 1543 1522 } 1544 1523 1545 - di = dn->d_fsdata; 1546 - di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); 1547 - rde->offset = di->offset; 1524 + ceph_dentry(dn)->offset = rde->offset; 1548 1525 1549 1526 update_dentry_lease(dn, rde->lease, req->r_session, 1550 1527 req->r_request_started);
+1
fs/ceph/mds_client.c
··· 185 185 u16 flags = ceph_decode_16(p); 186 186 info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); 187 187 info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); 188 + info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); 188 189 } 189 190 if (num == 0) 190 191 goto done;
+3 -1
fs/ceph/mds_client.h
··· 81 81 struct ceph_mds_reply_dirfrag *dir_dir; 82 82 size_t dir_buf_size; 83 83 int dir_nr; 84 - bool dir_complete, dir_end; 84 + bool dir_complete; 85 + bool dir_end; 86 + bool hash_order; 85 87 struct ceph_mds_reply_dir_entry *dir_entries; 86 88 }; 87 89
+1 -5
fs/ceph/super.h
··· 540 540 return (struct ceph_dentry_info *)dentry->d_fsdata; 541 541 } 542 542 543 - static inline loff_t ceph_make_fpos(unsigned frag, unsigned off) 544 - { 545 - return ((loff_t)frag << 32) | (loff_t)off; 546 - } 547 - 548 543 /* 549 544 * caps helpers 550 545 */ ··· 944 949 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, 945 950 ceph_snapdir_dentry_ops; 946 951 952 + extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); 947 953 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); 948 954 extern int ceph_handle_snapdir(struct ceph_mds_request *req, 949 955 struct dentry *dentry, int err);
+1
include/linux/ceph/ceph_fs.h
··· 357 357 */ 358 358 #define CEPH_READDIR_FRAG_END (1<<0) 359 359 #define CEPH_READDIR_FRAG_COMPLETE (1<<8) 360 + #define CEPH_READDIR_HASH_ORDER (1<<9) 360 361 361 362 union ceph_mds_request_args { 362 363 struct {