Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

gfs2: change gfs2 readdir cookie

gfs2 currently returns 31 bits of filename hash as a cookie that readdir
uses for an offset into the directory. When there are a large number of
directory entries, the likelihood of a collision goes up way too
quickly. GFS2 will now return cookies that are guaranteed unique for a
while, and then fail back to using 30 bits of filename hash.
Specifically, the directory leaf blocks are divided up into chunks based
on the minimum size of a gfs2 directory entry (48 bytes). Each entry's
cookie is based off the chunk where it starts, in the linked list of
leaf blocks that it hashes to (there are 131072 hash buckets). Directory
entries will have unique names until they take reach chunk 8192.
Assuming the largest filenames possible, and the least efficient spacing
possible, this new method will still be able to return unique names when
the previous method has statistically more than a 99% chance of a
collision. The non-unique names it fails back to are guaranteed to not
collide with the unique names.

unique cookies will be in this format:
- 1 bit "0" to make sure the the returned cookie is positive
- 17 bits for the hash table index
- 1 bit for the mode "0"
- 13 bits for the offset

non-unique cookies will be in this format:
- 1 bit "0" to make sure the the returned cookie is positive
- 17 bits for the hash table index
- 1 bit for the mode "1"
- 13 more bits of the name hash

Another benefit of location based cookies, is that once a directory's
exhash table is fully extended (so that multiple hash table indexs do
not use the same leaf blocks), gfs2 can skip sorting the directory
entries until it reaches the non-unique ones, and then it only needs to
sort these. This provides a significant speed up for directory reads of
very large directories.

The only issue is that for these cookies to continue to point to the
correct entry as files are added and removed from the directory, gfs2
must keep the entries at the same offset in the leaf block when they are
split (see my previous patch). This means that until all the nodes in a
cluster are running with code that will split the directory leaf blocks
this way, none of the nodes can use the new cookie code. To deal with
this, gfs2 now has the mount option loccookie, which, if set, will make
it return these new location based cookies. This option must not be set
until all nodes in the cluster are at least running this version of the
kernel code, and you have guaranteed that there are no outstanding
cookies required by other software, such as NFS.

gfs2 uses some of the extra space at the end of the gfs2_dirent
structure to store the calculated readdir cookies. This keeps us from
needing to allocate a seperate array to hold these values. gfs2
recomputes the cookie stored in de_cookie for every readdir call. The
time it takes to do so is small, and if gfs2 expected this value to be
saved on disk, the new code wouldn't work correctly on filesystems
created with an earlier version of gfs2.

One issue with adding de_cookie to the union in the gfs2_dirent
structure is that it caused the union to align itself to a 4 byte
boundary, instead of its previous 2 byte boundary. This changed the
offset of de_rahead. To solve that, I pulled de_rahead out of the union,
since it does not need to be there.

Signed-off-by: Benjamin Marzinski <bmarzins@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>

authored by

Benjamin Marzinski and committed by
Bob Peterson
471f3db2 34017472

+95 -23
+71 -20
fs/gfs2/dir.c
··· 82 82 83 83 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) 84 84 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) 85 + #define GFS2_HASH_INDEX_MASK 0xffffc000 86 + #define GFS2_USE_HASH_FLAG 0x2000 85 87 86 88 struct qstr gfs2_qdot __read_mostly; 87 89 struct qstr gfs2_qdotdot __read_mostly; ··· 1225 1223 int ret = 0; 1226 1224 1227 1225 dent_a = *(const struct gfs2_dirent **)a; 1228 - hash_a = be32_to_cpu(dent_a->de_hash); 1226 + hash_a = dent_a->de_cookie; 1229 1227 1230 1228 dent_b = *(const struct gfs2_dirent **)b; 1231 - hash_b = be32_to_cpu(dent_b->de_hash); 1229 + hash_b = dent_b->de_cookie; 1232 1230 1233 1231 if (hash_a > hash_b) 1234 1232 ret = 1; ··· 1266 1264 */ 1267 1265 1268 1266 static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, 1269 - const struct gfs2_dirent **darr, u32 entries, 1270 - int *copied) 1267 + struct gfs2_dirent **darr, u32 entries, 1268 + u32 sort_start, int *copied) 1271 1269 { 1272 1270 const struct gfs2_dirent *dent, *dent_next; 1273 1271 u64 off, off_next; 1274 1272 unsigned int x, y; 1275 1273 int run = 0; 1276 1274 1277 - sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); 1275 + if (sort_start < entries) 1276 + sort(&darr[sort_start], entries - sort_start, 1277 + sizeof(struct gfs2_dirent *), compare_dents, NULL); 1278 1278 1279 1279 dent_next = darr[0]; 1280 - off_next = be32_to_cpu(dent_next->de_hash); 1281 - off_next = gfs2_disk_hash2offset(off_next); 1280 + off_next = dent_next->de_cookie; 1282 1281 1283 1282 for (x = 0, y = 1; x < entries; x++, y++) { 1284 1283 dent = dent_next; ··· 1287 1284 1288 1285 if (y < entries) { 1289 1286 dent_next = darr[y]; 1290 - off_next = be32_to_cpu(dent_next->de_hash); 1291 - off_next = gfs2_disk_hash2offset(off_next); 1287 + off_next = dent_next->de_cookie; 1292 1288 1293 1289 if (off < ctx->pos) 1294 1290 continue; ··· 1334 1332 return ptr; 1335 1333 } 1336 1334 1335 + 1336 + static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh, 1337 + unsigned leaf_nr, struct gfs2_dirent **darr, 1338 + unsigned entries) 1339 + { 1340 + int sort_id = -1; 1341 + int i; 1342 + 1343 + for (i = 0; i < entries; i++) { 1344 + unsigned offset; 1345 + 1346 + darr[i]->de_cookie = be32_to_cpu(darr[i]->de_hash); 1347 + darr[i]->de_cookie = gfs2_disk_hash2offset(darr[i]->de_cookie); 1348 + 1349 + if (!sdp->sd_args.ar_loccookie) 1350 + continue; 1351 + offset = (char *)(darr[i]) - 1352 + (bh->b_data + gfs2_dirent_offset(bh->b_data)); 1353 + offset /= GFS2_MIN_DIRENT_SIZE; 1354 + offset += leaf_nr * sdp->sd_max_dents_per_leaf; 1355 + if (offset >= GFS2_USE_HASH_FLAG || 1356 + leaf_nr >= GFS2_USE_HASH_FLAG) { 1357 + darr[i]->de_cookie |= GFS2_USE_HASH_FLAG; 1358 + if (sort_id < 0) 1359 + sort_id = i; 1360 + continue; 1361 + } 1362 + darr[i]->de_cookie &= GFS2_HASH_INDEX_MASK; 1363 + darr[i]->de_cookie |= offset; 1364 + } 1365 + return sort_id; 1366 + } 1367 + 1368 + 1337 1369 static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, 1338 1370 int *copied, unsigned *depth, 1339 1371 u64 leaf_no) ··· 1377 1341 struct buffer_head *bh; 1378 1342 struct gfs2_leaf *lf; 1379 1343 unsigned entries = 0, entries2 = 0; 1380 - unsigned leaves = 0; 1381 - const struct gfs2_dirent **darr, *dent; 1344 + unsigned leaves = 0, leaf = 0, offset, sort_offset; 1345 + struct gfs2_dirent **darr, *dent; 1382 1346 struct dirent_gather g; 1383 1347 struct buffer_head **larr; 1384 - int leaf = 0; 1385 - int error, i; 1348 + int error, i, need_sort = 0, sort_id; 1386 1349 u64 lfn = leaf_no; 1387 1350 1388 1351 do { ··· 1397 1362 brelse(bh); 1398 1363 } while(lfn); 1399 1364 1365 + if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) { 1366 + need_sort = 1; 1367 + sort_offset = 0; 1368 + } 1369 + 1400 1370 if (!entries) 1401 1371 return 0; 1402 1372 ··· 1415 1375 larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); 1416 1376 if (!larr) 1417 1377 goto out; 1418 - darr = (const struct gfs2_dirent **)(larr + leaves); 1419 - g.pdent = darr; 1378 + darr = (struct gfs2_dirent **)(larr + leaves); 1379 + g.pdent = (const struct gfs2_dirent **)darr; 1420 1380 g.offset = 0; 1421 1381 lfn = leaf_no; 1422 1382 ··· 1427 1387 lf = (struct gfs2_leaf *)bh->b_data; 1428 1388 lfn = be64_to_cpu(lf->lf_next); 1429 1389 if (lf->lf_entries) { 1390 + offset = g.offset; 1430 1391 entries2 += be16_to_cpu(lf->lf_entries); 1431 1392 dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, 1432 1393 gfs2_dirent_gather, NULL, &g); ··· 1445 1404 goto out_free; 1446 1405 } 1447 1406 error = 0; 1407 + sort_id = gfs2_set_cookies(sdp, bh, leaf, &darr[offset], 1408 + be16_to_cpu(lf->lf_entries)); 1409 + if (!need_sort && sort_id >= 0) { 1410 + need_sort = 1; 1411 + sort_offset = offset + sort_id; 1412 + } 1448 1413 larr[leaf++] = bh; 1449 1414 } else { 1415 + larr[leaf++] = NULL; 1450 1416 brelse(bh); 1451 1417 } 1452 1418 } while(lfn); 1453 1419 1454 1420 BUG_ON(entries2 != entries); 1455 - error = do_filldir_main(ip, ctx, darr, entries, copied); 1421 + error = do_filldir_main(ip, ctx, darr, entries, need_sort ? 1422 + sort_offset : entries, copied); 1456 1423 out_free: 1457 1424 for(i = 0; i < leaf; i++) 1458 - brelse(larr[i]); 1425 + if (larr[i]) 1426 + brelse(larr[i]); 1459 1427 kvfree(larr); 1460 1428 out: 1461 1429 return error; ··· 1570 1520 struct gfs2_inode *dip = GFS2_I(inode); 1571 1521 struct gfs2_sbd *sdp = GFS2_SB(inode); 1572 1522 struct dirent_gather g; 1573 - const struct gfs2_dirent **darr, *dent; 1523 + struct gfs2_dirent **darr, *dent; 1574 1524 struct buffer_head *dibh; 1575 1525 int copied = 0; 1576 1526 int error; ··· 1594 1544 /* 96 is max number of dirents which can be stuffed into an inode */ 1595 1545 darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS); 1596 1546 if (darr) { 1597 - g.pdent = darr; 1547 + g.pdent = (const struct gfs2_dirent **)darr; 1598 1548 g.offset = 0; 1599 1549 dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size, 1600 1550 gfs2_dirent_gather, NULL, &g); ··· 1611 1561 error = -EIO; 1612 1562 goto out; 1613 1563 } 1564 + gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries); 1614 1565 error = do_filldir_main(dip, ctx, darr, 1615 - dip->i_entries, &copied); 1566 + dip->i_entries, 0, &copied); 1616 1567 out: 1617 1568 kfree(darr); 1618 1569 }
+3
fs/gfs2/incore.h
··· 562 562 unsigned int ar_errors:2; /* errors=withdraw | panic */ 563 563 unsigned int ar_nobarrier:1; /* do not send barriers */ 564 564 unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ 565 + unsigned int ar_loccookie:1; /* use location based readdir 566 + cookies */ 565 567 int ar_commit; /* Commit interval */ 566 568 int ar_statfs_quantum; /* The fast statfs interval */ 567 569 int ar_quota_quantum; /* The quota interval */ ··· 691 689 u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; 692 690 u32 sd_max_jheight; /* Max height of journaled file's meta tree */ 693 691 u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1]; 692 + u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */ 694 693 695 694 struct gfs2_args sd_args; /* Mount arguments */ 696 695 struct gfs2_tune sd_tune; /* Filesystem tuning structure */
+3
fs/gfs2/ops_fstype.c
··· 352 352 sdp->sd_jheightsize[x] = ~0; 353 353 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); 354 354 355 + sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize - 356 + sizeof(struct gfs2_leaf)) / 357 + GFS2_MIN_DIRENT_SIZE; 355 358 return 0; 356 359 } 357 360
+12
fs/gfs2/super.c
··· 83 83 Opt_nobarrier, 84 84 Opt_rgrplvb, 85 85 Opt_norgrplvb, 86 + Opt_loccookie, 87 + Opt_noloccookie, 86 88 Opt_error, 87 89 }; 88 90 ··· 124 122 {Opt_nobarrier, "nobarrier"}, 125 123 {Opt_rgrplvb, "rgrplvb"}, 126 124 {Opt_norgrplvb, "norgrplvb"}, 125 + {Opt_loccookie, "loccookie"}, 126 + {Opt_noloccookie, "noloccookie"}, 127 127 {Opt_error, NULL} 128 128 }; 129 129 ··· 281 277 break; 282 278 case Opt_norgrplvb: 283 279 args->ar_rgrplvb = 0; 280 + break; 281 + case Opt_loccookie: 282 + args->ar_loccookie = 1; 283 + break; 284 + case Opt_noloccookie: 285 + args->ar_loccookie = 0; 284 286 break; 285 287 case Opt_error: 286 288 default: ··· 1428 1418 seq_puts(s, ",demote_interface_used"); 1429 1419 if (args->ar_rgrplvb) 1430 1420 seq_puts(s, ",rgrplvb"); 1421 + if (args->ar_loccookie) 1422 + seq_puts(s, ",loccookie"); 1431 1423 return 0; 1432 1424 } 1433 1425
+6 -3
include/uapi/linux/gfs2_ondisk.h
··· 297 297 298 298 #define GFS2_FNAMESIZE 255 299 299 #define GFS2_DIRENT_SIZE(name_len) ((sizeof(struct gfs2_dirent) + (name_len) + 7) & ~7) 300 + #define GFS2_MIN_DIRENT_SIZE (GFS2_DIRENT_SIZE(1)) 301 + 300 302 301 303 struct gfs2_dirent { 302 304 struct gfs2_inum de_inum; ··· 306 304 __be16 de_rec_len; 307 305 __be16 de_name_len; 308 306 __be16 de_type; 307 + __be16 de_rahead; 309 308 union { 310 - __u8 __pad[14]; 309 + __u8 __pad[12]; 311 310 struct { 312 - __be16 de_rahead; 313 - __u8 pad2[12]; 311 + __u32 de_cookie; /* ondisk value not used */ 312 + __u8 pad3[8]; 314 313 }; 315 314 }; 316 315 };