Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

ext3: return 32/64-bit dir name hash according to usage type

This is based on commit d1f5273e9adb40724a85272f248f210dc4ce919a
ext4: return 32/64-bit dir name hash according to usage type
by Fan Yong <yong.fan@whamcloud.com>

Traditionally ext2/3/4 has returned a 32-bit hash value from llseek()
to appease NFSv2, which can only handle a 32-bit cookie for seekdir()
and telldir(). However, this causes problems if there are 32-bit hash
collisions, since the NFSv2 server can get stuck resending the same
entries from the directory repeatedly.

Allow ext3 to return a full 64-bit hash (both major and minor) for
telldir to decrease the chance of hash collisions.

This patch does implement a new ext3_dir_llseek op, because with 64-bit
hashes, nfs will attempt to seek to a hash "offset" which is much
larger than ext3's s_maxbytes. So for dx dirs, we call
generic_file_llseek_size() with the appropriate max hash value as the
maximum seekable size. Otherwise we just pass through to
generic_file_llseek().

Patch-updated-by: Bernd Schubert <bernd.schubert@itwm.fraunhofer.de>
Patch-updated-by: Eric Sandeen <sandeen@redhat.com>
(blame us if something is not correct)

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>

authored by

Eric Sandeen and committed by
Jan Kara
d7dab39b a80b12c3

+129 -48
+122 -45
fs/ext3/dir.c
··· 21 21 * 22 22 */ 23 23 24 + #include <linux/compat.h> 24 25 #include "ext3.h" 25 26 26 27 static unsigned char ext3_filetype_table[] = { 27 28 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 28 29 }; 29 30 30 - static int ext3_readdir(struct file *, void *, filldir_t); 31 31 static int ext3_dx_readdir(struct file * filp, 32 32 void * dirent, filldir_t filldir); 33 - static int ext3_release_dir (struct inode * inode, 34 - struct file * filp); 35 - 36 - const struct file_operations ext3_dir_operations = { 37 - .llseek = generic_file_llseek, 38 - .read = generic_read_dir, 39 - .readdir = ext3_readdir, /* we take BKL. needed?*/ 40 - .unlocked_ioctl = ext3_ioctl, 41 - #ifdef CONFIG_COMPAT 42 - .compat_ioctl = ext3_compat_ioctl, 43 - #endif 44 - .fsync = ext3_sync_file, /* BKL held */ 45 - .release = ext3_release_dir, 46 - }; 47 - 48 33 49 34 static unsigned char get_dtype(struct super_block *sb, int filetype) 50 35 { ··· 40 55 return (ext3_filetype_table[filetype]); 41 56 } 42 57 58 + /** 59 + * Check if the given dir-inode refers to an htree-indexed directory 60 + * (or a directory which chould potentially get coverted to use htree 61 + * indexing). 62 + * 63 + * Return 1 if it is a dx dir, 0 if not 64 + */ 65 + static int is_dx_dir(struct inode *inode) 66 + { 67 + struct super_block *sb = inode->i_sb; 68 + 69 + if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, 70 + EXT3_FEATURE_COMPAT_DIR_INDEX) && 71 + ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || 72 + ((inode->i_size >> sb->s_blocksize_bits) == 1))) 73 + return 1; 74 + 75 + return 0; 76 + } 43 77 44 78 int ext3_check_dir_entry (const char * function, struct inode * dir, 45 79 struct ext3_dir_entry_2 * de, ··· 98 94 unsigned long offset; 99 95 int i, stored; 100 96 struct ext3_dir_entry_2 *de; 101 - struct super_block *sb; 102 97 int err; 103 98 struct inode *inode = filp->f_path.dentry->d_inode; 99 + struct super_block *sb = inode->i_sb; 104 100 int ret = 0; 105 101 int dir_has_error = 0; 106 102 107 - sb = inode->i_sb; 108 - 109 - if (EXT3_HAS_COMPAT_FEATURE(inode->i_sb, 110 - EXT3_FEATURE_COMPAT_DIR_INDEX) && 111 - ((EXT3_I(inode)->i_flags & EXT3_INDEX_FL) || 112 - ((inode->i_size >> sb->s_blocksize_bits) == 1))) { 103 + if (is_dx_dir(inode)) { 113 104 err = ext3_dx_readdir(filp, dirent, filldir); 114 105 if (err != ERR_BAD_DX_DIR) { 115 106 ret = err; ··· 226 227 return ret; 227 228 } 228 229 230 + static inline int is_32bit_api(void) 231 + { 232 + #ifdef CONFIG_COMPAT 233 + return is_compat_task(); 234 + #else 235 + return (BITS_PER_LONG == 32); 236 + #endif 237 + } 238 + 229 239 /* 230 240 * These functions convert from the major/minor hash to an f_pos 231 - * value. 241 + * value for dx directories 232 242 * 233 - * Currently we only use major hash numer. This is unfortunate, but 234 - * on 32-bit machines, the same VFS interface is used for lseek and 235 - * llseek, so if we use the 64 bit offset, then the 32-bit versions of 236 - * lseek/telldir/seekdir will blow out spectacularly, and from within 237 - * the ext2 low-level routine, we don't know if we're being called by 238 - * a 64-bit version of the system call or the 32-bit version of the 239 - * system call. Worse yet, NFSv2 only allows for a 32-bit readdir 240 - * cookie. Sigh. 243 + * Upper layer (for example NFS) should specify FMODE_32BITHASH or 244 + * FMODE_64BITHASH explicitly. On the other hand, we allow ext3 to be mounted 245 + * directly on both 32-bit and 64-bit nodes, under such case, neither 246 + * FMODE_32BITHASH nor FMODE_64BITHASH is specified. 241 247 */ 242 - #define hash2pos(major, minor) (major >> 1) 243 - #define pos2maj_hash(pos) ((pos << 1) & 0xffffffff) 244 - #define pos2min_hash(pos) (0) 248 + static inline loff_t hash2pos(struct file *filp, __u32 major, __u32 minor) 249 + { 250 + if ((filp->f_mode & FMODE_32BITHASH) || 251 + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 252 + return major >> 1; 253 + else 254 + return ((__u64)(major >> 1) << 32) | (__u64)minor; 255 + } 256 + 257 + static inline __u32 pos2maj_hash(struct file *filp, loff_t pos) 258 + { 259 + if ((filp->f_mode & FMODE_32BITHASH) || 260 + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 261 + return (pos << 1) & 0xffffffff; 262 + else 263 + return ((pos >> 32) << 1) & 0xffffffff; 264 + } 265 + 266 + static inline __u32 pos2min_hash(struct file *filp, loff_t pos) 267 + { 268 + if ((filp->f_mode & FMODE_32BITHASH) || 269 + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 270 + return 0; 271 + else 272 + return pos & 0xffffffff; 273 + } 274 + 275 + /* 276 + * Return 32- or 64-bit end-of-file for dx directories 277 + */ 278 + static inline loff_t ext3_get_htree_eof(struct file *filp) 279 + { 280 + if ((filp->f_mode & FMODE_32BITHASH) || 281 + (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api())) 282 + return EXT3_HTREE_EOF_32BIT; 283 + else 284 + return EXT3_HTREE_EOF_64BIT; 285 + } 286 + 287 + 288 + /* 289 + * ext3_dir_llseek() calls generic_file_llseek[_size]() to handle both 290 + * non-htree and htree directories, where the "offset" is in terms 291 + * of the filename hash value instead of the byte offset. 292 + * 293 + * Because we may return a 64-bit hash that is well beyond s_maxbytes, 294 + * we need to pass the max hash as the maximum allowable offset in 295 + * the htree directory case. 296 + * 297 + * NOTE: offsets obtained *before* ext3_set_inode_flag(dir, EXT3_INODE_INDEX) 298 + * will be invalid once the directory was converted into a dx directory 299 + */ 300 + loff_t ext3_dir_llseek(struct file *file, loff_t offset, int origin) 301 + { 302 + struct inode *inode = file->f_mapping->host; 303 + int dx_dir = is_dx_dir(inode); 304 + 305 + if (likely(dx_dir)) 306 + return generic_file_llseek_size(file, offset, origin, 307 + ext3_get_htree_eof(file)); 308 + else 309 + return generic_file_llseek(file, offset, origin); 310 + } 245 311 246 312 /* 247 313 * This structure holds the nodes of the red-black tree used to store ··· 367 303 } 368 304 369 305 370 - static struct dir_private_info *ext3_htree_create_dir_info(loff_t pos) 306 + static struct dir_private_info *ext3_htree_create_dir_info(struct file *filp, 307 + loff_t pos) 371 308 { 372 309 struct dir_private_info *p; 373 310 374 311 p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); 375 312 if (!p) 376 313 return NULL; 377 - p->curr_hash = pos2maj_hash(pos); 378 - p->curr_minor_hash = pos2min_hash(pos); 314 + p->curr_hash = pos2maj_hash(filp, pos); 315 + p->curr_minor_hash = pos2min_hash(filp, pos); 379 316 return p; 380 317 } 381 318 ··· 466 401 printk("call_filldir: called with null fname?!?\n"); 467 402 return 0; 468 403 } 469 - curr_pos = hash2pos(fname->hash, fname->minor_hash); 404 + curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); 470 405 while (fname) { 471 406 error = filldir(dirent, fname->name, 472 407 fname->name_len, curr_pos, ··· 491 426 int ret; 492 427 493 428 if (!info) { 494 - info = ext3_htree_create_dir_info(filp->f_pos); 429 + info = ext3_htree_create_dir_info(filp, filp->f_pos); 495 430 if (!info) 496 431 return -ENOMEM; 497 432 filp->private_data = info; 498 433 } 499 434 500 - if (filp->f_pos == EXT3_HTREE_EOF) 435 + if (filp->f_pos == ext3_get_htree_eof(filp)) 501 436 return 0; /* EOF */ 502 437 503 438 /* Some one has messed with f_pos; reset the world */ ··· 505 440 free_rb_tree_fname(&info->root); 506 441 info->curr_node = NULL; 507 442 info->extra_fname = NULL; 508 - info->curr_hash = pos2maj_hash(filp->f_pos); 509 - info->curr_minor_hash = pos2min_hash(filp->f_pos); 443 + info->curr_hash = pos2maj_hash(filp, filp->f_pos); 444 + info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); 510 445 } 511 446 512 447 /* ··· 538 473 if (ret < 0) 539 474 return ret; 540 475 if (ret == 0) { 541 - filp->f_pos = EXT3_HTREE_EOF; 476 + filp->f_pos = ext3_get_htree_eof(filp); 542 477 break; 543 478 } 544 479 info->curr_node = rb_first(&info->root); ··· 558 493 info->curr_minor_hash = fname->minor_hash; 559 494 } else { 560 495 if (info->next_hash == ~0) { 561 - filp->f_pos = EXT3_HTREE_EOF; 496 + filp->f_pos = ext3_get_htree_eof(filp); 562 497 break; 563 498 } 564 499 info->curr_hash = info->next_hash; ··· 577 512 578 513 return 0; 579 514 } 515 + 516 + const struct file_operations ext3_dir_operations = { 517 + .llseek = ext3_dir_llseek, 518 + .read = generic_read_dir, 519 + .readdir = ext3_readdir, 520 + .unlocked_ioctl = ext3_ioctl, 521 + #ifdef CONFIG_COMPAT 522 + .compat_ioctl = ext3_compat_ioctl, 523 + #endif 524 + .fsync = ext3_sync_file, 525 + .release = ext3_release_dir, 526 + };
+5 -1
fs/ext3/ext3.h
··· 920 920 u32 *seed; 921 921 }; 922 922 923 - #define EXT3_HTREE_EOF 0x7fffffff 923 + 924 + /* 32 and 64 bit signed EOF for dx directories */ 925 + #define EXT3_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) 926 + #define EXT3_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) 927 + 924 928 925 929 /* 926 930 * Control parameters used by ext3_htree_next_block
+2 -2
fs/ext3/hash.c
··· 198 198 return -1; 199 199 } 200 200 hash = hash & ~1; 201 - if (hash == (EXT3_HTREE_EOF << 1)) 202 - hash = (EXT3_HTREE_EOF-1) << 1; 201 + if (hash == (EXT3_HTREE_EOF_32BIT << 1)) 202 + hash = (EXT3_HTREE_EOF_32BIT - 1) << 1; 203 203 hinfo->hash = hash; 204 204 hinfo->minor_hash = minor_hash; 205 205 return 0;