ceph: add dir_layout to inode

Add a ceph_dir_layout to the inode, and calculate dentry hash values based
on the parent directory's specified dir_hash function. This is needed
because the old default Linux dcache hash function is extremely week and
leads to a poor distribution of files among dir fragments.

Signed-off-by: Sage Weil <sage@newdream.net>

Sage Weil 6c0f3af7 3c0eee3f

+41 -4
+20
fs/ceph/dir.c
··· 1216 } 1217 } 1218 1219 const struct file_operations ceph_dir_fops = { 1220 .read = ceph_read_dir, 1221 .readdir = ceph_readdir,
··· 1216 } 1217 } 1218 1219 + /* 1220 + * Return name hash for a given dentry. This is dependent on 1221 + * the parent directory's hash function. 1222 + */ 1223 + unsigned ceph_dentry_hash(struct dentry *dn) 1224 + { 1225 + struct inode *dir = dn->d_parent->d_inode; 1226 + struct ceph_inode_info *dci = ceph_inode(dir); 1227 + 1228 + switch (dci->i_dir_layout.dl_dir_hash) { 1229 + case 0: /* for backward compat */ 1230 + case CEPH_STR_HASH_LINUX: 1231 + return dn->d_name.hash; 1232 + 1233 + default: 1234 + return ceph_str_hash(dci->i_dir_layout.dl_dir_hash, 1235 + dn->d_name.name, dn->d_name.len); 1236 + } 1237 + } 1238 + 1239 const struct file_operations ceph_dir_fops = { 1240 .read = ceph_read_dir, 1241 .readdir = ceph_readdir,
+1 -1
fs/ceph/export.c
··· 59 dout("encode_fh %p connectable\n", dentry); 60 cfh->ino = ceph_ino(dentry->d_inode); 61 cfh->parent_ino = ceph_ino(parent->d_inode); 62 - cfh->parent_name_hash = parent->d_name.hash; 63 *max_len = connected_handle_length; 64 type = 2; 65 } else if (*max_len >= handle_length) {
··· 59 dout("encode_fh %p connectable\n", dentry); 60 cfh->ino = ceph_ino(dentry->d_inode); 61 cfh->parent_ino = ceph_ino(parent->d_inode); 62 + cfh->parent_name_hash = ceph_dentry_hash(parent); 63 *max_len = connected_handle_length; 64 type = 2; 65 } else if (*max_len >= handle_length) {
+2
fs/ceph/inode.c
··· 297 ci->i_release_count = 0; 298 ci->i_symlink = NULL; 299 300 ci->i_fragtree = RB_ROOT; 301 mutex_init(&ci->i_fragtree_mutex); 302
··· 297 ci->i_release_count = 0; 298 ci->i_symlink = NULL; 299 300 + memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); 301 + 302 ci->i_fragtree = RB_ROOT; 303 mutex_init(&ci->i_fragtree_mutex); 304
+2
fs/ceph/super.h
··· 239 unsigned i_ceph_flags; 240 unsigned long i_release_count; 241 242 struct ceph_file_layout i_layout; 243 char *i_symlink; 244 ··· 769 extern void ceph_dentry_lru_touch(struct dentry *dn); 770 extern void ceph_dentry_lru_del(struct dentry *dn); 771 extern void ceph_invalidate_dentry_lease(struct dentry *dentry); 772 773 /* 774 * our d_ops vary depending on whether the inode is live,
··· 239 unsigned i_ceph_flags; 240 unsigned long i_release_count; 241 242 + struct ceph_dir_layout i_dir_layout; 243 struct ceph_file_layout i_layout; 244 char *i_symlink; 245 ··· 768 extern void ceph_dentry_lru_touch(struct dentry *dn); 769 extern void ceph_dentry_lru_del(struct dentry *dn); 770 extern void ceph_invalidate_dentry_lease(struct dentry *dentry); 771 + extern unsigned ceph_dentry_hash(struct dentry *dn); 772 773 /* 774 * our d_ops vary depending on whether the inode is live,
+13 -3
include/linux/ceph/ceph_fs.h
··· 43 #define CEPH_FEATURE_NOSRCADDR (1<<1) 44 #define CEPH_FEATURE_MONCLOCKCHECK (1<<2) 45 #define CEPH_FEATURE_FLOCK (1<<3) 46 47 48 /* ··· 59 __le32 fl_stripe_count; /* over this many objects */ 60 __le32 fl_object_size; /* until objects are this big, then move to 61 new objects */ 62 - __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */ 63 64 /* pg -> disk layout */ 65 - __le32 fl_object_stripe_unit; /* for per-object parity, if any */ 66 67 /* object -> pg layout */ 68 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ ··· 73 74 int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); 75 76 77 /* crypto algorithms */ 78 #define CEPH_CRYPTO_NONE 0x0 ··· 467 struct ceph_timespec rctime; 468 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */ 469 } __attribute__ ((packed)); 470 - /* followed by frag array, then symlink string, then xattr blob */ 471 472 /* reply_lease follows dname, and reply_inode */ 473 struct ceph_mds_reply_lease {
··· 43 #define CEPH_FEATURE_NOSRCADDR (1<<1) 44 #define CEPH_FEATURE_MONCLOCKCHECK (1<<2) 45 #define CEPH_FEATURE_FLOCK (1<<3) 46 + #define CEPH_FEATURE_SUBSCRIBE2 (1<<4) 47 + #define CEPH_FEATURE_MONNAMES (1<<5) 48 + #define CEPH_FEATURE_RECONNECT_SEQ (1<<6) 49 + #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) 50 51 52 /* ··· 55 __le32 fl_stripe_count; /* over this many objects */ 56 __le32 fl_object_size; /* until objects are this big, then move to 57 new objects */ 58 + __le32 fl_cas_hash; /* UNUSED. 0 = none; 1 = sha256 */ 59 60 /* pg -> disk layout */ 61 + __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ 62 63 /* object -> pg layout */ 64 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ ··· 69 70 int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); 71 72 + struct ceph_dir_layout { 73 + __u8 dl_dir_hash; /* see ceph_hash.h for ids */ 74 + __u8 dl_unused1; 75 + __u16 dl_unused2; 76 + __u32 dl_unused3; 77 + } __attribute__ ((packed)); 78 79 /* crypto algorithms */ 80 #define CEPH_CRYPTO_NONE 0x0 ··· 457 struct ceph_timespec rctime; 458 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */ 459 } __attribute__ ((packed)); 460 + /* followed by frag array, symlink string, dir layout, xattr blob */ 461 462 /* reply_lease follows dname, and reply_inode */ 463 struct ceph_mds_reply_lease {
+3
net/ceph/ceph_hash.c
··· 1 2 #include <linux/ceph/types.h> 3 4 /* 5 * Robert Jenkin's hash function. ··· 105 return -1; 106 } 107 } 108 109 const char *ceph_str_hash_name(int type) 110 { ··· 118 return "unknown"; 119 } 120 }
··· 1 2 #include <linux/ceph/types.h> 3 + #include <linux/module.h> 4 5 /* 6 * Robert Jenkin's hash function. ··· 104 return -1; 105 } 106 } 107 + EXPORT_SYMBOL(ceph_str_hash); 108 109 const char *ceph_str_hash_name(int type) 110 { ··· 116 return "unknown"; 117 } 118 } 119 + EXPORT_SYMBOL(ceph_str_hash_name);