commit b1e2d907cb748ef40b180184f36e594d35f12ba6 · tjh.dev/kernel

+1

fs/ocfs2/cluster/masklog.c

··· 74 74 #define define_mask(_name) { \ 75 75 .attr = { \ 76 76 .name = #_name, \ 77 + .owner = THIS_MODULE, \ 77 78 .mode = S_IRUGO | S_IWUSR, \ 78 79 }, \ 79 80 .mask = ML_##_name, \

+1 -1

fs/ocfs2/cluster/masklog.h

··· 256 256 } \ 257 257 } while (0) 258 258 259 - #if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) 259 + #if (BITS_PER_LONG == 32) || defined(CONFIG_X86_64) || (defined(CONFIG_UML_X86) && defined(CONFIG_64BIT)) 260 260 #define MLFi64 "lld" 261 261 #define MLFu64 "llu" 262 262 #define MLFx64 "llx"

+3 -1

fs/ocfs2/cluster/nodemanager.c

··· 756 756 if (!ocfs2_table_header) { 757 757 printk(KERN_ERR "nodemanager: unable to register sysctl\n"); 758 758 ret = -ENOMEM; /* or something. */ 759 - goto out; 759 + goto out_o2net; 760 760 } 761 761 762 762 ret = o2net_register_hb_callbacks(); ··· 780 780 o2net_unregister_hb_callbacks(); 781 781 out_sysctl: 782 782 unregister_sysctl_table(ocfs2_table_header); 783 + out_o2net: 784 + o2net_exit(); 783 785 out: 784 786 return ret; 785 787 }

+12 -2

fs/ocfs2/cluster/tcp.c

··· 1318 1318 { 1319 1319 struct o2net_node *nn = arg; 1320 1320 struct o2net_sock_container *sc = NULL; 1321 - struct o2nm_node *node = NULL; 1321 + struct o2nm_node *node = NULL, *mynode = NULL; 1322 1322 struct socket *sock = NULL; 1323 1323 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; 1324 1324 int ret = 0; ··· 1330 1330 /* watch for racing with tearing a node down */ 1331 1331 node = o2nm_get_node_by_num(o2net_num_from_nn(nn)); 1332 1332 if (node == NULL) { 1333 + ret = 0; 1334 + goto out; 1335 + } 1336 + 1337 + mynode = o2nm_get_node_by_num(o2nm_this_node()); 1338 + if (mynode == NULL) { 1333 1339 ret = 0; 1334 1340 goto out; 1335 1341 } ··· 1367 1361 sock->sk->sk_allocation = GFP_ATOMIC; 1368 1362 1369 1363 myaddr.sin_family = AF_INET; 1364 + myaddr.sin_addr.s_addr = (__force u32)mynode->nd_ipv4_address; 1370 1365 myaddr.sin_port = (__force u16)htons(0); /* any port */ 1371 1366 1372 1367 ret = sock->ops->bind(sock, (struct sockaddr *)&myaddr, 1373 1368 sizeof(myaddr)); 1374 1369 if (ret) { 1375 - mlog(0, "bind failed: %d\n", ret); 1370 + mlog(ML_ERROR, "bind failed with %d at address %u.%u.%u.%u\n", 1371 + ret, NIPQUAD(mynode->nd_ipv4_address)); 1376 1372 goto out; 1377 1373 } 1378 1374 ··· 1415 1407 sc_put(sc); 1416 1408 if (node) 1417 1409 o2nm_node_put(node); 1410 + if (mynode) 1411 + o2nm_node_put(mynode); 1418 1412 1419 1413 return; 1420 1414 }

-5

fs/ocfs2/cluster/tcp.h

··· 85 85 O2NET_DRIVER_READY, 86 86 }; 87 87 88 - int o2net_init_tcp_sock(struct inode *inode); 89 88 int o2net_send_message(u32 msg_type, u32 key, void *data, u32 len, 90 89 u8 target_node, int *status); 91 90 int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *vec, 92 91 size_t veclen, u8 target_node, int *status); 93 - int o2net_broadcast_message(u32 msg_type, u32 key, void *data, u32 len, 94 - struct inode *group); 95 92 96 93 int o2net_register_handler(u32 msg_type, u32 key, u32 max_len, 97 94 o2net_msg_handler_func *func, void *data, ··· 104 107 105 108 int o2net_init(void); 106 109 void o2net_exit(void); 107 - int o2net_proc_init(struct proc_dir_entry *parent); 108 - void o2net_proc_exit(struct proc_dir_entry *parent); 109 110 110 111 #endif /* O2CLUSTER_TCP_H */

+3 -5

fs/ocfs2/dlm/dlmcommon.h

··· 37 37 #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes 38 38 #define DLM_THREAD_MS 200 // flush at least every 200 ms 39 39 40 - #define DLM_HASH_BITS 7 41 - #define DLM_HASH_SIZE (1 << DLM_HASH_BITS) 42 - #define DLM_HASH_MASK (DLM_HASH_SIZE - 1) 40 + #define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head)) 43 41 44 42 enum dlm_ast_type { 45 43 DLM_AST = 0, ··· 85 87 struct dlm_ctxt 86 88 { 87 89 struct list_head list; 88 - struct list_head *resources; 90 + struct hlist_head *lockres_hash; 89 91 struct list_head dirty_list; 90 92 struct list_head purge_list; 91 93 struct list_head pending_asts; ··· 215 217 { 216 218 /* WARNING: Please see the comment in dlm_init_lockres before 217 219 * adding fields here. */ 218 - struct list_head list; 220 + struct hlist_node hash_node; 219 221 struct kref refs; 220 222 221 223 /* please keep these next 3 in this order

+5 -7

fs/ocfs2/dlm/dlmdebug.c

··· 117 117 void dlm_dump_lock_resources(struct dlm_ctxt *dlm) 118 118 { 119 119 struct dlm_lock_resource *res; 120 - struct list_head *iter; 121 - struct list_head *bucket; 120 + struct hlist_node *iter; 121 + struct hlist_head *bucket; 122 122 int i; 123 123 124 124 mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n", ··· 129 129 } 130 130 131 131 spin_lock(&dlm->spinlock); 132 - for (i=0; i<DLM_HASH_SIZE; i++) { 133 - bucket = &(dlm->resources[i]); 134 - list_for_each(iter, bucket) { 135 - res = list_entry(iter, struct dlm_lock_resource, list); 132 + for (i=0; i<DLM_HASH_BUCKETS; i++) { 133 + bucket = &(dlm->lockres_hash[i]); 134 + hlist_for_each_entry(res, iter, bucket, hash_node) 136 135 dlm_print_one_lock_resource(res); 137 - } 138 136 } 139 137 spin_unlock(&dlm->spinlock); 140 138 }

+19 -20

fs/ocfs2/dlm/dlmdomain.c

··· 77 77 78 78 void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) 79 79 { 80 - list_del_init(&lockres->list); 80 + hlist_del_init(&lockres->hash_node); 81 81 dlm_lockres_put(lockres); 82 82 } 83 83 84 84 void __dlm_insert_lockres(struct dlm_ctxt *dlm, 85 85 struct dlm_lock_resource *res) 86 86 { 87 - struct list_head *bucket; 87 + struct hlist_head *bucket; 88 88 struct qstr *q; 89 89 90 90 assert_spin_locked(&dlm->spinlock); 91 91 92 92 q = &res->lockname; 93 93 q->hash = full_name_hash(q->name, q->len); 94 - bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]); 94 + bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]); 95 95 96 96 /* get a reference for our hashtable */ 97 97 dlm_lockres_get(res); 98 98 99 - list_add_tail(&res->list, bucket); 99 + hlist_add_head(&res->hash_node, bucket); 100 100 } 101 101 102 102 struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, ··· 104 104 unsigned int len) 105 105 { 106 106 unsigned int hash; 107 - struct list_head *iter; 107 + struct hlist_node *iter; 108 108 struct dlm_lock_resource *tmpres=NULL; 109 - struct list_head *bucket; 109 + struct hlist_head *bucket; 110 110 111 111 mlog_entry("%.*s\n", len, name); 112 112 ··· 114 114 115 115 hash = full_name_hash(name, len); 116 116 117 - bucket = &(dlm->resources[hash & DLM_HASH_MASK]); 117 + bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]); 118 118 119 119 /* check for pre-existing lock */ 120 - list_for_each(iter, bucket) { 121 - tmpres = list_entry(iter, struct dlm_lock_resource, list); 120 + hlist_for_each(iter, bucket) { 121 + tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node); 122 122 if (tmpres->lockname.len == len && 123 123 memcmp(tmpres->lockname.name, name, len) == 0) { 124 124 dlm_lockres_get(tmpres); ··· 193 193 194 194 static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) 195 195 { 196 - if (dlm->resources) 197 - free_page((unsigned long) dlm->resources); 196 + if (dlm->lockres_hash) 197 + free_page((unsigned long) dlm->lockres_hash); 198 198 199 199 if (dlm->name) 200 200 kfree(dlm->name); ··· 303 303 mlog(0, "Migrating locks from domain %s\n", dlm->name); 304 304 restart: 305 305 spin_lock(&dlm->spinlock); 306 - for (i=0; i<DLM_HASH_SIZE; i++) { 307 - while (!list_empty(&dlm->resources[i])) { 308 - res = list_entry(dlm->resources[i].next, 309 - struct dlm_lock_resource, list); 306 + for (i = 0; i < DLM_HASH_BUCKETS; i++) { 307 + while (!hlist_empty(&dlm->lockres_hash[i])) { 308 + res = hlist_entry(dlm->lockres_hash[i].first, 309 + struct dlm_lock_resource, hash_node); 310 310 /* need reference when manually grabbing lockres */ 311 311 dlm_lockres_get(res); 312 312 /* this should unhash the lockres ··· 1191 1191 goto leave; 1192 1192 } 1193 1193 1194 - dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL); 1195 - if (!dlm->resources) { 1194 + dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL); 1195 + if (!dlm->lockres_hash) { 1196 1196 mlog_errno(-ENOMEM); 1197 1197 kfree(dlm->name); 1198 1198 kfree(dlm); 1199 1199 dlm = NULL; 1200 1200 goto leave; 1201 1201 } 1202 - memset(dlm->resources, 0, PAGE_SIZE); 1203 1202 1204 - for (i=0; i<DLM_HASH_SIZE; i++) 1205 - INIT_LIST_HEAD(&dlm->resources[i]); 1203 + for (i=0; i<DLM_HASH_BUCKETS; i++) 1204 + INIT_HLIST_HEAD(&dlm->lockres_hash[i]); 1206 1205 1207 1206 strcpy(dlm->name, domain); 1208 1207 dlm->key = key;

+2 -2

fs/ocfs2/dlm/dlmmaster.c

··· 564 564 565 565 /* By the time we're ready to blow this guy away, we shouldn't 566 566 * be on any lists. */ 567 - BUG_ON(!list_empty(&res->list)); 567 + BUG_ON(!hlist_unhashed(&res->hash_node)); 568 568 BUG_ON(!list_empty(&res->granted)); 569 569 BUG_ON(!list_empty(&res->converting)); 570 570 BUG_ON(!list_empty(&res->blocked)); ··· 605 605 606 606 init_waitqueue_head(&res->wq); 607 607 spin_lock_init(&res->spinlock); 608 - INIT_LIST_HEAD(&res->list); 608 + INIT_HLIST_NODE(&res->hash_node); 609 609 INIT_LIST_HEAD(&res->granted); 610 610 INIT_LIST_HEAD(&res->converting); 611 611 INIT_LIST_HEAD(&res->blocked);

+12 -11

fs/ocfs2/dlm/dlmrecovery.c

··· 1693 1693 u8 dead_node, u8 new_master) 1694 1694 { 1695 1695 int i; 1696 - struct list_head *iter, *iter2, *bucket; 1696 + struct list_head *iter, *iter2; 1697 + struct hlist_node *hash_iter; 1698 + struct hlist_head *bucket; 1699 + 1697 1700 struct dlm_lock_resource *res; 1698 1701 1699 1702 mlog_entry_void(); ··· 1720 1717 * for now we need to run the whole hash, clear 1721 1718 * the RECOVERING state and set the owner 1722 1719 * if necessary */ 1723 - for (i=0; i<DLM_HASH_SIZE; i++) { 1724 - bucket = &(dlm->resources[i]); 1725 - list_for_each(iter, bucket) { 1726 - res = list_entry (iter, struct dlm_lock_resource, list); 1720 + for (i = 0; i < DLM_HASH_BUCKETS; i++) { 1721 + bucket = &(dlm->lockres_hash[i]); 1722 + hlist_for_each_entry(res, hash_iter, bucket, hash_node) { 1727 1723 if (res->state & DLM_LOCK_RES_RECOVERING) { 1728 1724 if (res->owner == dead_node) { 1729 1725 mlog(0, "(this=%u) res %.*s owner=%u " ··· 1854 1852 1855 1853 static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) 1856 1854 { 1857 - struct list_head *iter; 1855 + struct hlist_node *iter; 1858 1856 struct dlm_lock_resource *res; 1859 1857 int i; 1860 - struct list_head *bucket; 1858 + struct hlist_head *bucket; 1861 1859 struct dlm_lock *lock; 1862 1860 1863 1861 ··· 1878 1876 * can be kicked again to see if any ASTs or BASTs 1879 1877 * need to be fired as a result. 1880 1878 */ 1881 - for (i=0; i<DLM_HASH_SIZE; i++) { 1882 - bucket = &(dlm->resources[i]); 1883 - list_for_each(iter, bucket) { 1884 - res = list_entry (iter, struct dlm_lock_resource, list); 1879 + for (i = 0; i < DLM_HASH_BUCKETS; i++) { 1880 + bucket = &(dlm->lockres_hash[i]); 1881 + hlist_for_each_entry(res, iter, bucket, hash_node) { 1885 1882 /* always prune any $RECOVERY entries for dead nodes, 1886 1883 * otherwise hangs can occur during later recovery */ 1887 1884 if (dlm_is_recovery_lock(res->lockname.name,

+36 -2

fs/ocfs2/extent_map.c

··· 181 181 ret = -EBADR; 182 182 if (rec_end > OCFS2_I(inode)->ip_clusters) { 183 183 mlog_errno(ret); 184 + ocfs2_error(inode->i_sb, 185 + "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n", 186 + i, 187 + le64_to_cpu(rec->e_blkno), 188 + OCFS2_I(inode)->ip_blkno, 189 + OCFS2_I(inode)->ip_clusters); 184 190 goto out_free; 185 191 } 186 192 ··· 232 226 ret = -EBADR; 233 227 if (blkno) { 234 228 mlog_errno(ret); 229 + ocfs2_error(inode->i_sb, 230 + "Multiple extents for (cpos = %u, clusters = %u) on inode %"MLFu64"; e_blkno %"MLFu64" and rec %d at e_blkno %"MLFu64"\n", 231 + cpos, clusters, 232 + OCFS2_I(inode)->ip_blkno, 233 + blkno, i, 234 + le64_to_cpu(rec->e_blkno)); 235 235 goto out_free; 236 236 } 237 237 ··· 250 238 */ 251 239 ret = -EBADR; 252 240 if (!blkno) { 241 + ocfs2_error(inode->i_sb, 242 + "No record found for (cpos = %u, clusters = %u) on inode %"MLFu64"\n", 243 + cpos, clusters, 244 + OCFS2_I(inode)->ip_blkno); 253 245 mlog_errno(ret); 254 246 goto out_free; 255 247 } ··· 282 266 283 267 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 284 268 rec = &el->l_recs[i]; 269 + 270 + if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) > 271 + OCFS2_I(inode)->ip_clusters) { 272 + ret = -EBADR; 273 + mlog_errno(ret); 274 + ocfs2_error(inode->i_sb, 275 + "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n", 276 + i, 277 + le64_to_cpu(rec->e_blkno), 278 + OCFS2_I(inode)->ip_blkno, 279 + OCFS2_I(inode)->ip_clusters); 280 + return ret; 281 + } 282 + 285 283 ret = ocfs2_extent_map_insert(inode, rec, 286 284 le16_to_cpu(el->l_tree_depth)); 287 285 if (ret) { ··· 556 526 OCFS2_I(inode)->ip_map.em_clusters) { 557 527 ret = -EBADR; 558 528 mlog_errno(ret); 529 + ocfs2_error(inode->i_sb, 530 + "Zero e_clusters on non-tail extent record at e_blkno %"MLFu64" on inode %"MLFu64"\n", 531 + le64_to_cpu(rec->e_blkno), 532 + OCFS2_I(inode)->ip_blkno); 559 533 return ret; 560 534 } 561 535 ··· 622 588 * Existing record in the extent map: 623 589 * 624 590 * cpos = 10, len = 10 625 - * |---------| 591 + * |---------| 626 592 * 627 593 * New Record: 628 594 * 629 595 * cpos = 10, len = 20 630 - * |------------------| 596 + * |------------------| 631 597 * 632 598 * The passed record is the new on-disk record. The new_clusters value 633 599 * is how many clusters were added to the file. If the append is a

+1 -50

fs/ocfs2/file.c

··· 933 933 struct file *filp = iocb->ki_filp; 934 934 struct inode *inode = filp->f_dentry->d_inode; 935 935 loff_t newsize, saved_pos; 936 - #ifdef OCFS2_ORACORE_WORKAROUNDS 937 - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 938 - #endif 939 936 940 937 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 941 938 (unsigned int)count, ··· 947 950 mlog(0, "bad inode\n"); 948 951 return -EIO; 949 952 } 950 - 951 - #ifdef OCFS2_ORACORE_WORKAROUNDS 952 - /* ugh, work around some applications which open everything O_DIRECT + 953 - * O_APPEND and really don't mean to use O_DIRECT. */ 954 - if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && 955 - (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) 956 - filp->f_flags &= ~O_DIRECT; 957 - #endif 958 953 959 954 mutex_lock(&inode->i_mutex); 960 955 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ ··· 1068 1079 /* communicate with ocfs2_dio_end_io */ 1069 1080 ocfs2_iocb_set_rw_locked(iocb); 1070 1081 1071 - #ifdef OCFS2_ORACORE_WORKAROUNDS 1072 - if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS && 1073 - filp->f_flags & O_DIRECT) { 1074 - unsigned int saved_flags = filp->f_flags; 1075 - int sector_size = 1 << osb->s_sectsize_bits; 1076 - 1077 - if ((saved_pos & (sector_size - 1)) || 1078 - (count & (sector_size - 1)) || 1079 - ((unsigned long)buf & (sector_size - 1))) { 1080 - filp->f_flags |= O_SYNC; 1081 - filp->f_flags &= ~O_DIRECT; 1082 - } 1083 - 1084 - ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, 1085 - &iocb->ki_pos); 1086 - 1087 - filp->f_flags = saved_flags; 1088 - } else 1089 - #endif 1090 - ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, 1091 - &iocb->ki_pos); 1082 + ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos); 1092 1083 1093 1084 /* buffered aio wouldn't have proper lock coverage today */ 1094 1085 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); ··· 1109 1140 int ret = 0, rw_level = -1, have_alloc_sem = 0; 1110 1141 struct file *filp = iocb->ki_filp; 1111 1142 struct inode *inode = filp->f_dentry->d_inode; 1112 - #ifdef OCFS2_ORACORE_WORKAROUNDS 1113 - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1114 - #endif 1115 1143 1116 1144 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf, 1117 1145 (unsigned int)count, ··· 1120 1154 mlog_errno(ret); 1121 1155 goto bail; 1122 1156 } 1123 - 1124 - #ifdef OCFS2_ORACORE_WORKAROUNDS 1125 - if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) { 1126 - if (filp->f_flags & O_DIRECT) { 1127 - int sector_size = 1 << osb->s_sectsize_bits; 1128 - 1129 - if ((pos & (sector_size - 1)) || 1130 - (count & (sector_size - 1)) || 1131 - ((unsigned long)buf & (sector_size - 1)) || 1132 - (i_size_read(inode) & (sector_size -1))) { 1133 - filp->f_flags &= ~O_DIRECT; 1134 - } 1135 - } 1136 - } 1137 - #endif 1138 1157 1139 1158 /* 1140 1159 * buffered reads protect themselves in ->readpage(). O_DIRECT reads

+1

fs/ocfs2/heartbeat.c

··· 67 67 ocfs2_node_map_init(&osb->mounted_map); 68 68 ocfs2_node_map_init(&osb->recovery_map); 69 69 ocfs2_node_map_init(&osb->umount_map); 70 + ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); 70 71 } 71 72 72 73 static void ocfs2_do_node_down(int node_num,

+45 -1

fs/ocfs2/inode.c

··· 41 41 #include "dlmglue.h" 42 42 #include "extent_map.h" 43 43 #include "file.h" 44 + #include "heartbeat.h" 44 45 #include "inode.h" 45 46 #include "journal.h" 46 47 #include "namei.h" ··· 545 544 return status; 546 545 } 547 546 547 + /* 548 + * Serialize with orphan dir recovery. If the process doing 549 + * recovery on this orphan dir does an iget() with the dir 550 + * i_mutex held, we'll deadlock here. Instead we detect this 551 + * and exit early - recovery will wipe this inode for us. 552 + */ 553 + static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb, 554 + int slot) 555 + { 556 + int ret = 0; 557 + 558 + spin_lock(&osb->osb_lock); 559 + if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) { 560 + mlog(0, "Recovery is happening on orphan dir %d, will skip " 561 + "this inode\n", slot); 562 + ret = -EDEADLK; 563 + goto out; 564 + } 565 + /* This signals to the orphan recovery process that it should 566 + * wait for us to handle the wipe. */ 567 + osb->osb_orphan_wipes[slot]++; 568 + out: 569 + spin_unlock(&osb->osb_lock); 570 + return ret; 571 + } 572 + 573 + static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, 574 + int slot) 575 + { 576 + spin_lock(&osb->osb_lock); 577 + osb->osb_orphan_wipes[slot]--; 578 + spin_unlock(&osb->osb_lock); 579 + 580 + wake_up(&osb->osb_wipe_event); 581 + } 582 + 548 583 static int ocfs2_wipe_inode(struct inode *inode, 549 584 struct buffer_head *di_bh) 550 585 { ··· 592 555 /* We've already voted on this so it should be readonly - no 593 556 * spinlock needed. */ 594 557 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot; 558 + 559 + status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); 560 + if (status) 561 + return status; 562 + 595 563 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 596 564 ORPHAN_DIR_SYSTEM_INODE, 597 565 orphaned_slot); ··· 639 597 brelse(orphan_dir_bh); 640 598 bail: 641 599 iput(orphan_dir_inode); 600 + ocfs2_signal_wipe_completion(osb, orphaned_slot); 642 601 643 602 return status; 644 603 } ··· 865 822 866 823 status = ocfs2_wipe_inode(inode, di_bh); 867 824 if (status < 0) { 868 - mlog_errno(status); 825 + if (status != -EDEADLK) 826 + mlog_errno(status); 869 827 goto bail_unlock_inode; 870 828 } 871 829

+93 -31

fs/ocfs2/journal.c

··· 1408 1408 return status; 1409 1409 } 1410 1410 1411 - static int ocfs2_recover_orphans(struct ocfs2_super *osb, 1412 - int slot) 1411 + static int ocfs2_queue_orphans(struct ocfs2_super *osb, 1412 + int slot, 1413 + struct inode **head) 1413 1414 { 1414 - int status = 0; 1415 - int have_disk_lock = 0; 1416 - struct inode *inode = NULL; 1417 - struct inode *iter; 1415 + int status; 1418 1416 struct inode *orphan_dir_inode = NULL; 1417 + struct inode *iter; 1419 1418 unsigned long offset, blk, local; 1420 1419 struct buffer_head *bh = NULL; 1421 1420 struct ocfs2_dir_entry *de; 1422 1421 struct super_block *sb = osb->sb; 1423 - struct ocfs2_inode_info *oi; 1424 - 1425 - mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); 1426 1422 1427 1423 orphan_dir_inode = ocfs2_get_system_file_inode(osb, 1428 1424 ORPHAN_DIR_SYSTEM_INODE, ··· 1426 1430 if (!orphan_dir_inode) { 1427 1431 status = -ENOENT; 1428 1432 mlog_errno(status); 1429 - goto out; 1430 - } 1433 + return status; 1434 + } 1431 1435 1432 1436 mutex_lock(&orphan_dir_inode->i_mutex); 1433 1437 status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0); 1434 1438 if (status < 0) { 1435 - mutex_unlock(&orphan_dir_inode->i_mutex); 1436 1439 mlog_errno(status); 1437 1440 goto out; 1438 1441 } 1439 - have_disk_lock = 1; 1440 1442 1441 1443 offset = 0; 1442 1444 iter = NULL; ··· 1445 1451 if (!bh) 1446 1452 status = -EINVAL; 1447 1453 if (status < 0) { 1448 - mutex_unlock(&orphan_dir_inode->i_mutex); 1449 1454 if (bh) 1450 1455 brelse(bh); 1451 1456 mlog_errno(status); 1452 - goto out; 1457 + goto out_unlock; 1453 1458 } 1454 1459 1455 1460 local = 0; ··· 1458 1465 1459 1466 if (!ocfs2_check_dir_entry(orphan_dir_inode, 1460 1467 de, bh, local)) { 1461 - mutex_unlock(&orphan_dir_inode->i_mutex); 1462 1468 status = -EINVAL; 1463 1469 mlog_errno(status); 1464 1470 brelse(bh); 1465 - goto out; 1471 + goto out_unlock; 1466 1472 } 1467 1473 1468 1474 local += le16_to_cpu(de->rec_len); ··· 1496 1504 1497 1505 mlog(0, "queue orphan %"MLFu64"\n", 1498 1506 OCFS2_I(iter)->ip_blkno); 1499 - OCFS2_I(iter)->ip_next_orphan = inode; 1500 - inode = iter; 1507 + /* No locking is required for the next_orphan 1508 + * queue as there is only ever a single 1509 + * process doing orphan recovery. */ 1510 + OCFS2_I(iter)->ip_next_orphan = *head; 1511 + *head = iter; 1501 1512 } 1502 1513 brelse(bh); 1503 1514 } 1504 - mutex_unlock(&orphan_dir_inode->i_mutex); 1505 1515 1516 + out_unlock: 1506 1517 ocfs2_meta_unlock(orphan_dir_inode, 0); 1507 - have_disk_lock = 0; 1508 - 1518 + out: 1519 + mutex_unlock(&orphan_dir_inode->i_mutex); 1509 1520 iput(orphan_dir_inode); 1510 - orphan_dir_inode = NULL; 1521 + return status; 1522 + } 1523 + 1524 + static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb, 1525 + int slot) 1526 + { 1527 + int ret; 1528 + 1529 + spin_lock(&osb->osb_lock); 1530 + ret = !osb->osb_orphan_wipes[slot]; 1531 + spin_unlock(&osb->osb_lock); 1532 + return ret; 1533 + } 1534 + 1535 + static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb, 1536 + int slot) 1537 + { 1538 + spin_lock(&osb->osb_lock); 1539 + /* Mark ourselves such that new processes in delete_inode() 1540 + * know to quit early. */ 1541 + ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot); 1542 + while (osb->osb_orphan_wipes[slot]) { 1543 + /* If any processes are already in the middle of an 1544 + * orphan wipe on this dir, then we need to wait for 1545 + * them. */ 1546 + spin_unlock(&osb->osb_lock); 1547 + wait_event_interruptible(osb->osb_wipe_event, 1548 + ocfs2_orphan_recovery_can_continue(osb, slot)); 1549 + spin_lock(&osb->osb_lock); 1550 + } 1551 + spin_unlock(&osb->osb_lock); 1552 + } 1553 + 1554 + static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb, 1555 + int slot) 1556 + { 1557 + ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot); 1558 + } 1559 + 1560 + /* 1561 + * Orphan recovery. Each mounted node has it's own orphan dir which we 1562 + * must run during recovery. Our strategy here is to build a list of 1563 + * the inodes in the orphan dir and iget/iput them. The VFS does 1564 + * (most) of the rest of the work. 1565 + * 1566 + * Orphan recovery can happen at any time, not just mount so we have a 1567 + * couple of extra considerations. 1568 + * 1569 + * - We grab as many inodes as we can under the orphan dir lock - 1570 + * doing iget() outside the orphan dir risks getting a reference on 1571 + * an invalid inode. 1572 + * - We must be sure not to deadlock with other processes on the 1573 + * system wanting to run delete_inode(). This can happen when they go 1574 + * to lock the orphan dir and the orphan recovery process attempts to 1575 + * iget() inside the orphan dir lock. This can be avoided by 1576 + * advertising our state to ocfs2_delete_inode(). 1577 + */ 1578 + static int ocfs2_recover_orphans(struct ocfs2_super *osb, 1579 + int slot) 1580 + { 1581 + int ret = 0; 1582 + struct inode *inode = NULL; 1583 + struct inode *iter; 1584 + struct ocfs2_inode_info *oi; 1585 + 1586 + mlog(0, "Recover inodes from orphan dir in slot %d\n", slot); 1587 + 1588 + ocfs2_mark_recovering_orphan_dir(osb, slot); 1589 + ret = ocfs2_queue_orphans(osb, slot, &inode); 1590 + ocfs2_clear_recovering_orphan_dir(osb, slot); 1591 + 1592 + /* Error here should be noted, but we want to continue with as 1593 + * many queued inodes as we've got. */ 1594 + if (ret) 1595 + mlog_errno(ret); 1511 1596 1512 1597 while (inode) { 1513 1598 oi = OCFS2_I(inode); ··· 1610 1541 inode = iter; 1611 1542 } 1612 1543 1613 - out: 1614 - if (have_disk_lock) 1615 - ocfs2_meta_unlock(orphan_dir_inode, 0); 1616 - 1617 - if (orphan_dir_inode) 1618 - iput(orphan_dir_inode); 1619 - 1620 - return status; 1544 + return ret; 1621 1545 } 1622 1546 1623 1547 static int ocfs2_wait_on_mount(struct ocfs2_super *osb)

+4 -3

fs/ocfs2/ocfs2.h

··· 174 174 OCFS2_MOUNT_NOINTR = 1 << 2, /* Don't catch signals */ 175 175 OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ 176 176 OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ 177 - #ifdef OCFS2_ORACORE_WORKAROUNDS 178 - OCFS2_MOUNT_COMPAT_OCFS = 1 << 30, /* ocfs1 compatibility mode */ 179 - #endif 180 177 }; 181 178 182 179 #define OCFS2_OSB_SOFT_RO 0x0001 ··· 287 290 struct inode *osb_tl_inode; 288 291 struct buffer_head *osb_tl_bh; 289 292 struct work_struct osb_truncate_log_wq; 293 + 294 + struct ocfs2_node_map osb_recovering_orphan_dirs; 295 + unsigned int *osb_orphan_wipes; 296 + wait_queue_head_t osb_wipe_event; 290 297 }; 291 298 292 299 #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)

-1

fs/ocfs2/ocfs2_fs.h

··· 138 138 139 139 /* Journal limits (in bytes) */ 140 140 #define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 141 - #define OCFS2_MAX_JOURNAL_SIZE (500 * 1024 * 1024) 142 141 143 142 struct ocfs2_system_inode_info { 144 143 char *si_name;

+11

fs/ocfs2/super.c

··· 1325 1325 } 1326 1326 mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots); 1327 1327 1328 + init_waitqueue_head(&osb->osb_wipe_event); 1329 + osb->osb_orphan_wipes = kcalloc(osb->max_slots, 1330 + sizeof(*osb->osb_orphan_wipes), 1331 + GFP_KERNEL); 1332 + if (!osb->osb_orphan_wipes) { 1333 + status = -ENOMEM; 1334 + mlog_errno(status); 1335 + goto bail; 1336 + } 1337 + 1328 1338 osb->s_feature_compat = 1329 1339 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat); 1330 1340 osb->s_feature_ro_compat = ··· 1648 1638 if (osb->slot_info) 1649 1639 ocfs2_free_slot_info(osb->slot_info); 1650 1640 1641 + kfree(osb->osb_orphan_wipes); 1651 1642 /* FIXME 1652 1643 * This belongs in journal shutdown, but because we have to 1653 1644 * allocate osb->journal at the start of ocfs2_initalize_osb(),