···77777878void __dlm_unhash_lockres(struct dlm_lock_resource *lockres)7979{8080- list_del_init(&lockres->list);8080+ hlist_del_init(&lockres->hash_node);8181 dlm_lockres_put(lockres);8282}83838484void __dlm_insert_lockres(struct dlm_ctxt *dlm,8585 struct dlm_lock_resource *res)8686{8787- struct list_head *bucket;8787+ struct hlist_head *bucket;8888 struct qstr *q;89899090 assert_spin_locked(&dlm->spinlock);91919292 q = &res->lockname;9393 q->hash = full_name_hash(q->name, q->len);9494- bucket = &(dlm->resources[q->hash & DLM_HASH_MASK]);9494+ bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);95959696 /* get a reference for our hashtable */9797 dlm_lockres_get(res);98989999- list_add_tail(&res->list, bucket);9999+ hlist_add_head(&res->hash_node, bucket);100100}101101102102struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,···104104 unsigned int len)105105{106106 unsigned int hash;107107- struct list_head *iter;107107+ struct hlist_node *iter;108108 struct dlm_lock_resource *tmpres=NULL;109109- struct list_head *bucket;109109+ struct hlist_head *bucket;110110111111 mlog_entry("%.*s\n", len, name);112112···114114115115 hash = full_name_hash(name, len);116116117117- bucket = &(dlm->resources[hash & DLM_HASH_MASK]);117117+ bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);118118119119 /* check for pre-existing lock */120120- list_for_each(iter, bucket) {121121- tmpres = list_entry(iter, struct dlm_lock_resource, list);120120+ hlist_for_each(iter, bucket) {121121+ tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);122122 if (tmpres->lockname.len == len &&123123 memcmp(tmpres->lockname.name, name, len) == 0) {124124 dlm_lockres_get(tmpres);···193193194194static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)195195{196196- if (dlm->resources)197197- free_page((unsigned long) dlm->resources);196196+ if (dlm->lockres_hash)197197+ free_page((unsigned long) dlm->lockres_hash);198198199199 if (dlm->name)200200 kfree(dlm->name);···303303 mlog(0, "Migrating locks from domain %s\n", dlm->name);304304restart:305305 spin_lock(&dlm->spinlock);306306- for (i=0; i<DLM_HASH_SIZE; i++) {307307- while (!list_empty(&dlm->resources[i])) {308308- res = list_entry(dlm->resources[i].next,309309- struct dlm_lock_resource, list);306306+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {307307+ while (!hlist_empty(&dlm->lockres_hash[i])) {308308+ res = hlist_entry(dlm->lockres_hash[i].first,309309+ struct dlm_lock_resource, hash_node);310310 /* need reference when manually grabbing lockres */311311 dlm_lockres_get(res);312312 /* this should unhash the lockres···11911191 goto leave;11921192 }1193119311941194- dlm->resources = (struct list_head *) __get_free_page(GFP_KERNEL);11951195- if (!dlm->resources) {11941194+ dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL);11951195+ if (!dlm->lockres_hash) {11961196 mlog_errno(-ENOMEM);11971197 kfree(dlm->name);11981198 kfree(dlm);11991199 dlm = NULL;12001200 goto leave;12011201 }12021202- memset(dlm->resources, 0, PAGE_SIZE);1203120212041204- for (i=0; i<DLM_HASH_SIZE; i++)12051205- INIT_LIST_HEAD(&dlm->resources[i]);12031203+ for (i=0; i<DLM_HASH_BUCKETS; i++)12041204+ INIT_HLIST_HEAD(&dlm->lockres_hash[i]);1206120512071206 strcpy(dlm->name, domain);12081207 dlm->key = key;
+2-2
fs/ocfs2/dlm/dlmmaster.c
···564564565565 /* By the time we're ready to blow this guy away, we shouldn't566566 * be on any lists. */567567- BUG_ON(!list_empty(&res->list));567567+ BUG_ON(!hlist_unhashed(&res->hash_node));568568 BUG_ON(!list_empty(&res->granted));569569 BUG_ON(!list_empty(&res->converting));570570 BUG_ON(!list_empty(&res->blocked));···605605606606 init_waitqueue_head(&res->wq);607607 spin_lock_init(&res->spinlock);608608- INIT_LIST_HEAD(&res->list);608608+ INIT_HLIST_NODE(&res->hash_node);609609 INIT_LIST_HEAD(&res->granted);610610 INIT_LIST_HEAD(&res->converting);611611 INIT_LIST_HEAD(&res->blocked);
+12-11
fs/ocfs2/dlm/dlmrecovery.c
···16931693 u8 dead_node, u8 new_master)16941694{16951695 int i;16961696- struct list_head *iter, *iter2, *bucket;16961696+ struct list_head *iter, *iter2;16971697+ struct hlist_node *hash_iter;16981698+ struct hlist_head *bucket;16991699+16971700 struct dlm_lock_resource *res;1698170116991702 mlog_entry_void();···17201717 * for now we need to run the whole hash, clear17211718 * the RECOVERING state and set the owner17221719 * if necessary */17231723- for (i=0; i<DLM_HASH_SIZE; i++) {17241724- bucket = &(dlm->resources[i]);17251725- list_for_each(iter, bucket) {17261726- res = list_entry (iter, struct dlm_lock_resource, list);17201720+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {17211721+ bucket = &(dlm->lockres_hash[i]);17221722+ hlist_for_each_entry(res, hash_iter, bucket, hash_node) {17271723 if (res->state & DLM_LOCK_RES_RECOVERING) {17281724 if (res->owner == dead_node) {17291725 mlog(0, "(this=%u) res %.*s owner=%u "···1854185218551853static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)18561854{18571857- struct list_head *iter;18551855+ struct hlist_node *iter;18581856 struct dlm_lock_resource *res;18591857 int i;18601860- struct list_head *bucket;18581858+ struct hlist_head *bucket;18611859 struct dlm_lock *lock;1862186018631861···18781876 * can be kicked again to see if any ASTs or BASTs18791877 * need to be fired as a result.18801878 */18811881- for (i=0; i<DLM_HASH_SIZE; i++) {18821882- bucket = &(dlm->resources[i]);18831883- list_for_each(iter, bucket) {18841884- res = list_entry (iter, struct dlm_lock_resource, list);18791879+ for (i = 0; i < DLM_HASH_BUCKETS; i++) {18801880+ bucket = &(dlm->lockres_hash[i]);18811881+ hlist_for_each_entry(res, iter, bucket, hash_node) {18851882 /* always prune any $RECOVERY entries for dead nodes,18861883 * otherwise hangs can occur during later recovery */18871884 if (dlm_is_recovery_lock(res->lockname.name,
+36-2
fs/ocfs2/extent_map.c
···181181 ret = -EBADR;182182 if (rec_end > OCFS2_I(inode)->ip_clusters) {183183 mlog_errno(ret);184184+ ocfs2_error(inode->i_sb,185185+ "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n",186186+ i,187187+ le64_to_cpu(rec->e_blkno),188188+ OCFS2_I(inode)->ip_blkno,189189+ OCFS2_I(inode)->ip_clusters);184190 goto out_free;185191 }186192···232226 ret = -EBADR;233227 if (blkno) {234228 mlog_errno(ret);229229+ ocfs2_error(inode->i_sb,230230+ "Multiple extents for (cpos = %u, clusters = %u) on inode %"MLFu64"; e_blkno %"MLFu64" and rec %d at e_blkno %"MLFu64"\n",231231+ cpos, clusters,232232+ OCFS2_I(inode)->ip_blkno,233233+ blkno, i,234234+ le64_to_cpu(rec->e_blkno));235235 goto out_free;236236 }237237···250238 */251239 ret = -EBADR;252240 if (!blkno) {241241+ ocfs2_error(inode->i_sb,242242+ "No record found for (cpos = %u, clusters = %u) on inode %"MLFu64"\n",243243+ cpos, clusters,244244+ OCFS2_I(inode)->ip_blkno);253245 mlog_errno(ret);254246 goto out_free;255247 }···282266283267 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {284268 rec = &el->l_recs[i];269269+270270+ if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >271271+ OCFS2_I(inode)->ip_clusters) {272272+ ret = -EBADR;273273+ mlog_errno(ret);274274+ ocfs2_error(inode->i_sb,275275+ "Extent %d at e_blkno %"MLFu64" of inode %"MLFu64" goes past ip_clusters of %u\n",276276+ i,277277+ le64_to_cpu(rec->e_blkno),278278+ OCFS2_I(inode)->ip_blkno,279279+ OCFS2_I(inode)->ip_clusters);280280+ return ret;281281+ }282282+285283 ret = ocfs2_extent_map_insert(inode, rec,286284 le16_to_cpu(el->l_tree_depth));287285 if (ret) {···556526 OCFS2_I(inode)->ip_map.em_clusters) {557527 ret = -EBADR;558528 mlog_errno(ret);529529+ ocfs2_error(inode->i_sb,530530+ "Zero e_clusters on non-tail extent record at e_blkno %"MLFu64" on inode %"MLFu64"\n",531531+ le64_to_cpu(rec->e_blkno),532532+ OCFS2_I(inode)->ip_blkno);559533 return ret;560534 }561535···622588 * Existing record in the extent map:623589 *624590 * cpos = 10, len = 10625625- * |---------|591591+ * |---------|626592 *627593 * New Record:628594 *629595 * cpos = 10, len = 20630630- * |------------------|596596+ * |------------------|631597 *632598 * The passed record is the new on-disk record. The new_clusters value633599 * is how many clusters were added to the file. If the append is a
+1-50
fs/ocfs2/file.c
···933933 struct file *filp = iocb->ki_filp;934934 struct inode *inode = filp->f_dentry->d_inode;935935 loff_t newsize, saved_pos;936936-#ifdef OCFS2_ORACORE_WORKAROUNDS937937- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);938938-#endif939936940937 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,941938 (unsigned int)count,···947950 mlog(0, "bad inode\n");948951 return -EIO;949952 }950950-951951-#ifdef OCFS2_ORACORE_WORKAROUNDS952952- /* ugh, work around some applications which open everything O_DIRECT +953953- * O_APPEND and really don't mean to use O_DIRECT. */954954- if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&955955- (filp->f_flags & O_APPEND) && (filp->f_flags & O_DIRECT)) 956956- filp->f_flags &= ~O_DIRECT;957957-#endif958953959954 mutex_lock(&inode->i_mutex);960955 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */···10681079 /* communicate with ocfs2_dio_end_io */10691080 ocfs2_iocb_set_rw_locked(iocb);1070108110711071-#ifdef OCFS2_ORACORE_WORKAROUNDS10721072- if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS &&10731073- filp->f_flags & O_DIRECT) {10741074- unsigned int saved_flags = filp->f_flags;10751075- int sector_size = 1 << osb->s_sectsize_bits;10761076-10771077- if ((saved_pos & (sector_size - 1)) ||10781078- (count & (sector_size - 1)) ||10791079- ((unsigned long)buf & (sector_size - 1))) {10801080- filp->f_flags |= O_SYNC;10811081- filp->f_flags &= ~O_DIRECT;10821082- }10831083-10841084- ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,10851085- &iocb->ki_pos);10861086-10871087- filp->f_flags = saved_flags;10881088- } else10891089-#endif10901090- ret = generic_file_aio_write_nolock(iocb, &local_iov, 1,10911091- &iocb->ki_pos);10821082+ ret = generic_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);1092108310931084 /* buffered aio wouldn't have proper lock coverage today */10941085 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));···11091140 int ret = 0, rw_level = -1, have_alloc_sem = 0;11101141 struct file *filp = iocb->ki_filp;11111142 struct inode *inode = filp->f_dentry->d_inode;11121112-#ifdef OCFS2_ORACORE_WORKAROUNDS11131113- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);11141114-#endif1115114311161144 mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", filp, buf,11171145 (unsigned int)count,···11201154 mlog_errno(ret);11211155 goto bail;11221156 }11231123-11241124-#ifdef OCFS2_ORACORE_WORKAROUNDS11251125- if (osb->s_mount_opt & OCFS2_MOUNT_COMPAT_OCFS) {11261126- if (filp->f_flags & O_DIRECT) {11271127- int sector_size = 1 << osb->s_sectsize_bits;11281128-11291129- if ((pos & (sector_size - 1)) ||11301130- (count & (sector_size - 1)) ||11311131- ((unsigned long)buf & (sector_size - 1)) ||11321132- (i_size_read(inode) & (sector_size -1))) {11331133- filp->f_flags &= ~O_DIRECT;11341134- }11351135- }11361136- }11371137-#endif1138115711391158 /* 11401159 * buffered reads protect themselves in ->readpage(). O_DIRECT reads
···4141#include "dlmglue.h"4242#include "extent_map.h"4343#include "file.h"4444+#include "heartbeat.h"4445#include "inode.h"4546#include "journal.h"4647#include "namei.h"···545544 return status;546545}547546547547+/* 548548+ * Serialize with orphan dir recovery. If the process doing549549+ * recovery on this orphan dir does an iget() with the dir550550+ * i_mutex held, we'll deadlock here. Instead we detect this551551+ * and exit early - recovery will wipe this inode for us.552552+ */553553+static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,554554+ int slot)555555+{556556+ int ret = 0;557557+558558+ spin_lock(&osb->osb_lock);559559+ if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {560560+ mlog(0, "Recovery is happening on orphan dir %d, will skip "561561+ "this inode\n", slot);562562+ ret = -EDEADLK;563563+ goto out;564564+ }565565+ /* This signals to the orphan recovery process that it should566566+ * wait for us to handle the wipe. */567567+ osb->osb_orphan_wipes[slot]++;568568+out:569569+ spin_unlock(&osb->osb_lock);570570+ return ret;571571+}572572+573573+static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb,574574+ int slot)575575+{576576+ spin_lock(&osb->osb_lock);577577+ osb->osb_orphan_wipes[slot]--;578578+ spin_unlock(&osb->osb_lock);579579+580580+ wake_up(&osb->osb_wipe_event);581581+}582582+548583static int ocfs2_wipe_inode(struct inode *inode,549584 struct buffer_head *di_bh)550585{···592555 /* We've already voted on this so it should be readonly - no593556 * spinlock needed. */594557 orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;558558+559559+ status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);560560+ if (status)561561+ return status;562562+595563 orphan_dir_inode = ocfs2_get_system_file_inode(osb,596564 ORPHAN_DIR_SYSTEM_INODE,597565 orphaned_slot);···639597 brelse(orphan_dir_bh);640598bail:641599 iput(orphan_dir_inode);600600+ ocfs2_signal_wipe_completion(osb, orphaned_slot);642601643602 return status;644603}···865822866823 status = ocfs2_wipe_inode(inode, di_bh);867824 if (status < 0) {868868- mlog_errno(status);825825+ if (status != -EDEADLK)826826+ mlog_errno(status);869827 goto bail_unlock_inode;870828 }871829
+93-31
fs/ocfs2/journal.c
···14081408 return status;14091409}1410141014111411-static int ocfs2_recover_orphans(struct ocfs2_super *osb,14121412- int slot)14111411+static int ocfs2_queue_orphans(struct ocfs2_super *osb,14121412+ int slot,14131413+ struct inode **head)14131414{14141414- int status = 0;14151415- int have_disk_lock = 0;14161416- struct inode *inode = NULL;14171417- struct inode *iter;14151415+ int status;14181416 struct inode *orphan_dir_inode = NULL;14171417+ struct inode *iter;14191418 unsigned long offset, blk, local;14201419 struct buffer_head *bh = NULL;14211420 struct ocfs2_dir_entry *de;14221421 struct super_block *sb = osb->sb;14231423- struct ocfs2_inode_info *oi;14241424-14251425- mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);1426142214271423 orphan_dir_inode = ocfs2_get_system_file_inode(osb,14281424 ORPHAN_DIR_SYSTEM_INODE,···14261430 if (!orphan_dir_inode) {14271431 status = -ENOENT;14281432 mlog_errno(status);14291429- goto out;14301430- }14331433+ return status;14341434+ } 1431143514321436 mutex_lock(&orphan_dir_inode->i_mutex);14331437 status = ocfs2_meta_lock(orphan_dir_inode, NULL, NULL, 0);14341438 if (status < 0) {14351435- mutex_unlock(&orphan_dir_inode->i_mutex);14361439 mlog_errno(status);14371440 goto out;14381441 }14391439- have_disk_lock = 1;1440144214411443 offset = 0;14421444 iter = NULL;···14451451 if (!bh)14461452 status = -EINVAL;14471453 if (status < 0) {14481448- mutex_unlock(&orphan_dir_inode->i_mutex);14491454 if (bh)14501455 brelse(bh);14511456 mlog_errno(status);14521452- goto out;14571457+ goto out_unlock;14531458 }1454145914551460 local = 0;···1458146514591466 if (!ocfs2_check_dir_entry(orphan_dir_inode,14601467 de, bh, local)) {14611461- mutex_unlock(&orphan_dir_inode->i_mutex);14621468 status = -EINVAL;14631469 mlog_errno(status);14641470 brelse(bh);14651465- goto out;14711471+ goto out_unlock;14661472 }1467147314681474 local += le16_to_cpu(de->rec_len);···1496150414971505 mlog(0, "queue orphan %"MLFu64"\n",14981506 OCFS2_I(iter)->ip_blkno);14991499- OCFS2_I(iter)->ip_next_orphan = inode;15001500- inode = iter;15071507+ /* No locking is required for the next_orphan15081508+ * queue as there is only ever a single15091509+ * process doing orphan recovery. */15101510+ OCFS2_I(iter)->ip_next_orphan = *head;15111511+ *head = iter;15011512 }15021513 brelse(bh);15031514 }15041504- mutex_unlock(&orphan_dir_inode->i_mutex);1505151515161516+out_unlock:15061517 ocfs2_meta_unlock(orphan_dir_inode, 0);15071507- have_disk_lock = 0;15081508-15181518+out:15191519+ mutex_unlock(&orphan_dir_inode->i_mutex);15091520 iput(orphan_dir_inode);15101510- orphan_dir_inode = NULL;15211521+ return status;15221522+}15231523+15241524+static int ocfs2_orphan_recovery_can_continue(struct ocfs2_super *osb,15251525+ int slot)15261526+{15271527+ int ret;15281528+15291529+ spin_lock(&osb->osb_lock);15301530+ ret = !osb->osb_orphan_wipes[slot];15311531+ spin_unlock(&osb->osb_lock);15321532+ return ret;15331533+}15341534+15351535+static void ocfs2_mark_recovering_orphan_dir(struct ocfs2_super *osb,15361536+ int slot)15371537+{15381538+ spin_lock(&osb->osb_lock);15391539+ /* Mark ourselves such that new processes in delete_inode()15401540+ * know to quit early. */15411541+ ocfs2_node_map_set_bit(osb, &osb->osb_recovering_orphan_dirs, slot);15421542+ while (osb->osb_orphan_wipes[slot]) {15431543+ /* If any processes are already in the middle of an15441544+ * orphan wipe on this dir, then we need to wait for15451545+ * them. */15461546+ spin_unlock(&osb->osb_lock);15471547+ wait_event_interruptible(osb->osb_wipe_event,15481548+ ocfs2_orphan_recovery_can_continue(osb, slot));15491549+ spin_lock(&osb->osb_lock);15501550+ }15511551+ spin_unlock(&osb->osb_lock);15521552+}15531553+15541554+static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,15551555+ int slot)15561556+{15571557+ ocfs2_node_map_clear_bit(osb, &osb->osb_recovering_orphan_dirs, slot);15581558+}15591559+15601560+/*15611561+ * Orphan recovery. Each mounted node has it's own orphan dir which we15621562+ * must run during recovery. Our strategy here is to build a list of15631563+ * the inodes in the orphan dir and iget/iput them. The VFS does15641564+ * (most) of the rest of the work.15651565+ *15661566+ * Orphan recovery can happen at any time, not just mount so we have a15671567+ * couple of extra considerations.15681568+ *15691569+ * - We grab as many inodes as we can under the orphan dir lock -15701570+ * doing iget() outside the orphan dir risks getting a reference on15711571+ * an invalid inode.15721572+ * - We must be sure not to deadlock with other processes on the15731573+ * system wanting to run delete_inode(). This can happen when they go15741574+ * to lock the orphan dir and the orphan recovery process attempts to15751575+ * iget() inside the orphan dir lock. This can be avoided by15761576+ * advertising our state to ocfs2_delete_inode().15771577+ */15781578+static int ocfs2_recover_orphans(struct ocfs2_super *osb,15791579+ int slot)15801580+{15811581+ int ret = 0;15821582+ struct inode *inode = NULL;15831583+ struct inode *iter;15841584+ struct ocfs2_inode_info *oi;15851585+15861586+ mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);15871587+15881588+ ocfs2_mark_recovering_orphan_dir(osb, slot);15891589+ ret = ocfs2_queue_orphans(osb, slot, &inode);15901590+ ocfs2_clear_recovering_orphan_dir(osb, slot);15911591+15921592+ /* Error here should be noted, but we want to continue with as15931593+ * many queued inodes as we've got. */15941594+ if (ret)15951595+ mlog_errno(ret);1511159615121597 while (inode) {15131598 oi = OCFS2_I(inode);···16101541 inode = iter;16111542 }1612154316131613-out:16141614- if (have_disk_lock)16151615- ocfs2_meta_unlock(orphan_dir_inode, 0);16161616-16171617- if (orphan_dir_inode)16181618- iput(orphan_dir_inode);16191619-16201620- return status;15441544+ return ret;16211545}1622154616231547static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
···13251325 }13261326 mlog(ML_NOTICE, "max_slots for this device: %u\n", osb->max_slots);1327132713281328+ init_waitqueue_head(&osb->osb_wipe_event);13291329+ osb->osb_orphan_wipes = kcalloc(osb->max_slots,13301330+ sizeof(*osb->osb_orphan_wipes),13311331+ GFP_KERNEL);13321332+ if (!osb->osb_orphan_wipes) {13331333+ status = -ENOMEM;13341334+ mlog_errno(status);13351335+ goto bail;13361336+ }13371337+13281338 osb->s_feature_compat =13291339 le32_to_cpu(OCFS2_RAW_SB(di)->s_feature_compat);13301340 osb->s_feature_ro_compat =···16481638 if (osb->slot_info)16491639 ocfs2_free_slot_info(osb->slot_info);1650164016411641+ kfree(osb->osb_orphan_wipes);16511642 /* FIXME16521643 * This belongs in journal shutdown, but because we have to16531644 * allocate osb->journal at the start of ocfs2_initalize_osb(),