commit ab479995b191b4256183956c13caabb86331af8e · tjh.dev/kernel

+4

fs/ocfs2/dlm/dlmcommon.h

··· 208 #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 209 #define DLM_LOCK_RES_MIGRATING 0x00000020 210 211 #define DLM_PURGE_INTERVAL_MS (8 * 1000) 212 213 struct dlm_lock_resource ··· 661 void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); 662 void dlm_wait_for_recovery(struct dlm_ctxt *dlm); 663 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); 664 665 void dlm_put(struct dlm_ctxt *dlm); 666 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);

··· 208 #define DLM_LOCK_RES_IN_PROGRESS 0x00000010 209 #define DLM_LOCK_RES_MIGRATING 0x00000020 210 211 + /* max milliseconds to wait to sync up a network failure with a node death */ 212 + #define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) 213 + 214 #define DLM_PURGE_INTERVAL_MS (8 * 1000) 215 216 struct dlm_lock_resource ··· 658 void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); 659 void dlm_wait_for_recovery(struct dlm_ctxt *dlm); 660 int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); 661 + int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); 662 663 void dlm_put(struct dlm_ctxt *dlm); 664 struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);

+9 -3

fs/ocfs2/dlm/dlmconvert.c

··· 392 } else { 393 mlog_errno(tmpret); 394 if (dlm_is_host_down(tmpret)) { 395 ret = DLM_RECOVERING; 396 mlog(0, "node %u died so returning DLM_RECOVERING " 397 "from convert message!\n", res->owner); ··· 426 struct dlm_lockstatus *lksb; 427 enum dlm_status status = DLM_NORMAL; 428 u32 flags; 429 - int call_ast = 0, kick_thread = 0; 430 431 if (!dlm_grab(dlm)) { 432 dlm_error(DLM_REJECTED); ··· 495 status = __dlm_lockres_state_to_status(res); 496 if (status == DLM_NORMAL) { 497 __dlm_lockres_reserve_ast(res); 498 res->state |= DLM_LOCK_RES_IN_PROGRESS; 499 status = __dlmconvert_master(dlm, res, lock, flags, 500 cnv->requested_type, ··· 518 else 519 dlm_lock_put(lock); 520 521 - /* either queue the ast or release it */ 522 if (call_ast) 523 dlm_queue_ast(dlm, lock); 524 - else 525 dlm_lockres_release_ast(dlm, res); 526 527 if (kick_thread)

··· 392 } else { 393 mlog_errno(tmpret); 394 if (dlm_is_host_down(tmpret)) { 395 + /* instead of logging the same network error over 396 + * and over, sleep here and wait for the heartbeat 397 + * to notice the node is dead. times out after 5s. */ 398 + dlm_wait_for_node_death(dlm, res->owner, 399 + DLM_NODE_DEATH_WAIT_MAX); 400 ret = DLM_RECOVERING; 401 mlog(0, "node %u died so returning DLM_RECOVERING " 402 "from convert message!\n", res->owner); ··· 421 struct dlm_lockstatus *lksb; 422 enum dlm_status status = DLM_NORMAL; 423 u32 flags; 424 + int call_ast = 0, kick_thread = 0, ast_reserved = 0; 425 426 if (!dlm_grab(dlm)) { 427 dlm_error(DLM_REJECTED); ··· 490 status = __dlm_lockres_state_to_status(res); 491 if (status == DLM_NORMAL) { 492 __dlm_lockres_reserve_ast(res); 493 + ast_reserved = 1; 494 res->state |= DLM_LOCK_RES_IN_PROGRESS; 495 status = __dlmconvert_master(dlm, res, lock, flags, 496 cnv->requested_type, ··· 512 else 513 dlm_lock_put(lock); 514 515 + /* either queue the ast or release it, if reserved */ 516 if (call_ast) 517 dlm_queue_ast(dlm, lock); 518 + else if (ast_reserved) 519 dlm_lockres_release_ast(dlm, res); 520 521 if (kick_thread)

+24 -1

fs/ocfs2/dlm/dlmlock.c

··· 220 dlm_error(status); 221 dlm_revert_pending_lock(res, lock); 222 dlm_lock_put(lock); 223 } 224 spin_unlock(&res->spinlock); 225 ··· 657 mlog(0, "retrying lock with migration/" 658 "recovery/in progress\n"); 659 msleep(100); 660 - dlm_wait_for_recovery(dlm); 661 goto retry_lock; 662 } 663

··· 220 dlm_error(status); 221 dlm_revert_pending_lock(res, lock); 222 dlm_lock_put(lock); 223 + } else if (dlm_is_recovery_lock(res->lockname.name, 224 + res->lockname.len)) { 225 + /* special case for the $RECOVERY lock. 226 + * there will never be an AST delivered to put 227 + * this lock on the proper secondary queue 228 + * (granted), so do it manually. */ 229 + mlog(0, "%s: $RECOVERY lock for this node (%u) is " 230 + "mastered by %u; got lock, manually granting (no ast)\n", 231 + dlm->name, dlm->node_num, res->owner); 232 + list_del_init(&lock->list); 233 + list_add_tail(&lock->list, &res->granted); 234 } 235 spin_unlock(&res->spinlock); 236 ··· 646 mlog(0, "retrying lock with migration/" 647 "recovery/in progress\n"); 648 msleep(100); 649 + /* no waiting for dlm_reco_thread */ 650 + if (recovery) { 651 + if (status == DLM_RECOVERING) { 652 + mlog(0, "%s: got RECOVERING " 653 + "for $REOCVERY lock, master " 654 + "was %u\n", dlm->name, 655 + res->owner); 656 + dlm_wait_for_node_death(dlm, res->owner, 657 + DLM_NODE_DEATH_WAIT_MAX); 658 + } 659 + } else { 660 + dlm_wait_for_recovery(dlm); 661 + } 662 goto retry_lock; 663 } 664

+6 -1

fs/ocfs2/dlm/dlmmaster.c

··· 2482 atomic_set(&mle->woken, 1); 2483 spin_unlock(&mle->spinlock); 2484 wake_up(&mle->wq); 2485 - /* final put will take care of list removal */ 2486 __dlm_put_mle(mle); 2487 } 2488 continue; ··· 2538 dlm_move_lockres_to_recovery_list(dlm, res); 2539 spin_unlock(&res->spinlock); 2540 dlm_lockres_put(res); 2541 2542 /* dump the mle */ 2543 spin_lock(&dlm->master_lock);

··· 2482 atomic_set(&mle->woken, 1); 2483 spin_unlock(&mle->spinlock); 2484 wake_up(&mle->wq); 2485 + /* do not need events any longer, so detach 2486 + * from heartbeat */ 2487 + __dlm_mle_detach_hb_events(dlm, mle); 2488 __dlm_put_mle(mle); 2489 } 2490 continue; ··· 2536 dlm_move_lockres_to_recovery_list(dlm, res); 2537 spin_unlock(&res->spinlock); 2538 dlm_lockres_put(res); 2539 + 2540 + /* about to get rid of mle, detach from heartbeat */ 2541 + __dlm_mle_detach_hb_events(dlm, mle); 2542 2543 /* dump the mle */ 2544 spin_lock(&dlm->master_lock);

+42

fs/ocfs2/dlm/dlmrecovery.c

··· 278 return dead; 279 } 280 281 /* callers of the top-level api calls (dlmlock/dlmunlock) should 282 * block on the dlm->reco.event when recovery is in progress. 283 * the dlm recovery thread will set this state when it begins ··· 2050 dlm->reco.new_master); 2051 status = -EEXIST; 2052 } else { 2053 status = dlm_send_begin_reco_message(dlm, 2054 dlm->reco.dead_node); 2055 /* this always succeeds */

··· 278 return dead; 279 } 280 281 + int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) 282 + { 283 + if (timeout) { 284 + mlog(ML_NOTICE, "%s: waiting %dms for notification of " 285 + "death of node %u\n", dlm->name, timeout, node); 286 + wait_event_timeout(dlm->dlm_reco_thread_wq, 287 + dlm_is_node_dead(dlm, node), 288 + msecs_to_jiffies(timeout)); 289 + } else { 290 + mlog(ML_NOTICE, "%s: waiting indefinitely for notification " 291 + "of death of node %u\n", dlm->name, node); 292 + wait_event(dlm->dlm_reco_thread_wq, 293 + dlm_is_node_dead(dlm, node)); 294 + } 295 + /* for now, return 0 */ 296 + return 0; 297 + } 298 + 299 /* callers of the top-level api calls (dlmlock/dlmunlock) should 300 * block on the dlm->reco.event when recovery is in progress. 301 * the dlm recovery thread will set this state when it begins ··· 2032 dlm->reco.new_master); 2033 status = -EEXIST; 2034 } else { 2035 + status = 0; 2036 + 2037 + /* see if recovery was already finished elsewhere */ 2038 + spin_lock(&dlm->spinlock); 2039 + if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { 2040 + status = -EINVAL; 2041 + mlog(0, "%s: got reco EX lock, but " 2042 + "node got recovered already\n", dlm->name); 2043 + if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { 2044 + mlog(ML_ERROR, "%s: new master is %u " 2045 + "but no dead node!\n", 2046 + dlm->name, dlm->reco.new_master); 2047 + BUG(); 2048 + } 2049 + } 2050 + spin_unlock(&dlm->spinlock); 2051 + } 2052 + 2053 + /* if this node has actually become the recovery master, 2054 + * set the master and send the messages to begin recovery */ 2055 + if (!status) { 2056 + mlog(0, "%s: dead=%u, this=%u, sending " 2057 + "begin_reco now\n", dlm->name, 2058 + dlm->reco.dead_node, dlm->node_num); 2059 status = dlm_send_begin_reco_message(dlm, 2060 dlm->reco.dead_node); 2061 /* this always succeeds */

+3 -4

fs/ocfs2/journal.c

··· 1584 while (!(kthread_should_stop() && 1585 atomic_read(&journal->j_num_trans) == 0)) { 1586 1587 - wait_event_interruptible_timeout(osb->checkpoint_event, 1588 - atomic_read(&journal->j_num_trans) 1589 - || kthread_should_stop(), 1590 - OCFS2_CHECKPOINT_INTERVAL); 1591 1592 status = ocfs2_commit_cache(osb); 1593 if (status < 0)

··· 1584 while (!(kthread_should_stop() && 1585 atomic_read(&journal->j_num_trans) == 0)) { 1586 1587 + wait_event_interruptible(osb->checkpoint_event, 1588 + atomic_read(&journal->j_num_trans) 1589 + || kthread_should_stop()); 1590 1591 status = ocfs2_commit_cache(osb); 1592 if (status < 0)

-2

fs/ocfs2/journal.h

··· 29 #include <linux/fs.h> 30 #include <linux/jbd.h> 31 32 - #define OCFS2_CHECKPOINT_INTERVAL (8 * HZ) 33 - 34 enum ocfs2_journal_state { 35 OCFS2_JOURNAL_FREE = 0, 36 OCFS2_JOURNAL_LOADED,

··· 29 #include <linux/fs.h> 30 #include <linux/jbd.h> 31 32 enum ocfs2_journal_state { 33 OCFS2_JOURNAL_FREE = 0, 34 OCFS2_JOURNAL_LOADED,