fs/dlm/lock.c at v6.0-rc2 · tjh.dev/kernel

tjh.dev / kernel
fork atom
Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
fork atom
kernel / fs / dlm / lock.c
at v6.0-rc2 6407 lines 166 kB view raw
wrap content
   1// SPDX-License-Identifier: GPL-2.0-only
   2/******************************************************************************
   3*******************************************************************************
   4**
   5**  Copyright (C) 2005-2010 Red Hat, Inc.  All rights reserved.
   6**
   7**
   8*******************************************************************************
   9******************************************************************************/
  10
  11/* Central locking logic has four stages:
  12
  13   dlm_lock()
  14   dlm_unlock()
  15
  16   request_lock(ls, lkb)
  17   convert_lock(ls, lkb)
  18   unlock_lock(ls, lkb)
  19   cancel_lock(ls, lkb)
  20
  21   _request_lock(r, lkb)
  22   _convert_lock(r, lkb)
  23   _unlock_lock(r, lkb)
  24   _cancel_lock(r, lkb)
  25
  26   do_request(r, lkb)
  27   do_convert(r, lkb)
  28   do_unlock(r, lkb)
  29   do_cancel(r, lkb)
  30
  31   Stage 1 (lock, unlock) is mainly about checking input args and
  32   splitting into one of the four main operations:
  33
  34       dlm_lock          = request_lock
  35       dlm_lock+CONVERT  = convert_lock
  36       dlm_unlock        = unlock_lock
  37       dlm_unlock+CANCEL = cancel_lock
  38
  39   Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is
  40   provided to the next stage.
  41
  42   Stage 3, _xxxx_lock(), determines if the operation is local or remote.
  43   When remote, it calls send_xxxx(), when local it calls do_xxxx().
  44
  45   Stage 4, do_xxxx(), is the guts of the operation.  It manipulates the
  46   given rsb and lkb and queues callbacks.
  47
  48   For remote operations, send_xxxx() results in the corresponding do_xxxx()
  49   function being executed on the remote node.  The connecting send/receive
  50   calls on local (L) and remote (R) nodes:
  51
  52   L: send_xxxx()              ->  R: receive_xxxx()
  53                                   R: do_xxxx()
  54   L: receive_xxxx_reply()     <-  R: send_xxxx_reply()
  55*/
  56#include <trace/events/dlm.h>
  57
  58#include <linux/types.h>
  59#include <linux/rbtree.h>
  60#include <linux/slab.h>
  61#include "dlm_internal.h"
  62#include <linux/dlm_device.h>
  63#include "memory.h"
  64#include "midcomms.h"
  65#include "requestqueue.h"
  66#include "util.h"
  67#include "dir.h"
  68#include "member.h"
  69#include "lockspace.h"
  70#include "ast.h"
  71#include "lock.h"
  72#include "rcom.h"
  73#include "recover.h"
  74#include "lvb_table.h"
  75#include "user.h"
  76#include "config.h"
  77
  78static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb);
  79static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb);
  80static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb);
  81static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb);
  82static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb);
  83static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode);
  84static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb);
  85static int send_remove(struct dlm_rsb *r);
  86static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
  87static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb);
  88static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
  89				    struct dlm_message *ms);
  90static int receive_extralen(struct dlm_message *ms);
  91static void do_purge(struct dlm_ls *ls, int nodeid, int pid);
  92static void del_timeout(struct dlm_lkb *lkb);
  93static void toss_rsb(struct kref *kref);
  94
  95/*
  96 * Lock compatibilty matrix - thanks Steve
  97 * UN = Unlocked state. Not really a state, used as a flag
  98 * PD = Padding. Used to make the matrix a nice power of two in size
  99 * Other states are the same as the VMS DLM.
 100 * Usage: matrix[grmode+1][rqmode+1]  (although m[rq+1][gr+1] is the same)
 101 */
 102
 103static const int __dlm_compat_matrix[8][8] = {
 104      /* UN NL CR CW PR PW EX PD */
 105        {1, 1, 1, 1, 1, 1, 1, 0},       /* UN */
 106        {1, 1, 1, 1, 1, 1, 1, 0},       /* NL */
 107        {1, 1, 1, 1, 1, 1, 0, 0},       /* CR */
 108        {1, 1, 1, 1, 0, 0, 0, 0},       /* CW */
 109        {1, 1, 1, 0, 1, 0, 0, 0},       /* PR */
 110        {1, 1, 1, 0, 0, 0, 0, 0},       /* PW */
 111        {1, 1, 0, 0, 0, 0, 0, 0},       /* EX */
 112        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
 113};
 114
 115/*
 116 * This defines the direction of transfer of LVB data.
 117 * Granted mode is the row; requested mode is the column.
 118 * Usage: matrix[grmode+1][rqmode+1]
 119 * 1 = LVB is returned to the caller
 120 * 0 = LVB is written to the resource
 121 * -1 = nothing happens to the LVB
 122 */
 123
 124const int dlm_lvb_operations[8][8] = {
 125        /* UN   NL  CR  CW  PR  PW  EX  PD*/
 126        {  -1,  1,  1,  1,  1,  1,  1, -1 }, /* UN */
 127        {  -1,  1,  1,  1,  1,  1,  1,  0 }, /* NL */
 128        {  -1, -1,  1,  1,  1,  1,  1,  0 }, /* CR */
 129        {  -1, -1, -1,  1,  1,  1,  1,  0 }, /* CW */
 130        {  -1, -1, -1, -1,  1,  1,  1,  0 }, /* PR */
 131        {  -1,  0,  0,  0,  0,  0,  1,  0 }, /* PW */
 132        {  -1,  0,  0,  0,  0,  0,  0,  0 }, /* EX */
 133        {  -1,  0,  0,  0,  0,  0,  0,  0 }  /* PD */
 134};
 135
 136#define modes_compat(gr, rq) \
 137	__dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1]
 138
 139int dlm_modes_compat(int mode1, int mode2)
 140{
 141	return __dlm_compat_matrix[mode1 + 1][mode2 + 1];
 142}
 143
 144/*
 145 * Compatibility matrix for conversions with QUECVT set.
 146 * Granted mode is the row; requested mode is the column.
 147 * Usage: matrix[grmode+1][rqmode+1]
 148 */
 149
 150static const int __quecvt_compat_matrix[8][8] = {
 151      /* UN NL CR CW PR PW EX PD */
 152        {0, 0, 0, 0, 0, 0, 0, 0},       /* UN */
 153        {0, 0, 1, 1, 1, 1, 1, 0},       /* NL */
 154        {0, 0, 0, 1, 1, 1, 1, 0},       /* CR */
 155        {0, 0, 0, 0, 1, 1, 1, 0},       /* CW */
 156        {0, 0, 0, 1, 0, 1, 1, 0},       /* PR */
 157        {0, 0, 0, 0, 0, 0, 1, 0},       /* PW */
 158        {0, 0, 0, 0, 0, 0, 0, 0},       /* EX */
 159        {0, 0, 0, 0, 0, 0, 0, 0}        /* PD */
 160};
 161
 162void dlm_print_lkb(struct dlm_lkb *lkb)
 163{
 164	printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x "
 165	       "sts %d rq %d gr %d wait_type %d wait_nodeid %d seq %llu\n",
 166	       lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags,
 167	       lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode,
 168	       lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_wait_nodeid,
 169	       (unsigned long long)lkb->lkb_recover_seq);
 170}
 171
 172static void dlm_print_rsb(struct dlm_rsb *r)
 173{
 174	printk(KERN_ERR "rsb: nodeid %d master %d dir %d flags %lx first %x "
 175	       "rlc %d name %s\n",
 176	       r->res_nodeid, r->res_master_nodeid, r->res_dir_nodeid,
 177	       r->res_flags, r->res_first_lkid, r->res_recover_locks_count,
 178	       r->res_name);
 179}
 180
 181void dlm_dump_rsb(struct dlm_rsb *r)
 182{
 183	struct dlm_lkb *lkb;
 184
 185	dlm_print_rsb(r);
 186
 187	printk(KERN_ERR "rsb: root_list empty %d recover_list empty %d\n",
 188	       list_empty(&r->res_root_list), list_empty(&r->res_recover_list));
 189	printk(KERN_ERR "rsb lookup list\n");
 190	list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup)
 191		dlm_print_lkb(lkb);
 192	printk(KERN_ERR "rsb grant queue:\n");
 193	list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue)
 194		dlm_print_lkb(lkb);
 195	printk(KERN_ERR "rsb convert queue:\n");
 196	list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue)
 197		dlm_print_lkb(lkb);
 198	printk(KERN_ERR "rsb wait queue:\n");
 199	list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue)
 200		dlm_print_lkb(lkb);
 201}
 202
 203/* Threads cannot use the lockspace while it's being recovered */
 204
 205static inline void dlm_lock_recovery(struct dlm_ls *ls)
 206{
 207	down_read(&ls->ls_in_recovery);
 208}
 209
 210void dlm_unlock_recovery(struct dlm_ls *ls)
 211{
 212	up_read(&ls->ls_in_recovery);
 213}
 214
 215int dlm_lock_recovery_try(struct dlm_ls *ls)
 216{
 217	return down_read_trylock(&ls->ls_in_recovery);
 218}
 219
 220static inline int can_be_queued(struct dlm_lkb *lkb)
 221{
 222	return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE);
 223}
 224
 225static inline int force_blocking_asts(struct dlm_lkb *lkb)
 226{
 227	return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST);
 228}
 229
 230static inline int is_demoted(struct dlm_lkb *lkb)
 231{
 232	return (lkb->lkb_sbflags & DLM_SBF_DEMOTED);
 233}
 234
 235static inline int is_altmode(struct dlm_lkb *lkb)
 236{
 237	return (lkb->lkb_sbflags & DLM_SBF_ALTMODE);
 238}
 239
 240static inline int is_granted(struct dlm_lkb *lkb)
 241{
 242	return (lkb->lkb_status == DLM_LKSTS_GRANTED);
 243}
 244
 245static inline int is_remote(struct dlm_rsb *r)
 246{
 247	DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r););
 248	return !!r->res_nodeid;
 249}
 250
 251static inline int is_process_copy(struct dlm_lkb *lkb)
 252{
 253	return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY));
 254}
 255
 256static inline int is_master_copy(struct dlm_lkb *lkb)
 257{
 258	return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? 1 : 0;
 259}
 260
 261static inline int middle_conversion(struct dlm_lkb *lkb)
 262{
 263	if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) ||
 264	    (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW))
 265		return 1;
 266	return 0;
 267}
 268
 269static inline int down_conversion(struct dlm_lkb *lkb)
 270{
 271	return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode);
 272}
 273
 274static inline int is_overlap_unlock(struct dlm_lkb *lkb)
 275{
 276	return lkb->lkb_flags & DLM_IFL_OVERLAP_UNLOCK;
 277}
 278
 279static inline int is_overlap_cancel(struct dlm_lkb *lkb)
 280{
 281	return lkb->lkb_flags & DLM_IFL_OVERLAP_CANCEL;
 282}
 283
 284static inline int is_overlap(struct dlm_lkb *lkb)
 285{
 286	return (lkb->lkb_flags & (DLM_IFL_OVERLAP_UNLOCK |
 287				  DLM_IFL_OVERLAP_CANCEL));
 288}
 289
 290static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
 291{
 292	if (is_master_copy(lkb))
 293		return;
 294
 295	del_timeout(lkb);
 296
 297	DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb););
 298
 299#ifdef CONFIG_DLM_DEPRECATED_API
 300	/* if the operation was a cancel, then return -DLM_ECANCEL, if a
 301	   timeout caused the cancel then return -ETIMEDOUT */
 302	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_TIMEOUT_CANCEL)) {
 303		lkb->lkb_flags &= ~DLM_IFL_TIMEOUT_CANCEL;
 304		rv = -ETIMEDOUT;
 305	}
 306#endif
 307
 308	if (rv == -DLM_ECANCEL && (lkb->lkb_flags & DLM_IFL_DEADLOCK_CANCEL)) {
 309		lkb->lkb_flags &= ~DLM_IFL_DEADLOCK_CANCEL;
 310		rv = -EDEADLK;
 311	}
 312
 313	dlm_add_cb(lkb, DLM_CB_CAST, lkb->lkb_grmode, rv, lkb->lkb_sbflags);
 314}
 315
 316static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb)
 317{
 318	queue_cast(r, lkb,
 319		   is_overlap_unlock(lkb) ? -DLM_EUNLOCK : -DLM_ECANCEL);
 320}
 321
 322static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode)
 323{
 324	if (is_master_copy(lkb)) {
 325		send_bast(r, lkb, rqmode);
 326	} else {
 327		dlm_add_cb(lkb, DLM_CB_BAST, rqmode, 0, 0);
 328	}
 329}
 330
 331/*
 332 * Basic operations on rsb's and lkb's
 333 */
 334
 335/* This is only called to add a reference when the code already holds
 336   a valid reference to the rsb, so there's no need for locking. */
 337
 338static inline void hold_rsb(struct dlm_rsb *r)
 339{
 340	kref_get(&r->res_ref);
 341}
 342
 343void dlm_hold_rsb(struct dlm_rsb *r)
 344{
 345	hold_rsb(r);
 346}
 347
 348/* When all references to the rsb are gone it's transferred to
 349   the tossed list for later disposal. */
 350
 351static void put_rsb(struct dlm_rsb *r)
 352{
 353	struct dlm_ls *ls = r->res_ls;
 354	uint32_t bucket = r->res_bucket;
 355	int rv;
 356
 357	rv = kref_put_lock(&r->res_ref, toss_rsb,
 358			   &ls->ls_rsbtbl[bucket].lock);
 359	if (rv)
 360		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
 361}
 362
 363void dlm_put_rsb(struct dlm_rsb *r)
 364{
 365	put_rsb(r);
 366}
 367
 368static int pre_rsb_struct(struct dlm_ls *ls)
 369{
 370	struct dlm_rsb *r1, *r2;
 371	int count = 0;
 372
 373	spin_lock(&ls->ls_new_rsb_spin);
 374	if (ls->ls_new_rsb_count > dlm_config.ci_new_rsb_count / 2) {
 375		spin_unlock(&ls->ls_new_rsb_spin);
 376		return 0;
 377	}
 378	spin_unlock(&ls->ls_new_rsb_spin);
 379
 380	r1 = dlm_allocate_rsb(ls);
 381	r2 = dlm_allocate_rsb(ls);
 382
 383	spin_lock(&ls->ls_new_rsb_spin);
 384	if (r1) {
 385		list_add(&r1->res_hashchain, &ls->ls_new_rsb);
 386		ls->ls_new_rsb_count++;
 387	}
 388	if (r2) {
 389		list_add(&r2->res_hashchain, &ls->ls_new_rsb);
 390		ls->ls_new_rsb_count++;
 391	}
 392	count = ls->ls_new_rsb_count;
 393	spin_unlock(&ls->ls_new_rsb_spin);
 394
 395	if (!count)
 396		return -ENOMEM;
 397	return 0;
 398}
 399
 400/* If ls->ls_new_rsb is empty, return -EAGAIN, so the caller can
 401   unlock any spinlocks, go back and call pre_rsb_struct again.
 402   Otherwise, take an rsb off the list and return it. */
 403
 404static int get_rsb_struct(struct dlm_ls *ls, char *name, int len,
 405			  struct dlm_rsb **r_ret)
 406{
 407	struct dlm_rsb *r;
 408	int count;
 409
 410	spin_lock(&ls->ls_new_rsb_spin);
 411	if (list_empty(&ls->ls_new_rsb)) {
 412		count = ls->ls_new_rsb_count;
 413		spin_unlock(&ls->ls_new_rsb_spin);
 414		log_debug(ls, "find_rsb retry %d %d %s",
 415			  count, dlm_config.ci_new_rsb_count, name);
 416		return -EAGAIN;
 417	}
 418
 419	r = list_first_entry(&ls->ls_new_rsb, struct dlm_rsb, res_hashchain);
 420	list_del(&r->res_hashchain);
 421	/* Convert the empty list_head to a NULL rb_node for tree usage: */
 422	memset(&r->res_hashnode, 0, sizeof(struct rb_node));
 423	ls->ls_new_rsb_count--;
 424	spin_unlock(&ls->ls_new_rsb_spin);
 425
 426	r->res_ls = ls;
 427	r->res_length = len;
 428	memcpy(r->res_name, name, len);
 429	mutex_init(&r->res_mutex);
 430
 431	INIT_LIST_HEAD(&r->res_lookup);
 432	INIT_LIST_HEAD(&r->res_grantqueue);
 433	INIT_LIST_HEAD(&r->res_convertqueue);
 434	INIT_LIST_HEAD(&r->res_waitqueue);
 435	INIT_LIST_HEAD(&r->res_root_list);
 436	INIT_LIST_HEAD(&r->res_recover_list);
 437
 438	*r_ret = r;
 439	return 0;
 440}
 441
 442static int rsb_cmp(struct dlm_rsb *r, const char *name, int nlen)
 443{
 444	char maxname[DLM_RESNAME_MAXLEN];
 445
 446	memset(maxname, 0, DLM_RESNAME_MAXLEN);
 447	memcpy(maxname, name, nlen);
 448	return memcmp(r->res_name, maxname, DLM_RESNAME_MAXLEN);
 449}
 450
 451int dlm_search_rsb_tree(struct rb_root *tree, char *name, int len,
 452			struct dlm_rsb **r_ret)
 453{
 454	struct rb_node *node = tree->rb_node;
 455	struct dlm_rsb *r;
 456	int rc;
 457
 458	while (node) {
 459		r = rb_entry(node, struct dlm_rsb, res_hashnode);
 460		rc = rsb_cmp(r, name, len);
 461		if (rc < 0)
 462			node = node->rb_left;
 463		else if (rc > 0)
 464			node = node->rb_right;
 465		else
 466			goto found;
 467	}
 468	*r_ret = NULL;
 469	return -EBADR;
 470
 471 found:
 472	*r_ret = r;
 473	return 0;
 474}
 475
 476static int rsb_insert(struct dlm_rsb *rsb, struct rb_root *tree)
 477{
 478	struct rb_node **newn = &tree->rb_node;
 479	struct rb_node *parent = NULL;
 480	int rc;
 481
 482	while (*newn) {
 483		struct dlm_rsb *cur = rb_entry(*newn, struct dlm_rsb,
 484					       res_hashnode);
 485
 486		parent = *newn;
 487		rc = rsb_cmp(cur, rsb->res_name, rsb->res_length);
 488		if (rc < 0)
 489			newn = &parent->rb_left;
 490		else if (rc > 0)
 491			newn = &parent->rb_right;
 492		else {
 493			log_print("rsb_insert match");
 494			dlm_dump_rsb(rsb);
 495			dlm_dump_rsb(cur);
 496			return -EEXIST;
 497		}
 498	}
 499
 500	rb_link_node(&rsb->res_hashnode, parent, newn);
 501	rb_insert_color(&rsb->res_hashnode, tree);
 502	return 0;
 503}
 504
 505/*
 506 * Find rsb in rsbtbl and potentially create/add one
 507 *
 508 * Delaying the release of rsb's has a similar benefit to applications keeping
 509 * NL locks on an rsb, but without the guarantee that the cached master value
 510 * will still be valid when the rsb is reused.  Apps aren't always smart enough
 511 * to keep NL locks on an rsb that they may lock again shortly; this can lead
 512 * to excessive master lookups and removals if we don't delay the release.
 513 *
 514 * Searching for an rsb means looking through both the normal list and toss
 515 * list.  When found on the toss list the rsb is moved to the normal list with
 516 * ref count of 1; when found on normal list the ref count is incremented.
 517 *
 518 * rsb's on the keep list are being used locally and refcounted.
 519 * rsb's on the toss list are not being used locally, and are not refcounted.
 520 *
 521 * The toss list rsb's were either
 522 * - previously used locally but not any more (were on keep list, then
 523 *   moved to toss list when last refcount dropped)
 524 * - created and put on toss list as a directory record for a lookup
 525 *   (we are the dir node for the res, but are not using the res right now,
 526 *   but some other node is)
 527 *
 528 * The purpose of find_rsb() is to return a refcounted rsb for local use.
 529 * So, if the given rsb is on the toss list, it is moved to the keep list
 530 * before being returned.
 531 *
 532 * toss_rsb() happens when all local usage of the rsb is done, i.e. no
 533 * more refcounts exist, so the rsb is moved from the keep list to the
 534 * toss list.
 535 *
 536 * rsb's on both keep and toss lists are used for doing a name to master
 537 * lookups.  rsb's that are in use locally (and being refcounted) are on
 538 * the keep list, rsb's that are not in use locally (not refcounted) and
 539 * only exist for name/master lookups are on the toss list.
 540 *
 541 * rsb's on the toss list who's dir_nodeid is not local can have stale
 542 * name/master mappings.  So, remote requests on such rsb's can potentially
 543 * return with an error, which means the mapping is stale and needs to
 544 * be updated with a new lookup.  (The idea behind MASTER UNCERTAIN and
 545 * first_lkid is to keep only a single outstanding request on an rsb
 546 * while that rsb has a potentially stale master.)
 547 */
 548
 549static int find_rsb_dir(struct dlm_ls *ls, char *name, int len,
 550			uint32_t hash, uint32_t b,
 551			int dir_nodeid, int from_nodeid,
 552			unsigned int flags, struct dlm_rsb **r_ret)
 553{
 554	struct dlm_rsb *r = NULL;
 555	int our_nodeid = dlm_our_nodeid();
 556	int from_local = 0;
 557	int from_other = 0;
 558	int from_dir = 0;
 559	int create = 0;
 560	int error;
 561
 562	if (flags & R_RECEIVE_REQUEST) {
 563		if (from_nodeid == dir_nodeid)
 564			from_dir = 1;
 565		else
 566			from_other = 1;
 567	} else if (flags & R_REQUEST) {
 568		from_local = 1;
 569	}
 570
 571	/*
 572	 * flags & R_RECEIVE_RECOVER is from dlm_recover_master_copy, so
 573	 * from_nodeid has sent us a lock in dlm_recover_locks, believing
 574	 * we're the new master.  Our local recovery may not have set
 575	 * res_master_nodeid to our_nodeid yet, so allow either.  Don't
 576	 * create the rsb; dlm_recover_process_copy() will handle EBADR
 577	 * by resending.
 578	 *
 579	 * If someone sends us a request, we are the dir node, and we do
 580	 * not find the rsb anywhere, then recreate it.  This happens if
 581	 * someone sends us a request after we have removed/freed an rsb
 582	 * from our toss list.  (They sent a request instead of lookup
 583	 * because they are using an rsb from their toss list.)
 584	 */
 585
 586	if (from_local || from_dir ||
 587	    (from_other && (dir_nodeid == our_nodeid))) {
 588		create = 1;
 589	}
 590
 591 retry:
 592	if (create) {
 593		error = pre_rsb_struct(ls);
 594		if (error < 0)
 595			goto out;
 596	}
 597
 598	spin_lock(&ls->ls_rsbtbl[b].lock);
 599
 600	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
 601	if (error)
 602		goto do_toss;
 603	
 604	/*
 605	 * rsb is active, so we can't check master_nodeid without lock_rsb.
 606	 */
 607
 608	kref_get(&r->res_ref);
 609	goto out_unlock;
 610
 611
 612 do_toss:
 613	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
 614	if (error)
 615		goto do_new;
 616
 617	/*
 618	 * rsb found inactive (master_nodeid may be out of date unless
 619	 * we are the dir_nodeid or were the master)  No other thread
 620	 * is using this rsb because it's on the toss list, so we can
 621	 * look at or update res_master_nodeid without lock_rsb.
 622	 */
 623
 624	if ((r->res_master_nodeid != our_nodeid) && from_other) {
 625		/* our rsb was not master, and another node (not the dir node)
 626		   has sent us a request */
 627		log_debug(ls, "find_rsb toss from_other %d master %d dir %d %s",
 628			  from_nodeid, r->res_master_nodeid, dir_nodeid,
 629			  r->res_name);
 630		error = -ENOTBLK;
 631		goto out_unlock;
 632	}
 633
 634	if ((r->res_master_nodeid != our_nodeid) && from_dir) {
 635		/* don't think this should ever happen */
 636		log_error(ls, "find_rsb toss from_dir %d master %d",
 637			  from_nodeid, r->res_master_nodeid);
 638		dlm_print_rsb(r);
 639		/* fix it and go on */
 640		r->res_master_nodeid = our_nodeid;
 641		r->res_nodeid = 0;
 642		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
 643		r->res_first_lkid = 0;
 644	}
 645
 646	if (from_local && (r->res_master_nodeid != our_nodeid)) {
 647		/* Because we have held no locks on this rsb,
 648		   res_master_nodeid could have become stale. */
 649		rsb_set_flag(r, RSB_MASTER_UNCERTAIN);
 650		r->res_first_lkid = 0;
 651	}
 652
 653	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
 654	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
 655	goto out_unlock;
 656
 657
 658 do_new:
 659	/*
 660	 * rsb not found
 661	 */
 662
 663	if (error == -EBADR && !create)
 664		goto out_unlock;
 665
 666	error = get_rsb_struct(ls, name, len, &r);
 667	if (error == -EAGAIN) {
 668		spin_unlock(&ls->ls_rsbtbl[b].lock);
 669		goto retry;
 670	}
 671	if (error)
 672		goto out_unlock;
 673
 674	r->res_hash = hash;
 675	r->res_bucket = b;
 676	r->res_dir_nodeid = dir_nodeid;
 677	kref_init(&r->res_ref);
 678
 679	if (from_dir) {
 680		/* want to see how often this happens */
 681		log_debug(ls, "find_rsb new from_dir %d recreate %s",
 682			  from_nodeid, r->res_name);
 683		r->res_master_nodeid = our_nodeid;
 684		r->res_nodeid = 0;
 685		goto out_add;
 686	}
 687
 688	if (from_other && (dir_nodeid != our_nodeid)) {
 689		/* should never happen */
 690		log_error(ls, "find_rsb new from_other %d dir %d our %d %s",
 691			  from_nodeid, dir_nodeid, our_nodeid, r->res_name);
 692		dlm_free_rsb(r);
 693		r = NULL;
 694		error = -ENOTBLK;
 695		goto out_unlock;
 696	}
 697
 698	if (from_other) {
 699		log_debug(ls, "find_rsb new from_other %d dir %d %s",
 700			  from_nodeid, dir_nodeid, r->res_name);
 701	}
 702
 703	if (dir_nodeid == our_nodeid) {
 704		/* When we are the dir nodeid, we can set the master
 705		   node immediately */
 706		r->res_master_nodeid = our_nodeid;
 707		r->res_nodeid = 0;
 708	} else {
 709		/* set_master will send_lookup to dir_nodeid */
 710		r->res_master_nodeid = 0;
 711		r->res_nodeid = -1;
 712	}
 713
 714 out_add:
 715	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
 716 out_unlock:
 717	spin_unlock(&ls->ls_rsbtbl[b].lock);
 718 out:
 719	*r_ret = r;
 720	return error;
 721}
 722
 723/* During recovery, other nodes can send us new MSTCPY locks (from
 724   dlm_recover_locks) before we've made ourself master (in
 725   dlm_recover_masters). */
 726
 727static int find_rsb_nodir(struct dlm_ls *ls, char *name, int len,
 728			  uint32_t hash, uint32_t b,
 729			  int dir_nodeid, int from_nodeid,
 730			  unsigned int flags, struct dlm_rsb **r_ret)
 731{
 732	struct dlm_rsb *r = NULL;
 733	int our_nodeid = dlm_our_nodeid();
 734	int recover = (flags & R_RECEIVE_RECOVER);
 735	int error;
 736
 737 retry:
 738	error = pre_rsb_struct(ls);
 739	if (error < 0)
 740		goto out;
 741
 742	spin_lock(&ls->ls_rsbtbl[b].lock);
 743
 744	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
 745	if (error)
 746		goto do_toss;
 747
 748	/*
 749	 * rsb is active, so we can't check master_nodeid without lock_rsb.
 750	 */
 751
 752	kref_get(&r->res_ref);
 753	goto out_unlock;
 754
 755
 756 do_toss:
 757	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
 758	if (error)
 759		goto do_new;
 760
 761	/*
 762	 * rsb found inactive. No other thread is using this rsb because
 763	 * it's on the toss list, so we can look at or update
 764	 * res_master_nodeid without lock_rsb.
 765	 */
 766
 767	if (!recover && (r->res_master_nodeid != our_nodeid) && from_nodeid) {
 768		/* our rsb is not master, and another node has sent us a
 769		   request; this should never happen */
 770		log_error(ls, "find_rsb toss from_nodeid %d master %d dir %d",
 771			  from_nodeid, r->res_master_nodeid, dir_nodeid);
 772		dlm_print_rsb(r);
 773		error = -ENOTBLK;
 774		goto out_unlock;
 775	}
 776
 777	if (!recover && (r->res_master_nodeid != our_nodeid) &&
 778	    (dir_nodeid == our_nodeid)) {
 779		/* our rsb is not master, and we are dir; may as well fix it;
 780		   this should never happen */
 781		log_error(ls, "find_rsb toss our %d master %d dir %d",
 782			  our_nodeid, r->res_master_nodeid, dir_nodeid);
 783		dlm_print_rsb(r);
 784		r->res_master_nodeid = our_nodeid;
 785		r->res_nodeid = 0;
 786	}
 787
 788	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
 789	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
 790	goto out_unlock;
 791
 792
 793 do_new:
 794	/*
 795	 * rsb not found
 796	 */
 797
 798	error = get_rsb_struct(ls, name, len, &r);
 799	if (error == -EAGAIN) {
 800		spin_unlock(&ls->ls_rsbtbl[b].lock);
 801		goto retry;
 802	}
 803	if (error)
 804		goto out_unlock;
 805
 806	r->res_hash = hash;
 807	r->res_bucket = b;
 808	r->res_dir_nodeid = dir_nodeid;
 809	r->res_master_nodeid = dir_nodeid;
 810	r->res_nodeid = (dir_nodeid == our_nodeid) ? 0 : dir_nodeid;
 811	kref_init(&r->res_ref);
 812
 813	error = rsb_insert(r, &ls->ls_rsbtbl[b].keep);
 814 out_unlock:
 815	spin_unlock(&ls->ls_rsbtbl[b].lock);
 816 out:
 817	*r_ret = r;
 818	return error;
 819}
 820
 821static int find_rsb(struct dlm_ls *ls, char *name, int len, int from_nodeid,
 822		    unsigned int flags, struct dlm_rsb **r_ret)
 823{
 824	uint32_t hash, b;
 825	int dir_nodeid;
 826
 827	if (len > DLM_RESNAME_MAXLEN)
 828		return -EINVAL;
 829
 830	hash = jhash(name, len, 0);
 831	b = hash & (ls->ls_rsbtbl_size - 1);
 832
 833	dir_nodeid = dlm_hash2nodeid(ls, hash);
 834
 835	if (dlm_no_directory(ls))
 836		return find_rsb_nodir(ls, name, len, hash, b, dir_nodeid,
 837				      from_nodeid, flags, r_ret);
 838	else
 839		return find_rsb_dir(ls, name, len, hash, b, dir_nodeid,
 840				      from_nodeid, flags, r_ret);
 841}
 842
 843/* we have received a request and found that res_master_nodeid != our_nodeid,
 844   so we need to return an error or make ourself the master */
 845
 846static int validate_master_nodeid(struct dlm_ls *ls, struct dlm_rsb *r,
 847				  int from_nodeid)
 848{
 849	if (dlm_no_directory(ls)) {
 850		log_error(ls, "find_rsb keep from_nodeid %d master %d dir %d",
 851			  from_nodeid, r->res_master_nodeid,
 852			  r->res_dir_nodeid);
 853		dlm_print_rsb(r);
 854		return -ENOTBLK;
 855	}
 856
 857	if (from_nodeid != r->res_dir_nodeid) {
 858		/* our rsb is not master, and another node (not the dir node)
 859	   	   has sent us a request.  this is much more common when our
 860	   	   master_nodeid is zero, so limit debug to non-zero.  */
 861
 862		if (r->res_master_nodeid) {
 863			log_debug(ls, "validate master from_other %d master %d "
 864				  "dir %d first %x %s", from_nodeid,
 865				  r->res_master_nodeid, r->res_dir_nodeid,
 866				  r->res_first_lkid, r->res_name);
 867		}
 868		return -ENOTBLK;
 869	} else {
 870		/* our rsb is not master, but the dir nodeid has sent us a
 871	   	   request; this could happen with master 0 / res_nodeid -1 */
 872
 873		if (r->res_master_nodeid) {
 874			log_error(ls, "validate master from_dir %d master %d "
 875				  "first %x %s",
 876				  from_nodeid, r->res_master_nodeid,
 877				  r->res_first_lkid, r->res_name);
 878		}
 879
 880		r->res_master_nodeid = dlm_our_nodeid();
 881		r->res_nodeid = 0;
 882		return 0;
 883	}
 884}
 885
 886static void __dlm_master_lookup(struct dlm_ls *ls, struct dlm_rsb *r, int our_nodeid,
 887				int from_nodeid, bool toss_list, unsigned int flags,
 888				int *r_nodeid, int *result)
 889{
 890	int fix_master = (flags & DLM_LU_RECOVER_MASTER);
 891	int from_master = (flags & DLM_LU_RECOVER_DIR);
 892
 893	if (r->res_dir_nodeid != our_nodeid) {
 894		/* should not happen, but may as well fix it and carry on */
 895		log_error(ls, "%s res_dir %d our %d %s", __func__,
 896			  r->res_dir_nodeid, our_nodeid, r->res_name);
 897		r->res_dir_nodeid = our_nodeid;
 898	}
 899
 900	if (fix_master && dlm_is_removed(ls, r->res_master_nodeid)) {
 901		/* Recovery uses this function to set a new master when
 902		 * the previous master failed.  Setting NEW_MASTER will
 903		 * force dlm_recover_masters to call recover_master on this
 904		 * rsb even though the res_nodeid is no longer removed.
 905		 */
 906
 907		r->res_master_nodeid = from_nodeid;
 908		r->res_nodeid = from_nodeid;
 909		rsb_set_flag(r, RSB_NEW_MASTER);
 910
 911		if (toss_list) {
 912			/* I don't think we should ever find it on toss list. */
 913			log_error(ls, "%s fix_master on toss", __func__);
 914			dlm_dump_rsb(r);
 915		}
 916	}
 917
 918	if (from_master && (r->res_master_nodeid != from_nodeid)) {
 919		/* this will happen if from_nodeid became master during
 920		 * a previous recovery cycle, and we aborted the previous
 921		 * cycle before recovering this master value
 922		 */
 923
 924		log_limit(ls, "%s from_master %d master_nodeid %d res_nodeid %d first %x %s",
 925			  __func__, from_nodeid, r->res_master_nodeid,
 926			  r->res_nodeid, r->res_first_lkid, r->res_name);
 927
 928		if (r->res_master_nodeid == our_nodeid) {
 929			log_error(ls, "from_master %d our_master", from_nodeid);
 930			dlm_dump_rsb(r);
 931			goto ret_assign;
 932		}
 933
 934		r->res_master_nodeid = from_nodeid;
 935		r->res_nodeid = from_nodeid;
 936		rsb_set_flag(r, RSB_NEW_MASTER);
 937	}
 938
 939	if (!r->res_master_nodeid) {
 940		/* this will happen if recovery happens while we're looking
 941		 * up the master for this rsb
 942		 */
 943
 944		log_debug(ls, "%s master 0 to %d first %x %s", __func__,
 945			  from_nodeid, r->res_first_lkid, r->res_name);
 946		r->res_master_nodeid = from_nodeid;
 947		r->res_nodeid = from_nodeid;
 948	}
 949
 950	if (!from_master && !fix_master &&
 951	    (r->res_master_nodeid == from_nodeid)) {
 952		/* this can happen when the master sends remove, the dir node
 953		 * finds the rsb on the keep list and ignores the remove,
 954		 * and the former master sends a lookup
 955		 */
 956
 957		log_limit(ls, "%s from master %d flags %x first %x %s",
 958			  __func__, from_nodeid, flags, r->res_first_lkid,
 959			  r->res_name);
 960	}
 961
 962 ret_assign:
 963	*r_nodeid = r->res_master_nodeid;
 964	if (result)
 965		*result = DLM_LU_MATCH;
 966}
 967
 968/*
 969 * We're the dir node for this res and another node wants to know the
 970 * master nodeid.  During normal operation (non recovery) this is only
 971 * called from receive_lookup(); master lookups when the local node is
 972 * the dir node are done by find_rsb().
 973 *
 974 * normal operation, we are the dir node for a resource
 975 * . _request_lock
 976 * . set_master
 977 * . send_lookup
 978 * . receive_lookup
 979 * . dlm_master_lookup flags 0
 980 *
 981 * recover directory, we are rebuilding dir for all resources
 982 * . dlm_recover_directory
 983 * . dlm_rcom_names
 984 *   remote node sends back the rsb names it is master of and we are dir of
 985 * . dlm_master_lookup RECOVER_DIR (fix_master 0, from_master 1)
 986 *   we either create new rsb setting remote node as master, or find existing
 987 *   rsb and set master to be the remote node.
 988 *
 989 * recover masters, we are finding the new master for resources
 990 * . dlm_recover_masters
 991 * . recover_master
 992 * . dlm_send_rcom_lookup
 993 * . receive_rcom_lookup
 994 * . dlm_master_lookup RECOVER_MASTER (fix_master 1, from_master 0)
 995 */
 996
 997int dlm_master_lookup(struct dlm_ls *ls, int from_nodeid, char *name, int len,
 998		      unsigned int flags, int *r_nodeid, int *result)
 999{
1000	struct dlm_rsb *r = NULL;
1001	uint32_t hash, b;
1002	int our_nodeid = dlm_our_nodeid();
1003	int dir_nodeid, error;
1004
1005	if (len > DLM_RESNAME_MAXLEN)
1006		return -EINVAL;
1007
1008	if (from_nodeid == our_nodeid) {
1009		log_error(ls, "dlm_master_lookup from our_nodeid %d flags %x",
1010			  our_nodeid, flags);
1011		return -EINVAL;
1012	}
1013
1014	hash = jhash(name, len, 0);
1015	b = hash & (ls->ls_rsbtbl_size - 1);
1016
1017	dir_nodeid = dlm_hash2nodeid(ls, hash);
1018	if (dir_nodeid != our_nodeid) {
1019		log_error(ls, "dlm_master_lookup from %d dir %d our %d h %x %d",
1020			  from_nodeid, dir_nodeid, our_nodeid, hash,
1021			  ls->ls_num_nodes);
1022		*r_nodeid = -1;
1023		return -EINVAL;
1024	}
1025
1026 retry:
1027	error = pre_rsb_struct(ls);
1028	if (error < 0)
1029		return error;
1030
1031	spin_lock(&ls->ls_rsbtbl[b].lock);
1032	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
1033	if (!error) {
1034		/* because the rsb is active, we need to lock_rsb before
1035		 * checking/changing re_master_nodeid
1036		 */
1037
1038		hold_rsb(r);
1039		spin_unlock(&ls->ls_rsbtbl[b].lock);
1040		lock_rsb(r);
1041
1042		__dlm_master_lookup(ls, r, our_nodeid, from_nodeid, false,
1043				    flags, r_nodeid, result);
1044
1045		/* the rsb was active */
1046		unlock_rsb(r);
1047		put_rsb(r);
1048
1049		return 0;
1050	}
1051
1052	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
1053	if (error)
1054		goto not_found;
1055
1056	/* because the rsb is inactive (on toss list), it's not refcounted
1057	 * and lock_rsb is not used, but is protected by the rsbtbl lock
1058	 */
1059
1060	__dlm_master_lookup(ls, r, our_nodeid, from_nodeid, true, flags,
1061			    r_nodeid, result);
1062
1063	r->res_toss_time = jiffies;
1064	/* the rsb was inactive (on toss list) */
1065	spin_unlock(&ls->ls_rsbtbl[b].lock);
1066
1067	return 0;
1068
1069 not_found:
1070	error = get_rsb_struct(ls, name, len, &r);
1071	if (error == -EAGAIN) {
1072		spin_unlock(&ls->ls_rsbtbl[b].lock);
1073		goto retry;
1074	}
1075	if (error)
1076		goto out_unlock;
1077
1078	r->res_hash = hash;
1079	r->res_bucket = b;
1080	r->res_dir_nodeid = our_nodeid;
1081	r->res_master_nodeid = from_nodeid;
1082	r->res_nodeid = from_nodeid;
1083	kref_init(&r->res_ref);
1084	r->res_toss_time = jiffies;
1085
1086	error = rsb_insert(r, &ls->ls_rsbtbl[b].toss);
1087	if (error) {
1088		/* should never happen */
1089		dlm_free_rsb(r);
1090		spin_unlock(&ls->ls_rsbtbl[b].lock);
1091		goto retry;
1092	}
1093
1094	if (result)
1095		*result = DLM_LU_ADD;
1096	*r_nodeid = from_nodeid;
1097 out_unlock:
1098	spin_unlock(&ls->ls_rsbtbl[b].lock);
1099	return error;
1100}
1101
1102static void dlm_dump_rsb_hash(struct dlm_ls *ls, uint32_t hash)
1103{
1104	struct rb_node *n;
1105	struct dlm_rsb *r;
1106	int i;
1107
1108	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
1109		spin_lock(&ls->ls_rsbtbl[i].lock);
1110		for (n = rb_first(&ls->ls_rsbtbl[i].keep); n; n = rb_next(n)) {
1111			r = rb_entry(n, struct dlm_rsb, res_hashnode);
1112			if (r->res_hash == hash)
1113				dlm_dump_rsb(r);
1114		}
1115		spin_unlock(&ls->ls_rsbtbl[i].lock);
1116	}
1117}
1118
1119void dlm_dump_rsb_name(struct dlm_ls *ls, char *name, int len)
1120{
1121	struct dlm_rsb *r = NULL;
1122	uint32_t hash, b;
1123	int error;
1124
1125	hash = jhash(name, len, 0);
1126	b = hash & (ls->ls_rsbtbl_size - 1);
1127
1128	spin_lock(&ls->ls_rsbtbl[b].lock);
1129	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
1130	if (!error)
1131		goto out_dump;
1132
1133	error = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
1134	if (error)
1135		goto out;
1136 out_dump:
1137	dlm_dump_rsb(r);
1138 out:
1139	spin_unlock(&ls->ls_rsbtbl[b].lock);
1140}
1141
1142static void toss_rsb(struct kref *kref)
1143{
1144	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
1145	struct dlm_ls *ls = r->res_ls;
1146
1147	DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r););
1148	kref_init(&r->res_ref);
1149	rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
1150	rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
1151	r->res_toss_time = jiffies;
1152	ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK;
1153	if (r->res_lvbptr) {
1154		dlm_free_lvb(r->res_lvbptr);
1155		r->res_lvbptr = NULL;
1156	}
1157}
1158
1159/* See comment for unhold_lkb */
1160
1161static void unhold_rsb(struct dlm_rsb *r)
1162{
1163	int rv;
1164	rv = kref_put(&r->res_ref, toss_rsb);
1165	DLM_ASSERT(!rv, dlm_dump_rsb(r););
1166}
1167
1168static void kill_rsb(struct kref *kref)
1169{
1170	struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref);
1171
1172	/* All work is done after the return from kref_put() so we
1173	   can release the write_lock before the remove and free. */
1174
1175	DLM_ASSERT(list_empty(&r->res_lookup), dlm_dump_rsb(r););
1176	DLM_ASSERT(list_empty(&r->res_grantqueue), dlm_dump_rsb(r););
1177	DLM_ASSERT(list_empty(&r->res_convertqueue), dlm_dump_rsb(r););
1178	DLM_ASSERT(list_empty(&r->res_waitqueue), dlm_dump_rsb(r););
1179	DLM_ASSERT(list_empty(&r->res_root_list), dlm_dump_rsb(r););
1180	DLM_ASSERT(list_empty(&r->res_recover_list), dlm_dump_rsb(r););
1181}
1182
1183/* Attaching/detaching lkb's from rsb's is for rsb reference counting.
1184   The rsb must exist as long as any lkb's for it do. */
1185
1186static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
1187{
1188	hold_rsb(r);
1189	lkb->lkb_resource = r;
1190}
1191
1192static void detach_lkb(struct dlm_lkb *lkb)
1193{
1194	if (lkb->lkb_resource) {
1195		put_rsb(lkb->lkb_resource);
1196		lkb->lkb_resource = NULL;
1197	}
1198}
1199
1200static int _create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret,
1201		       int start, int end)
1202{
1203	struct dlm_lkb *lkb;
1204	int rv;
1205
1206	lkb = dlm_allocate_lkb(ls);
1207	if (!lkb)
1208		return -ENOMEM;
1209
1210	lkb->lkb_nodeid = -1;
1211	lkb->lkb_grmode = DLM_LOCK_IV;
1212	kref_init(&lkb->lkb_ref);
1213	INIT_LIST_HEAD(&lkb->lkb_ownqueue);
1214	INIT_LIST_HEAD(&lkb->lkb_rsb_lookup);
1215#ifdef CONFIG_DLM_DEPRECATED_API
1216	INIT_LIST_HEAD(&lkb->lkb_time_list);
1217#endif
1218	INIT_LIST_HEAD(&lkb->lkb_cb_list);
1219	mutex_init(&lkb->lkb_cb_mutex);
1220	INIT_WORK(&lkb->lkb_cb_work, dlm_callback_work);
1221
1222	idr_preload(GFP_NOFS);
1223	spin_lock(&ls->ls_lkbidr_spin);
1224	rv = idr_alloc(&ls->ls_lkbidr, lkb, start, end, GFP_NOWAIT);
1225	if (rv >= 0)
1226		lkb->lkb_id = rv;
1227	spin_unlock(&ls->ls_lkbidr_spin);
1228	idr_preload_end();
1229
1230	if (rv < 0) {
1231		log_error(ls, "create_lkb idr error %d", rv);
1232		dlm_free_lkb(lkb);
1233		return rv;
1234	}
1235
1236	*lkb_ret = lkb;
1237	return 0;
1238}
1239
1240static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret)
1241{
1242	return _create_lkb(ls, lkb_ret, 1, 0);
1243}
1244
1245static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret)
1246{
1247	struct dlm_lkb *lkb;
1248
1249	spin_lock(&ls->ls_lkbidr_spin);
1250	lkb = idr_find(&ls->ls_lkbidr, lkid);
1251	if (lkb)
1252		kref_get(&lkb->lkb_ref);
1253	spin_unlock(&ls->ls_lkbidr_spin);
1254
1255	*lkb_ret = lkb;
1256	return lkb ? 0 : -ENOENT;
1257}
1258
1259static void kill_lkb(struct kref *kref)
1260{
1261	struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
1262
1263	/* All work is done after the return from kref_put() so we
1264	   can release the write_lock before the detach_lkb */
1265
1266	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
1267}
1268
1269/* __put_lkb() is used when an lkb may not have an rsb attached to
1270   it so we need to provide the lockspace explicitly */
1271
1272static int __put_lkb(struct dlm_ls *ls, struct dlm_lkb *lkb)
1273{
1274	uint32_t lkid = lkb->lkb_id;
1275	int rv;
1276
1277	rv = kref_put_lock(&lkb->lkb_ref, kill_lkb,
1278			   &ls->ls_lkbidr_spin);
1279	if (rv) {
1280		idr_remove(&ls->ls_lkbidr, lkid);
1281		spin_unlock(&ls->ls_lkbidr_spin);
1282
1283		detach_lkb(lkb);
1284
1285		/* for local/process lkbs, lvbptr points to caller's lksb */
1286		if (lkb->lkb_lvbptr && is_master_copy(lkb))
1287			dlm_free_lvb(lkb->lkb_lvbptr);
1288		dlm_free_lkb(lkb);
1289	}
1290
1291	return rv;
1292}
1293
1294int dlm_put_lkb(struct dlm_lkb *lkb)
1295{
1296	struct dlm_ls *ls;
1297
1298	DLM_ASSERT(lkb->lkb_resource, dlm_print_lkb(lkb););
1299	DLM_ASSERT(lkb->lkb_resource->res_ls, dlm_print_lkb(lkb););
1300
1301	ls = lkb->lkb_resource->res_ls;
1302	return __put_lkb(ls, lkb);
1303}
1304
1305/* This is only called to add a reference when the code already holds
1306   a valid reference to the lkb, so there's no need for locking. */
1307
1308static inline void hold_lkb(struct dlm_lkb *lkb)
1309{
1310	kref_get(&lkb->lkb_ref);
1311}
1312
1313static void unhold_lkb_assert(struct kref *kref)
1314{
1315	struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref);
1316
1317	DLM_ASSERT(false, dlm_print_lkb(lkb););
1318}
1319
1320/* This is called when we need to remove a reference and are certain
1321   it's not the last ref.  e.g. del_lkb is always called between a
1322   find_lkb/put_lkb and is always the inverse of a previous add_lkb.
1323   put_lkb would work fine, but would involve unnecessary locking */
1324
1325static inline void unhold_lkb(struct dlm_lkb *lkb)
1326{
1327	kref_put(&lkb->lkb_ref, unhold_lkb_assert);
1328}
1329
1330static void lkb_add_ordered(struct list_head *new, struct list_head *head,
1331			    int mode)
1332{
1333	struct dlm_lkb *lkb = NULL, *iter;
1334
1335	list_for_each_entry(iter, head, lkb_statequeue)
1336		if (iter->lkb_rqmode < mode) {
1337			lkb = iter;
1338			list_add_tail(new, &iter->lkb_statequeue);
1339			break;
1340		}
1341
1342	if (!lkb)
1343		list_add_tail(new, head);
1344}
1345
1346/* add/remove lkb to rsb's grant/convert/wait queue */
1347
1348static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status)
1349{
1350	kref_get(&lkb->lkb_ref);
1351
1352	DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb););
1353
1354	lkb->lkb_timestamp = ktime_get();
1355
1356	lkb->lkb_status = status;
1357
1358	switch (status) {
1359	case DLM_LKSTS_WAITING:
1360		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
1361			list_add(&lkb->lkb_statequeue, &r->res_waitqueue);
1362		else
1363			list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue);
1364		break;
1365	case DLM_LKSTS_GRANTED:
1366		/* convention says granted locks kept in order of grmode */
1367		lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue,
1368				lkb->lkb_grmode);
1369		break;
1370	case DLM_LKSTS_CONVERT:
1371		if (lkb->lkb_exflags & DLM_LKF_HEADQUE)
1372			list_add(&lkb->lkb_statequeue, &r->res_convertqueue);
1373		else
1374			list_add_tail(&lkb->lkb_statequeue,
1375				      &r->res_convertqueue);
1376		break;
1377	default:
1378		DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status););
1379	}
1380}
1381
1382static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb)
1383{
1384	lkb->lkb_status = 0;
1385	list_del(&lkb->lkb_statequeue);
1386	unhold_lkb(lkb);
1387}
1388
1389static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts)
1390{
1391	hold_lkb(lkb);
1392	del_lkb(r, lkb);
1393	add_lkb(r, lkb, sts);
1394	unhold_lkb(lkb);
1395}
1396
1397static int msg_reply_type(int mstype)
1398{
1399	switch (mstype) {
1400	case DLM_MSG_REQUEST:
1401		return DLM_MSG_REQUEST_REPLY;
1402	case DLM_MSG_CONVERT:
1403		return DLM_MSG_CONVERT_REPLY;
1404	case DLM_MSG_UNLOCK:
1405		return DLM_MSG_UNLOCK_REPLY;
1406	case DLM_MSG_CANCEL:
1407		return DLM_MSG_CANCEL_REPLY;
1408	case DLM_MSG_LOOKUP:
1409		return DLM_MSG_LOOKUP_REPLY;
1410	}
1411	return -1;
1412}
1413
1414/* add/remove lkb from global waiters list of lkb's waiting for
1415   a reply from a remote node */
1416
1417static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
1418{
1419	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1420	int error = 0;
1421
1422	mutex_lock(&ls->ls_waiters_mutex);
1423
1424	if (is_overlap_unlock(lkb) ||
1425	    (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL))) {
1426		error = -EINVAL;
1427		goto out;
1428	}
1429
1430	if (lkb->lkb_wait_type || is_overlap_cancel(lkb)) {
1431		switch (mstype) {
1432		case DLM_MSG_UNLOCK:
1433			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
1434			break;
1435		case DLM_MSG_CANCEL:
1436			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
1437			break;
1438		default:
1439			error = -EBUSY;
1440			goto out;
1441		}
1442		lkb->lkb_wait_count++;
1443		hold_lkb(lkb);
1444
1445		log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
1446			  lkb->lkb_id, lkb->lkb_wait_type, mstype,
1447			  lkb->lkb_wait_count, lkb->lkb_flags);
1448		goto out;
1449	}
1450
1451	DLM_ASSERT(!lkb->lkb_wait_count,
1452		   dlm_print_lkb(lkb);
1453		   printk("wait_count %d\n", lkb->lkb_wait_count););
1454
1455	lkb->lkb_wait_count++;
1456	lkb->lkb_wait_type = mstype;
1457	lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
1458	hold_lkb(lkb);
1459	list_add(&lkb->lkb_wait_reply, &ls->ls_waiters);
1460 out:
1461	if (error)
1462		log_error(ls, "addwait error %x %d flags %x %d %d %s",
1463			  lkb->lkb_id, error, lkb->lkb_flags, mstype,
1464			  lkb->lkb_wait_type, lkb->lkb_resource->res_name);
1465	mutex_unlock(&ls->ls_waiters_mutex);
1466	return error;
1467}
1468
1469/* We clear the RESEND flag because we might be taking an lkb off the waiters
1470   list as part of process_requestqueue (e.g. a lookup that has an optimized
1471   request reply on the requestqueue) between dlm_recover_waiters_pre() which
1472   set RESEND and dlm_recover_waiters_post() */
1473
1474static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
1475				struct dlm_message *ms)
1476{
1477	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1478	int overlap_done = 0;
1479
1480	if (is_overlap_unlock(lkb) && (mstype == DLM_MSG_UNLOCK_REPLY)) {
1481		log_debug(ls, "remwait %x unlock_reply overlap", lkb->lkb_id);
1482		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
1483		overlap_done = 1;
1484		goto out_del;
1485	}
1486
1487	if (is_overlap_cancel(lkb) && (mstype == DLM_MSG_CANCEL_REPLY)) {
1488		log_debug(ls, "remwait %x cancel_reply overlap", lkb->lkb_id);
1489		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
1490		overlap_done = 1;
1491		goto out_del;
1492	}
1493
1494	/* Cancel state was preemptively cleared by a successful convert,
1495	   see next comment, nothing to do. */
1496
1497	if ((mstype == DLM_MSG_CANCEL_REPLY) &&
1498	    (lkb->lkb_wait_type != DLM_MSG_CANCEL)) {
1499		log_debug(ls, "remwait %x cancel_reply wait_type %d",
1500			  lkb->lkb_id, lkb->lkb_wait_type);
1501		return -1;
1502	}
1503
1504	/* Remove for the convert reply, and premptively remove for the
1505	   cancel reply.  A convert has been granted while there's still
1506	   an outstanding cancel on it (the cancel is moot and the result
1507	   in the cancel reply should be 0).  We preempt the cancel reply
1508	   because the app gets the convert result and then can follow up
1509	   with another op, like convert.  This subsequent op would see the
1510	   lingering state of the cancel and fail with -EBUSY. */
1511
1512	if ((mstype == DLM_MSG_CONVERT_REPLY) &&
1513	    (lkb->lkb_wait_type == DLM_MSG_CONVERT) &&
1514	    is_overlap_cancel(lkb) && ms && !ms->m_result) {
1515		log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
1516			  lkb->lkb_id);
1517		lkb->lkb_wait_type = 0;
1518		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
1519		lkb->lkb_wait_count--;
1520		unhold_lkb(lkb);
1521		goto out_del;
1522	}
1523
1524	/* N.B. type of reply may not always correspond to type of original
1525	   msg due to lookup->request optimization, verify others? */
1526
1527	if (lkb->lkb_wait_type) {
1528		lkb->lkb_wait_type = 0;
1529		goto out_del;
1530	}
1531
1532	log_error(ls, "remwait error %x remote %d %x msg %d flags %x no wait",
1533		  lkb->lkb_id, ms ? le32_to_cpu(ms->m_header.h_nodeid) : 0,
1534		  lkb->lkb_remid, mstype, lkb->lkb_flags);
1535	return -1;
1536
1537 out_del:
1538	/* the force-unlock/cancel has completed and we haven't recvd a reply
1539	   to the op that was in progress prior to the unlock/cancel; we
1540	   give up on any reply to the earlier op.  FIXME: not sure when/how
1541	   this would happen */
1542
1543	if (overlap_done && lkb->lkb_wait_type) {
1544		log_error(ls, "remwait error %x reply %d wait_type %d overlap",
1545			  lkb->lkb_id, mstype, lkb->lkb_wait_type);
1546		lkb->lkb_wait_count--;
1547		unhold_lkb(lkb);
1548		lkb->lkb_wait_type = 0;
1549	}
1550
1551	DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
1552
1553	lkb->lkb_flags &= ~DLM_IFL_RESEND;
1554	lkb->lkb_wait_count--;
1555	if (!lkb->lkb_wait_count)
1556		list_del_init(&lkb->lkb_wait_reply);
1557	unhold_lkb(lkb);
1558	return 0;
1559}
1560
1561static int remove_from_waiters(struct dlm_lkb *lkb, int mstype)
1562{
1563	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1564	int error;
1565
1566	mutex_lock(&ls->ls_waiters_mutex);
1567	error = _remove_from_waiters(lkb, mstype, NULL);
1568	mutex_unlock(&ls->ls_waiters_mutex);
1569	return error;
1570}
1571
1572/* Handles situations where we might be processing a "fake" or "stub" reply in
1573   which we can't try to take waiters_mutex again. */
1574
1575static int remove_from_waiters_ms(struct dlm_lkb *lkb, struct dlm_message *ms)
1576{
1577	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1578	int error;
1579
1580	if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS))
1581		mutex_lock(&ls->ls_waiters_mutex);
1582	error = _remove_from_waiters(lkb, le32_to_cpu(ms->m_type), ms);
1583	if (ms->m_flags != cpu_to_le32(DLM_IFL_STUB_MS))
1584		mutex_unlock(&ls->ls_waiters_mutex);
1585	return error;
1586}
1587
1588/* If there's an rsb for the same resource being removed, ensure
1589 * that the remove message is sent before the new lookup message.
1590 */
1591
1592#define DLM_WAIT_PENDING_COND(ls, r)		\
1593	(ls->ls_remove_len &&			\
1594	 !rsb_cmp(r, ls->ls_remove_name,	\
1595		  ls->ls_remove_len))
1596
1597static void wait_pending_remove(struct dlm_rsb *r)
1598{
1599	struct dlm_ls *ls = r->res_ls;
1600 restart:
1601	spin_lock(&ls->ls_remove_spin);
1602	if (DLM_WAIT_PENDING_COND(ls, r)) {
1603		log_debug(ls, "delay lookup for remove dir %d %s",
1604			  r->res_dir_nodeid, r->res_name);
1605		spin_unlock(&ls->ls_remove_spin);
1606		wait_event(ls->ls_remove_wait, !DLM_WAIT_PENDING_COND(ls, r));
1607		goto restart;
1608	}
1609	spin_unlock(&ls->ls_remove_spin);
1610}
1611
1612/*
1613 * ls_remove_spin protects ls_remove_name and ls_remove_len which are
1614 * read by other threads in wait_pending_remove.  ls_remove_names
1615 * and ls_remove_lens are only used by the scan thread, so they do
1616 * not need protection.
1617 */
1618
1619static void shrink_bucket(struct dlm_ls *ls, int b)
1620{
1621	struct rb_node *n, *next;
1622	struct dlm_rsb *r;
1623	char *name;
1624	int our_nodeid = dlm_our_nodeid();
1625	int remote_count = 0;
1626	int need_shrink = 0;
1627	int i, len, rv;
1628
1629	memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);
1630
1631	spin_lock(&ls->ls_rsbtbl[b].lock);
1632
1633	if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) {
1634		spin_unlock(&ls->ls_rsbtbl[b].lock);
1635		return;
1636	}
1637
1638	for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {
1639		next = rb_next(n);
1640		r = rb_entry(n, struct dlm_rsb, res_hashnode);
1641
1642		/* If we're the directory record for this rsb, and
1643		   we're not the master of it, then we need to wait
1644		   for the master node to send us a dir remove for
1645		   before removing the dir record. */
1646
1647		if (!dlm_no_directory(ls) &&
1648		    (r->res_master_nodeid != our_nodeid) &&
1649		    (dlm_dir_nodeid(r) == our_nodeid)) {
1650			continue;
1651		}
1652
1653		need_shrink = 1;
1654
1655		if (!time_after_eq(jiffies, r->res_toss_time +
1656				   dlm_config.ci_toss_secs * HZ)) {
1657			continue;
1658		}
1659
1660		if (!dlm_no_directory(ls) &&
1661		    (r->res_master_nodeid == our_nodeid) &&
1662		    (dlm_dir_nodeid(r) != our_nodeid)) {
1663
1664			/* We're the master of this rsb but we're not
1665			   the directory record, so we need to tell the
1666			   dir node to remove the dir record. */
1667
1668			ls->ls_remove_lens[remote_count] = r->res_length;
1669			memcpy(ls->ls_remove_names[remote_count], r->res_name,
1670			       DLM_RESNAME_MAXLEN);
1671			remote_count++;
1672
1673			if (remote_count >= DLM_REMOVE_NAMES_MAX)
1674				break;
1675			continue;
1676		}
1677
1678		if (!kref_put(&r->res_ref, kill_rsb)) {
1679			log_error(ls, "tossed rsb in use %s", r->res_name);
1680			continue;
1681		}
1682
1683		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
1684		dlm_free_rsb(r);
1685	}
1686
1687	if (need_shrink)
1688		ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK;
1689	else
1690		ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK;
1691	spin_unlock(&ls->ls_rsbtbl[b].lock);
1692
1693	/*
1694	 * While searching for rsb's to free, we found some that require
1695	 * remote removal.  We leave them in place and find them again here
1696	 * so there is a very small gap between removing them from the toss
1697	 * list and sending the removal.  Keeping this gap small is
1698	 * important to keep us (the master node) from being out of sync
1699	 * with the remote dir node for very long.
1700	 *
1701	 * From the time the rsb is removed from toss until just after
1702	 * send_remove, the rsb name is saved in ls_remove_name.  A new
1703	 * lookup checks this to ensure that a new lookup message for the
1704	 * same resource name is not sent just before the remove message.
1705	 */
1706
1707	for (i = 0; i < remote_count; i++) {
1708		name = ls->ls_remove_names[i];
1709		len = ls->ls_remove_lens[i];
1710
1711		spin_lock(&ls->ls_rsbtbl[b].lock);
1712		rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
1713		if (rv) {
1714			spin_unlock(&ls->ls_rsbtbl[b].lock);
1715			log_debug(ls, "remove_name not toss %s", name);
1716			continue;
1717		}
1718
1719		if (r->res_master_nodeid != our_nodeid) {
1720			spin_unlock(&ls->ls_rsbtbl[b].lock);
1721			log_debug(ls, "remove_name master %d dir %d our %d %s",
1722				  r->res_master_nodeid, r->res_dir_nodeid,
1723				  our_nodeid, name);
1724			continue;
1725		}
1726
1727		if (r->res_dir_nodeid == our_nodeid) {
1728			/* should never happen */
1729			spin_unlock(&ls->ls_rsbtbl[b].lock);
1730			log_error(ls, "remove_name dir %d master %d our %d %s",
1731				  r->res_dir_nodeid, r->res_master_nodeid,
1732				  our_nodeid, name);
1733			continue;
1734		}
1735
1736		if (!time_after_eq(jiffies, r->res_toss_time +
1737				   dlm_config.ci_toss_secs * HZ)) {
1738			spin_unlock(&ls->ls_rsbtbl[b].lock);
1739			log_debug(ls, "remove_name toss_time %lu now %lu %s",
1740				  r->res_toss_time, jiffies, name);
1741			continue;
1742		}
1743
1744		if (!kref_put(&r->res_ref, kill_rsb)) {
1745			spin_unlock(&ls->ls_rsbtbl[b].lock);
1746			log_error(ls, "remove_name in use %s", name);
1747			continue;
1748		}
1749
1750		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
1751
1752		/* block lookup of same name until we've sent remove */
1753		spin_lock(&ls->ls_remove_spin);
1754		ls->ls_remove_len = len;
1755		memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
1756		spin_unlock(&ls->ls_remove_spin);
1757		spin_unlock(&ls->ls_rsbtbl[b].lock);
1758
1759		send_remove(r);
1760
1761		/* allow lookup of name again */
1762		spin_lock(&ls->ls_remove_spin);
1763		ls->ls_remove_len = 0;
1764		memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
1765		spin_unlock(&ls->ls_remove_spin);
1766		wake_up(&ls->ls_remove_wait);
1767
1768		dlm_free_rsb(r);
1769	}
1770}
1771
1772void dlm_scan_rsbs(struct dlm_ls *ls)
1773{
1774	int i;
1775
1776	for (i = 0; i < ls->ls_rsbtbl_size; i++) {
1777		shrink_bucket(ls, i);
1778		if (dlm_locking_stopped(ls))
1779			break;
1780		cond_resched();
1781	}
1782}
1783
1784#ifdef CONFIG_DLM_DEPRECATED_API
1785static void add_timeout(struct dlm_lkb *lkb)
1786{
1787	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1788
1789	if (is_master_copy(lkb))
1790		return;
1791
1792	if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
1793	    !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
1794		lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN;
1795		goto add_it;
1796	}
1797	if (lkb->lkb_exflags & DLM_LKF_TIMEOUT)
1798		goto add_it;
1799	return;
1800
1801 add_it:
1802	DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb););
1803	mutex_lock(&ls->ls_timeout_mutex);
1804	hold_lkb(lkb);
1805	list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout);
1806	mutex_unlock(&ls->ls_timeout_mutex);
1807}
1808
1809static void del_timeout(struct dlm_lkb *lkb)
1810{
1811	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
1812
1813	mutex_lock(&ls->ls_timeout_mutex);
1814	if (!list_empty(&lkb->lkb_time_list)) {
1815		list_del_init(&lkb->lkb_time_list);
1816		unhold_lkb(lkb);
1817	}
1818	mutex_unlock(&ls->ls_timeout_mutex);
1819}
1820
1821/* FIXME: is it safe to look at lkb_exflags, lkb_flags, lkb_timestamp, and
1822   lkb_lksb_timeout without lock_rsb?  Note: we can't lock timeout_mutex
1823   and then lock rsb because of lock ordering in add_timeout.  We may need
1824   to specify some special timeout-related bits in the lkb that are just to
1825   be accessed under the timeout_mutex. */
1826
1827void dlm_scan_timeout(struct dlm_ls *ls)
1828{
1829	struct dlm_rsb *r;
1830	struct dlm_lkb *lkb = NULL, *iter;
1831	int do_cancel, do_warn;
1832	s64 wait_us;
1833
1834	for (;;) {
1835		if (dlm_locking_stopped(ls))
1836			break;
1837
1838		do_cancel = 0;
1839		do_warn = 0;
1840		mutex_lock(&ls->ls_timeout_mutex);
1841		list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) {
1842
1843			wait_us = ktime_to_us(ktime_sub(ktime_get(),
1844							iter->lkb_timestamp));
1845
1846			if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) &&
1847			    wait_us >= (iter->lkb_timeout_cs * 10000))
1848				do_cancel = 1;
1849
1850			if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
1851			    wait_us >= dlm_config.ci_timewarn_cs * 10000)
1852				do_warn = 1;
1853
1854			if (!do_cancel && !do_warn)
1855				continue;
1856			hold_lkb(iter);
1857			lkb = iter;
1858			break;
1859		}
1860		mutex_unlock(&ls->ls_timeout_mutex);
1861
1862		if (!lkb)
1863			break;
1864
1865		r = lkb->lkb_resource;
1866		hold_rsb(r);
1867		lock_rsb(r);
1868
1869		if (do_warn) {
1870			/* clear flag so we only warn once */
1871			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
1872			if (!(lkb->lkb_exflags & DLM_LKF_TIMEOUT))
1873				del_timeout(lkb);
1874			dlm_timeout_warn(lkb);
1875		}
1876
1877		if (do_cancel) {
1878			log_debug(ls, "timeout cancel %x node %d %s",
1879				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
1880			lkb->lkb_flags &= ~DLM_IFL_WATCH_TIMEWARN;
1881			lkb->lkb_flags |= DLM_IFL_TIMEOUT_CANCEL;
1882			del_timeout(lkb);
1883			_cancel_lock(r, lkb);
1884		}
1885
1886		unlock_rsb(r);
1887		unhold_rsb(r);
1888		dlm_put_lkb(lkb);
1889	}
1890}
1891
1892/* This is only called by dlm_recoverd, and we rely on dlm_ls_stop() stopping
1893   dlm_recoverd before checking/setting ls_recover_begin. */
1894
1895void dlm_adjust_timeouts(struct dlm_ls *ls)
1896{
1897	struct dlm_lkb *lkb;
1898	u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin);
1899
1900	ls->ls_recover_begin = 0;
1901	mutex_lock(&ls->ls_timeout_mutex);
1902	list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list)
1903		lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us);
1904	mutex_unlock(&ls->ls_timeout_mutex);
1905}
1906#else
1907static void add_timeout(struct dlm_lkb *lkb) { }
1908static void del_timeout(struct dlm_lkb *lkb) { }
1909#endif
1910
1911/* lkb is master or local copy */
1912
1913static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1914{
1915	int b, len = r->res_ls->ls_lvblen;
1916
1917	/* b=1 lvb returned to caller
1918	   b=0 lvb written to rsb or invalidated
1919	   b=-1 do nothing */
1920
1921	b =  dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
1922
1923	if (b == 1) {
1924		if (!lkb->lkb_lvbptr)
1925			return;
1926
1927		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
1928			return;
1929
1930		if (!r->res_lvbptr)
1931			return;
1932
1933		memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len);
1934		lkb->lkb_lvbseq = r->res_lvbseq;
1935
1936	} else if (b == 0) {
1937		if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
1938			rsb_set_flag(r, RSB_VALNOTVALID);
1939			return;
1940		}
1941
1942		if (!lkb->lkb_lvbptr)
1943			return;
1944
1945		if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
1946			return;
1947
1948		if (!r->res_lvbptr)
1949			r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
1950
1951		if (!r->res_lvbptr)
1952			return;
1953
1954		memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len);
1955		r->res_lvbseq++;
1956		lkb->lkb_lvbseq = r->res_lvbseq;
1957		rsb_clear_flag(r, RSB_VALNOTVALID);
1958	}
1959
1960	if (rsb_flag(r, RSB_VALNOTVALID))
1961		lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID;
1962}
1963
1964static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
1965{
1966	if (lkb->lkb_grmode < DLM_LOCK_PW)
1967		return;
1968
1969	if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) {
1970		rsb_set_flag(r, RSB_VALNOTVALID);
1971		return;
1972	}
1973
1974	if (!lkb->lkb_lvbptr)
1975		return;
1976
1977	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
1978		return;
1979
1980	if (!r->res_lvbptr)
1981		r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
1982
1983	if (!r->res_lvbptr)
1984		return;
1985
1986	memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
1987	r->res_lvbseq++;
1988	rsb_clear_flag(r, RSB_VALNOTVALID);
1989}
1990
1991/* lkb is process copy (pc) */
1992
1993static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
1994			    struct dlm_message *ms)
1995{
1996	int b;
1997
1998	if (!lkb->lkb_lvbptr)
1999		return;
2000
2001	if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
2002		return;
2003
2004	b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1];
2005	if (b == 1) {
2006		int len = receive_extralen(ms);
2007		if (len > r->res_ls->ls_lvblen)
2008			len = r->res_ls->ls_lvblen;
2009		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
2010		lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq);
2011	}
2012}
2013
2014/* Manipulate lkb's on rsb's convert/granted/waiting queues
2015   remove_lock -- used for unlock, removes lkb from granted
2016   revert_lock -- used for cancel, moves lkb from convert to granted
2017   grant_lock  -- used for request and convert, adds lkb to granted or
2018                  moves lkb from convert or waiting to granted
2019
2020   Each of these is used for master or local copy lkb's.  There is
2021   also a _pc() variation used to make the corresponding change on
2022   a process copy (pc) lkb. */
2023
2024static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2025{
2026	del_lkb(r, lkb);
2027	lkb->lkb_grmode = DLM_LOCK_IV;
2028	/* this unhold undoes the original ref from create_lkb()
2029	   so this leads to the lkb being freed */
2030	unhold_lkb(lkb);
2031}
2032
2033static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2034{
2035	set_lvb_unlock(r, lkb);
2036	_remove_lock(r, lkb);
2037}
2038
2039static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
2040{
2041	_remove_lock(r, lkb);
2042}
2043
2044/* returns: 0 did nothing
2045	    1 moved lock to granted
2046	   -1 removed lock */
2047
2048static int revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2049{
2050	int rv = 0;
2051
2052	lkb->lkb_rqmode = DLM_LOCK_IV;
2053
2054	switch (lkb->lkb_status) {
2055	case DLM_LKSTS_GRANTED:
2056		break;
2057	case DLM_LKSTS_CONVERT:
2058		move_lkb(r, lkb, DLM_LKSTS_GRANTED);
2059		rv = 1;
2060		break;
2061	case DLM_LKSTS_WAITING:
2062		del_lkb(r, lkb);
2063		lkb->lkb_grmode = DLM_LOCK_IV;
2064		/* this unhold undoes the original ref from create_lkb()
2065		   so this leads to the lkb being freed */
2066		unhold_lkb(lkb);
2067		rv = -1;
2068		break;
2069	default:
2070		log_print("invalid status for revert %d", lkb->lkb_status);
2071	}
2072	return rv;
2073}
2074
2075static int revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb)
2076{
2077	return revert_lock(r, lkb);
2078}
2079
2080static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2081{
2082	if (lkb->lkb_grmode != lkb->lkb_rqmode) {
2083		lkb->lkb_grmode = lkb->lkb_rqmode;
2084		if (lkb->lkb_status)
2085			move_lkb(r, lkb, DLM_LKSTS_GRANTED);
2086		else
2087			add_lkb(r, lkb, DLM_LKSTS_GRANTED);
2088	}
2089
2090	lkb->lkb_rqmode = DLM_LOCK_IV;
2091	lkb->lkb_highbast = 0;
2092}
2093
2094static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
2095{
2096	set_lvb_lock(r, lkb);
2097	_grant_lock(r, lkb);
2098}
2099
2100static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb,
2101			  struct dlm_message *ms)
2102{
2103	set_lvb_lock_pc(r, lkb, ms);
2104	_grant_lock(r, lkb);
2105}
2106
2107/* called by grant_pending_locks() which means an async grant message must
2108   be sent to the requesting node in addition to granting the lock if the
2109   lkb belongs to a remote node. */
2110
2111static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb)
2112{
2113	grant_lock(r, lkb);
2114	if (is_master_copy(lkb))
2115		send_grant(r, lkb);
2116	else
2117		queue_cast(r, lkb, 0);
2118}
2119
2120/* The special CONVDEADLK, ALTPR and ALTCW flags allow the master to
2121   change the granted/requested modes.  We're munging things accordingly in
2122   the process copy.
2123   CONVDEADLK: our grmode may have been forced down to NL to resolve a
2124   conversion deadlock
2125   ALTPR/ALTCW: our rqmode may have been changed to PR or CW to become
2126   compatible with other granted locks */
2127
2128static void munge_demoted(struct dlm_lkb *lkb)
2129{
2130	if (lkb->lkb_rqmode == DLM_LOCK_IV || lkb->lkb_grmode == DLM_LOCK_IV) {
2131		log_print("munge_demoted %x invalid modes gr %d rq %d",
2132			  lkb->lkb_id, lkb->lkb_grmode, lkb->lkb_rqmode);
2133		return;
2134	}
2135
2136	lkb->lkb_grmode = DLM_LOCK_NL;
2137}
2138
2139static void munge_altmode(struct dlm_lkb *lkb, struct dlm_message *ms)
2140{
2141	if (ms->m_type != cpu_to_le32(DLM_MSG_REQUEST_REPLY) &&
2142	    ms->m_type != cpu_to_le32(DLM_MSG_GRANT)) {
2143		log_print("munge_altmode %x invalid reply type %d",
2144			  lkb->lkb_id, le32_to_cpu(ms->m_type));
2145		return;
2146	}
2147
2148	if (lkb->lkb_exflags & DLM_LKF_ALTPR)
2149		lkb->lkb_rqmode = DLM_LOCK_PR;
2150	else if (lkb->lkb_exflags & DLM_LKF_ALTCW)
2151		lkb->lkb_rqmode = DLM_LOCK_CW;
2152	else {
2153		log_print("munge_altmode invalid exflags %x", lkb->lkb_exflags);
2154		dlm_print_lkb(lkb);
2155	}
2156}
2157
2158static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head)
2159{
2160	struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb,
2161					   lkb_statequeue);
2162	if (lkb->lkb_id == first->lkb_id)
2163		return 1;
2164
2165	return 0;
2166}
2167
2168/* Check if the given lkb conflicts with another lkb on the queue. */
2169
2170static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb)
2171{
2172	struct dlm_lkb *this;
2173
2174	list_for_each_entry(this, head, lkb_statequeue) {
2175		if (this == lkb)
2176			continue;
2177		if (!modes_compat(this, lkb))
2178			return 1;
2179	}
2180	return 0;
2181}
2182
2183/*
2184 * "A conversion deadlock arises with a pair of lock requests in the converting
2185 * queue for one resource.  The granted mode of each lock blocks the requested
2186 * mode of the other lock."
2187 *
2188 * Part 2: if the granted mode of lkb is preventing an earlier lkb in the
2189 * convert queue from being granted, then deadlk/demote lkb.
2190 *
2191 * Example:
2192 * Granted Queue: empty
2193 * Convert Queue: NL->EX (first lock)
2194 *                PR->EX (second lock)
2195 *
2196 * The first lock can't be granted because of the granted mode of the second
2197 * lock and the second lock can't be granted because it's not first in the
2198 * list.  We either cancel lkb's conversion (PR->EX) and return EDEADLK, or we
2199 * demote the granted mode of lkb (from PR to NL) if it has the CONVDEADLK
2200 * flag set and return DEMOTED in the lksb flags.
2201 *
2202 * Originally, this function detected conv-deadlk in a more limited scope:
2203 * - if !modes_compat(lkb1, lkb2) && !modes_compat(lkb2, lkb1), or
2204 * - if lkb1 was the first entry in the queue (not just earlier), and was
2205 *   blocked by the granted mode of lkb2, and there was nothing on the
2206 *   granted queue preventing lkb1 from being granted immediately, i.e.
2207 *   lkb2 was the only thing preventing lkb1 from being granted.
2208 *
2209 * That second condition meant we'd only say there was conv-deadlk if
2210 * resolving it (by demotion) would lead to the first lock on the convert
2211 * queue being granted right away.  It allowed conversion deadlocks to exist
2212 * between locks on the convert queue while they couldn't be granted anyway.
2213 *
2214 * Now, we detect and take action on conversion deadlocks immediately when
2215 * they're created, even if they may not be immediately consequential.  If
2216 * lkb1 exists anywhere in the convert queue and lkb2 comes in with a granted
2217 * mode that would prevent lkb1's conversion from being granted, we do a
2218 * deadlk/demote on lkb2 right away and don't let it onto the convert queue.
2219 * I think this means that the lkb_is_ahead condition below should always
2220 * be zero, i.e. there will never be conv-deadlk between two locks that are
2221 * both already on the convert queue.
2222 */
2223
2224static int conversion_deadlock_detect(struct dlm_rsb *r, struct dlm_lkb *lkb2)
2225{
2226	struct dlm_lkb *lkb1;
2227	int lkb_is_ahead = 0;
2228
2229	list_for_each_entry(lkb1, &r->res_convertqueue, lkb_statequeue) {
2230		if (lkb1 == lkb2) {
2231			lkb_is_ahead = 1;
2232			continue;
2233		}
2234
2235		if (!lkb_is_ahead) {
2236			if (!modes_compat(lkb2, lkb1))
2237				return 1;
2238		} else {
2239			if (!modes_compat(lkb2, lkb1) &&
2240			    !modes_compat(lkb1, lkb2))
2241				return 1;
2242		}
2243	}
2244	return 0;
2245}
2246
2247/*
2248 * Return 1 if the lock can be granted, 0 otherwise.
2249 * Also detect and resolve conversion deadlocks.
2250 *
2251 * lkb is the lock to be granted
2252 *
2253 * now is 1 if the function is being called in the context of the
2254 * immediate request, it is 0 if called later, after the lock has been
2255 * queued.
2256 *
2257 * recover is 1 if dlm_recover_grant() is trying to grant conversions
2258 * after recovery.
2259 *
2260 * References are from chapter 6 of "VAXcluster Principles" by Roy Davis
2261 */
2262
2263static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2264			   int recover)
2265{
2266	int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV);
2267
2268	/*
2269	 * 6-10: Version 5.4 introduced an option to address the phenomenon of
2270	 * a new request for a NL mode lock being blocked.
2271	 *
2272	 * 6-11: If the optional EXPEDITE flag is used with the new NL mode
2273	 * request, then it would be granted.  In essence, the use of this flag
2274	 * tells the Lock Manager to expedite theis request by not considering
2275	 * what may be in the CONVERTING or WAITING queues...  As of this
2276	 * writing, the EXPEDITE flag can be used only with new requests for NL
2277	 * mode locks.  This flag is not valid for conversion requests.
2278	 *
2279	 * A shortcut.  Earlier checks return an error if EXPEDITE is used in a
2280	 * conversion or used with a non-NL requested mode.  We also know an
2281	 * EXPEDITE request is always granted immediately, so now must always
2282	 * be 1.  The full condition to grant an expedite request: (now &&
2283	 * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can
2284	 * therefore be shortened to just checking the flag.
2285	 */
2286
2287	if (lkb->lkb_exflags & DLM_LKF_EXPEDITE)
2288		return 1;
2289
2290	/*
2291	 * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be
2292	 * added to the remaining conditions.
2293	 */
2294
2295	if (queue_conflict(&r->res_grantqueue, lkb))
2296		return 0;
2297
2298	/*
2299	 * 6-3: By default, a conversion request is immediately granted if the
2300	 * requested mode is compatible with the modes of all other granted
2301	 * locks
2302	 */
2303
2304	if (queue_conflict(&r->res_convertqueue, lkb))
2305		return 0;
2306
2307	/*
2308	 * The RECOVER_GRANT flag means dlm_recover_grant() is granting
2309	 * locks for a recovered rsb, on which lkb's have been rebuilt.
2310	 * The lkb's may have been rebuilt on the queues in a different
2311	 * order than they were in on the previous master.  So, granting
2312	 * queued conversions in order after recovery doesn't make sense
2313	 * since the order hasn't been preserved anyway.  The new order
2314	 * could also have created a new "in place" conversion deadlock.
2315	 * (e.g. old, failed master held granted EX, with PR->EX, NL->EX.
2316	 * After recovery, there would be no granted locks, and possibly
2317	 * NL->EX, PR->EX, an in-place conversion deadlock.)  So, after
2318	 * recovery, grant conversions without considering order.
2319	 */
2320
2321	if (conv && recover)
2322		return 1;
2323
2324	/*
2325	 * 6-5: But the default algorithm for deciding whether to grant or
2326	 * queue conversion requests does not by itself guarantee that such
2327	 * requests are serviced on a "first come first serve" basis.  This, in
2328	 * turn, can lead to a phenomenon known as "indefinate postponement".
2329	 *
2330	 * 6-7: This issue is dealt with by using the optional QUECVT flag with
2331	 * the system service employed to request a lock conversion.  This flag
2332	 * forces certain conversion requests to be queued, even if they are
2333	 * compatible with the granted modes of other locks on the same
2334	 * resource.  Thus, the use of this flag results in conversion requests
2335	 * being ordered on a "first come first servce" basis.
2336	 *
2337	 * DCT: This condition is all about new conversions being able to occur
2338	 * "in place" while the lock remains on the granted queue (assuming
2339	 * nothing else conflicts.)  IOW if QUECVT isn't set, a conversion
2340	 * doesn't _have_ to go onto the convert queue where it's processed in
2341	 * order.  The "now" variable is necessary to distinguish converts
2342	 * being received and processed for the first time now, because once a
2343	 * convert is moved to the conversion queue the condition below applies
2344	 * requiring fifo granting.
2345	 */
2346
2347	if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
2348		return 1;
2349
2350	/*
2351	 * Even if the convert is compat with all granted locks,
2352	 * QUECVT forces it behind other locks on the convert queue.
2353	 */
2354
2355	if (now && conv && (lkb->lkb_exflags & DLM_LKF_QUECVT)) {
2356		if (list_empty(&r->res_convertqueue))
2357			return 1;
2358		else
2359			return 0;
2360	}
2361
2362	/*
2363	 * The NOORDER flag is set to avoid the standard vms rules on grant
2364	 * order.
2365	 */
2366
2367	if (lkb->lkb_exflags & DLM_LKF_NOORDER)
2368		return 1;
2369
2370	/*
2371	 * 6-3: Once in that queue [CONVERTING], a conversion request cannot be
2372	 * granted until all other conversion requests ahead of it are granted
2373	 * and/or canceled.
2374	 */
2375
2376	if (!now && conv && first_in_list(lkb, &r->res_convertqueue))
2377		return 1;
2378
2379	/*
2380	 * 6-4: By default, a new request is immediately granted only if all
2381	 * three of the following conditions are satisfied when the request is
2382	 * issued:
2383	 * - The queue of ungranted conversion requests for the resource is
2384	 *   empty.
2385	 * - The queue of ungranted new requests for the resource is empty.
2386	 * - The mode of the new request is compatible with the most
2387	 *   restrictive mode of all granted locks on the resource.
2388	 */
2389
2390	if (now && !conv && list_empty(&r->res_convertqueue) &&
2391	    list_empty(&r->res_waitqueue))
2392		return 1;
2393
2394	/*
2395	 * 6-4: Once a lock request is in the queue of ungranted new requests,
2396	 * it cannot be granted until the queue of ungranted conversion
2397	 * requests is empty, all ungranted new requests ahead of it are
2398	 * granted and/or canceled, and it is compatible with the granted mode
2399	 * of the most restrictive lock granted on the resource.
2400	 */
2401
2402	if (!now && !conv && list_empty(&r->res_convertqueue) &&
2403	    first_in_list(lkb, &r->res_waitqueue))
2404		return 1;
2405
2406	return 0;
2407}
2408
2409static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now,
2410			  int recover, int *err)
2411{
2412	int rv;
2413	int8_t alt = 0, rqmode = lkb->lkb_rqmode;
2414	int8_t is_convert = (lkb->lkb_grmode != DLM_LOCK_IV);
2415
2416	if (err)
2417		*err = 0;
2418
2419	rv = _can_be_granted(r, lkb, now, recover);
2420	if (rv)
2421		goto out;
2422
2423	/*
2424	 * The CONVDEADLK flag is non-standard and tells the dlm to resolve
2425	 * conversion deadlocks by demoting grmode to NL, otherwise the dlm
2426	 * cancels one of the locks.
2427	 */
2428
2429	if (is_convert && can_be_queued(lkb) &&
2430	    conversion_deadlock_detect(r, lkb)) {
2431		if (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) {
2432			lkb->lkb_grmode = DLM_LOCK_NL;
2433			lkb->lkb_sbflags |= DLM_SBF_DEMOTED;
2434		} else if (err) {
2435			*err = -EDEADLK;
2436		} else {
2437			log_print("can_be_granted deadlock %x now %d",
2438				  lkb->lkb_id, now);
2439			dlm_dump_rsb(r);
2440		}
2441		goto out;
2442	}
2443
2444	/*
2445	 * The ALTPR and ALTCW flags are non-standard and tell the dlm to try
2446	 * to grant a request in a mode other than the normal rqmode.  It's a
2447	 * simple way to provide a big optimization to applications that can
2448	 * use them.
2449	 */
2450
2451	if (rqmode != DLM_LOCK_PR && (lkb->lkb_exflags & DLM_LKF_ALTPR))
2452		alt = DLM_LOCK_PR;
2453	else if (rqmode != DLM_LOCK_CW && (lkb->lkb_exflags & DLM_LKF_ALTCW))
2454		alt = DLM_LOCK_CW;
2455
2456	if (alt) {
2457		lkb->lkb_rqmode = alt;
2458		rv = _can_be_granted(r, lkb, now, 0);
2459		if (rv)
2460			lkb->lkb_sbflags |= DLM_SBF_ALTMODE;
2461		else
2462			lkb->lkb_rqmode = rqmode;
2463	}
2464 out:
2465	return rv;
2466}
2467
2468/* Returns the highest requested mode of all blocked conversions; sets
2469   cw if there's a blocked conversion to DLM_LOCK_CW. */
2470
2471static int grant_pending_convert(struct dlm_rsb *r, int high, int *cw,
2472				 unsigned int *count)
2473{
2474	struct dlm_lkb *lkb, *s;
2475	int recover = rsb_flag(r, RSB_RECOVER_GRANT);
2476	int hi, demoted, quit, grant_restart, demote_restart;
2477	int deadlk;
2478
2479	quit = 0;
2480 restart:
2481	grant_restart = 0;
2482	demote_restart = 0;
2483	hi = DLM_LOCK_IV;
2484
2485	list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) {
2486		demoted = is_demoted(lkb);
2487		deadlk = 0;
2488
2489		if (can_be_granted(r, lkb, 0, recover, &deadlk)) {
2490			grant_lock_pending(r, lkb);
2491			grant_restart = 1;
2492			if (count)
2493				(*count)++;
2494			continue;
2495		}
2496
2497		if (!demoted && is_demoted(lkb)) {
2498			log_print("WARN: pending demoted %x node %d %s",
2499				  lkb->lkb_id, lkb->lkb_nodeid, r->res_name);
2500			demote_restart = 1;
2501			continue;
2502		}
2503
2504		if (deadlk) {
2505			/*
2506			 * If DLM_LKB_NODLKWT flag is set and conversion
2507			 * deadlock is detected, we request blocking AST and
2508			 * down (or cancel) conversion.
2509			 */
2510			if (lkb->lkb_exflags & DLM_LKF_NODLCKWT) {
2511				if (lkb->lkb_highbast < lkb->lkb_rqmode) {
2512					queue_bast(r, lkb, lkb->lkb_rqmode);
2513					lkb->lkb_highbast = lkb->lkb_rqmode;
2514				}
2515			} else {
2516				log_print("WARN: pending deadlock %x node %d %s",
2517					  lkb->lkb_id, lkb->lkb_nodeid,
2518					  r->res_name);
2519				dlm_dump_rsb(r);
2520			}
2521			continue;
2522		}
2523
2524		hi = max_t(int, lkb->lkb_rqmode, hi);
2525
2526		if (cw && lkb->lkb_rqmode == DLM_LOCK_CW)
2527			*cw = 1;
2528	}
2529
2530	if (grant_restart)
2531		goto restart;
2532	if (demote_restart && !quit) {
2533		quit = 1;
2534		goto restart;
2535	}
2536
2537	return max_t(int, high, hi);
2538}
2539
2540static int grant_pending_wait(struct dlm_rsb *r, int high, int *cw,
2541			      unsigned int *count)
2542{
2543	struct dlm_lkb *lkb, *s;
2544
2545	list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) {
2546		if (can_be_granted(r, lkb, 0, 0, NULL)) {
2547			grant_lock_pending(r, lkb);
2548			if (count)
2549				(*count)++;
2550		} else {
2551			high = max_t(int, lkb->lkb_rqmode, high);
2552			if (lkb->lkb_rqmode == DLM_LOCK_CW)
2553				*cw = 1;
2554		}
2555	}
2556
2557	return high;
2558}
2559
2560/* cw of 1 means there's a lock with a rqmode of DLM_LOCK_CW that's blocked
2561   on either the convert or waiting queue.
2562   high is the largest rqmode of all locks blocked on the convert or
2563   waiting queue. */
2564
2565static int lock_requires_bast(struct dlm_lkb *gr, int high, int cw)
2566{
2567	if (gr->lkb_grmode == DLM_LOCK_PR && cw) {
2568		if (gr->lkb_highbast < DLM_LOCK_EX)
2569			return 1;
2570		return 0;
2571	}
2572
2573	if (gr->lkb_highbast < high &&
2574	    !__dlm_compat_matrix[gr->lkb_grmode+1][high+1])
2575		return 1;
2576	return 0;
2577}
2578
2579static void grant_pending_locks(struct dlm_rsb *r, unsigned int *count)
2580{
2581	struct dlm_lkb *lkb, *s;
2582	int high = DLM_LOCK_IV;
2583	int cw = 0;
2584
2585	if (!is_master(r)) {
2586		log_print("grant_pending_locks r nodeid %d", r->res_nodeid);
2587		dlm_dump_rsb(r);
2588		return;
2589	}
2590
2591	high = grant_pending_convert(r, high, &cw, count);
2592	high = grant_pending_wait(r, high, &cw, count);
2593
2594	if (high == DLM_LOCK_IV)
2595		return;
2596
2597	/*
2598	 * If there are locks left on the wait/convert queue then send blocking
2599	 * ASTs to granted locks based on the largest requested mode (high)
2600	 * found above.
2601	 */
2602
2603	list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) {
2604		if (lkb->lkb_bastfn && lock_requires_bast(lkb, high, cw)) {
2605			if (cw && high == DLM_LOCK_PR &&
2606			    lkb->lkb_grmode == DLM_LOCK_PR)
2607				queue_bast(r, lkb, DLM_LOCK_CW);
2608			else
2609				queue_bast(r, lkb, high);
2610			lkb->lkb_highbast = high;
2611		}
2612	}
2613}
2614
2615static int modes_require_bast(struct dlm_lkb *gr, struct dlm_lkb *rq)
2616{
2617	if ((gr->lkb_grmode == DLM_LOCK_PR && rq->lkb_rqmode == DLM_LOCK_CW) ||
2618	    (gr->lkb_grmode == DLM_LOCK_CW && rq->lkb_rqmode == DLM_LOCK_PR)) {
2619		if (gr->lkb_highbast < DLM_LOCK_EX)
2620			return 1;
2621		return 0;
2622	}
2623
2624	if (gr->lkb_highbast < rq->lkb_rqmode && !modes_compat(gr, rq))
2625		return 1;
2626	return 0;
2627}
2628
2629static void send_bast_queue(struct dlm_rsb *r, struct list_head *head,
2630			    struct dlm_lkb *lkb)
2631{
2632	struct dlm_lkb *gr;
2633
2634	list_for_each_entry(gr, head, lkb_statequeue) {
2635		/* skip self when sending basts to convertqueue */
2636		if (gr == lkb)
2637			continue;
2638		if (gr->lkb_bastfn && modes_require_bast(gr, lkb)) {
2639			queue_bast(r, gr, lkb->lkb_rqmode);
2640			gr->lkb_highbast = lkb->lkb_rqmode;
2641		}
2642	}
2643}
2644
2645static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb)
2646{
2647	send_bast_queue(r, &r->res_grantqueue, lkb);
2648}
2649
2650static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb)
2651{
2652	send_bast_queue(r, &r->res_grantqueue, lkb);
2653	send_bast_queue(r, &r->res_convertqueue, lkb);
2654}
2655
2656/* set_master(r, lkb) -- set the master nodeid of a resource
2657
2658   The purpose of this function is to set the nodeid field in the given
2659   lkb using the nodeid field in the given rsb.  If the rsb's nodeid is
2660   known, it can just be copied to the lkb and the function will return
2661   0.  If the rsb's nodeid is _not_ known, it needs to be looked up
2662   before it can be copied to the lkb.
2663
2664   When the rsb nodeid is being looked up remotely, the initial lkb
2665   causing the lookup is kept on the ls_waiters list waiting for the
2666   lookup reply.  Other lkb's waiting for the same rsb lookup are kept
2667   on the rsb's res_lookup list until the master is verified.
2668
2669   Return values:
2670   0: nodeid is set in rsb/lkb and the caller should go ahead and use it
2671   1: the rsb master is not available and the lkb has been placed on
2672      a wait queue
2673*/
2674
2675static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb)
2676{
2677	int our_nodeid = dlm_our_nodeid();
2678
2679	if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) {
2680		rsb_clear_flag(r, RSB_MASTER_UNCERTAIN);
2681		r->res_first_lkid = lkb->lkb_id;
2682		lkb->lkb_nodeid = r->res_nodeid;
2683		return 0;
2684	}
2685
2686	if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) {
2687		list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup);
2688		return 1;
2689	}
2690
2691	if (r->res_master_nodeid == our_nodeid) {
2692		lkb->lkb_nodeid = 0;
2693		return 0;
2694	}
2695
2696	if (r->res_master_nodeid) {
2697		lkb->lkb_nodeid = r->res_master_nodeid;
2698		return 0;
2699	}
2700
2701	if (dlm_dir_nodeid(r) == our_nodeid) {
2702		/* This is a somewhat unusual case; find_rsb will usually
2703		   have set res_master_nodeid when dir nodeid is local, but
2704		   there are cases where we become the dir node after we've
2705		   past find_rsb and go through _request_lock again.
2706		   confirm_master() or process_lookup_list() needs to be
2707		   called after this. */
2708		log_debug(r->res_ls, "set_master %x self master %d dir %d %s",
2709			  lkb->lkb_id, r->res_master_nodeid, r->res_dir_nodeid,
2710			  r->res_name);
2711		r->res_master_nodeid = our_nodeid;
2712		r->res_nodeid = 0;
2713		lkb->lkb_nodeid = 0;
2714		return 0;
2715	}
2716
2717	wait_pending_remove(r);
2718
2719	r->res_first_lkid = lkb->lkb_id;
2720	send_lookup(r, lkb);
2721	return 1;
2722}
2723
2724static void process_lookup_list(struct dlm_rsb *r)
2725{
2726	struct dlm_lkb *lkb, *safe;
2727
2728	list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) {
2729		list_del_init(&lkb->lkb_rsb_lookup);
2730		_request_lock(r, lkb);
2731		schedule();
2732	}
2733}
2734
2735/* confirm_master -- confirm (or deny) an rsb's master nodeid */
2736
2737static void confirm_master(struct dlm_rsb *r, int error)
2738{
2739	struct dlm_lkb *lkb;
2740
2741	if (!r->res_first_lkid)
2742		return;
2743
2744	switch (error) {
2745	case 0:
2746	case -EINPROGRESS:
2747		r->res_first_lkid = 0;
2748		process_lookup_list(r);
2749		break;
2750
2751	case -EAGAIN:
2752	case -EBADR:
2753	case -ENOTBLK:
2754		/* the remote request failed and won't be retried (it was
2755		   a NOQUEUE, or has been canceled/unlocked); make a waiting
2756		   lkb the first_lkid */
2757
2758		r->res_first_lkid = 0;
2759
2760		if (!list_empty(&r->res_lookup)) {
2761			lkb = list_entry(r->res_lookup.next, struct dlm_lkb,
2762					 lkb_rsb_lookup);
2763			list_del_init(&lkb->lkb_rsb_lookup);
2764			r->res_first_lkid = lkb->lkb_id;
2765			_request_lock(r, lkb);
2766		}
2767		break;
2768
2769	default:
2770		log_error(r->res_ls, "confirm_master unknown error %d", error);
2771	}
2772}
2773
2774#ifdef CONFIG_DLM_DEPRECATED_API
2775static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
2776			 int namelen, unsigned long timeout_cs,
2777			 void (*ast) (void *astparam),
2778			 void *astparam,
2779			 void (*bast) (void *astparam, int mode),
2780			 struct dlm_args *args)
2781#else
2782static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags,
2783			 int namelen, void (*ast)(void *astparam),
2784			 void *astparam,
2785			 void (*bast)(void *astparam, int mode),
2786			 struct dlm_args *args)
2787#endif
2788{
2789	int rv = -EINVAL;
2790
2791	/* check for invalid arg usage */
2792
2793	if (mode < 0 || mode > DLM_LOCK_EX)
2794		goto out;
2795
2796	if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN))
2797		goto out;
2798
2799	if (flags & DLM_LKF_CANCEL)
2800		goto out;
2801
2802	if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT))
2803		goto out;
2804
2805	if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT))
2806		goto out;
2807
2808	if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE)
2809		goto out;
2810
2811	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT)
2812		goto out;
2813
2814	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT)
2815		goto out;
2816
2817	if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE)
2818		goto out;
2819
2820	if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL)
2821		goto out;
2822
2823	if (!ast || !lksb)
2824		goto out;
2825
2826	if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr)
2827		goto out;
2828
2829	if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid)
2830		goto out;
2831
2832	/* these args will be copied to the lkb in validate_lock_args,
2833	   it cannot be done now because when converting locks, fields in
2834	   an active lkb cannot be modified before locking the rsb */
2835
2836	args->flags = flags;
2837	args->astfn = ast;
2838	args->astparam = astparam;
2839	args->bastfn = bast;
2840#ifdef CONFIG_DLM_DEPRECATED_API
2841	args->timeout = timeout_cs;
2842#endif
2843	args->mode = mode;
2844	args->lksb = lksb;
2845	rv = 0;
2846 out:
2847	return rv;
2848}
2849
2850static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
2851{
2852	if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK |
2853 		      DLM_LKF_FORCEUNLOCK))
2854		return -EINVAL;
2855
2856	if (flags & DLM_LKF_CANCEL && flags & DLM_LKF_FORCEUNLOCK)
2857		return -EINVAL;
2858
2859	args->flags = flags;
2860	args->astparam = astarg;
2861	return 0;
2862}
2863
2864static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
2865			      struct dlm_args *args)
2866{
2867	int rv = -EINVAL;
2868
2869	if (args->flags & DLM_LKF_CONVERT) {
2870		if (lkb->lkb_flags & DLM_IFL_MSTCPY)
2871			goto out;
2872
2873		if (args->flags & DLM_LKF_QUECVT &&
2874		    !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
2875			goto out;
2876
2877		rv = -EBUSY;
2878		if (lkb->lkb_status != DLM_LKSTS_GRANTED)
2879			goto out;
2880
2881		/* lock not allowed if there's any op in progress */
2882		if (lkb->lkb_wait_type || lkb->lkb_wait_count)
2883			goto out;
2884
2885		if (is_overlap(lkb))
2886			goto out;
2887	}
2888
2889	lkb->lkb_exflags = args->flags;
2890	lkb->lkb_sbflags = 0;
2891	lkb->lkb_astfn = args->astfn;
2892	lkb->lkb_astparam = args->astparam;
2893	lkb->lkb_bastfn = args->bastfn;
2894	lkb->lkb_rqmode = args->mode;
2895	lkb->lkb_lksb = args->lksb;
2896	lkb->lkb_lvbptr = args->lksb->sb_lvbptr;
2897	lkb->lkb_ownpid = (int) current->pid;
2898#ifdef CONFIG_DLM_DEPRECATED_API
2899	lkb->lkb_timeout_cs = args->timeout;
2900#endif
2901	rv = 0;
2902 out:
2903	if (rv)
2904		log_debug(ls, "validate_lock_args %d %x %x %x %d %d %s",
2905			  rv, lkb->lkb_id, lkb->lkb_flags, args->flags,
2906			  lkb->lkb_status, lkb->lkb_wait_type,
2907			  lkb->lkb_resource->res_name);
2908	return rv;
2909}
2910
2911/* when dlm_unlock() sees -EBUSY with CANCEL/FORCEUNLOCK it returns 0
2912   for success */
2913
2914/* note: it's valid for lkb_nodeid/res_nodeid to be -1 when we get here
2915   because there may be a lookup in progress and it's valid to do
2916   cancel/unlockf on it */
2917
2918static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
2919{
2920	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
2921	int rv = -EINVAL;
2922
2923	if (lkb->lkb_flags & DLM_IFL_MSTCPY) {
2924		log_error(ls, "unlock on MSTCPY %x", lkb->lkb_id);
2925		dlm_print_lkb(lkb);
2926		goto out;
2927	}
2928
2929	/* an lkb may still exist even though the lock is EOL'ed due to a
2930	   cancel, unlock or failed noqueue request; an app can't use these
2931	   locks; return same error as if the lkid had not been found at all */
2932
2933	if (lkb->lkb_flags & DLM_IFL_ENDOFLIFE) {
2934		log_debug(ls, "unlock on ENDOFLIFE %x", lkb->lkb_id);
2935		rv = -ENOENT;
2936		goto out;
2937	}
2938
2939	/* an lkb may be waiting for an rsb lookup to complete where the
2940	   lookup was initiated by another lock */
2941
2942	if (!list_empty(&lkb->lkb_rsb_lookup)) {
2943		if (args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) {
2944			log_debug(ls, "unlock on rsb_lookup %x", lkb->lkb_id);
2945			list_del_init(&lkb->lkb_rsb_lookup);
2946			queue_cast(lkb->lkb_resource, lkb,
2947				   args->flags & DLM_LKF_CANCEL ?
2948				   -DLM_ECANCEL : -DLM_EUNLOCK);
2949			unhold_lkb(lkb); /* undoes create_lkb() */
2950		}
2951		/* caller changes -EBUSY to 0 for CANCEL and FORCEUNLOCK */
2952		rv = -EBUSY;
2953		goto out;
2954	}
2955
2956	/* cancel not allowed with another cancel/unlock in progress */
2957
2958	if (args->flags & DLM_LKF_CANCEL) {
2959		if (lkb->lkb_exflags & DLM_LKF_CANCEL)
2960			goto out;
2961
2962		if (is_overlap(lkb))
2963			goto out;
2964
2965		/* don't let scand try to do a cancel */
2966		del_timeout(lkb);
2967
2968		if (lkb->lkb_flags & DLM_IFL_RESEND) {
2969			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
2970			rv = -EBUSY;
2971			goto out;
2972		}
2973
2974		/* there's nothing to cancel */
2975		if (lkb->lkb_status == DLM_LKSTS_GRANTED &&
2976		    !lkb->lkb_wait_type) {
2977			rv = -EBUSY;
2978			goto out;
2979		}
2980
2981		switch (lkb->lkb_wait_type) {
2982		case DLM_MSG_LOOKUP:
2983		case DLM_MSG_REQUEST:
2984			lkb->lkb_flags |= DLM_IFL_OVERLAP_CANCEL;
2985			rv = -EBUSY;
2986			goto out;
2987		case DLM_MSG_UNLOCK:
2988		case DLM_MSG_CANCEL:
2989			goto out;
2990		}
2991		/* add_to_waiters() will set OVERLAP_CANCEL */
2992		goto out_ok;
2993	}
2994
2995	/* do we need to allow a force-unlock if there's a normal unlock
2996	   already in progress?  in what conditions could the normal unlock
2997	   fail such that we'd want to send a force-unlock to be sure? */
2998
2999	if (args->flags & DLM_LKF_FORCEUNLOCK) {
3000		if (lkb->lkb_exflags & DLM_LKF_FORCEUNLOCK)
3001			goto out;
3002
3003		if (is_overlap_unlock(lkb))
3004			goto out;
3005
3006		/* don't let scand try to do a cancel */
3007		del_timeout(lkb);
3008
3009		if (lkb->lkb_flags & DLM_IFL_RESEND) {
3010			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
3011			rv = -EBUSY;
3012			goto out;
3013		}
3014
3015		switch (lkb->lkb_wait_type) {
3016		case DLM_MSG_LOOKUP:
3017		case DLM_MSG_REQUEST:
3018			lkb->lkb_flags |= DLM_IFL_OVERLAP_UNLOCK;
3019			rv = -EBUSY;
3020			goto out;
3021		case DLM_MSG_UNLOCK:
3022			goto out;
3023		}
3024		/* add_to_waiters() will set OVERLAP_UNLOCK */
3025		goto out_ok;
3026	}
3027
3028	/* normal unlock not allowed if there's any op in progress */
3029	rv = -EBUSY;
3030	if (lkb->lkb_wait_type || lkb->lkb_wait_count)
3031		goto out;
3032
3033 out_ok:
3034	/* an overlapping op shouldn't blow away exflags from other op */
3035	lkb->lkb_exflags |= args->flags;
3036	lkb->lkb_sbflags = 0;
3037	lkb->lkb_astparam = args->astparam;
3038	rv = 0;
3039 out:
3040	if (rv)
3041		log_debug(ls, "validate_unlock_args %d %x %x %x %x %d %s", rv,
3042			  lkb->lkb_id, lkb->lkb_flags, lkb->lkb_exflags,
3043			  args->flags, lkb->lkb_wait_type,
3044			  lkb->lkb_resource->res_name);
3045	return rv;
3046}
3047
3048/*
3049 * Four stage 4 varieties:
3050 * do_request(), do_convert(), do_unlock(), do_cancel()
3051 * These are called on the master node for the given lock and
3052 * from the central locking logic.
3053 */
3054
3055static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
3056{
3057	int error = 0;
3058
3059	if (can_be_granted(r, lkb, 1, 0, NULL)) {
3060		grant_lock(r, lkb);
3061		queue_cast(r, lkb, 0);
3062		goto out;
3063	}
3064
3065	if (can_be_queued(lkb)) {
3066		error = -EINPROGRESS;
3067		add_lkb(r, lkb, DLM_LKSTS_WAITING);
3068		add_timeout(lkb);
3069		goto out;
3070	}
3071
3072	error = -EAGAIN;
3073	queue_cast(r, lkb, -EAGAIN);
3074 out:
3075	return error;
3076}
3077
3078static void do_request_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
3079			       int error)
3080{
3081	switch (error) {
3082	case -EAGAIN:
3083		if (force_blocking_asts(lkb))
3084			send_blocking_asts_all(r, lkb);
3085		break;
3086	case -EINPROGRESS:
3087		send_blocking_asts(r, lkb);
3088		break;
3089	}
3090}
3091
3092static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
3093{
3094	int error = 0;
3095	int deadlk = 0;
3096
3097	/* changing an existing lock may allow others to be granted */
3098
3099	if (can_be_granted(r, lkb, 1, 0, &deadlk)) {
3100		grant_lock(r, lkb);
3101		queue_cast(r, lkb, 0);
3102		goto out;
3103	}
3104
3105	/* can_be_granted() detected that this lock would block in a conversion
3106	   deadlock, so we leave it on the granted queue and return EDEADLK in
3107	   the ast for the convert. */
3108
3109	if (deadlk && !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
3110		/* it's left on the granted queue */
3111		revert_lock(r, lkb);
3112		queue_cast(r, lkb, -EDEADLK);
3113		error = -EDEADLK;
3114		goto out;
3115	}
3116
3117	/* is_demoted() means the can_be_granted() above set the grmode
3118	   to NL, and left us on the granted queue.  This auto-demotion
3119	   (due to CONVDEADLK) might mean other locks, and/or this lock, are
3120	   now grantable.  We have to try to grant other converting locks
3121	   before we try again to grant this one. */
3122
3123	if (is_demoted(lkb)) {
3124		grant_pending_convert(r, DLM_LOCK_IV, NULL, NULL);
3125		if (_can_be_granted(r, lkb, 1, 0)) {
3126			grant_lock(r, lkb);
3127			queue_cast(r, lkb, 0);
3128			goto out;
3129		}
3130		/* else fall through and move to convert queue */
3131	}
3132
3133	if (can_be_queued(lkb)) {
3134		error = -EINPROGRESS;
3135		del_lkb(r, lkb);
3136		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
3137		add_timeout(lkb);
3138		goto out;
3139	}
3140
3141	error = -EAGAIN;
3142	queue_cast(r, lkb, -EAGAIN);
3143 out:
3144	return error;
3145}
3146
3147static void do_convert_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
3148			       int error)
3149{
3150	switch (error) {
3151	case 0:
3152		grant_pending_locks(r, NULL);
3153		/* grant_pending_locks also sends basts */
3154		break;
3155	case -EAGAIN:
3156		if (force_blocking_asts(lkb))
3157			send_blocking_asts_all(r, lkb);
3158		break;
3159	case -EINPROGRESS:
3160		send_blocking_asts(r, lkb);
3161		break;
3162	}
3163}
3164
3165static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3166{
3167	remove_lock(r, lkb);
3168	queue_cast(r, lkb, -DLM_EUNLOCK);
3169	return -DLM_EUNLOCK;
3170}
3171
3172static void do_unlock_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
3173			      int error)
3174{
3175	grant_pending_locks(r, NULL);
3176}
3177
3178/* returns: 0 did nothing, -DLM_ECANCEL canceled lock */
3179
3180static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
3181{
3182	int error;
3183
3184	error = revert_lock(r, lkb);
3185	if (error) {
3186		queue_cast(r, lkb, -DLM_ECANCEL);
3187		return -DLM_ECANCEL;
3188	}
3189	return 0;
3190}
3191
3192static void do_cancel_effects(struct dlm_rsb *r, struct dlm_lkb *lkb,
3193			      int error)
3194{
3195	if (error)
3196		grant_pending_locks(r, NULL);
3197}
3198
3199/*
3200 * Four stage 3 varieties:
3201 * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock()
3202 */
3203
3204/* add a new lkb to a possibly new rsb, called by requesting process */
3205
3206static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3207{
3208	int error;
3209
3210	/* set_master: sets lkb nodeid from r */
3211
3212	error = set_master(r, lkb);
3213	if (error < 0)
3214		goto out;
3215	if (error) {
3216		error = 0;
3217		goto out;
3218	}
3219
3220	if (is_remote(r)) {
3221		/* receive_request() calls do_request() on remote node */
3222		error = send_request(r, lkb);
3223	} else {
3224		error = do_request(r, lkb);
3225		/* for remote locks the request_reply is sent
3226		   between do_request and do_request_effects */
3227		do_request_effects(r, lkb, error);
3228	}
3229 out:
3230	return error;
3231}
3232
3233/* change some property of an existing lkb, e.g. mode */
3234
3235static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3236{
3237	int error;
3238
3239	if (is_remote(r)) {
3240		/* receive_convert() calls do_convert() on remote node */
3241		error = send_convert(r, lkb);
3242	} else {
3243		error = do_convert(r, lkb);
3244		/* for remote locks the convert_reply is sent
3245		   between do_convert and do_convert_effects */
3246		do_convert_effects(r, lkb, error);
3247	}
3248
3249	return error;
3250}
3251
3252/* remove an existing lkb from the granted queue */
3253
3254static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3255{
3256	int error;
3257
3258	if (is_remote(r)) {
3259		/* receive_unlock() calls do_unlock() on remote node */
3260		error = send_unlock(r, lkb);
3261	} else {
3262		error = do_unlock(r, lkb);
3263		/* for remote locks the unlock_reply is sent
3264		   between do_unlock and do_unlock_effects */
3265		do_unlock_effects(r, lkb, error);
3266	}
3267
3268	return error;
3269}
3270
3271/* remove an existing lkb from the convert or wait queue */
3272
3273static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3274{
3275	int error;
3276
3277	if (is_remote(r)) {
3278		/* receive_cancel() calls do_cancel() on remote node */
3279		error = send_cancel(r, lkb);
3280	} else {
3281		error = do_cancel(r, lkb);
3282		/* for remote locks the cancel_reply is sent
3283		   between do_cancel and do_cancel_effects */
3284		do_cancel_effects(r, lkb, error);
3285	}
3286
3287	return error;
3288}
3289
3290/*
3291 * Four stage 2 varieties:
3292 * request_lock(), convert_lock(), unlock_lock(), cancel_lock()
3293 */
3294
3295static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name,
3296			int len, struct dlm_args *args)
3297{
3298	struct dlm_rsb *r;
3299	int error;
3300
3301	error = validate_lock_args(ls, lkb, args);
3302	if (error)
3303		return error;
3304
3305	error = find_rsb(ls, name, len, 0, R_REQUEST, &r);
3306	if (error)
3307		return error;
3308
3309	lock_rsb(r);
3310
3311	attach_lkb(r, lkb);
3312	lkb->lkb_lksb->sb_lkid = lkb->lkb_id;
3313
3314	error = _request_lock(r, lkb);
3315
3316	unlock_rsb(r);
3317	put_rsb(r);
3318	return error;
3319}
3320
3321static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
3322			struct dlm_args *args)
3323{
3324	struct dlm_rsb *r;
3325	int error;
3326
3327	r = lkb->lkb_resource;
3328
3329	hold_rsb(r);
3330	lock_rsb(r);
3331
3332	error = validate_lock_args(ls, lkb, args);
3333	if (error)
3334		goto out;
3335
3336	error = _convert_lock(r, lkb);
3337 out:
3338	unlock_rsb(r);
3339	put_rsb(r);
3340	return error;
3341}
3342
3343static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
3344		       struct dlm_args *args)
3345{
3346	struct dlm_rsb *r;
3347	int error;
3348
3349	r = lkb->lkb_resource;
3350
3351	hold_rsb(r);
3352	lock_rsb(r);
3353
3354	error = validate_unlock_args(lkb, args);
3355	if (error)
3356		goto out;
3357
3358	error = _unlock_lock(r, lkb);
3359 out:
3360	unlock_rsb(r);
3361	put_rsb(r);
3362	return error;
3363}
3364
3365static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb,
3366		       struct dlm_args *args)
3367{
3368	struct dlm_rsb *r;
3369	int error;
3370
3371	r = lkb->lkb_resource;
3372
3373	hold_rsb(r);
3374	lock_rsb(r);
3375
3376	error = validate_unlock_args(lkb, args);
3377	if (error)
3378		goto out;
3379
3380	error = _cancel_lock(r, lkb);
3381 out:
3382	unlock_rsb(r);
3383	put_rsb(r);
3384	return error;
3385}
3386
3387/*
3388 * Two stage 1 varieties:  dlm_lock() and dlm_unlock()
3389 */
3390
3391int dlm_lock(dlm_lockspace_t *lockspace,
3392	     int mode,
3393	     struct dlm_lksb *lksb,
3394	     uint32_t flags,
3395	     void *name,
3396	     unsigned int namelen,
3397	     uint32_t parent_lkid,
3398	     void (*ast) (void *astarg),
3399	     void *astarg,
3400	     void (*bast) (void *astarg, int mode))
3401{
3402	struct dlm_ls *ls;
3403	struct dlm_lkb *lkb;
3404	struct dlm_args args;
3405	int error, convert = flags & DLM_LKF_CONVERT;
3406
3407	ls = dlm_find_lockspace_local(lockspace);
3408	if (!ls)
3409		return -EINVAL;
3410
3411	dlm_lock_recovery(ls);
3412
3413	if (convert)
3414		error = find_lkb(ls, lksb->sb_lkid, &lkb);
3415	else
3416		error = create_lkb(ls, &lkb);
3417
3418	if (error)
3419		goto out;
3420
3421	trace_dlm_lock_start(ls, lkb, name, namelen, mode, flags);
3422
3423#ifdef CONFIG_DLM_DEPRECATED_API
3424	error = set_lock_args(mode, lksb, flags, namelen, 0, ast,
3425			      astarg, bast, &args);
3426#else
3427	error = set_lock_args(mode, lksb, flags, namelen, ast, astarg, bast,
3428			      &args);
3429#endif
3430	if (error)
3431		goto out_put;
3432
3433	if (convert)
3434		error = convert_lock(ls, lkb, &args);
3435	else
3436		error = request_lock(ls, lkb, name, namelen, &args);
3437
3438	if (error == -EINPROGRESS)
3439		error = 0;
3440 out_put:
3441	trace_dlm_lock_end(ls, lkb, name, namelen, mode, flags, error);
3442
3443	if (convert || error)
3444		__put_lkb(ls, lkb);
3445	if (error == -EAGAIN || error == -EDEADLK)
3446		error = 0;
3447 out:
3448	dlm_unlock_recovery(ls);
3449	dlm_put_lockspace(ls);
3450	return error;
3451}
3452
3453int dlm_unlock(dlm_lockspace_t *lockspace,
3454	       uint32_t lkid,
3455	       uint32_t flags,
3456	       struct dlm_lksb *lksb,
3457	       void *astarg)
3458{
3459	struct dlm_ls *ls;
3460	struct dlm_lkb *lkb;
3461	struct dlm_args args;
3462	int error;
3463
3464	ls = dlm_find_lockspace_local(lockspace);
3465	if (!ls)
3466		return -EINVAL;
3467
3468	dlm_lock_recovery(ls);
3469
3470	error = find_lkb(ls, lkid, &lkb);
3471	if (error)
3472		goto out;
3473
3474	trace_dlm_unlock_start(ls, lkb, flags);
3475
3476	error = set_unlock_args(flags, astarg, &args);
3477	if (error)
3478		goto out_put;
3479
3480	if (flags & DLM_LKF_CANCEL)
3481		error = cancel_lock(ls, lkb, &args);
3482	else
3483		error = unlock_lock(ls, lkb, &args);
3484
3485	if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL)
3486		error = 0;
3487	if (error == -EBUSY && (flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)))
3488		error = 0;
3489 out_put:
3490	trace_dlm_unlock_end(ls, lkb, flags, error);
3491
3492	dlm_put_lkb(lkb);
3493 out:
3494	dlm_unlock_recovery(ls);
3495	dlm_put_lockspace(ls);
3496	return error;
3497}
3498
3499/*
3500 * send/receive routines for remote operations and replies
3501 *
3502 * send_args
3503 * send_common
3504 * send_request			receive_request
3505 * send_convert			receive_convert
3506 * send_unlock			receive_unlock
3507 * send_cancel			receive_cancel
3508 * send_grant			receive_grant
3509 * send_bast			receive_bast
3510 * send_lookup			receive_lookup
3511 * send_remove			receive_remove
3512 *
3513 * 				send_common_reply
3514 * receive_request_reply	send_request_reply
3515 * receive_convert_reply	send_convert_reply
3516 * receive_unlock_reply		send_unlock_reply
3517 * receive_cancel_reply		send_cancel_reply
3518 * receive_lookup_reply		send_lookup_reply
3519 */
3520
3521static int _create_message(struct dlm_ls *ls, int mb_len,
3522			   int to_nodeid, int mstype,
3523			   struct dlm_message **ms_ret,
3524			   struct dlm_mhandle **mh_ret)
3525{
3526	struct dlm_message *ms;
3527	struct dlm_mhandle *mh;
3528	char *mb;
3529
3530	/* get_buffer gives us a message handle (mh) that we need to
3531	   pass into midcomms_commit and a message buffer (mb) that we
3532	   write our data into */
3533
3534	mh = dlm_midcomms_get_mhandle(to_nodeid, mb_len, GFP_NOFS, &mb);
3535	if (!mh)
3536		return -ENOBUFS;
3537
3538	ms = (struct dlm_message *) mb;
3539
3540	ms->m_header.h_version = cpu_to_le32(DLM_HEADER_MAJOR | DLM_HEADER_MINOR);
3541	ms->m_header.u.h_lockspace = cpu_to_le32(ls->ls_global_id);
3542	ms->m_header.h_nodeid = cpu_to_le32(dlm_our_nodeid());
3543	ms->m_header.h_length = cpu_to_le16(mb_len);
3544	ms->m_header.h_cmd = DLM_MSG;
3545
3546	ms->m_type = cpu_to_le32(mstype);
3547
3548	*mh_ret = mh;
3549	*ms_ret = ms;
3550	return 0;
3551}
3552
3553static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb,
3554			  int to_nodeid, int mstype,
3555			  struct dlm_message **ms_ret,
3556			  struct dlm_mhandle **mh_ret)
3557{
3558	int mb_len = sizeof(struct dlm_message);
3559
3560	switch (mstype) {
3561	case DLM_MSG_REQUEST:
3562	case DLM_MSG_LOOKUP:
3563	case DLM_MSG_REMOVE:
3564		mb_len += r->res_length;
3565		break;
3566	case DLM_MSG_CONVERT:
3567	case DLM_MSG_UNLOCK:
3568	case DLM_MSG_REQUEST_REPLY:
3569	case DLM_MSG_CONVERT_REPLY:
3570	case DLM_MSG_GRANT:
3571		if (lkb && lkb->lkb_lvbptr)
3572			mb_len += r->res_ls->ls_lvblen;
3573		break;
3574	}
3575
3576	return _create_message(r->res_ls, mb_len, to_nodeid, mstype,
3577			       ms_ret, mh_ret);
3578}
3579
3580/* further lowcomms enhancements or alternate implementations may make
3581   the return value from this function useful at some point */
3582
3583static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms)
3584{
3585	dlm_midcomms_commit_mhandle(mh);
3586	return 0;
3587}
3588
3589static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb,
3590		      struct dlm_message *ms)
3591{
3592	ms->m_nodeid   = cpu_to_le32(lkb->lkb_nodeid);
3593	ms->m_pid      = cpu_to_le32(lkb->lkb_ownpid);
3594	ms->m_lkid     = cpu_to_le32(lkb->lkb_id);
3595	ms->m_remid    = cpu_to_le32(lkb->lkb_remid);
3596	ms->m_exflags  = cpu_to_le32(lkb->lkb_exflags);
3597	ms->m_sbflags  = cpu_to_le32(lkb->lkb_sbflags);
3598	ms->m_flags    = cpu_to_le32(lkb->lkb_flags);
3599	ms->m_lvbseq   = cpu_to_le32(lkb->lkb_lvbseq);
3600	ms->m_status   = cpu_to_le32(lkb->lkb_status);
3601	ms->m_grmode   = cpu_to_le32(lkb->lkb_grmode);
3602	ms->m_rqmode   = cpu_to_le32(lkb->lkb_rqmode);
3603	ms->m_hash     = cpu_to_le32(r->res_hash);
3604
3605	/* m_result and m_bastmode are set from function args,
3606	   not from lkb fields */
3607
3608	if (lkb->lkb_bastfn)
3609		ms->m_asts |= cpu_to_le32(DLM_CB_BAST);
3610	if (lkb->lkb_astfn)
3611		ms->m_asts |= cpu_to_le32(DLM_CB_CAST);
3612
3613	/* compare with switch in create_message; send_remove() doesn't
3614	   use send_args() */
3615
3616	switch (ms->m_type) {
3617	case cpu_to_le32(DLM_MSG_REQUEST):
3618	case cpu_to_le32(DLM_MSG_LOOKUP):
3619		memcpy(ms->m_extra, r->res_name, r->res_length);
3620		break;
3621	case cpu_to_le32(DLM_MSG_CONVERT):
3622	case cpu_to_le32(DLM_MSG_UNLOCK):
3623	case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
3624	case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
3625	case cpu_to_le32(DLM_MSG_GRANT):
3626		if (!lkb->lkb_lvbptr)
3627			break;
3628		memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen);
3629		break;
3630	}
3631}
3632
3633static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype)
3634{
3635	struct dlm_message *ms;
3636	struct dlm_mhandle *mh;
3637	int to_nodeid, error;
3638
3639	to_nodeid = r->res_nodeid;
3640
3641	error = add_to_waiters(lkb, mstype, to_nodeid);
3642	if (error)
3643		return error;
3644
3645	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
3646	if (error)
3647		goto fail;
3648
3649	send_args(r, lkb, ms);
3650
3651	error = send_message(mh, ms);
3652	if (error)
3653		goto fail;
3654	return 0;
3655
3656 fail:
3657	remove_from_waiters(lkb, msg_reply_type(mstype));
3658	return error;
3659}
3660
3661static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb)
3662{
3663	return send_common(r, lkb, DLM_MSG_REQUEST);
3664}
3665
3666static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb)
3667{
3668	int error;
3669
3670	error = send_common(r, lkb, DLM_MSG_CONVERT);
3671
3672	/* down conversions go without a reply from the master */
3673	if (!error && down_conversion(lkb)) {
3674		remove_from_waiters(lkb, DLM_MSG_CONVERT_REPLY);
3675		r->res_ls->ls_stub_ms.m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
3676		r->res_ls->ls_stub_ms.m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
3677		r->res_ls->ls_stub_ms.m_result = 0;
3678		__receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms);
3679	}
3680
3681	return error;
3682}
3683
3684/* FIXME: if this lkb is the only lock we hold on the rsb, then set
3685   MASTER_UNCERTAIN to force the next request on the rsb to confirm
3686   that the master is still correct. */
3687
3688static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb)
3689{
3690	return send_common(r, lkb, DLM_MSG_UNLOCK);
3691}
3692
3693static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb)
3694{
3695	return send_common(r, lkb, DLM_MSG_CANCEL);
3696}
3697
3698static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb)
3699{
3700	struct dlm_message *ms;
3701	struct dlm_mhandle *mh;
3702	int to_nodeid, error;
3703
3704	to_nodeid = lkb->lkb_nodeid;
3705
3706	error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh);
3707	if (error)
3708		goto out;
3709
3710	send_args(r, lkb, ms);
3711
3712	ms->m_result = 0;
3713
3714	error = send_message(mh, ms);
3715 out:
3716	return error;
3717}
3718
3719static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode)
3720{
3721	struct dlm_message *ms;
3722	struct dlm_mhandle *mh;
3723	int to_nodeid, error;
3724
3725	to_nodeid = lkb->lkb_nodeid;
3726
3727	error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh);
3728	if (error)
3729		goto out;
3730
3731	send_args(r, lkb, ms);
3732
3733	ms->m_bastmode = cpu_to_le32(mode);
3734
3735	error = send_message(mh, ms);
3736 out:
3737	return error;
3738}
3739
3740static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb)
3741{
3742	struct dlm_message *ms;
3743	struct dlm_mhandle *mh;
3744	int to_nodeid, error;
3745
3746	to_nodeid = dlm_dir_nodeid(r);
3747
3748	error = add_to_waiters(lkb, DLM_MSG_LOOKUP, to_nodeid);
3749	if (error)
3750		return error;
3751
3752	error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh);
3753	if (error)
3754		goto fail;
3755
3756	send_args(r, lkb, ms);
3757
3758	error = send_message(mh, ms);
3759	if (error)
3760		goto fail;
3761	return 0;
3762
3763 fail:
3764	remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
3765	return error;
3766}
3767
3768static int send_remove(struct dlm_rsb *r)
3769{
3770	struct dlm_message *ms;
3771	struct dlm_mhandle *mh;
3772	int to_nodeid, error;
3773
3774	to_nodeid = dlm_dir_nodeid(r);
3775
3776	error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh);
3777	if (error)
3778		goto out;
3779
3780	memcpy(ms->m_extra, r->res_name, r->res_length);
3781	ms->m_hash = cpu_to_le32(r->res_hash);
3782
3783	error = send_message(mh, ms);
3784 out:
3785	return error;
3786}
3787
3788static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
3789			     int mstype, int rv)
3790{
3791	struct dlm_message *ms;
3792	struct dlm_mhandle *mh;
3793	int to_nodeid, error;
3794
3795	to_nodeid = lkb->lkb_nodeid;
3796
3797	error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh);
3798	if (error)
3799		goto out;
3800
3801	send_args(r, lkb, ms);
3802
3803	ms->m_result = cpu_to_le32(to_dlm_errno(rv));
3804
3805	error = send_message(mh, ms);
3806 out:
3807	return error;
3808}
3809
3810static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
3811{
3812	return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv);
3813}
3814
3815static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
3816{
3817	return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv);
3818}
3819
3820static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
3821{
3822	return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv);
3823}
3824
3825static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv)
3826{
3827	return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv);
3828}
3829
3830static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in,
3831			     int ret_nodeid, int rv)
3832{
3833	struct dlm_rsb *r = &ls->ls_stub_rsb;
3834	struct dlm_message *ms;
3835	struct dlm_mhandle *mh;
3836	int error, nodeid = le32_to_cpu(ms_in->m_header.h_nodeid);
3837
3838	error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh);
3839	if (error)
3840		goto out;
3841
3842	ms->m_lkid = ms_in->m_lkid;
3843	ms->m_result = cpu_to_le32(to_dlm_errno(rv));
3844	ms->m_nodeid = cpu_to_le32(ret_nodeid);
3845
3846	error = send_message(mh, ms);
3847 out:
3848	return error;
3849}
3850
3851/* which args we save from a received message depends heavily on the type
3852   of message, unlike the send side where we can safely send everything about
3853   the lkb for any type of message */
3854
3855static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms)
3856{
3857	lkb->lkb_exflags = le32_to_cpu(ms->m_exflags);
3858	lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
3859	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
3860			  (le32_to_cpu(ms->m_flags) & 0x0000FFFF);
3861}
3862
3863static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
3864{
3865	if (ms->m_flags == cpu_to_le32(DLM_IFL_STUB_MS))
3866		return;
3867
3868	lkb->lkb_sbflags = le32_to_cpu(ms->m_sbflags);
3869	lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) |
3870			 (le32_to_cpu(ms->m_flags) & 0x0000FFFF);
3871}
3872
3873static int receive_extralen(struct dlm_message *ms)
3874{
3875	return (le16_to_cpu(ms->m_header.h_length) -
3876		sizeof(struct dlm_message));
3877}
3878
3879static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb,
3880		       struct dlm_message *ms)
3881{
3882	int len;
3883
3884	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3885		if (!lkb->lkb_lvbptr)
3886			lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
3887		if (!lkb->lkb_lvbptr)
3888			return -ENOMEM;
3889		len = receive_extralen(ms);
3890		if (len > ls->ls_lvblen)
3891			len = ls->ls_lvblen;
3892		memcpy(lkb->lkb_lvbptr, ms->m_extra, len);
3893	}
3894	return 0;
3895}
3896
3897static void fake_bastfn(void *astparam, int mode)
3898{
3899	log_print("fake_bastfn should not be called");
3900}
3901
3902static void fake_astfn(void *astparam)
3903{
3904	log_print("fake_astfn should not be called");
3905}
3906
3907static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3908				struct dlm_message *ms)
3909{
3910	lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
3911	lkb->lkb_ownpid = le32_to_cpu(ms->m_pid);
3912	lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
3913	lkb->lkb_grmode = DLM_LOCK_IV;
3914	lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode);
3915
3916	lkb->lkb_bastfn = (ms->m_asts & cpu_to_le32(DLM_CB_BAST)) ? &fake_bastfn : NULL;
3917	lkb->lkb_astfn = (ms->m_asts & cpu_to_le32(DLM_CB_CAST)) ? &fake_astfn : NULL;
3918
3919	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
3920		/* lkb was just created so there won't be an lvb yet */
3921		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
3922		if (!lkb->lkb_lvbptr)
3923			return -ENOMEM;
3924	}
3925
3926	return 0;
3927}
3928
3929static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3930				struct dlm_message *ms)
3931{
3932	if (lkb->lkb_status != DLM_LKSTS_GRANTED)
3933		return -EBUSY;
3934
3935	if (receive_lvb(ls, lkb, ms))
3936		return -ENOMEM;
3937
3938	lkb->lkb_rqmode = le32_to_cpu(ms->m_rqmode);
3939	lkb->lkb_lvbseq = le32_to_cpu(ms->m_lvbseq);
3940
3941	return 0;
3942}
3943
3944static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
3945			       struct dlm_message *ms)
3946{
3947	if (receive_lvb(ls, lkb, ms))
3948		return -ENOMEM;
3949	return 0;
3950}
3951
3952/* We fill in the stub-lkb fields with the info that send_xxxx_reply()
3953   uses to send a reply and that the remote end uses to process the reply. */
3954
3955static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms)
3956{
3957	struct dlm_lkb *lkb = &ls->ls_stub_lkb;
3958	lkb->lkb_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
3959	lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
3960}
3961
3962/* This is called after the rsb is locked so that we can safely inspect
3963   fields in the lkb. */
3964
3965static int validate_message(struct dlm_lkb *lkb, struct dlm_message *ms)
3966{
3967	int from = le32_to_cpu(ms->m_header.h_nodeid);
3968	int error = 0;
3969
3970	/* currently mixing of user/kernel locks are not supported */
3971	if (ms->m_flags & cpu_to_le32(DLM_IFL_USER) &&
3972	    ~lkb->lkb_flags & DLM_IFL_USER) {
3973		log_error(lkb->lkb_resource->res_ls,
3974			  "got user dlm message for a kernel lock");
3975		error = -EINVAL;
3976		goto out;
3977	}
3978
3979	switch (ms->m_type) {
3980	case cpu_to_le32(DLM_MSG_CONVERT):
3981	case cpu_to_le32(DLM_MSG_UNLOCK):
3982	case cpu_to_le32(DLM_MSG_CANCEL):
3983		if (!is_master_copy(lkb) || lkb->lkb_nodeid != from)
3984			error = -EINVAL;
3985		break;
3986
3987	case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
3988	case cpu_to_le32(DLM_MSG_UNLOCK_REPLY):
3989	case cpu_to_le32(DLM_MSG_CANCEL_REPLY):
3990	case cpu_to_le32(DLM_MSG_GRANT):
3991	case cpu_to_le32(DLM_MSG_BAST):
3992		if (!is_process_copy(lkb) || lkb->lkb_nodeid != from)
3993			error = -EINVAL;
3994		break;
3995
3996	case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
3997		if (!is_process_copy(lkb))
3998			error = -EINVAL;
3999		else if (lkb->lkb_nodeid != -1 && lkb->lkb_nodeid != from)
4000			error = -EINVAL;
4001		break;
4002
4003	default:
4004		error = -EINVAL;
4005	}
4006
4007out:
4008	if (error)
4009		log_error(lkb->lkb_resource->res_ls,
4010			  "ignore invalid message %d from %d %x %x %x %d",
4011			  le32_to_cpu(ms->m_type), from, lkb->lkb_id,
4012			  lkb->lkb_remid, lkb->lkb_flags, lkb->lkb_nodeid);
4013	return error;
4014}
4015
4016static void send_repeat_remove(struct dlm_ls *ls, char *ms_name, int len)
4017{
4018	char name[DLM_RESNAME_MAXLEN + 1];
4019	struct dlm_message *ms;
4020	struct dlm_mhandle *mh;
4021	struct dlm_rsb *r;
4022	uint32_t hash, b;
4023	int rv, dir_nodeid;
4024
4025	memset(name, 0, sizeof(name));
4026	memcpy(name, ms_name, len);
4027
4028	hash = jhash(name, len, 0);
4029	b = hash & (ls->ls_rsbtbl_size - 1);
4030
4031	dir_nodeid = dlm_hash2nodeid(ls, hash);
4032
4033	log_error(ls, "send_repeat_remove dir %d %s", dir_nodeid, name);
4034
4035	spin_lock(&ls->ls_rsbtbl[b].lock);
4036	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
4037	if (!rv) {
4038		spin_unlock(&ls->ls_rsbtbl[b].lock);
4039		log_error(ls, "repeat_remove on keep %s", name);
4040		return;
4041	}
4042
4043	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
4044	if (!rv) {
4045		spin_unlock(&ls->ls_rsbtbl[b].lock);
4046		log_error(ls, "repeat_remove on toss %s", name);
4047		return;
4048	}
4049
4050	/* use ls->remove_name2 to avoid conflict with shrink? */
4051
4052	spin_lock(&ls->ls_remove_spin);
4053	ls->ls_remove_len = len;
4054	memcpy(ls->ls_remove_name, name, DLM_RESNAME_MAXLEN);
4055	spin_unlock(&ls->ls_remove_spin);
4056	spin_unlock(&ls->ls_rsbtbl[b].lock);
4057
4058	rv = _create_message(ls, sizeof(struct dlm_message) + len,
4059			     dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
4060	if (rv)
4061		goto out;
4062
4063	memcpy(ms->m_extra, name, len);
4064	ms->m_hash = cpu_to_le32(hash);
4065
4066	send_message(mh, ms);
4067
4068out:
4069	spin_lock(&ls->ls_remove_spin);
4070	ls->ls_remove_len = 0;
4071	memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
4072	spin_unlock(&ls->ls_remove_spin);
4073	wake_up(&ls->ls_remove_wait);
4074}
4075
4076static int receive_request(struct dlm_ls *ls, struct dlm_message *ms)
4077{
4078	struct dlm_lkb *lkb;
4079	struct dlm_rsb *r;
4080	int from_nodeid;
4081	int error, namelen = 0;
4082
4083	from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
4084
4085	error = create_lkb(ls, &lkb);
4086	if (error)
4087		goto fail;
4088
4089	receive_flags(lkb, ms);
4090	lkb->lkb_flags |= DLM_IFL_MSTCPY;
4091	error = receive_request_args(ls, lkb, ms);
4092	if (error) {
4093		__put_lkb(ls, lkb);
4094		goto fail;
4095	}
4096
4097	/* The dir node is the authority on whether we are the master
4098	   for this rsb or not, so if the master sends us a request, we should
4099	   recreate the rsb if we've destroyed it.   This race happens when we
4100	   send a remove message to the dir node at the same time that the dir
4101	   node sends us a request for the rsb. */
4102
4103	namelen = receive_extralen(ms);
4104
4105	error = find_rsb(ls, ms->m_extra, namelen, from_nodeid,
4106			 R_RECEIVE_REQUEST, &r);
4107	if (error) {
4108		__put_lkb(ls, lkb);
4109		goto fail;
4110	}
4111
4112	lock_rsb(r);
4113
4114	if (r->res_master_nodeid != dlm_our_nodeid()) {
4115		error = validate_master_nodeid(ls, r, from_nodeid);
4116		if (error) {
4117			unlock_rsb(r);
4118			put_rsb(r);
4119			__put_lkb(ls, lkb);
4120			goto fail;
4121		}
4122	}
4123
4124	attach_lkb(r, lkb);
4125	error = do_request(r, lkb);
4126	send_request_reply(r, lkb, error);
4127	do_request_effects(r, lkb, error);
4128
4129	unlock_rsb(r);
4130	put_rsb(r);
4131
4132	if (error == -EINPROGRESS)
4133		error = 0;
4134	if (error)
4135		dlm_put_lkb(lkb);
4136	return 0;
4137
4138 fail:
4139	/* TODO: instead of returning ENOTBLK, add the lkb to res_lookup
4140	   and do this receive_request again from process_lookup_list once
4141	   we get the lookup reply.  This would avoid a many repeated
4142	   ENOTBLK request failures when the lookup reply designating us
4143	   as master is delayed. */
4144
4145	/* We could repeatedly return -EBADR here if our send_remove() is
4146	   delayed in being sent/arriving/being processed on the dir node.
4147	   Another node would repeatedly lookup up the master, and the dir
4148	   node would continue returning our nodeid until our send_remove
4149	   took effect.
4150
4151	   We send another remove message in case our previous send_remove
4152	   was lost/ignored/missed somehow. */
4153
4154	if (error != -ENOTBLK) {
4155		log_limit(ls, "receive_request %x from %d %d",
4156			  le32_to_cpu(ms->m_lkid), from_nodeid, error);
4157	}
4158
4159	if (namelen && error == -EBADR) {
4160		send_repeat_remove(ls, ms->m_extra, namelen);
4161		msleep(1000);
4162	}
4163
4164	setup_stub_lkb(ls, ms);
4165	send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4166	return error;
4167}
4168
4169static int receive_convert(struct dlm_ls *ls, struct dlm_message *ms)
4170{
4171	struct dlm_lkb *lkb;
4172	struct dlm_rsb *r;
4173	int error, reply = 1;
4174
4175	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4176	if (error)
4177		goto fail;
4178
4179	if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) {
4180		log_error(ls, "receive_convert %x remid %x recover_seq %llu "
4181			  "remote %d %x", lkb->lkb_id, lkb->lkb_remid,
4182			  (unsigned long long)lkb->lkb_recover_seq,
4183			  le32_to_cpu(ms->m_header.h_nodeid),
4184			  le32_to_cpu(ms->m_lkid));
4185		error = -ENOENT;
4186		dlm_put_lkb(lkb);
4187		goto fail;
4188	}
4189
4190	r = lkb->lkb_resource;
4191
4192	hold_rsb(r);
4193	lock_rsb(r);
4194
4195	error = validate_message(lkb, ms);
4196	if (error)
4197		goto out;
4198
4199	receive_flags(lkb, ms);
4200
4201	error = receive_convert_args(ls, lkb, ms);
4202	if (error) {
4203		send_convert_reply(r, lkb, error);
4204		goto out;
4205	}
4206
4207	reply = !down_conversion(lkb);
4208
4209	error = do_convert(r, lkb);
4210	if (reply)
4211		send_convert_reply(r, lkb, error);
4212	do_convert_effects(r, lkb, error);
4213 out:
4214	unlock_rsb(r);
4215	put_rsb(r);
4216	dlm_put_lkb(lkb);
4217	return 0;
4218
4219 fail:
4220	setup_stub_lkb(ls, ms);
4221	send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4222	return error;
4223}
4224
4225static int receive_unlock(struct dlm_ls *ls, struct dlm_message *ms)
4226{
4227	struct dlm_lkb *lkb;
4228	struct dlm_rsb *r;
4229	int error;
4230
4231	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4232	if (error)
4233		goto fail;
4234
4235	if (lkb->lkb_remid != le32_to_cpu(ms->m_lkid)) {
4236		log_error(ls, "receive_unlock %x remid %x remote %d %x",
4237			  lkb->lkb_id, lkb->lkb_remid,
4238			  le32_to_cpu(ms->m_header.h_nodeid),
4239			  le32_to_cpu(ms->m_lkid));
4240		error = -ENOENT;
4241		dlm_put_lkb(lkb);
4242		goto fail;
4243	}
4244
4245	r = lkb->lkb_resource;
4246
4247	hold_rsb(r);
4248	lock_rsb(r);
4249
4250	error = validate_message(lkb, ms);
4251	if (error)
4252		goto out;
4253
4254	receive_flags(lkb, ms);
4255
4256	error = receive_unlock_args(ls, lkb, ms);
4257	if (error) {
4258		send_unlock_reply(r, lkb, error);
4259		goto out;
4260	}
4261
4262	error = do_unlock(r, lkb);
4263	send_unlock_reply(r, lkb, error);
4264	do_unlock_effects(r, lkb, error);
4265 out:
4266	unlock_rsb(r);
4267	put_rsb(r);
4268	dlm_put_lkb(lkb);
4269	return 0;
4270
4271 fail:
4272	setup_stub_lkb(ls, ms);
4273	send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4274	return error;
4275}
4276
4277static int receive_cancel(struct dlm_ls *ls, struct dlm_message *ms)
4278{
4279	struct dlm_lkb *lkb;
4280	struct dlm_rsb *r;
4281	int error;
4282
4283	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4284	if (error)
4285		goto fail;
4286
4287	receive_flags(lkb, ms);
4288
4289	r = lkb->lkb_resource;
4290
4291	hold_rsb(r);
4292	lock_rsb(r);
4293
4294	error = validate_message(lkb, ms);
4295	if (error)
4296		goto out;
4297
4298	error = do_cancel(r, lkb);
4299	send_cancel_reply(r, lkb, error);
4300	do_cancel_effects(r, lkb, error);
4301 out:
4302	unlock_rsb(r);
4303	put_rsb(r);
4304	dlm_put_lkb(lkb);
4305	return 0;
4306
4307 fail:
4308	setup_stub_lkb(ls, ms);
4309	send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error);
4310	return error;
4311}
4312
4313static int receive_grant(struct dlm_ls *ls, struct dlm_message *ms)
4314{
4315	struct dlm_lkb *lkb;
4316	struct dlm_rsb *r;
4317	int error;
4318
4319	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4320	if (error)
4321		return error;
4322
4323	r = lkb->lkb_resource;
4324
4325	hold_rsb(r);
4326	lock_rsb(r);
4327
4328	error = validate_message(lkb, ms);
4329	if (error)
4330		goto out;
4331
4332	receive_flags_reply(lkb, ms);
4333	if (is_altmode(lkb))
4334		munge_altmode(lkb, ms);
4335	grant_lock_pc(r, lkb, ms);
4336	queue_cast(r, lkb, 0);
4337 out:
4338	unlock_rsb(r);
4339	put_rsb(r);
4340	dlm_put_lkb(lkb);
4341	return 0;
4342}
4343
4344static int receive_bast(struct dlm_ls *ls, struct dlm_message *ms)
4345{
4346	struct dlm_lkb *lkb;
4347	struct dlm_rsb *r;
4348	int error;
4349
4350	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4351	if (error)
4352		return error;
4353
4354	r = lkb->lkb_resource;
4355
4356	hold_rsb(r);
4357	lock_rsb(r);
4358
4359	error = validate_message(lkb, ms);
4360	if (error)
4361		goto out;
4362
4363	queue_bast(r, lkb, le32_to_cpu(ms->m_bastmode));
4364	lkb->lkb_highbast = le32_to_cpu(ms->m_bastmode);
4365 out:
4366	unlock_rsb(r);
4367	put_rsb(r);
4368	dlm_put_lkb(lkb);
4369	return 0;
4370}
4371
4372static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms)
4373{
4374	int len, error, ret_nodeid, from_nodeid, our_nodeid;
4375
4376	from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
4377	our_nodeid = dlm_our_nodeid();
4378
4379	len = receive_extralen(ms);
4380
4381	error = dlm_master_lookup(ls, from_nodeid, ms->m_extra, len, 0,
4382				  &ret_nodeid, NULL);
4383
4384	/* Optimization: we're master so treat lookup as a request */
4385	if (!error && ret_nodeid == our_nodeid) {
4386		receive_request(ls, ms);
4387		return;
4388	}
4389	send_lookup_reply(ls, ms, ret_nodeid, error);
4390}
4391
4392static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms)
4393{
4394	char name[DLM_RESNAME_MAXLEN+1];
4395	struct dlm_rsb *r;
4396	uint32_t hash, b;
4397	int rv, len, dir_nodeid, from_nodeid;
4398
4399	from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
4400
4401	len = receive_extralen(ms);
4402
4403	if (len > DLM_RESNAME_MAXLEN) {
4404		log_error(ls, "receive_remove from %d bad len %d",
4405			  from_nodeid, len);
4406		return;
4407	}
4408
4409	dir_nodeid = dlm_hash2nodeid(ls, le32_to_cpu(ms->m_hash));
4410	if (dir_nodeid != dlm_our_nodeid()) {
4411		log_error(ls, "receive_remove from %d bad nodeid %d",
4412			  from_nodeid, dir_nodeid);
4413		return;
4414	}
4415
4416	/* Look for name on rsbtbl.toss, if it's there, kill it.
4417	   If it's on rsbtbl.keep, it's being used, and we should ignore this
4418	   message.  This is an expected race between the dir node sending a
4419	   request to the master node at the same time as the master node sends
4420	   a remove to the dir node.  The resolution to that race is for the
4421	   dir node to ignore the remove message, and the master node to
4422	   recreate the master rsb when it gets a request from the dir node for
4423	   an rsb it doesn't have. */
4424
4425	memset(name, 0, sizeof(name));
4426	memcpy(name, ms->m_extra, len);
4427
4428	hash = jhash(name, len, 0);
4429	b = hash & (ls->ls_rsbtbl_size - 1);
4430
4431	spin_lock(&ls->ls_rsbtbl[b].lock);
4432
4433	rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].toss, name, len, &r);
4434	if (rv) {
4435		/* verify the rsb is on keep list per comment above */
4436		rv = dlm_search_rsb_tree(&ls->ls_rsbtbl[b].keep, name, len, &r);
4437		if (rv) {
4438			/* should not happen */
4439			log_error(ls, "receive_remove from %d not found %s",
4440				  from_nodeid, name);
4441			spin_unlock(&ls->ls_rsbtbl[b].lock);
4442			return;
4443		}
4444		if (r->res_master_nodeid != from_nodeid) {
4445			/* should not happen */
4446			log_error(ls, "receive_remove keep from %d master %d",
4447				  from_nodeid, r->res_master_nodeid);
4448			dlm_print_rsb(r);
4449			spin_unlock(&ls->ls_rsbtbl[b].lock);
4450			return;
4451		}
4452
4453		log_debug(ls, "receive_remove from %d master %d first %x %s",
4454			  from_nodeid, r->res_master_nodeid, r->res_first_lkid,
4455			  name);
4456		spin_unlock(&ls->ls_rsbtbl[b].lock);
4457		return;
4458	}
4459
4460	if (r->res_master_nodeid != from_nodeid) {
4461		log_error(ls, "receive_remove toss from %d master %d",
4462			  from_nodeid, r->res_master_nodeid);
4463		dlm_print_rsb(r);
4464		spin_unlock(&ls->ls_rsbtbl[b].lock);
4465		return;
4466	}
4467
4468	if (kref_put(&r->res_ref, kill_rsb)) {
4469		rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
4470		spin_unlock(&ls->ls_rsbtbl[b].lock);
4471		dlm_free_rsb(r);
4472	} else {
4473		log_error(ls, "receive_remove from %d rsb ref error",
4474			  from_nodeid);
4475		dlm_print_rsb(r);
4476		spin_unlock(&ls->ls_rsbtbl[b].lock);
4477	}
4478}
4479
4480static void receive_purge(struct dlm_ls *ls, struct dlm_message *ms)
4481{
4482	do_purge(ls, le32_to_cpu(ms->m_nodeid), le32_to_cpu(ms->m_pid));
4483}
4484
4485static int receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms)
4486{
4487	struct dlm_lkb *lkb;
4488	struct dlm_rsb *r;
4489	int error, mstype, result;
4490	int from_nodeid = le32_to_cpu(ms->m_header.h_nodeid);
4491
4492	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4493	if (error)
4494		return error;
4495
4496	r = lkb->lkb_resource;
4497	hold_rsb(r);
4498	lock_rsb(r);
4499
4500	error = validate_message(lkb, ms);
4501	if (error)
4502		goto out;
4503
4504	mstype = lkb->lkb_wait_type;
4505	error = remove_from_waiters(lkb, DLM_MSG_REQUEST_REPLY);
4506	if (error) {
4507		log_error(ls, "receive_request_reply %x remote %d %x result %d",
4508			  lkb->lkb_id, from_nodeid, le32_to_cpu(ms->m_lkid),
4509			  from_dlm_errno(le32_to_cpu(ms->m_result)));
4510		dlm_dump_rsb(r);
4511		goto out;
4512	}
4513
4514	/* Optimization: the dir node was also the master, so it took our
4515	   lookup as a request and sent request reply instead of lookup reply */
4516	if (mstype == DLM_MSG_LOOKUP) {
4517		r->res_master_nodeid = from_nodeid;
4518		r->res_nodeid = from_nodeid;
4519		lkb->lkb_nodeid = from_nodeid;
4520	}
4521
4522	/* this is the value returned from do_request() on the master */
4523	result = from_dlm_errno(le32_to_cpu(ms->m_result));
4524
4525	switch (result) {
4526	case -EAGAIN:
4527		/* request would block (be queued) on remote master */
4528		queue_cast(r, lkb, -EAGAIN);
4529		confirm_master(r, -EAGAIN);
4530		unhold_lkb(lkb); /* undoes create_lkb() */
4531		break;
4532
4533	case -EINPROGRESS:
4534	case 0:
4535		/* request was queued or granted on remote master */
4536		receive_flags_reply(lkb, ms);
4537		lkb->lkb_remid = le32_to_cpu(ms->m_lkid);
4538		if (is_altmode(lkb))
4539			munge_altmode(lkb, ms);
4540		if (result) {
4541			add_lkb(r, lkb, DLM_LKSTS_WAITING);
4542			add_timeout(lkb);
4543		} else {
4544			grant_lock_pc(r, lkb, ms);
4545			queue_cast(r, lkb, 0);
4546		}
4547		confirm_master(r, result);
4548		break;
4549
4550	case -EBADR:
4551	case -ENOTBLK:
4552		/* find_rsb failed to find rsb or rsb wasn't master */
4553		log_limit(ls, "receive_request_reply %x from %d %d "
4554			  "master %d dir %d first %x %s", lkb->lkb_id,
4555			  from_nodeid, result, r->res_master_nodeid,
4556			  r->res_dir_nodeid, r->res_first_lkid, r->res_name);
4557
4558		if (r->res_dir_nodeid != dlm_our_nodeid() &&
4559		    r->res_master_nodeid != dlm_our_nodeid()) {
4560			/* cause _request_lock->set_master->send_lookup */
4561			r->res_master_nodeid = 0;
4562			r->res_nodeid = -1;
4563			lkb->lkb_nodeid = -1;
4564		}
4565
4566		if (is_overlap(lkb)) {
4567			/* we'll ignore error in cancel/unlock reply */
4568			queue_cast_overlap(r, lkb);
4569			confirm_master(r, result);
4570			unhold_lkb(lkb); /* undoes create_lkb() */
4571		} else {
4572			_request_lock(r, lkb);
4573
4574			if (r->res_master_nodeid == dlm_our_nodeid())
4575				confirm_master(r, 0);
4576		}
4577		break;
4578
4579	default:
4580		log_error(ls, "receive_request_reply %x error %d",
4581			  lkb->lkb_id, result);
4582	}
4583
4584	if (is_overlap_unlock(lkb) && (result == 0 || result == -EINPROGRESS)) {
4585		log_debug(ls, "receive_request_reply %x result %d unlock",
4586			  lkb->lkb_id, result);
4587		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
4588		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
4589		send_unlock(r, lkb);
4590	} else if (is_overlap_cancel(lkb) && (result == -EINPROGRESS)) {
4591		log_debug(ls, "receive_request_reply %x cancel", lkb->lkb_id);
4592		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
4593		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
4594		send_cancel(r, lkb);
4595	} else {
4596		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
4597		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
4598	}
4599 out:
4600	unlock_rsb(r);
4601	put_rsb(r);
4602	dlm_put_lkb(lkb);
4603	return 0;
4604}
4605
4606static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb,
4607				    struct dlm_message *ms)
4608{
4609	/* this is the value returned from do_convert() on the master */
4610	switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
4611	case -EAGAIN:
4612		/* convert would block (be queued) on remote master */
4613		queue_cast(r, lkb, -EAGAIN);
4614		break;
4615
4616	case -EDEADLK:
4617		receive_flags_reply(lkb, ms);
4618		revert_lock_pc(r, lkb);
4619		queue_cast(r, lkb, -EDEADLK);
4620		break;
4621
4622	case -EINPROGRESS:
4623		/* convert was queued on remote master */
4624		receive_flags_reply(lkb, ms);
4625		if (is_demoted(lkb))
4626			munge_demoted(lkb);
4627		del_lkb(r, lkb);
4628		add_lkb(r, lkb, DLM_LKSTS_CONVERT);
4629		add_timeout(lkb);
4630		break;
4631
4632	case 0:
4633		/* convert was granted on remote master */
4634		receive_flags_reply(lkb, ms);
4635		if (is_demoted(lkb))
4636			munge_demoted(lkb);
4637		grant_lock_pc(r, lkb, ms);
4638		queue_cast(r, lkb, 0);
4639		break;
4640
4641	default:
4642		log_error(r->res_ls, "receive_convert_reply %x remote %d %x %d",
4643			  lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid),
4644			  le32_to_cpu(ms->m_lkid),
4645			  from_dlm_errno(le32_to_cpu(ms->m_result)));
4646		dlm_print_rsb(r);
4647		dlm_print_lkb(lkb);
4648	}
4649}
4650
4651static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
4652{
4653	struct dlm_rsb *r = lkb->lkb_resource;
4654	int error;
4655
4656	hold_rsb(r);
4657	lock_rsb(r);
4658
4659	error = validate_message(lkb, ms);
4660	if (error)
4661		goto out;
4662
4663	/* stub reply can happen with waiters_mutex held */
4664	error = remove_from_waiters_ms(lkb, ms);
4665	if (error)
4666		goto out;
4667
4668	__receive_convert_reply(r, lkb, ms);
4669 out:
4670	unlock_rsb(r);
4671	put_rsb(r);
4672}
4673
4674static int receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms)
4675{
4676	struct dlm_lkb *lkb;
4677	int error;
4678
4679	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4680	if (error)
4681		return error;
4682
4683	_receive_convert_reply(lkb, ms);
4684	dlm_put_lkb(lkb);
4685	return 0;
4686}
4687
4688static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
4689{
4690	struct dlm_rsb *r = lkb->lkb_resource;
4691	int error;
4692
4693	hold_rsb(r);
4694	lock_rsb(r);
4695
4696	error = validate_message(lkb, ms);
4697	if (error)
4698		goto out;
4699
4700	/* stub reply can happen with waiters_mutex held */
4701	error = remove_from_waiters_ms(lkb, ms);
4702	if (error)
4703		goto out;
4704
4705	/* this is the value returned from do_unlock() on the master */
4706
4707	switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
4708	case -DLM_EUNLOCK:
4709		receive_flags_reply(lkb, ms);
4710		remove_lock_pc(r, lkb);
4711		queue_cast(r, lkb, -DLM_EUNLOCK);
4712		break;
4713	case -ENOENT:
4714		break;
4715	default:
4716		log_error(r->res_ls, "receive_unlock_reply %x error %d",
4717			  lkb->lkb_id, from_dlm_errno(le32_to_cpu(ms->m_result)));
4718	}
4719 out:
4720	unlock_rsb(r);
4721	put_rsb(r);
4722}
4723
4724static int receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms)
4725{
4726	struct dlm_lkb *lkb;
4727	int error;
4728
4729	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4730	if (error)
4731		return error;
4732
4733	_receive_unlock_reply(lkb, ms);
4734	dlm_put_lkb(lkb);
4735	return 0;
4736}
4737
4738static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
4739{
4740	struct dlm_rsb *r = lkb->lkb_resource;
4741	int error;
4742
4743	hold_rsb(r);
4744	lock_rsb(r);
4745
4746	error = validate_message(lkb, ms);
4747	if (error)
4748		goto out;
4749
4750	/* stub reply can happen with waiters_mutex held */
4751	error = remove_from_waiters_ms(lkb, ms);
4752	if (error)
4753		goto out;
4754
4755	/* this is the value returned from do_cancel() on the master */
4756
4757	switch (from_dlm_errno(le32_to_cpu(ms->m_result))) {
4758	case -DLM_ECANCEL:
4759		receive_flags_reply(lkb, ms);
4760		revert_lock_pc(r, lkb);
4761		queue_cast(r, lkb, -DLM_ECANCEL);
4762		break;
4763	case 0:
4764		break;
4765	default:
4766		log_error(r->res_ls, "receive_cancel_reply %x error %d",
4767			  lkb->lkb_id,
4768			  from_dlm_errno(le32_to_cpu(ms->m_result)));
4769	}
4770 out:
4771	unlock_rsb(r);
4772	put_rsb(r);
4773}
4774
4775static int receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms)
4776{
4777	struct dlm_lkb *lkb;
4778	int error;
4779
4780	error = find_lkb(ls, le32_to_cpu(ms->m_remid), &lkb);
4781	if (error)
4782		return error;
4783
4784	_receive_cancel_reply(lkb, ms);
4785	dlm_put_lkb(lkb);
4786	return 0;
4787}
4788
4789static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms)
4790{
4791	struct dlm_lkb *lkb;
4792	struct dlm_rsb *r;
4793	int error, ret_nodeid;
4794	int do_lookup_list = 0;
4795
4796	error = find_lkb(ls, le32_to_cpu(ms->m_lkid), &lkb);
4797	if (error) {
4798		log_error(ls, "%s no lkid %x", __func__,
4799			  le32_to_cpu(ms->m_lkid));
4800		return;
4801	}
4802
4803	/* ms->m_result is the value returned by dlm_master_lookup on dir node
4804	   FIXME: will a non-zero error ever be returned? */
4805
4806	r = lkb->lkb_resource;
4807	hold_rsb(r);
4808	lock_rsb(r);
4809
4810	error = remove_from_waiters(lkb, DLM_MSG_LOOKUP_REPLY);
4811	if (error)
4812		goto out;
4813
4814	ret_nodeid = le32_to_cpu(ms->m_nodeid);
4815
4816	/* We sometimes receive a request from the dir node for this
4817	   rsb before we've received the dir node's loookup_reply for it.
4818	   The request from the dir node implies we're the master, so we set
4819	   ourself as master in receive_request_reply, and verify here that
4820	   we are indeed the master. */
4821
4822	if (r->res_master_nodeid && (r->res_master_nodeid != ret_nodeid)) {
4823		/* This should never happen */
4824		log_error(ls, "receive_lookup_reply %x from %d ret %d "
4825			  "master %d dir %d our %d first %x %s",
4826			  lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid),
4827			  ret_nodeid, r->res_master_nodeid, r->res_dir_nodeid,
4828			  dlm_our_nodeid(), r->res_first_lkid, r->res_name);
4829	}
4830
4831	if (ret_nodeid == dlm_our_nodeid()) {
4832		r->res_master_nodeid = ret_nodeid;
4833		r->res_nodeid = 0;
4834		do_lookup_list = 1;
4835		r->res_first_lkid = 0;
4836	} else if (ret_nodeid == -1) {
4837		/* the remote node doesn't believe it's the dir node */
4838		log_error(ls, "receive_lookup_reply %x from %d bad ret_nodeid",
4839			  lkb->lkb_id, le32_to_cpu(ms->m_header.h_nodeid));
4840		r->res_master_nodeid = 0;
4841		r->res_nodeid = -1;
4842		lkb->lkb_nodeid = -1;
4843	} else {
4844		/* set_master() will set lkb_nodeid from r */
4845		r->res_master_nodeid = ret_nodeid;
4846		r->res_nodeid = ret_nodeid;
4847	}
4848
4849	if (is_overlap(lkb)) {
4850		log_debug(ls, "receive_lookup_reply %x unlock %x",
4851			  lkb->lkb_id, lkb->lkb_flags);
4852		queue_cast_overlap(r, lkb);
4853		unhold_lkb(lkb); /* undoes create_lkb() */
4854		goto out_list;
4855	}
4856
4857	_request_lock(r, lkb);
4858
4859 out_list:
4860	if (do_lookup_list)
4861		process_lookup_list(r);
4862 out:
4863	unlock_rsb(r);
4864	put_rsb(r);
4865	dlm_put_lkb(lkb);
4866}
4867
4868static void _receive_message(struct dlm_ls *ls, struct dlm_message *ms,
4869			     uint32_t saved_seq)
4870{
4871	int error = 0, noent = 0;
4872
4873	if (!dlm_is_member(ls, le32_to_cpu(ms->m_header.h_nodeid))) {
4874		log_limit(ls, "receive %d from non-member %d %x %x %d",
4875			  le32_to_cpu(ms->m_type),
4876			  le32_to_cpu(ms->m_header.h_nodeid),
4877			  le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
4878			  from_dlm_errno(le32_to_cpu(ms->m_result)));
4879		return;
4880	}
4881
4882	switch (ms->m_type) {
4883
4884	/* messages sent to a master node */
4885
4886	case cpu_to_le32(DLM_MSG_REQUEST):
4887		error = receive_request(ls, ms);
4888		break;
4889
4890	case cpu_to_le32(DLM_MSG_CONVERT):
4891		error = receive_convert(ls, ms);
4892		break;
4893
4894	case cpu_to_le32(DLM_MSG_UNLOCK):
4895		error = receive_unlock(ls, ms);
4896		break;
4897
4898	case cpu_to_le32(DLM_MSG_CANCEL):
4899		noent = 1;
4900		error = receive_cancel(ls, ms);
4901		break;
4902
4903	/* messages sent from a master node (replies to above) */
4904
4905	case cpu_to_le32(DLM_MSG_REQUEST_REPLY):
4906		error = receive_request_reply(ls, ms);
4907		break;
4908
4909	case cpu_to_le32(DLM_MSG_CONVERT_REPLY):
4910		error = receive_convert_reply(ls, ms);
4911		break;
4912
4913	case cpu_to_le32(DLM_MSG_UNLOCK_REPLY):
4914		error = receive_unlock_reply(ls, ms);
4915		break;
4916
4917	case cpu_to_le32(DLM_MSG_CANCEL_REPLY):
4918		error = receive_cancel_reply(ls, ms);
4919		break;
4920
4921	/* messages sent from a master node (only two types of async msg) */
4922
4923	case cpu_to_le32(DLM_MSG_GRANT):
4924		noent = 1;
4925		error = receive_grant(ls, ms);
4926		break;
4927
4928	case cpu_to_le32(DLM_MSG_BAST):
4929		noent = 1;
4930		error = receive_bast(ls, ms);
4931		break;
4932
4933	/* messages sent to a dir node */
4934
4935	case cpu_to_le32(DLM_MSG_LOOKUP):
4936		receive_lookup(ls, ms);
4937		break;
4938
4939	case cpu_to_le32(DLM_MSG_REMOVE):
4940		receive_remove(ls, ms);
4941		break;
4942
4943	/* messages sent from a dir node (remove has no reply) */
4944
4945	case cpu_to_le32(DLM_MSG_LOOKUP_REPLY):
4946		receive_lookup_reply(ls, ms);
4947		break;
4948
4949	/* other messages */
4950
4951	case cpu_to_le32(DLM_MSG_PURGE):
4952		receive_purge(ls, ms);
4953		break;
4954
4955	default:
4956		log_error(ls, "unknown message type %d",
4957			  le32_to_cpu(ms->m_type));
4958	}
4959
4960	/*
4961	 * When checking for ENOENT, we're checking the result of
4962	 * find_lkb(m_remid):
4963	 *
4964	 * The lock id referenced in the message wasn't found.  This may
4965	 * happen in normal usage for the async messages and cancel, so
4966	 * only use log_debug for them.
4967	 *
4968	 * Some errors are expected and normal.
4969	 */
4970
4971	if (error == -ENOENT && noent) {
4972		log_debug(ls, "receive %d no %x remote %d %x saved_seq %u",
4973			  le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid),
4974			  le32_to_cpu(ms->m_header.h_nodeid),
4975			  le32_to_cpu(ms->m_lkid), saved_seq);
4976	} else if (error == -ENOENT) {
4977		log_error(ls, "receive %d no %x remote %d %x saved_seq %u",
4978			  le32_to_cpu(ms->m_type), le32_to_cpu(ms->m_remid),
4979			  le32_to_cpu(ms->m_header.h_nodeid),
4980			  le32_to_cpu(ms->m_lkid), saved_seq);
4981
4982		if (ms->m_type == cpu_to_le32(DLM_MSG_CONVERT))
4983			dlm_dump_rsb_hash(ls, le32_to_cpu(ms->m_hash));
4984	}
4985
4986	if (error == -EINVAL) {
4987		log_error(ls, "receive %d inval from %d lkid %x remid %x "
4988			  "saved_seq %u",
4989			  le32_to_cpu(ms->m_type),
4990			  le32_to_cpu(ms->m_header.h_nodeid),
4991			  le32_to_cpu(ms->m_lkid), le32_to_cpu(ms->m_remid),
4992			  saved_seq);
4993	}
4994}
4995
4996/* If the lockspace is in recovery mode (locking stopped), then normal
4997   messages are saved on the requestqueue for processing after recovery is
4998   done.  When not in recovery mode, we wait for dlm_recoverd to drain saved
4999   messages off the requestqueue before we process new ones. This occurs right
5000   after recovery completes when we transition from saving all messages on
5001   requestqueue, to processing all the saved messages, to processing new
5002   messages as they arrive. */
5003
5004static void dlm_receive_message(struct dlm_ls *ls, struct dlm_message *ms,
5005				int nodeid)
5006{
5007	if (dlm_locking_stopped(ls)) {
5008		/* If we were a member of this lockspace, left, and rejoined,
5009		   other nodes may still be sending us messages from the
5010		   lockspace generation before we left. */
5011		if (!ls->ls_generation) {
5012			log_limit(ls, "receive %d from %d ignore old gen",
5013				  le32_to_cpu(ms->m_type), nodeid);
5014			return;
5015		}
5016
5017		dlm_add_requestqueue(ls, nodeid, ms);
5018	} else {
5019		dlm_wait_requestqueue(ls);
5020		_receive_message(ls, ms, 0);
5021	}
5022}
5023
5024/* This is called by dlm_recoverd to process messages that were saved on
5025   the requestqueue. */
5026
5027void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms,
5028			       uint32_t saved_seq)
5029{
5030	_receive_message(ls, ms, saved_seq);
5031}
5032
5033/* This is called by the midcomms layer when something is received for
5034   the lockspace.  It could be either a MSG (normal message sent as part of
5035   standard locking activity) or an RCOM (recovery message sent as part of
5036   lockspace recovery). */
5037
5038void dlm_receive_buffer(union dlm_packet *p, int nodeid)
5039{
5040	struct dlm_header *hd = &p->header;
5041	struct dlm_ls *ls;
5042	int type = 0;
5043
5044	switch (hd->h_cmd) {
5045	case DLM_MSG:
5046		type = le32_to_cpu(p->message.m_type);
5047		break;
5048	case DLM_RCOM:
5049		type = le32_to_cpu(p->rcom.rc_type);
5050		break;
5051	default:
5052		log_print("invalid h_cmd %d from %u", hd->h_cmd, nodeid);
5053		return;
5054	}
5055
5056	if (le32_to_cpu(hd->h_nodeid) != nodeid) {
5057		log_print("invalid h_nodeid %d from %d lockspace %x",
5058			  le32_to_cpu(hd->h_nodeid), nodeid,
5059			  le32_to_cpu(hd->u.h_lockspace));
5060		return;
5061	}
5062
5063	ls = dlm_find_lockspace_global(le32_to_cpu(hd->u.h_lockspace));
5064	if (!ls) {
5065		if (dlm_config.ci_log_debug) {
5066			printk_ratelimited(KERN_DEBUG "dlm: invalid lockspace "
5067				"%u from %d cmd %d type %d\n",
5068				le32_to_cpu(hd->u.h_lockspace), nodeid,
5069				hd->h_cmd, type);
5070		}
5071
5072		if (hd->h_cmd == DLM_RCOM && type == DLM_RCOM_STATUS)
5073			dlm_send_ls_not_ready(nodeid, &p->rcom);
5074		return;
5075	}
5076
5077	/* this rwsem allows dlm_ls_stop() to wait for all dlm_recv threads to
5078	   be inactive (in this ls) before transitioning to recovery mode */
5079
5080	down_read(&ls->ls_recv_active);
5081	if (hd->h_cmd == DLM_MSG)
5082		dlm_receive_message(ls, &p->message, nodeid);
5083	else
5084		dlm_receive_rcom(ls, &p->rcom, nodeid);
5085	up_read(&ls->ls_recv_active);
5086
5087	dlm_put_lockspace(ls);
5088}
5089
5090static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb,
5091				   struct dlm_message *ms_stub)
5092{
5093	if (middle_conversion(lkb)) {
5094		hold_lkb(lkb);
5095		memset(ms_stub, 0, sizeof(struct dlm_message));
5096		ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
5097		ms_stub->m_type = cpu_to_le32(DLM_MSG_CONVERT_REPLY);
5098		ms_stub->m_result = cpu_to_le32(to_dlm_errno(-EINPROGRESS));
5099		ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
5100		_receive_convert_reply(lkb, ms_stub);
5101
5102		/* Same special case as in receive_rcom_lock_args() */
5103		lkb->lkb_grmode = DLM_LOCK_IV;
5104		rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT);
5105		unhold_lkb(lkb);
5106
5107	} else if (lkb->lkb_rqmode >= lkb->lkb_grmode) {
5108		lkb->lkb_flags |= DLM_IFL_RESEND;
5109	}
5110
5111	/* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down
5112	   conversions are async; there's no reply from the remote master */
5113}
5114
5115/* A waiting lkb needs recovery if the master node has failed, or
5116   the master node is changing (only when no directory is used) */
5117
5118static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb,
5119				 int dir_nodeid)
5120{
5121	if (dlm_no_directory(ls))
5122		return 1;
5123
5124	if (dlm_is_removed(ls, lkb->lkb_wait_nodeid))
5125		return 1;
5126
5127	return 0;
5128}
5129
5130/* Recovery for locks that are waiting for replies from nodes that are now
5131   gone.  We can just complete unlocks and cancels by faking a reply from the
5132   dead node.  Requests and up-conversions we flag to be resent after
5133   recovery.  Down-conversions can just be completed with a fake reply like
5134   unlocks.  Conversions between PR and CW need special attention. */
5135
5136void dlm_recover_waiters_pre(struct dlm_ls *ls)
5137{
5138	struct dlm_lkb *lkb, *safe;
5139	struct dlm_message *ms_stub;
5140	int wait_type, stub_unlock_result, stub_cancel_result;
5141	int dir_nodeid;
5142
5143	ms_stub = kmalloc(sizeof(*ms_stub), GFP_KERNEL);
5144	if (!ms_stub)
5145		return;
5146
5147	mutex_lock(&ls->ls_waiters_mutex);
5148
5149	list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) {
5150
5151		dir_nodeid = dlm_dir_nodeid(lkb->lkb_resource);
5152
5153		/* exclude debug messages about unlocks because there can be so
5154		   many and they aren't very interesting */
5155
5156		if (lkb->lkb_wait_type != DLM_MSG_UNLOCK) {
5157			log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
5158				  "lkb_nodeid %d wait_nodeid %d dir_nodeid %d",
5159				  lkb->lkb_id,
5160				  lkb->lkb_remid,
5161				  lkb->lkb_wait_type,
5162				  lkb->lkb_resource->res_nodeid,
5163				  lkb->lkb_nodeid,
5164				  lkb->lkb_wait_nodeid,
5165				  dir_nodeid);
5166		}
5167
5168		/* all outstanding lookups, regardless of destination  will be
5169		   resent after recovery is done */
5170
5171		if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) {
5172			lkb->lkb_flags |= DLM_IFL_RESEND;
5173			continue;
5174		}
5175
5176		if (!waiter_needs_recovery(ls, lkb, dir_nodeid))
5177			continue;
5178
5179		wait_type = lkb->lkb_wait_type;
5180		stub_unlock_result = -DLM_EUNLOCK;
5181		stub_cancel_result = -DLM_ECANCEL;
5182
5183		/* Main reply may have been received leaving a zero wait_type,
5184		   but a reply for the overlapping op may not have been
5185		   received.  In that case we need to fake the appropriate
5186		   reply for the overlap op. */
5187
5188		if (!wait_type) {
5189			if (is_overlap_cancel(lkb)) {
5190				wait_type = DLM_MSG_CANCEL;
5191				if (lkb->lkb_grmode == DLM_LOCK_IV)
5192					stub_cancel_result = 0;
5193			}
5194			if (is_overlap_unlock(lkb)) {
5195				wait_type = DLM_MSG_UNLOCK;
5196				if (lkb->lkb_grmode == DLM_LOCK_IV)
5197					stub_unlock_result = -ENOENT;
5198			}
5199
5200			log_debug(ls, "rwpre overlap %x %x %d %d %d",
5201				  lkb->lkb_id, lkb->lkb_flags, wait_type,
5202				  stub_cancel_result, stub_unlock_result);
5203		}
5204
5205		switch (wait_type) {
5206
5207		case DLM_MSG_REQUEST:
5208			lkb->lkb_flags |= DLM_IFL_RESEND;
5209			break;
5210
5211		case DLM_MSG_CONVERT:
5212			recover_convert_waiter(ls, lkb, ms_stub);
5213			break;
5214
5215		case DLM_MSG_UNLOCK:
5216			hold_lkb(lkb);
5217			memset(ms_stub, 0, sizeof(struct dlm_message));
5218			ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
5219			ms_stub->m_type = cpu_to_le32(DLM_MSG_UNLOCK_REPLY);
5220			ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_unlock_result));
5221			ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
5222			_receive_unlock_reply(lkb, ms_stub);
5223			dlm_put_lkb(lkb);
5224			break;
5225
5226		case DLM_MSG_CANCEL:
5227			hold_lkb(lkb);
5228			memset(ms_stub, 0, sizeof(struct dlm_message));
5229			ms_stub->m_flags = cpu_to_le32(DLM_IFL_STUB_MS);
5230			ms_stub->m_type = cpu_to_le32(DLM_MSG_CANCEL_REPLY);
5231			ms_stub->m_result = cpu_to_le32(to_dlm_errno(stub_cancel_result));
5232			ms_stub->m_header.h_nodeid = cpu_to_le32(lkb->lkb_nodeid);
5233			_receive_cancel_reply(lkb, ms_stub);
5234			dlm_put_lkb(lkb);
5235			break;
5236
5237		default:
5238			log_error(ls, "invalid lkb wait_type %d %d",
5239				  lkb->lkb_wait_type, wait_type);
5240		}
5241		schedule();
5242	}
5243	mutex_unlock(&ls->ls_waiters_mutex);
5244	kfree(ms_stub);
5245}
5246
5247static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
5248{
5249	struct dlm_lkb *lkb = NULL, *iter;
5250
5251	mutex_lock(&ls->ls_waiters_mutex);
5252	list_for_each_entry(iter, &ls->ls_waiters, lkb_wait_reply) {
5253		if (iter->lkb_flags & DLM_IFL_RESEND) {
5254			hold_lkb(iter);
5255			lkb = iter;
5256			break;
5257		}
5258	}
5259	mutex_unlock(&ls->ls_waiters_mutex);
5260
5261	return lkb;
5262}
5263
5264/* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
5265   master or dir-node for r.  Processing the lkb may result in it being placed
5266   back on waiters. */
5267
5268/* We do this after normal locking has been enabled and any saved messages
5269   (in requestqueue) have been processed.  We should be confident that at
5270   this point we won't get or process a reply to any of these waiting
5271   operations.  But, new ops may be coming in on the rsbs/locks here from
5272   userspace or remotely. */
5273
5274/* there may have been an overlap unlock/cancel prior to recovery or after
5275   recovery.  if before, the lkb may still have a pos wait_count; if after, the
5276   overlap flag would just have been set and nothing new sent.  we can be
5277   confident here than any replies to either the initial op or overlap ops
5278   prior to recovery have been received. */
5279
5280int dlm_recover_waiters_post(struct dlm_ls *ls)
5281{
5282	struct dlm_lkb *lkb;
5283	struct dlm_rsb *r;
5284	int error = 0, mstype, err, oc, ou;
5285
5286	while (1) {
5287		if (dlm_locking_stopped(ls)) {
5288			log_debug(ls, "recover_waiters_post aborted");
5289			error = -EINTR;
5290			break;
5291		}
5292
5293		lkb = find_resend_waiter(ls);
5294		if (!lkb)
5295			break;
5296
5297		r = lkb->lkb_resource;
5298		hold_rsb(r);
5299		lock_rsb(r);
5300
5301		mstype = lkb->lkb_wait_type;
5302		oc = is_overlap_cancel(lkb);
5303		ou = is_overlap_unlock(lkb);
5304		err = 0;
5305
5306		log_debug(ls, "waiter %x remote %x msg %d r_nodeid %d "
5307			  "lkb_nodeid %d wait_nodeid %d dir_nodeid %d "
5308			  "overlap %d %d", lkb->lkb_id, lkb->lkb_remid, mstype,
5309			  r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid,
5310			  dlm_dir_nodeid(r), oc, ou);
5311
5312		/* At this point we assume that we won't get a reply to any
5313		   previous op or overlap op on this lock.  First, do a big
5314		   remove_from_waiters() for all previous ops. */
5315
5316		lkb->lkb_flags &= ~DLM_IFL_RESEND;
5317		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
5318		lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
5319		lkb->lkb_wait_type = 0;
5320		/* drop all wait_count references we still
5321		 * hold a reference for this iteration.
5322		 */
5323		while (lkb->lkb_wait_count) {
5324			lkb->lkb_wait_count--;
5325			unhold_lkb(lkb);
5326		}
5327		mutex_lock(&ls->ls_waiters_mutex);
5328		list_del_init(&lkb->lkb_wait_reply);
5329		mutex_unlock(&ls->ls_waiters_mutex);
5330
5331		if (oc || ou) {
5332			/* do an unlock or cancel instead of resending */
5333			switch (mstype) {
5334			case DLM_MSG_LOOKUP:
5335			case DLM_MSG_REQUEST:
5336				queue_cast(r, lkb, ou ? -DLM_EUNLOCK :
5337							-DLM_ECANCEL);
5338				unhold_lkb(lkb); /* undoes create_lkb() */
5339				break;
5340			case DLM_MSG_CONVERT:
5341				if (oc) {
5342					queue_cast(r, lkb, -DLM_ECANCEL);
5343				} else {
5344					lkb->lkb_exflags |= DLM_LKF_FORCEUNLOCK;
5345					_unlock_lock(r, lkb);
5346				}
5347				break;
5348			default:
5349				err = 1;
5350			}
5351		} else {
5352			switch (mstype) {
5353			case DLM_MSG_LOOKUP:
5354			case DLM_MSG_REQUEST:
5355				_request_lock(r, lkb);
5356				if (is_master(r))
5357					confirm_master(r, 0);
5358				break;
5359			case DLM_MSG_CONVERT:
5360				_convert_lock(r, lkb);
5361				break;
5362			default:
5363				err = 1;
5364			}
5365		}
5366
5367		if (err) {
5368			log_error(ls, "waiter %x msg %d r_nodeid %d "
5369				  "dir_nodeid %d overlap %d %d",
5370				  lkb->lkb_id, mstype, r->res_nodeid,
5371				  dlm_dir_nodeid(r), oc, ou);
5372		}
5373		unlock_rsb(r);
5374		put_rsb(r);
5375		dlm_put_lkb(lkb);
5376	}
5377
5378	return error;
5379}
5380
5381static void purge_mstcpy_list(struct dlm_ls *ls, struct dlm_rsb *r,
5382			      struct list_head *list)
5383{
5384	struct dlm_lkb *lkb, *safe;
5385
5386	list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) {
5387		if (!is_master_copy(lkb))
5388			continue;
5389
5390		/* don't purge lkbs we've added in recover_master_copy for
5391		   the current recovery seq */
5392
5393		if (lkb->lkb_recover_seq == ls->ls_recover_seq)
5394			continue;
5395
5396		del_lkb(r, lkb);
5397
5398		/* this put should free the lkb */
5399		if (!dlm_put_lkb(lkb))
5400			log_error(ls, "purged mstcpy lkb not released");
5401	}
5402}
5403
5404void dlm_purge_mstcpy_locks(struct dlm_rsb *r)
5405{
5406	struct dlm_ls *ls = r->res_ls;
5407
5408	purge_mstcpy_list(ls, r, &r->res_grantqueue);
5409	purge_mstcpy_list(ls, r, &r->res_convertqueue);
5410	purge_mstcpy_list(ls, r, &r->res_waitqueue);
5411}
5412
5413static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r,
5414			    struct list_head *list,
5415			    int nodeid_gone, unsigned int *count)
5416{
5417	struct dlm_lkb *lkb, *safe;
5418
5419	list_for_each_entry_safe(lkb, safe, list, lkb_statequeue) {
5420		if (!is_master_copy(lkb))
5421			continue;
5422
5423		if ((lkb->lkb_nodeid == nodeid_gone) ||
5424		    dlm_is_removed(ls, lkb->lkb_nodeid)) {
5425
5426			/* tell recover_lvb to invalidate the lvb
5427			   because a node holding EX/PW failed */
5428			if ((lkb->lkb_exflags & DLM_LKF_VALBLK) &&
5429			    (lkb->lkb_grmode >= DLM_LOCK_PW)) {
5430				rsb_set_flag(r, RSB_RECOVER_LVB_INVAL);
5431			}
5432
5433			del_lkb(r, lkb);
5434
5435			/* this put should free the lkb */
5436			if (!dlm_put_lkb(lkb))
5437				log_error(ls, "purged dead lkb not released");
5438
5439			rsb_set_flag(r, RSB_RECOVER_GRANT);
5440
5441			(*count)++;
5442		}
5443	}
5444}
5445
5446/* Get rid of locks held by nodes that are gone. */
5447
5448void dlm_recover_purge(struct dlm_ls *ls)
5449{
5450	struct dlm_rsb *r;
5451	struct dlm_member *memb;
5452	int nodes_count = 0;
5453	int nodeid_gone = 0;
5454	unsigned int lkb_count = 0;
5455
5456	/* cache one removed nodeid to optimize the common
5457	   case of a single node removed */
5458
5459	list_for_each_entry(memb, &ls->ls_nodes_gone, list) {
5460		nodes_count++;
5461		nodeid_gone = memb->nodeid;
5462	}
5463
5464	if (!nodes_count)
5465		return;
5466
5467	down_write(&ls->ls_root_sem);
5468	list_for_each_entry(r, &ls->ls_root_list, res_root_list) {
5469		hold_rsb(r);
5470		lock_rsb(r);
5471		if (is_master(r)) {
5472			purge_dead_list(ls, r, &r->res_grantqueue,
5473					nodeid_gone, &lkb_count);
5474			purge_dead_list(ls, r, &r->res_convertqueue,
5475					nodeid_gone, &lkb_count);
5476			purge_dead_list(ls, r, &r->res_waitqueue,
5477					nodeid_gone, &lkb_count);
5478		}
5479		unlock_rsb(r);
5480		unhold_rsb(r);
5481		cond_resched();
5482	}
5483	up_write(&ls->ls_root_sem);
5484
5485	if (lkb_count)
5486		log_rinfo(ls, "dlm_recover_purge %u locks for %u nodes",
5487			  lkb_count, nodes_count);
5488}
5489
5490static struct dlm_rsb *find_grant_rsb(struct dlm_ls *ls, int bucket)
5491{
5492	struct rb_node *n;
5493	struct dlm_rsb *r;
5494
5495	spin_lock(&ls->ls_rsbtbl[bucket].lock);
5496	for (n = rb_first(&ls->ls_rsbtbl[bucket].keep); n; n = rb_next(n)) {
5497		r = rb_entry(n, struct dlm_rsb, res_hashnode);
5498
5499		if (!rsb_flag(r, RSB_RECOVER_GRANT))
5500			continue;
5501		if (!is_master(r)) {
5502			rsb_clear_flag(r, RSB_RECOVER_GRANT);
5503			continue;
5504		}
5505		hold_rsb(r);
5506		spin_unlock(&ls->ls_rsbtbl[bucket].lock);
5507		return r;
5508	}
5509	spin_unlock(&ls->ls_rsbtbl[bucket].lock);
5510	return NULL;
5511}
5512
5513/*
5514 * Attempt to grant locks on resources that we are the master of.
5515 * Locks may have become grantable during recovery because locks
5516 * from departed nodes have been purged (or not rebuilt), allowing
5517 * previously blocked locks to now be granted.  The subset of rsb's
5518 * we are interested in are those with lkb's on either the convert or
5519 * waiting queues.
5520 *
5521 * Simplest would be to go through each master rsb and check for non-empty
5522 * convert or waiting queues, and attempt to grant on those rsbs.
5523 * Checking the queues requires lock_rsb, though, for which we'd need
5524 * to release the rsbtbl lock.  This would make iterating through all
5525 * rsb's very inefficient.  So, we rely on earlier recovery routines
5526 * to set RECOVER_GRANT on any rsb's that we should attempt to grant
5527 * locks for.
5528 */
5529
5530void dlm_recover_grant(struct dlm_ls *ls)
5531{
5532	struct dlm_rsb *r;
5533	int bucket = 0;
5534	unsigned int count = 0;
5535	unsigned int rsb_count = 0;
5536	unsigned int lkb_count = 0;
5537
5538	while (1) {
5539		r = find_grant_rsb(ls, bucket);
5540		if (!r) {
5541			if (bucket == ls->ls_rsbtbl_size - 1)
5542				break;
5543			bucket++;
5544			continue;
5545		}
5546		rsb_count++;
5547		count = 0;
5548		lock_rsb(r);
5549		/* the RECOVER_GRANT flag is checked in the grant path */
5550		grant_pending_locks(r, &count);
5551		rsb_clear_flag(r, RSB_RECOVER_GRANT);
5552		lkb_count += count;
5553		confirm_master(r, 0);
5554		unlock_rsb(r);
5555		put_rsb(r);
5556		cond_resched();
5557	}
5558
5559	if (lkb_count)
5560		log_rinfo(ls, "dlm_recover_grant %u locks on %u resources",
5561			  lkb_count, rsb_count);
5562}
5563
5564static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid,
5565					 uint32_t remid)
5566{
5567	struct dlm_lkb *lkb;
5568
5569	list_for_each_entry(lkb, head, lkb_statequeue) {
5570		if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid)
5571			return lkb;
5572	}
5573	return NULL;
5574}
5575
5576static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid,
5577				    uint32_t remid)
5578{
5579	struct dlm_lkb *lkb;
5580
5581	lkb = search_remid_list(&r->res_grantqueue, nodeid, remid);
5582	if (lkb)
5583		return lkb;
5584	lkb = search_remid_list(&r->res_convertqueue, nodeid, remid);
5585	if (lkb)
5586		return lkb;
5587	lkb = search_remid_list(&r->res_waitqueue, nodeid, remid);
5588	if (lkb)
5589		return lkb;
5590	return NULL;
5591}
5592
5593/* needs at least dlm_rcom + rcom_lock */
5594static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
5595				  struct dlm_rsb *r, struct dlm_rcom *rc)
5596{
5597	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
5598
5599	lkb->lkb_nodeid = le32_to_cpu(rc->rc_header.h_nodeid);
5600	lkb->lkb_ownpid = le32_to_cpu(rl->rl_ownpid);
5601	lkb->lkb_remid = le32_to_cpu(rl->rl_lkid);
5602	lkb->lkb_exflags = le32_to_cpu(rl->rl_exflags);
5603	lkb->lkb_flags = le32_to_cpu(rl->rl_flags) & 0x0000FFFF;
5604	lkb->lkb_flags |= DLM_IFL_MSTCPY;
5605	lkb->lkb_lvbseq = le32_to_cpu(rl->rl_lvbseq);
5606	lkb->lkb_rqmode = rl->rl_rqmode;
5607	lkb->lkb_grmode = rl->rl_grmode;
5608	/* don't set lkb_status because add_lkb wants to itself */
5609
5610	lkb->lkb_bastfn = (rl->rl_asts & DLM_CB_BAST) ? &fake_bastfn : NULL;
5611	lkb->lkb_astfn = (rl->rl_asts & DLM_CB_CAST) ? &fake_astfn : NULL;
5612
5613	if (lkb->lkb_exflags & DLM_LKF_VALBLK) {
5614		int lvblen = le16_to_cpu(rc->rc_header.h_length) -
5615			sizeof(struct dlm_rcom) - sizeof(struct rcom_lock);
5616		if (lvblen > ls->ls_lvblen)
5617			return -EINVAL;
5618		lkb->lkb_lvbptr = dlm_allocate_lvb(ls);
5619		if (!lkb->lkb_lvbptr)
5620			return -ENOMEM;
5621		memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen);
5622	}
5623
5624	/* Conversions between PR and CW (middle modes) need special handling.
5625	   The real granted mode of these converting locks cannot be determined
5626	   until all locks have been rebuilt on the rsb (recover_conversion) */
5627
5628	if (rl->rl_wait_type == cpu_to_le16(DLM_MSG_CONVERT) &&
5629	    middle_conversion(lkb)) {
5630		rl->rl_status = DLM_LKSTS_CONVERT;
5631		lkb->lkb_grmode = DLM_LOCK_IV;
5632		rsb_set_flag(r, RSB_RECOVER_CONVERT);
5633	}
5634
5635	return 0;
5636}
5637
5638/* This lkb may have been recovered in a previous aborted recovery so we need
5639   to check if the rsb already has an lkb with the given remote nodeid/lkid.
5640   If so we just send back a standard reply.  If not, we create a new lkb with
5641   the given values and send back our lkid.  We send back our lkid by sending
5642   back the rcom_lock struct we got but with the remid field filled in. */
5643
5644/* needs at least dlm_rcom + rcom_lock */
5645int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
5646{
5647	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
5648	struct dlm_rsb *r;
5649	struct dlm_lkb *lkb;
5650	uint32_t remid = 0;
5651	int from_nodeid = le32_to_cpu(rc->rc_header.h_nodeid);
5652	int error;
5653
5654	if (rl->rl_parent_lkid) {
5655		error = -EOPNOTSUPP;
5656		goto out;
5657	}
5658
5659	remid = le32_to_cpu(rl->rl_lkid);
5660
5661	/* In general we expect the rsb returned to be R_MASTER, but we don't
5662	   have to require it.  Recovery of masters on one node can overlap
5663	   recovery of locks on another node, so one node can send us MSTCPY
5664	   locks before we've made ourselves master of this rsb.  We can still
5665	   add new MSTCPY locks that we receive here without any harm; when
5666	   we make ourselves master, dlm_recover_masters() won't touch the
5667	   MSTCPY locks we've received early. */
5668
5669	error = find_rsb(ls, rl->rl_name, le16_to_cpu(rl->rl_namelen),
5670			 from_nodeid, R_RECEIVE_RECOVER, &r);
5671	if (error)
5672		goto out;
5673
5674	lock_rsb(r);
5675
5676	if (dlm_no_directory(ls) && (dlm_dir_nodeid(r) != dlm_our_nodeid())) {
5677		log_error(ls, "dlm_recover_master_copy remote %d %x not dir",
5678			  from_nodeid, remid);
5679		error = -EBADR;
5680		goto out_unlock;
5681	}
5682
5683	lkb = search_remid(r, from_nodeid, remid);
5684	if (lkb) {
5685		error = -EEXIST;
5686		goto out_remid;
5687	}
5688
5689	error = create_lkb(ls, &lkb);
5690	if (error)
5691		goto out_unlock;
5692
5693	error = receive_rcom_lock_args(ls, lkb, r, rc);
5694	if (error) {
5695		__put_lkb(ls, lkb);
5696		goto out_unlock;
5697	}
5698
5699	attach_lkb(r, lkb);
5700	add_lkb(r, lkb, rl->rl_status);
5701	ls->ls_recover_locks_in++;
5702
5703	if (!list_empty(&r->res_waitqueue) || !list_empty(&r->res_convertqueue))
5704		rsb_set_flag(r, RSB_RECOVER_GRANT);
5705
5706 out_remid:
5707	/* this is the new value returned to the lock holder for
5708	   saving in its process-copy lkb */
5709	rl->rl_remid = cpu_to_le32(lkb->lkb_id);
5710
5711	lkb->lkb_recover_seq = ls->ls_recover_seq;
5712
5713 out_unlock:
5714	unlock_rsb(r);
5715	put_rsb(r);
5716 out:
5717	if (error && error != -EEXIST)
5718		log_rinfo(ls, "dlm_recover_master_copy remote %d %x error %d",
5719			  from_nodeid, remid, error);
5720	rl->rl_result = cpu_to_le32(error);
5721	return error;
5722}
5723
5724/* needs at least dlm_rcom + rcom_lock */
5725int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc)
5726{
5727	struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf;
5728	struct dlm_rsb *r;
5729	struct dlm_lkb *lkb;
5730	uint32_t lkid, remid;
5731	int error, result;
5732
5733	lkid = le32_to_cpu(rl->rl_lkid);
5734	remid = le32_to_cpu(rl->rl_remid);
5735	result = le32_to_cpu(rl->rl_result);
5736
5737	error = find_lkb(ls, lkid, &lkb);
5738	if (error) {
5739		log_error(ls, "dlm_recover_process_copy no %x remote %d %x %d",
5740			  lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
5741			  result);
5742		return error;
5743	}
5744
5745	r = lkb->lkb_resource;
5746	hold_rsb(r);
5747	lock_rsb(r);
5748
5749	if (!is_process_copy(lkb)) {
5750		log_error(ls, "dlm_recover_process_copy bad %x remote %d %x %d",
5751			  lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
5752			  result);
5753		dlm_dump_rsb(r);
5754		unlock_rsb(r);
5755		put_rsb(r);
5756		dlm_put_lkb(lkb);
5757		return -EINVAL;
5758	}
5759
5760	switch (result) {
5761	case -EBADR:
5762		/* There's a chance the new master received our lock before
5763		   dlm_recover_master_reply(), this wouldn't happen if we did
5764		   a barrier between recover_masters and recover_locks. */
5765
5766		log_debug(ls, "dlm_recover_process_copy %x remote %d %x %d",
5767			  lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
5768			  result);
5769	
5770		dlm_send_rcom_lock(r, lkb);
5771		goto out;
5772	case -EEXIST:
5773	case 0:
5774		lkb->lkb_remid = remid;
5775		break;
5776	default:
5777		log_error(ls, "dlm_recover_process_copy %x remote %d %x %d unk",
5778			  lkid, le32_to_cpu(rc->rc_header.h_nodeid), remid,
5779			  result);
5780	}
5781
5782	/* an ack for dlm_recover_locks() which waits for replies from
5783	   all the locks it sends to new masters */
5784	dlm_recovered_lock(r);
5785 out:
5786	unlock_rsb(r);
5787	put_rsb(r);
5788	dlm_put_lkb(lkb);
5789
5790	return 0;
5791}
5792
5793#ifdef CONFIG_DLM_DEPRECATED_API
5794int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
5795		     int mode, uint32_t flags, void *name, unsigned int namelen,
5796		     unsigned long timeout_cs)
5797#else
5798int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
5799		     int mode, uint32_t flags, void *name, unsigned int namelen)
5800#endif
5801{
5802	struct dlm_lkb *lkb;
5803	struct dlm_args args;
5804	int error;
5805
5806	dlm_lock_recovery(ls);
5807
5808	error = create_lkb(ls, &lkb);
5809	if (error) {
5810		kfree(ua);
5811		goto out;
5812	}
5813
5814	if (flags & DLM_LKF_VALBLK) {
5815		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
5816		if (!ua->lksb.sb_lvbptr) {
5817			kfree(ua);
5818			__put_lkb(ls, lkb);
5819			error = -ENOMEM;
5820			goto out;
5821		}
5822	}
5823#ifdef CONFIG_DLM_DEPRECATED_API
5824	error = set_lock_args(mode, &ua->lksb, flags, namelen, timeout_cs,
5825			      fake_astfn, ua, fake_bastfn, &args);
5826#else
5827	error = set_lock_args(mode, &ua->lksb, flags, namelen, fake_astfn, ua,
5828			      fake_bastfn, &args);
5829#endif
5830	if (error) {
5831		kfree(ua->lksb.sb_lvbptr);
5832		ua->lksb.sb_lvbptr = NULL;
5833		kfree(ua);
5834		__put_lkb(ls, lkb);
5835		goto out;
5836	}
5837
5838	/* After ua is attached to lkb it will be freed by dlm_free_lkb().
5839	   When DLM_IFL_USER is set, the dlm knows that this is a userspace
5840	   lock and that lkb_astparam is the dlm_user_args structure. */
5841	lkb->lkb_flags |= DLM_IFL_USER;
5842	error = request_lock(ls, lkb, name, namelen, &args);
5843
5844	switch (error) {
5845	case 0:
5846		break;
5847	case -EINPROGRESS:
5848		error = 0;
5849		break;
5850	case -EAGAIN:
5851		error = 0;
5852		fallthrough;
5853	default:
5854		__put_lkb(ls, lkb);
5855		goto out;
5856	}
5857
5858	/* add this new lkb to the per-process list of locks */
5859	spin_lock(&ua->proc->locks_spin);
5860	hold_lkb(lkb);
5861	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
5862	spin_unlock(&ua->proc->locks_spin);
5863 out:
5864	dlm_unlock_recovery(ls);
5865	return error;
5866}
5867
5868#ifdef CONFIG_DLM_DEPRECATED_API
5869int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
5870		     int mode, uint32_t flags, uint32_t lkid, char *lvb_in,
5871		     unsigned long timeout_cs)
5872#else
5873int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
5874		     int mode, uint32_t flags, uint32_t lkid, char *lvb_in)
5875#endif
5876{
5877	struct dlm_lkb *lkb;
5878	struct dlm_args args;
5879	struct dlm_user_args *ua;
5880	int error;
5881
5882	dlm_lock_recovery(ls);
5883
5884	error = find_lkb(ls, lkid, &lkb);
5885	if (error)
5886		goto out;
5887
5888	/* user can change the params on its lock when it converts it, or
5889	   add an lvb that didn't exist before */
5890
5891	ua = lkb->lkb_ua;
5892
5893	if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
5894		ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
5895		if (!ua->lksb.sb_lvbptr) {
5896			error = -ENOMEM;
5897			goto out_put;
5898		}
5899	}
5900	if (lvb_in && ua->lksb.sb_lvbptr)
5901		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
5902
5903	ua->xid = ua_tmp->xid;
5904	ua->castparam = ua_tmp->castparam;
5905	ua->castaddr = ua_tmp->castaddr;
5906	ua->bastparam = ua_tmp->bastparam;
5907	ua->bastaddr = ua_tmp->bastaddr;
5908	ua->user_lksb = ua_tmp->user_lksb;
5909
5910#ifdef CONFIG_DLM_DEPRECATED_API
5911	error = set_lock_args(mode, &ua->lksb, flags, 0, timeout_cs,
5912			      fake_astfn, ua, fake_bastfn, &args);
5913#else
5914	error = set_lock_args(mode, &ua->lksb, flags, 0, fake_astfn, ua,
5915			      fake_bastfn, &args);
5916#endif
5917	if (error)
5918		goto out_put;
5919
5920	error = convert_lock(ls, lkb, &args);
5921
5922	if (error == -EINPROGRESS || error == -EAGAIN || error == -EDEADLK)
5923		error = 0;
5924 out_put:
5925	dlm_put_lkb(lkb);
5926 out:
5927	dlm_unlock_recovery(ls);
5928	kfree(ua_tmp);
5929	return error;
5930}
5931
5932/*
5933 * The caller asks for an orphan lock on a given resource with a given mode.
5934 * If a matching lock exists, it's moved to the owner's list of locks and
5935 * the lkid is returned.
5936 */
5937
5938int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
5939		     int mode, uint32_t flags, void *name, unsigned int namelen,
5940		     uint32_t *lkid)
5941{
5942	struct dlm_lkb *lkb = NULL, *iter;
5943	struct dlm_user_args *ua;
5944	int found_other_mode = 0;
5945	int rv = 0;
5946
5947	mutex_lock(&ls->ls_orphans_mutex);
5948	list_for_each_entry(iter, &ls->ls_orphans, lkb_ownqueue) {
5949		if (iter->lkb_resource->res_length != namelen)
5950			continue;
5951		if (memcmp(iter->lkb_resource->res_name, name, namelen))
5952			continue;
5953		if (iter->lkb_grmode != mode) {
5954			found_other_mode = 1;
5955			continue;
5956		}
5957
5958		lkb = iter;
5959		list_del_init(&iter->lkb_ownqueue);
5960		iter->lkb_flags &= ~DLM_IFL_ORPHAN;
5961		*lkid = iter->lkb_id;
5962		break;
5963	}
5964	mutex_unlock(&ls->ls_orphans_mutex);
5965
5966	if (!lkb && found_other_mode) {
5967		rv = -EAGAIN;
5968		goto out;
5969	}
5970
5971	if (!lkb) {
5972		rv = -ENOENT;
5973		goto out;
5974	}
5975
5976	lkb->lkb_exflags = flags;
5977	lkb->lkb_ownpid = (int) current->pid;
5978
5979	ua = lkb->lkb_ua;
5980
5981	ua->proc = ua_tmp->proc;
5982	ua->xid = ua_tmp->xid;
5983	ua->castparam = ua_tmp->castparam;
5984	ua->castaddr = ua_tmp->castaddr;
5985	ua->bastparam = ua_tmp->bastparam;
5986	ua->bastaddr = ua_tmp->bastaddr;
5987	ua->user_lksb = ua_tmp->user_lksb;
5988
5989	/*
5990	 * The lkb reference from the ls_orphans list was not
5991	 * removed above, and is now considered the reference
5992	 * for the proc locks list.
5993	 */
5994
5995	spin_lock(&ua->proc->locks_spin);
5996	list_add_tail(&lkb->lkb_ownqueue, &ua->proc->locks);
5997	spin_unlock(&ua->proc->locks_spin);
5998 out:
5999	kfree(ua_tmp);
6000	return rv;
6001}
6002
6003int dlm_user_unlock(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
6004		    uint32_t flags, uint32_t lkid, char *lvb_in)
6005{
6006	struct dlm_lkb *lkb;
6007	struct dlm_args args;
6008	struct dlm_user_args *ua;
6009	int error;
6010
6011	dlm_lock_recovery(ls);
6012
6013	error = find_lkb(ls, lkid, &lkb);
6014	if (error)
6015		goto out;
6016
6017	ua = lkb->lkb_ua;
6018
6019	if (lvb_in && ua->lksb.sb_lvbptr)
6020		memcpy(ua->lksb.sb_lvbptr, lvb_in, DLM_USER_LVB_LEN);
6021	if (ua_tmp->castparam)
6022		ua->castparam = ua_tmp->castparam;
6023	ua->user_lksb = ua_tmp->user_lksb;
6024
6025	error = set_unlock_args(flags, ua, &args);
6026	if (error)
6027		goto out_put;
6028
6029	error = unlock_lock(ls, lkb, &args);
6030
6031	if (error == -DLM_EUNLOCK)
6032		error = 0;
6033	/* from validate_unlock_args() */
6034	if (error == -EBUSY && (flags & DLM_LKF_FORCEUNLOCK))
6035		error = 0;
6036	if (error)
6037		goto out_put;
6038
6039	spin_lock(&ua->proc->locks_spin);
6040	/* dlm_user_add_cb() may have already taken lkb off the proc list */
6041	if (!list_empty(&lkb->lkb_ownqueue))
6042		list_move(&lkb->lkb_ownqueue, &ua->proc->unlocking);
6043	spin_unlock(&ua->proc->locks_spin);
6044 out_put:
6045	dlm_put_lkb(lkb);
6046 out:
6047	dlm_unlock_recovery(ls);
6048	kfree(ua_tmp);
6049	return error;
6050}
6051
6052int dlm_user_cancel(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
6053		    uint32_t flags, uint32_t lkid)
6054{
6055	struct dlm_lkb *lkb;
6056	struct dlm_args args;
6057	struct dlm_user_args *ua;
6058	int error;
6059
6060	dlm_lock_recovery(ls);
6061
6062	error = find_lkb(ls, lkid, &lkb);
6063	if (error)
6064		goto out;
6065
6066	ua = lkb->lkb_ua;
6067	if (ua_tmp->castparam)
6068		ua->castparam = ua_tmp->castparam;
6069	ua->user_lksb = ua_tmp->user_lksb;
6070
6071	error = set_unlock_args(flags, ua, &args);
6072	if (error)
6073		goto out_put;
6074
6075	error = cancel_lock(ls, lkb, &args);
6076
6077	if (error == -DLM_ECANCEL)
6078		error = 0;
6079	/* from validate_unlock_args() */
6080	if (error == -EBUSY)
6081		error = 0;
6082 out_put:
6083	dlm_put_lkb(lkb);
6084 out:
6085	dlm_unlock_recovery(ls);
6086	kfree(ua_tmp);
6087	return error;
6088}
6089
6090int dlm_user_deadlock(struct dlm_ls *ls, uint32_t flags, uint32_t lkid)
6091{
6092	struct dlm_lkb *lkb;
6093	struct dlm_args args;
6094	struct dlm_user_args *ua;
6095	struct dlm_rsb *r;
6096	int error;
6097
6098	dlm_lock_recovery(ls);
6099
6100	error = find_lkb(ls, lkid, &lkb);
6101	if (error)
6102		goto out;
6103
6104	ua = lkb->lkb_ua;
6105
6106	error = set_unlock_args(flags, ua, &args);
6107	if (error)
6108		goto out_put;
6109
6110	/* same as cancel_lock(), but set DEADLOCK_CANCEL after lock_rsb */
6111
6112	r = lkb->lkb_resource;
6113	hold_rsb(r);
6114	lock_rsb(r);
6115
6116	error = validate_unlock_args(lkb, &args);
6117	if (error)
6118		goto out_r;
6119	lkb->lkb_flags |= DLM_IFL_DEADLOCK_CANCEL;
6120
6121	error = _cancel_lock(r, lkb);
6122 out_r:
6123	unlock_rsb(r);
6124	put_rsb(r);
6125
6126	if (error == -DLM_ECANCEL)
6127		error = 0;
6128	/* from validate_unlock_args() */
6129	if (error == -EBUSY)
6130		error = 0;
6131 out_put:
6132	dlm_put_lkb(lkb);
6133 out:
6134	dlm_unlock_recovery(ls);
6135	return error;
6136}
6137
6138/* lkb's that are removed from the waiters list by revert are just left on the
6139   orphans list with the granted orphan locks, to be freed by purge */
6140
6141static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6142{
6143	struct dlm_args args;
6144	int error;
6145
6146	hold_lkb(lkb); /* reference for the ls_orphans list */
6147	mutex_lock(&ls->ls_orphans_mutex);
6148	list_add_tail(&lkb->lkb_ownqueue, &ls->ls_orphans);
6149	mutex_unlock(&ls->ls_orphans_mutex);
6150
6151	set_unlock_args(0, lkb->lkb_ua, &args);
6152
6153	error = cancel_lock(ls, lkb, &args);
6154	if (error == -DLM_ECANCEL)
6155		error = 0;
6156	return error;
6157}
6158
6159/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't
6160   granted.  Regardless of what rsb queue the lock is on, it's removed and
6161   freed.  The IVVALBLK flag causes the lvb on the resource to be invalidated
6162   if our lock is PW/EX (it's ignored if our granted mode is smaller.) */
6163
6164static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
6165{
6166	struct dlm_args args;
6167	int error;
6168
6169	set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK,
6170			lkb->lkb_ua, &args);
6171
6172	error = unlock_lock(ls, lkb, &args);
6173	if (error == -DLM_EUNLOCK)
6174		error = 0;
6175	return error;
6176}
6177
6178/* We have to release clear_proc_locks mutex before calling unlock_proc_lock()
6179   (which does lock_rsb) due to deadlock with receiving a message that does
6180   lock_rsb followed by dlm_user_add_cb() */
6181
6182static struct dlm_lkb *del_proc_lock(struct dlm_ls *ls,
6183				     struct dlm_user_proc *proc)
6184{
6185	struct dlm_lkb *lkb = NULL;
6186
6187	mutex_lock(&ls->ls_clear_proc_locks);
6188	if (list_empty(&proc->locks))
6189		goto out;
6190
6191	lkb = list_entry(proc->locks.next, struct dlm_lkb, lkb_ownqueue);
6192	list_del_init(&lkb->lkb_ownqueue);
6193
6194	if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
6195		lkb->lkb_flags |= DLM_IFL_ORPHAN;
6196	else
6197		lkb->lkb_flags |= DLM_IFL_DEAD;
6198 out:
6199	mutex_unlock(&ls->ls_clear_proc_locks);
6200	return lkb;
6201}
6202
6203/* The ls_clear_proc_locks mutex protects against dlm_user_add_cb() which
6204   1) references lkb->ua which we free here and 2) adds lkbs to proc->asts,
6205   which we clear here. */
6206
6207/* proc CLOSING flag is set so no more device_reads should look at proc->asts
6208   list, and no more device_writes should add lkb's to proc->locks list; so we
6209   shouldn't need to take asts_spin or locks_spin here.  this assumes that
6210   device reads/writes/closes are serialized -- FIXME: we may need to serialize
6211   them ourself. */
6212
6213void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
6214{
6215	struct dlm_lkb *lkb, *safe;
6216
6217	dlm_lock_recovery(ls);
6218
6219	while (1) {
6220		lkb = del_proc_lock(ls, proc);
6221		if (!lkb)
6222			break;
6223		del_timeout(lkb);
6224		if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
6225			orphan_proc_lock(ls, lkb);
6226		else
6227			unlock_proc_lock(ls, lkb);
6228
6229		/* this removes the reference for the proc->locks list
6230		   added by dlm_user_request, it may result in the lkb
6231		   being freed */
6232
6233		dlm_put_lkb(lkb);
6234	}
6235
6236	mutex_lock(&ls->ls_clear_proc_locks);
6237
6238	/* in-progress unlocks */
6239	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
6240		list_del_init(&lkb->lkb_ownqueue);
6241		lkb->lkb_flags |= DLM_IFL_DEAD;
6242		dlm_put_lkb(lkb);
6243	}
6244
6245	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
6246		memset(&lkb->lkb_callbacks, 0,
6247		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
6248		list_del_init(&lkb->lkb_cb_list);
6249		dlm_put_lkb(lkb);
6250	}
6251
6252	mutex_unlock(&ls->ls_clear_proc_locks);
6253	dlm_unlock_recovery(ls);
6254}
6255
6256static void purge_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
6257{
6258	struct dlm_lkb *lkb, *safe;
6259
6260	while (1) {
6261		lkb = NULL;
6262		spin_lock(&proc->locks_spin);
6263		if (!list_empty(&proc->locks)) {
6264			lkb = list_entry(proc->locks.next, struct dlm_lkb,
6265					 lkb_ownqueue);
6266			list_del_init(&lkb->lkb_ownqueue);
6267		}
6268		spin_unlock(&proc->locks_spin);
6269
6270		if (!lkb)
6271			break;
6272
6273		lkb->lkb_flags |= DLM_IFL_DEAD;
6274		unlock_proc_lock(ls, lkb);
6275		dlm_put_lkb(lkb); /* ref from proc->locks list */
6276	}
6277
6278	spin_lock(&proc->locks_spin);
6279	list_for_each_entry_safe(lkb, safe, &proc->unlocking, lkb_ownqueue) {
6280		list_del_init(&lkb->lkb_ownqueue);
6281		lkb->lkb_flags |= DLM_IFL_DEAD;
6282		dlm_put_lkb(lkb);
6283	}
6284	spin_unlock(&proc->locks_spin);
6285
6286	spin_lock(&proc->asts_spin);
6287	list_for_each_entry_safe(lkb, safe, &proc->asts, lkb_cb_list) {
6288		memset(&lkb->lkb_callbacks, 0,
6289		       sizeof(struct dlm_callback) * DLM_CALLBACKS_SIZE);
6290		list_del_init(&lkb->lkb_cb_list);
6291		dlm_put_lkb(lkb);
6292	}
6293	spin_unlock(&proc->asts_spin);
6294}
6295
6296/* pid of 0 means purge all orphans */
6297
6298static void do_purge(struct dlm_ls *ls, int nodeid, int pid)
6299{
6300	struct dlm_lkb *lkb, *safe;
6301
6302	mutex_lock(&ls->ls_orphans_mutex);
6303	list_for_each_entry_safe(lkb, safe, &ls->ls_orphans, lkb_ownqueue) {
6304		if (pid && lkb->lkb_ownpid != pid)
6305			continue;
6306		unlock_proc_lock(ls, lkb);
6307		list_del_init(&lkb->lkb_ownqueue);
6308		dlm_put_lkb(lkb);
6309	}
6310	mutex_unlock(&ls->ls_orphans_mutex);
6311}
6312
6313static int send_purge(struct dlm_ls *ls, int nodeid, int pid)
6314{
6315	struct dlm_message *ms;
6316	struct dlm_mhandle *mh;
6317	int error;
6318
6319	error = _create_message(ls, sizeof(struct dlm_message), nodeid,
6320				DLM_MSG_PURGE, &ms, &mh);
6321	if (error)
6322		return error;
6323	ms->m_nodeid = cpu_to_le32(nodeid);
6324	ms->m_pid = cpu_to_le32(pid);
6325
6326	return send_message(mh, ms);
6327}
6328
6329int dlm_user_purge(struct dlm_ls *ls, struct dlm_user_proc *proc,
6330		   int nodeid, int pid)
6331{
6332	int error = 0;
6333
6334	if (nodeid && (nodeid != dlm_our_nodeid())) {
6335		error = send_purge(ls, nodeid, pid);
6336	} else {
6337		dlm_lock_recovery(ls);
6338		if (pid == current->pid)
6339			purge_proc_locks(ls, proc);
6340		else
6341			do_purge(ls, nodeid, pid);
6342		dlm_unlock_recovery(ls);
6343	}
6344	return error;
6345}
6346
6347/* debug functionality */
6348int dlm_debug_add_lkb(struct dlm_ls *ls, uint32_t lkb_id, char *name, int len,
6349		      int lkb_nodeid, unsigned int lkb_flags, int lkb_status)
6350{
6351	struct dlm_lksb *lksb;
6352	struct dlm_lkb *lkb;
6353	struct dlm_rsb *r;
6354	int error;
6355
6356	/* we currently can't set a valid user lock */
6357	if (lkb_flags & DLM_IFL_USER)
6358		return -EOPNOTSUPP;
6359
6360	lksb = kzalloc(sizeof(*lksb), GFP_NOFS);
6361	if (!lksb)
6362		return -ENOMEM;
6363
6364	error = _create_lkb(ls, &lkb, lkb_id, lkb_id + 1);
6365	if (error) {
6366		kfree(lksb);
6367		return error;
6368	}
6369
6370	lkb->lkb_flags = lkb_flags;
6371	lkb->lkb_nodeid = lkb_nodeid;
6372	lkb->lkb_lksb = lksb;
6373	/* user specific pointer, just don't have it NULL for kernel locks */
6374	if (~lkb_flags & DLM_IFL_USER)
6375		lkb->lkb_astparam = (void *)0xDEADBEEF;
6376
6377	error = find_rsb(ls, name, len, 0, R_REQUEST, &r);
6378	if (error) {
6379		kfree(lksb);
6380		__put_lkb(ls, lkb);
6381		return error;
6382	}
6383
6384	lock_rsb(r);
6385	attach_lkb(r, lkb);
6386	add_lkb(r, lkb, lkb_status);
6387	unlock_rsb(r);
6388	put_rsb(r);
6389
6390	return 0;
6391}
6392
6393int dlm_debug_add_lkb_to_waiters(struct dlm_ls *ls, uint32_t lkb_id,
6394				 int mstype, int to_nodeid)
6395{
6396	struct dlm_lkb *lkb;
6397	int error;
6398
6399	error = find_lkb(ls, lkb_id, &lkb);
6400	if (error)
6401		return error;
6402
6403	error = add_to_waiters(lkb, mstype, to_nodeid);
6404	dlm_put_lkb(lkb);
6405	return error;
6406}
6407