Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux

[DLM] The core of the DLM for GFS2/CLVM

This is the core of the distributed lock manager which is required
to use GFS2 as a cluster filesystem. It is also used by CLVM and
can be used as a standalone lock manager independantly of either
of these two projects.

It implements VAX-style locking modes.

Signed-off-by: David Teigland <teigland@redhat.com>
Signed-off-by: Steve Whitehouse <swhiteho@redhat.com>

authored by

David Teigland and committed by
Steven Whitehouse
e7fd4179 e4731420

+12178
+1
fs/Kconfig
··· 1831 1831 endmenu 1832 1832 1833 1833 source "fs/nls/Kconfig" 1834 + source "fs/dlm/Kconfig" 1834 1835 1835 1836 endmenu 1836 1837
+1
fs/Makefile
··· 48 48 obj-y += devpts/ 49 49 50 50 obj-$(CONFIG_PROFILING) += dcookies.o 51 + obj-$(CONFIG_DLM) += dlm/ 51 52 52 53 # Do not add any filesystems before this line 53 54 obj-$(CONFIG_REISERFS_FS) += reiserfs/
+30
fs/dlm/Kconfig
··· 1 + menu "Distributed Lock Manager" 2 + depends on INET && EXPERIMENTAL 3 + 4 + config DLM 5 + tristate "Distributed Lock Manager (DLM)" 6 + depends on SYSFS 7 + depends on IPV6 || IPV6=n 8 + select IP_SCTP 9 + select CONFIGFS_FS 10 + help 11 + A general purpose distributed lock manager for kernel or userspace 12 + applications. 13 + 14 + config DLM_DEVICE 15 + tristate "DLM device for userspace access" 16 + depends on DLM 17 + help 18 + This module creates a misc device through which the dlm lockspace 19 + and locking functions become available to userspace applications 20 + (usually through the libdlm library). 21 + 22 + config DLM_DEBUG 23 + bool "DLM debugging" 24 + depends on DLM 25 + help 26 + Under the debugfs mount point, the name of each lockspace will 27 + appear as a file in the "dlm" directory. The output is the 28 + list of resource and locks the local node knows about. 29 + 30 + endmenu
+21
fs/dlm/Makefile
··· 1 + obj-$(CONFIG_DLM) += dlm.o 2 + obj-$(CONFIG_DLM_DEVICE) += dlm_device.o 3 + 4 + dlm-y := ast.o \ 5 + config.o \ 6 + dir.o \ 7 + lock.o \ 8 + lockspace.o \ 9 + lowcomms.o \ 10 + main.o \ 11 + member.o \ 12 + memory.o \ 13 + midcomms.o \ 14 + rcom.o \ 15 + recover.o \ 16 + recoverd.o \ 17 + requestqueue.o \ 18 + util.o 19 + dlm-$(CONFIG_DLM_DEBUG) += debug_fs.o 20 + 21 + dlm_device-y := device.o
+167
fs/dlm/ast.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #include "dlm_internal.h" 15 + #include "lock.h" 16 + #include "ast.h" 17 + 18 + #define WAKE_ASTS 0 19 + 20 + static struct list_head ast_queue; 21 + static spinlock_t ast_queue_lock; 22 + static struct task_struct * astd_task; 23 + static unsigned long astd_wakeflags; 24 + static struct semaphore astd_running; 25 + 26 + 27 + void dlm_del_ast(struct dlm_lkb *lkb) 28 + { 29 + spin_lock(&ast_queue_lock); 30 + if (lkb->lkb_ast_type & (AST_COMP | AST_BAST)) 31 + list_del(&lkb->lkb_astqueue); 32 + spin_unlock(&ast_queue_lock); 33 + } 34 + 35 + void dlm_add_ast(struct dlm_lkb *lkb, int type) 36 + { 37 + spin_lock(&ast_queue_lock); 38 + if (!(lkb->lkb_ast_type & (AST_COMP | AST_BAST))) { 39 + kref_get(&lkb->lkb_ref); 40 + list_add_tail(&lkb->lkb_astqueue, &ast_queue); 41 + } 42 + lkb->lkb_ast_type |= type; 43 + spin_unlock(&ast_queue_lock); 44 + 45 + set_bit(WAKE_ASTS, &astd_wakeflags); 46 + wake_up_process(astd_task); 47 + } 48 + 49 + static void process_asts(void) 50 + { 51 + struct dlm_ls *ls = NULL; 52 + struct dlm_rsb *r = NULL; 53 + struct dlm_lkb *lkb; 54 + void (*cast) (long param); 55 + void (*bast) (long param, int mode); 56 + int type = 0, found, bmode; 57 + 58 + for (;;) { 59 + found = FALSE; 60 + spin_lock(&ast_queue_lock); 61 + list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { 62 + r = lkb->lkb_resource; 63 + ls = r->res_ls; 64 + 65 + if (dlm_locking_stopped(ls)) 66 + continue; 67 + 68 + list_del(&lkb->lkb_astqueue); 69 + type = lkb->lkb_ast_type; 70 + lkb->lkb_ast_type = 0; 71 + found = TRUE; 72 + break; 73 + } 74 + spin_unlock(&ast_queue_lock); 75 + 76 + if (!found) 77 + break; 78 + 79 + cast = lkb->lkb_astaddr; 80 + bast = lkb->lkb_bastaddr; 81 + bmode = lkb->lkb_bastmode; 82 + 83 + if ((type & AST_COMP) && cast) 84 + cast(lkb->lkb_astparam); 85 + 86 + /* FIXME: Is it safe to look at lkb_grmode here 87 + without doing a lock_rsb() ? 88 + Look at other checks in v1 to avoid basts. */ 89 + 90 + if ((type & AST_BAST) && bast) 91 + if (!dlm_modes_compat(lkb->lkb_grmode, bmode)) 92 + bast(lkb->lkb_astparam, bmode); 93 + 94 + /* this removes the reference added by dlm_add_ast 95 + and may result in the lkb being freed */ 96 + dlm_put_lkb(lkb); 97 + 98 + schedule(); 99 + } 100 + } 101 + 102 + static inline int no_asts(void) 103 + { 104 + int ret; 105 + 106 + spin_lock(&ast_queue_lock); 107 + ret = list_empty(&ast_queue); 108 + spin_unlock(&ast_queue_lock); 109 + return ret; 110 + } 111 + 112 + static int dlm_astd(void *data) 113 + { 114 + while (!kthread_should_stop()) { 115 + set_current_state(TASK_INTERRUPTIBLE); 116 + if (!test_bit(WAKE_ASTS, &astd_wakeflags)) 117 + schedule(); 118 + set_current_state(TASK_RUNNING); 119 + 120 + down(&astd_running); 121 + if (test_and_clear_bit(WAKE_ASTS, &astd_wakeflags)) 122 + process_asts(); 123 + up(&astd_running); 124 + } 125 + return 0; 126 + } 127 + 128 + void dlm_astd_wake(void) 129 + { 130 + if (!no_asts()) { 131 + set_bit(WAKE_ASTS, &astd_wakeflags); 132 + wake_up_process(astd_task); 133 + } 134 + } 135 + 136 + int dlm_astd_start(void) 137 + { 138 + struct task_struct *p; 139 + int error = 0; 140 + 141 + INIT_LIST_HEAD(&ast_queue); 142 + spin_lock_init(&ast_queue_lock); 143 + init_MUTEX(&astd_running); 144 + 145 + p = kthread_run(dlm_astd, NULL, "dlm_astd"); 146 + if (IS_ERR(p)) 147 + error = PTR_ERR(p); 148 + else 149 + astd_task = p; 150 + return error; 151 + } 152 + 153 + void dlm_astd_stop(void) 154 + { 155 + kthread_stop(astd_task); 156 + } 157 + 158 + void dlm_astd_suspend(void) 159 + { 160 + down(&astd_running); 161 + } 162 + 163 + void dlm_astd_resume(void) 164 + { 165 + up(&astd_running); 166 + } 167 +
+26
fs/dlm/ast.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5 + ** 6 + ** This copyrighted material is made available to anyone wishing to use, 7 + ** modify, copy, or redistribute it subject to the terms and conditions 8 + ** of the GNU General Public License v.2. 9 + ** 10 + ******************************************************************************* 11 + ******************************************************************************/ 12 + 13 + #ifndef __ASTD_DOT_H__ 14 + #define __ASTD_DOT_H__ 15 + 16 + void dlm_add_ast(struct dlm_lkb *lkb, int type); 17 + void dlm_del_ast(struct dlm_lkb *lkb); 18 + 19 + void dlm_astd_wake(void); 20 + int dlm_astd_start(void); 21 + void dlm_astd_stop(void); 22 + void dlm_astd_suspend(void); 23 + void dlm_astd_resume(void); 24 + 25 + #endif 26 +
+787
fs/dlm/config.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #include <linux/kernel.h> 15 + #include <linux/module.h> 16 + #include <linux/configfs.h> 17 + #include <net/sock.h> 18 + 19 + #include "config.h" 20 + 21 + /* 22 + * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/nodeid 23 + * /config/dlm/<cluster>/spaces/<space>/nodes/<node>/weight 24 + * /config/dlm/<cluster>/comms/<comm>/nodeid 25 + * /config/dlm/<cluster>/comms/<comm>/local 26 + * /config/dlm/<cluster>/comms/<comm>/addr 27 + * The <cluster> level is useless, but I haven't figured out how to avoid it. 28 + */ 29 + 30 + static struct config_group *space_list; 31 + static struct config_group *comm_list; 32 + static struct comm *local_comm; 33 + 34 + struct clusters; 35 + struct cluster; 36 + struct spaces; 37 + struct space; 38 + struct comms; 39 + struct comm; 40 + struct nodes; 41 + struct node; 42 + 43 + static struct config_group *make_cluster(struct config_group *, const char *); 44 + static void drop_cluster(struct config_group *, struct config_item *); 45 + static void release_cluster(struct config_item *); 46 + static struct config_group *make_space(struct config_group *, const char *); 47 + static void drop_space(struct config_group *, struct config_item *); 48 + static void release_space(struct config_item *); 49 + static struct config_item *make_comm(struct config_group *, const char *); 50 + static void drop_comm(struct config_group *, struct config_item *); 51 + static void release_comm(struct config_item *); 52 + static struct config_item *make_node(struct config_group *, const char *); 53 + static void drop_node(struct config_group *, struct config_item *); 54 + static void release_node(struct config_item *); 55 + 56 + static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, 57 + char *buf); 58 + static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, 59 + const char *buf, size_t len); 60 + static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, 61 + char *buf); 62 + static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, 63 + const char *buf, size_t len); 64 + 65 + static ssize_t comm_nodeid_read(struct comm *cm, char *buf); 66 + static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len); 67 + static ssize_t comm_local_read(struct comm *cm, char *buf); 68 + static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len); 69 + static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len); 70 + static ssize_t node_nodeid_read(struct node *nd, char *buf); 71 + static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len); 72 + static ssize_t node_weight_read(struct node *nd, char *buf); 73 + static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len); 74 + 75 + enum { 76 + COMM_ATTR_NODEID = 0, 77 + COMM_ATTR_LOCAL, 78 + COMM_ATTR_ADDR, 79 + }; 80 + 81 + struct comm_attribute { 82 + struct configfs_attribute attr; 83 + ssize_t (*show)(struct comm *, char *); 84 + ssize_t (*store)(struct comm *, const char *, size_t); 85 + }; 86 + 87 + static struct comm_attribute comm_attr_nodeid = { 88 + .attr = { .ca_owner = THIS_MODULE, 89 + .ca_name = "nodeid", 90 + .ca_mode = S_IRUGO | S_IWUSR }, 91 + .show = comm_nodeid_read, 92 + .store = comm_nodeid_write, 93 + }; 94 + 95 + static struct comm_attribute comm_attr_local = { 96 + .attr = { .ca_owner = THIS_MODULE, 97 + .ca_name = "local", 98 + .ca_mode = S_IRUGO | S_IWUSR }, 99 + .show = comm_local_read, 100 + .store = comm_local_write, 101 + }; 102 + 103 + static struct comm_attribute comm_attr_addr = { 104 + .attr = { .ca_owner = THIS_MODULE, 105 + .ca_name = "addr", 106 + .ca_mode = S_IRUGO | S_IWUSR }, 107 + .store = comm_addr_write, 108 + }; 109 + 110 + static struct configfs_attribute *comm_attrs[] = { 111 + [COMM_ATTR_NODEID] = &comm_attr_nodeid.attr, 112 + [COMM_ATTR_LOCAL] = &comm_attr_local.attr, 113 + [COMM_ATTR_ADDR] = &comm_attr_addr.attr, 114 + NULL, 115 + }; 116 + 117 + enum { 118 + NODE_ATTR_NODEID = 0, 119 + NODE_ATTR_WEIGHT, 120 + }; 121 + 122 + struct node_attribute { 123 + struct configfs_attribute attr; 124 + ssize_t (*show)(struct node *, char *); 125 + ssize_t (*store)(struct node *, const char *, size_t); 126 + }; 127 + 128 + static struct node_attribute node_attr_nodeid = { 129 + .attr = { .ca_owner = THIS_MODULE, 130 + .ca_name = "nodeid", 131 + .ca_mode = S_IRUGO | S_IWUSR }, 132 + .show = node_nodeid_read, 133 + .store = node_nodeid_write, 134 + }; 135 + 136 + static struct node_attribute node_attr_weight = { 137 + .attr = { .ca_owner = THIS_MODULE, 138 + .ca_name = "weight", 139 + .ca_mode = S_IRUGO | S_IWUSR }, 140 + .show = node_weight_read, 141 + .store = node_weight_write, 142 + }; 143 + 144 + static struct configfs_attribute *node_attrs[] = { 145 + [NODE_ATTR_NODEID] = &node_attr_nodeid.attr, 146 + [NODE_ATTR_WEIGHT] = &node_attr_weight.attr, 147 + NULL, 148 + }; 149 + 150 + struct clusters { 151 + struct configfs_subsystem subsys; 152 + }; 153 + 154 + struct cluster { 155 + struct config_group group; 156 + }; 157 + 158 + struct spaces { 159 + struct config_group ss_group; 160 + }; 161 + 162 + struct space { 163 + struct config_group group; 164 + struct list_head members; 165 + struct semaphore members_lock; 166 + int members_count; 167 + }; 168 + 169 + struct comms { 170 + struct config_group cs_group; 171 + }; 172 + 173 + struct comm { 174 + struct config_item item; 175 + int nodeid; 176 + int local; 177 + int addr_count; 178 + struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; 179 + }; 180 + 181 + struct nodes { 182 + struct config_group ns_group; 183 + }; 184 + 185 + struct node { 186 + struct config_item item; 187 + struct list_head list; /* space->members */ 188 + int nodeid; 189 + int weight; 190 + }; 191 + 192 + static struct configfs_group_operations clusters_ops = { 193 + .make_group = make_cluster, 194 + .drop_item = drop_cluster, 195 + }; 196 + 197 + static struct configfs_item_operations cluster_ops = { 198 + .release = release_cluster, 199 + }; 200 + 201 + static struct configfs_group_operations spaces_ops = { 202 + .make_group = make_space, 203 + .drop_item = drop_space, 204 + }; 205 + 206 + static struct configfs_item_operations space_ops = { 207 + .release = release_space, 208 + }; 209 + 210 + static struct configfs_group_operations comms_ops = { 211 + .make_item = make_comm, 212 + .drop_item = drop_comm, 213 + }; 214 + 215 + static struct configfs_item_operations comm_ops = { 216 + .release = release_comm, 217 + .show_attribute = show_comm, 218 + .store_attribute = store_comm, 219 + }; 220 + 221 + static struct configfs_group_operations nodes_ops = { 222 + .make_item = make_node, 223 + .drop_item = drop_node, 224 + }; 225 + 226 + static struct configfs_item_operations node_ops = { 227 + .release = release_node, 228 + .show_attribute = show_node, 229 + .store_attribute = store_node, 230 + }; 231 + 232 + static struct config_item_type clusters_type = { 233 + .ct_group_ops = &clusters_ops, 234 + .ct_owner = THIS_MODULE, 235 + }; 236 + 237 + static struct config_item_type cluster_type = { 238 + .ct_item_ops = &cluster_ops, 239 + .ct_owner = THIS_MODULE, 240 + }; 241 + 242 + static struct config_item_type spaces_type = { 243 + .ct_group_ops = &spaces_ops, 244 + .ct_owner = THIS_MODULE, 245 + }; 246 + 247 + static struct config_item_type space_type = { 248 + .ct_item_ops = &space_ops, 249 + .ct_owner = THIS_MODULE, 250 + }; 251 + 252 + static struct config_item_type comms_type = { 253 + .ct_group_ops = &comms_ops, 254 + .ct_owner = THIS_MODULE, 255 + }; 256 + 257 + static struct config_item_type comm_type = { 258 + .ct_item_ops = &comm_ops, 259 + .ct_attrs = comm_attrs, 260 + .ct_owner = THIS_MODULE, 261 + }; 262 + 263 + static struct config_item_type nodes_type = { 264 + .ct_group_ops = &nodes_ops, 265 + .ct_owner = THIS_MODULE, 266 + }; 267 + 268 + static struct config_item_type node_type = { 269 + .ct_item_ops = &node_ops, 270 + .ct_attrs = node_attrs, 271 + .ct_owner = THIS_MODULE, 272 + }; 273 + 274 + static struct cluster *to_cluster(struct config_item *i) 275 + { 276 + return i ? container_of(to_config_group(i), struct cluster, group):NULL; 277 + } 278 + 279 + static struct space *to_space(struct config_item *i) 280 + { 281 + return i ? container_of(to_config_group(i), struct space, group) : NULL; 282 + } 283 + 284 + static struct comm *to_comm(struct config_item *i) 285 + { 286 + return i ? container_of(i, struct comm, item) : NULL; 287 + } 288 + 289 + static struct node *to_node(struct config_item *i) 290 + { 291 + return i ? container_of(i, struct node, item) : NULL; 292 + } 293 + 294 + static struct config_group *make_cluster(struct config_group *g, 295 + const char *name) 296 + { 297 + struct cluster *cl = NULL; 298 + struct spaces *sps = NULL; 299 + struct comms *cms = NULL; 300 + void *gps = NULL; 301 + 302 + cl = kzalloc(sizeof(struct cluster), GFP_KERNEL); 303 + gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); 304 + sps = kzalloc(sizeof(struct spaces), GFP_KERNEL); 305 + cms = kzalloc(sizeof(struct comms), GFP_KERNEL); 306 + 307 + if (!cl || !gps || !sps || !cms) 308 + goto fail; 309 + 310 + config_group_init_type_name(&cl->group, name, &cluster_type); 311 + config_group_init_type_name(&sps->ss_group, "spaces", &spaces_type); 312 + config_group_init_type_name(&cms->cs_group, "comms", &comms_type); 313 + 314 + cl->group.default_groups = gps; 315 + cl->group.default_groups[0] = &sps->ss_group; 316 + cl->group.default_groups[1] = &cms->cs_group; 317 + cl->group.default_groups[2] = NULL; 318 + 319 + space_list = &sps->ss_group; 320 + comm_list = &cms->cs_group; 321 + return &cl->group; 322 + 323 + fail: 324 + kfree(cl); 325 + kfree(gps); 326 + kfree(sps); 327 + kfree(cms); 328 + return NULL; 329 + } 330 + 331 + static void drop_cluster(struct config_group *g, struct config_item *i) 332 + { 333 + struct cluster *cl = to_cluster(i); 334 + struct config_item *tmp; 335 + int j; 336 + 337 + for (j = 0; cl->group.default_groups[j]; j++) { 338 + tmp = &cl->group.default_groups[j]->cg_item; 339 + cl->group.default_groups[j] = NULL; 340 + config_item_put(tmp); 341 + } 342 + 343 + space_list = NULL; 344 + comm_list = NULL; 345 + 346 + config_item_put(i); 347 + } 348 + 349 + static void release_cluster(struct config_item *i) 350 + { 351 + struct cluster *cl = to_cluster(i); 352 + kfree(cl->group.default_groups); 353 + kfree(cl); 354 + } 355 + 356 + static struct config_group *make_space(struct config_group *g, const char *name) 357 + { 358 + struct space *sp = NULL; 359 + struct nodes *nds = NULL; 360 + void *gps = NULL; 361 + 362 + sp = kzalloc(sizeof(struct space), GFP_KERNEL); 363 + gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL); 364 + nds = kzalloc(sizeof(struct nodes), GFP_KERNEL); 365 + 366 + if (!sp || !gps || !nds) 367 + goto fail; 368 + 369 + config_group_init_type_name(&sp->group, name, &space_type); 370 + config_group_init_type_name(&nds->ns_group, "nodes", &nodes_type); 371 + 372 + sp->group.default_groups = gps; 373 + sp->group.default_groups[0] = &nds->ns_group; 374 + sp->group.default_groups[1] = NULL; 375 + 376 + INIT_LIST_HEAD(&sp->members); 377 + init_MUTEX(&sp->members_lock); 378 + sp->members_count = 0; 379 + return &sp->group; 380 + 381 + fail: 382 + kfree(sp); 383 + kfree(gps); 384 + kfree(nds); 385 + return NULL; 386 + } 387 + 388 + static void drop_space(struct config_group *g, struct config_item *i) 389 + { 390 + struct space *sp = to_space(i); 391 + struct config_item *tmp; 392 + int j; 393 + 394 + /* assert list_empty(&sp->members) */ 395 + 396 + for (j = 0; sp->group.default_groups[j]; j++) { 397 + tmp = &sp->group.default_groups[j]->cg_item; 398 + sp->group.default_groups[j] = NULL; 399 + config_item_put(tmp); 400 + } 401 + 402 + config_item_put(i); 403 + } 404 + 405 + static void release_space(struct config_item *i) 406 + { 407 + struct space *sp = to_space(i); 408 + kfree(sp->group.default_groups); 409 + kfree(sp); 410 + } 411 + 412 + static struct config_item *make_comm(struct config_group *g, const char *name) 413 + { 414 + struct comm *cm; 415 + 416 + cm = kzalloc(sizeof(struct comm), GFP_KERNEL); 417 + if (!cm) 418 + return NULL; 419 + 420 + config_item_init_type_name(&cm->item, name, &comm_type); 421 + cm->nodeid = -1; 422 + cm->local = 0; 423 + cm->addr_count = 0; 424 + return &cm->item; 425 + } 426 + 427 + static void drop_comm(struct config_group *g, struct config_item *i) 428 + { 429 + struct comm *cm = to_comm(i); 430 + if (local_comm == cm) 431 + local_comm = NULL; 432 + while (cm->addr_count--) 433 + kfree(cm->addr[cm->addr_count]); 434 + config_item_put(i); 435 + } 436 + 437 + static void release_comm(struct config_item *i) 438 + { 439 + struct comm *cm = to_comm(i); 440 + kfree(cm); 441 + } 442 + 443 + static struct config_item *make_node(struct config_group *g, const char *name) 444 + { 445 + struct space *sp = to_space(g->cg_item.ci_parent); 446 + struct node *nd; 447 + 448 + nd = kzalloc(sizeof(struct node), GFP_KERNEL); 449 + if (!nd) 450 + return NULL; 451 + 452 + config_item_init_type_name(&nd->item, name, &node_type); 453 + nd->nodeid = -1; 454 + nd->weight = 1; /* default weight of 1 if none is set */ 455 + 456 + down(&sp->members_lock); 457 + list_add(&nd->list, &sp->members); 458 + sp->members_count++; 459 + up(&sp->members_lock); 460 + 461 + return &nd->item; 462 + } 463 + 464 + static void drop_node(struct config_group *g, struct config_item *i) 465 + { 466 + struct space *sp = to_space(g->cg_item.ci_parent); 467 + struct node *nd = to_node(i); 468 + 469 + down(&sp->members_lock); 470 + list_del(&nd->list); 471 + sp->members_count--; 472 + up(&sp->members_lock); 473 + 474 + config_item_put(i); 475 + } 476 + 477 + static void release_node(struct config_item *i) 478 + { 479 + struct node *nd = to_node(i); 480 + kfree(nd); 481 + } 482 + 483 + static struct clusters clusters_root = { 484 + .subsys = { 485 + .su_group = { 486 + .cg_item = { 487 + .ci_namebuf = "dlm", 488 + .ci_type = &clusters_type, 489 + }, 490 + }, 491 + }, 492 + }; 493 + 494 + int dlm_config_init(void) 495 + { 496 + config_group_init(&clusters_root.subsys.su_group); 497 + init_MUTEX(&clusters_root.subsys.su_sem); 498 + return configfs_register_subsystem(&clusters_root.subsys); 499 + } 500 + 501 + void dlm_config_exit(void) 502 + { 503 + configfs_unregister_subsystem(&clusters_root.subsys); 504 + } 505 + 506 + /* 507 + * Functions for user space to read/write attributes 508 + */ 509 + 510 + static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, 511 + char *buf) 512 + { 513 + struct comm *cm = to_comm(i); 514 + struct comm_attribute *cma = 515 + container_of(a, struct comm_attribute, attr); 516 + return cma->show ? cma->show(cm, buf) : 0; 517 + } 518 + 519 + static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, 520 + const char *buf, size_t len) 521 + { 522 + struct comm *cm = to_comm(i); 523 + struct comm_attribute *cma = 524 + container_of(a, struct comm_attribute, attr); 525 + return cma->store ? cma->store(cm, buf, len) : -EINVAL; 526 + } 527 + 528 + static ssize_t comm_nodeid_read(struct comm *cm, char *buf) 529 + { 530 + return sprintf(buf, "%d\n", cm->nodeid); 531 + } 532 + 533 + static ssize_t comm_nodeid_write(struct comm *cm, const char *buf, size_t len) 534 + { 535 + cm->nodeid = simple_strtol(buf, NULL, 0); 536 + return len; 537 + } 538 + 539 + static ssize_t comm_local_read(struct comm *cm, char *buf) 540 + { 541 + return sprintf(buf, "%d\n", cm->local); 542 + } 543 + 544 + static ssize_t comm_local_write(struct comm *cm, const char *buf, size_t len) 545 + { 546 + cm->local= simple_strtol(buf, NULL, 0); 547 + if (cm->local && !local_comm) 548 + local_comm = cm; 549 + return len; 550 + } 551 + 552 + static ssize_t comm_addr_write(struct comm *cm, const char *buf, size_t len) 553 + { 554 + struct sockaddr_storage *addr; 555 + 556 + if (len != sizeof(struct sockaddr_storage)) 557 + return -EINVAL; 558 + 559 + if (cm->addr_count >= DLM_MAX_ADDR_COUNT) 560 + return -ENOSPC; 561 + 562 + addr = kzalloc(sizeof(*addr), GFP_KERNEL); 563 + if (!addr) 564 + return -ENOMEM; 565 + 566 + memcpy(addr, buf, len); 567 + cm->addr[cm->addr_count++] = addr; 568 + return len; 569 + } 570 + 571 + static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, 572 + char *buf) 573 + { 574 + struct node *nd = to_node(i); 575 + struct node_attribute *nda = 576 + container_of(a, struct node_attribute, attr); 577 + return nda->show ? nda->show(nd, buf) : 0; 578 + } 579 + 580 + static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, 581 + const char *buf, size_t len) 582 + { 583 + struct node *nd = to_node(i); 584 + struct node_attribute *nda = 585 + container_of(a, struct node_attribute, attr); 586 + return nda->store ? nda->store(nd, buf, len) : -EINVAL; 587 + } 588 + 589 + static ssize_t node_nodeid_read(struct node *nd, char *buf) 590 + { 591 + return sprintf(buf, "%d\n", nd->nodeid); 592 + } 593 + 594 + static ssize_t node_nodeid_write(struct node *nd, const char *buf, size_t len) 595 + { 596 + nd->nodeid = simple_strtol(buf, NULL, 0); 597 + return len; 598 + } 599 + 600 + static ssize_t node_weight_read(struct node *nd, char *buf) 601 + { 602 + return sprintf(buf, "%d\n", nd->weight); 603 + } 604 + 605 + static ssize_t node_weight_write(struct node *nd, const char *buf, size_t len) 606 + { 607 + nd->weight = simple_strtol(buf, NULL, 0); 608 + return len; 609 + } 610 + 611 + /* 612 + * Functions for the dlm to get the info that's been configured 613 + */ 614 + 615 + static struct space *get_space(char *name) 616 + { 617 + if (!space_list) 618 + return NULL; 619 + return to_space(config_group_find_obj(space_list, name)); 620 + } 621 + 622 + static void put_space(struct space *sp) 623 + { 624 + config_item_put(&sp->group.cg_item); 625 + } 626 + 627 + static struct comm *get_comm(int nodeid, struct sockaddr_storage *addr) 628 + { 629 + struct config_item *i; 630 + struct comm *cm = NULL; 631 + int found = 0; 632 + 633 + if (!comm_list) 634 + return NULL; 635 + 636 + down(&clusters_root.subsys.su_sem); 637 + 638 + list_for_each_entry(i, &comm_list->cg_children, ci_entry) { 639 + cm = to_comm(i); 640 + 641 + if (nodeid) { 642 + if (cm->nodeid != nodeid) 643 + continue; 644 + found = 1; 645 + break; 646 + } else { 647 + if (!cm->addr_count || 648 + memcmp(cm->addr[0], addr, sizeof(*addr))) 649 + continue; 650 + found = 1; 651 + break; 652 + } 653 + } 654 + up(&clusters_root.subsys.su_sem); 655 + 656 + if (found) 657 + config_item_get(i); 658 + else 659 + cm = NULL; 660 + return cm; 661 + } 662 + 663 + static void put_comm(struct comm *cm) 664 + { 665 + config_item_put(&cm->item); 666 + } 667 + 668 + /* caller must free mem */ 669 + int dlm_nodeid_list(char *lsname, int **ids_out) 670 + { 671 + struct space *sp; 672 + struct node *nd; 673 + int i = 0, rv = 0; 674 + int *ids; 675 + 676 + sp = get_space(lsname); 677 + if (!sp) 678 + return -EEXIST; 679 + 680 + down(&sp->members_lock); 681 + if (!sp->members_count) { 682 + rv = 0; 683 + goto out; 684 + } 685 + 686 + ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL); 687 + if (!ids) { 688 + rv = -ENOMEM; 689 + goto out; 690 + } 691 + 692 + rv = sp->members_count; 693 + list_for_each_entry(nd, &sp->members, list) 694 + ids[i++] = nd->nodeid; 695 + 696 + if (rv != i) 697 + printk("bad nodeid count %d %d\n", rv, i); 698 + 699 + *ids_out = ids; 700 + out: 701 + up(&sp->members_lock); 702 + put_space(sp); 703 + return rv; 704 + } 705 + 706 + int dlm_node_weight(char *lsname, int nodeid) 707 + { 708 + struct space *sp; 709 + struct node *nd; 710 + int w = -EEXIST; 711 + 712 + sp = get_space(lsname); 713 + if (!sp) 714 + goto out; 715 + 716 + down(&sp->members_lock); 717 + list_for_each_entry(nd, &sp->members, list) { 718 + if (nd->nodeid != nodeid) 719 + continue; 720 + w = nd->weight; 721 + break; 722 + } 723 + up(&sp->members_lock); 724 + put_space(sp); 725 + out: 726 + return w; 727 + } 728 + 729 + int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr) 730 + { 731 + struct comm *cm = get_comm(nodeid, NULL); 732 + if (!cm) 733 + return -EEXIST; 734 + if (!cm->addr_count) 735 + return -ENOENT; 736 + memcpy(addr, cm->addr[0], sizeof(*addr)); 737 + put_comm(cm); 738 + return 0; 739 + } 740 + 741 + int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) 742 + { 743 + struct comm *cm = get_comm(0, addr); 744 + if (!cm) 745 + return -EEXIST; 746 + *nodeid = cm->nodeid; 747 + put_comm(cm); 748 + return 0; 749 + } 750 + 751 + int dlm_our_nodeid(void) 752 + { 753 + return local_comm ? local_comm->nodeid : 0; 754 + } 755 + 756 + /* num 0 is first addr, num 1 is second addr */ 757 + int dlm_our_addr(struct sockaddr_storage *addr, int num) 758 + { 759 + if (!local_comm) 760 + return -1; 761 + if (num + 1 > local_comm->addr_count) 762 + return -1; 763 + memcpy(addr, local_comm->addr[num], sizeof(*addr)); 764 + return 0; 765 + } 766 + 767 + /* Config file defaults */ 768 + #define DEFAULT_TCP_PORT 21064 769 + #define DEFAULT_BUFFER_SIZE 4096 770 + #define DEFAULT_RSBTBL_SIZE 256 771 + #define DEFAULT_LKBTBL_SIZE 1024 772 + #define DEFAULT_DIRTBL_SIZE 512 773 + #define DEFAULT_RECOVER_TIMER 5 774 + #define DEFAULT_TOSS_SECS 10 775 + #define DEFAULT_SCAN_SECS 5 776 + 777 + struct dlm_config_info dlm_config = { 778 + .tcp_port = DEFAULT_TCP_PORT, 779 + .buffer_size = DEFAULT_BUFFER_SIZE, 780 + .rsbtbl_size = DEFAULT_RSBTBL_SIZE, 781 + .lkbtbl_size = DEFAULT_LKBTBL_SIZE, 782 + .dirtbl_size = DEFAULT_DIRTBL_SIZE, 783 + .recover_timer = DEFAULT_RECOVER_TIMER, 784 + .toss_secs = DEFAULT_TOSS_SECS, 785 + .scan_secs = DEFAULT_SCAN_SECS 786 + }; 787 +
+42
fs/dlm/config.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #ifndef __CONFIG_DOT_H__ 15 + #define __CONFIG_DOT_H__ 16 + 17 + #define DLM_MAX_ADDR_COUNT 3 18 + 19 + struct dlm_config_info { 20 + int tcp_port; 21 + int buffer_size; 22 + int rsbtbl_size; 23 + int lkbtbl_size; 24 + int dirtbl_size; 25 + int recover_timer; 26 + int toss_secs; 27 + int scan_secs; 28 + }; 29 + 30 + extern struct dlm_config_info dlm_config; 31 + 32 + int dlm_config_init(void); 33 + void dlm_config_exit(void); 34 + int dlm_node_weight(char *lsname, int nodeid); 35 + int dlm_nodeid_list(char *lsname, int **ids_out); 36 + int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr); 37 + int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid); 38 + int dlm_our_nodeid(void); 39 + int dlm_our_addr(struct sockaddr_storage *addr, int num); 40 + 41 + #endif /* __CONFIG_DOT_H__ */ 42 +
+310
fs/dlm/debug_fs.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5 + ** 6 + ** This copyrighted material is made available to anyone wishing to use, 7 + ** modify, copy, or redistribute it subject to the terms and conditions 8 + ** of the GNU General Public License v.2. 9 + ** 10 + ******************************************************************************* 11 + ******************************************************************************/ 12 + 13 + #include <linux/pagemap.h> 14 + #include <linux/seq_file.h> 15 + #include <linux/module.h> 16 + #include <linux/ctype.h> 17 + #include <linux/debugfs.h> 18 + 19 + #include "dlm_internal.h" 20 + 21 + 22 + static struct dentry *dlm_root; 23 + 24 + struct rsb_iter { 25 + int entry; 26 + struct dlm_ls *ls; 27 + struct list_head *next; 28 + struct dlm_rsb *rsb; 29 + }; 30 + 31 + static char *print_lockmode(int mode) 32 + { 33 + switch (mode) { 34 + case DLM_LOCK_IV: 35 + return "--"; 36 + case DLM_LOCK_NL: 37 + return "NL"; 38 + case DLM_LOCK_CR: 39 + return "CR"; 40 + case DLM_LOCK_CW: 41 + return "CW"; 42 + case DLM_LOCK_PR: 43 + return "PR"; 44 + case DLM_LOCK_PW: 45 + return "PW"; 46 + case DLM_LOCK_EX: 47 + return "EX"; 48 + default: 49 + return "??"; 50 + } 51 + } 52 + 53 + static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, 54 + struct dlm_rsb *res) 55 + { 56 + seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); 57 + 58 + if (lkb->lkb_status == DLM_LKSTS_CONVERT 59 + || lkb->lkb_status == DLM_LKSTS_WAITING) 60 + seq_printf(s, " (%s)", print_lockmode(lkb->lkb_rqmode)); 61 + 62 + if (lkb->lkb_range) { 63 + /* FIXME: this warns on Alpha */ 64 + if (lkb->lkb_status == DLM_LKSTS_CONVERT 65 + || lkb->lkb_status == DLM_LKSTS_GRANTED) 66 + seq_printf(s, " %" PRIx64 "-%" PRIx64, 67 + lkb->lkb_range[GR_RANGE_START], 68 + lkb->lkb_range[GR_RANGE_END]); 69 + if (lkb->lkb_status == DLM_LKSTS_CONVERT 70 + || lkb->lkb_status == DLM_LKSTS_WAITING) 71 + seq_printf(s, " (%" PRIx64 "-%" PRIx64 ")", 72 + lkb->lkb_range[RQ_RANGE_START], 73 + lkb->lkb_range[RQ_RANGE_END]); 74 + } 75 + 76 + if (lkb->lkb_nodeid) { 77 + if (lkb->lkb_nodeid != res->res_nodeid) 78 + seq_printf(s, " Remote: %3d %08x", lkb->lkb_nodeid, 79 + lkb->lkb_remid); 80 + else 81 + seq_printf(s, " Master: %08x", lkb->lkb_remid); 82 + } 83 + 84 + if (lkb->lkb_wait_type) 85 + seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); 86 + 87 + seq_printf(s, "\n"); 88 + } 89 + 90 + static int print_resource(struct dlm_rsb *res, struct seq_file *s) 91 + { 92 + struct dlm_lkb *lkb; 93 + int i, lvblen = res->res_ls->ls_lvblen; 94 + 95 + seq_printf(s, "\nResource %p Name (len=%d) \"", res, res->res_length); 96 + for (i = 0; i < res->res_length; i++) { 97 + if (isprint(res->res_name[i])) 98 + seq_printf(s, "%c", res->res_name[i]); 99 + else 100 + seq_printf(s, "%c", '.'); 101 + } 102 + if (res->res_nodeid > 0) 103 + seq_printf(s, "\" \nLocal Copy, Master is node %d\n", 104 + res->res_nodeid); 105 + else if (res->res_nodeid == 0) 106 + seq_printf(s, "\" \nMaster Copy\n"); 107 + else if (res->res_nodeid == -1) 108 + seq_printf(s, "\" \nLooking up master (lkid %x)\n", 109 + res->res_first_lkid); 110 + else 111 + seq_printf(s, "\" \nInvalid master %d\n", res->res_nodeid); 112 + 113 + /* Print the LVB: */ 114 + if (res->res_lvbptr) { 115 + seq_printf(s, "LVB: "); 116 + for (i = 0; i < lvblen; i++) { 117 + if (i == lvblen / 2) 118 + seq_printf(s, "\n "); 119 + seq_printf(s, "%02x ", 120 + (unsigned char) res->res_lvbptr[i]); 121 + } 122 + if (rsb_flag(res, RSB_VALNOTVALID)) 123 + seq_printf(s, " (INVALID)"); 124 + seq_printf(s, "\n"); 125 + } 126 + 127 + /* Print the locks attached to this resource */ 128 + seq_printf(s, "Granted Queue\n"); 129 + list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) 130 + print_lock(s, lkb, res); 131 + 132 + seq_printf(s, "Conversion Queue\n"); 133 + list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) 134 + print_lock(s, lkb, res); 135 + 136 + seq_printf(s, "Waiting Queue\n"); 137 + list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) 138 + print_lock(s, lkb, res); 139 + 140 + return 0; 141 + } 142 + 143 + static int rsb_iter_next(struct rsb_iter *ri) 144 + { 145 + struct dlm_ls *ls = ri->ls; 146 + int i; 147 + 148 + if (!ri->next) { 149 + top: 150 + /* Find the next non-empty hash bucket */ 151 + for (i = ri->entry; i < ls->ls_rsbtbl_size; i++) { 152 + read_lock(&ls->ls_rsbtbl[i].lock); 153 + if (!list_empty(&ls->ls_rsbtbl[i].list)) { 154 + ri->next = ls->ls_rsbtbl[i].list.next; 155 + read_unlock(&ls->ls_rsbtbl[i].lock); 156 + break; 157 + } 158 + read_unlock(&ls->ls_rsbtbl[i].lock); 159 + } 160 + ri->entry = i; 161 + 162 + if (ri->entry >= ls->ls_rsbtbl_size) 163 + return 1; 164 + } else { 165 + i = ri->entry; 166 + read_lock(&ls->ls_rsbtbl[i].lock); 167 + ri->next = ri->next->next; 168 + if (ri->next->next == ls->ls_rsbtbl[i].list.next) { 169 + /* End of list - move to next bucket */ 170 + ri->next = NULL; 171 + ri->entry++; 172 + read_unlock(&ls->ls_rsbtbl[i].lock); 173 + goto top; 174 + } 175 + read_unlock(&ls->ls_rsbtbl[i].lock); 176 + } 177 + ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain); 178 + 179 + return 0; 180 + } 181 + 182 + static void rsb_iter_free(struct rsb_iter *ri) 183 + { 184 + kfree(ri); 185 + } 186 + 187 + static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls) 188 + { 189 + struct rsb_iter *ri; 190 + 191 + ri = kmalloc(sizeof *ri, GFP_KERNEL); 192 + if (!ri) 193 + return NULL; 194 + 195 + ri->ls = ls; 196 + ri->entry = 0; 197 + ri->next = NULL; 198 + 199 + if (rsb_iter_next(ri)) { 200 + rsb_iter_free(ri); 201 + return NULL; 202 + } 203 + 204 + return ri; 205 + } 206 + 207 + static void *seq_start(struct seq_file *file, loff_t *pos) 208 + { 209 + struct rsb_iter *ri; 210 + loff_t n = *pos; 211 + 212 + ri = rsb_iter_init(file->private); 213 + if (!ri) 214 + return NULL; 215 + 216 + while (n--) { 217 + if (rsb_iter_next(ri)) { 218 + rsb_iter_free(ri); 219 + return NULL; 220 + } 221 + } 222 + 223 + return ri; 224 + } 225 + 226 + static void *seq_next(struct seq_file *file, void *iter_ptr, loff_t *pos) 227 + { 228 + struct rsb_iter *ri = iter_ptr; 229 + 230 + (*pos)++; 231 + 232 + if (rsb_iter_next(ri)) { 233 + rsb_iter_free(ri); 234 + return NULL; 235 + } 236 + 237 + return ri; 238 + } 239 + 240 + static void seq_stop(struct seq_file *file, void *iter_ptr) 241 + { 242 + /* nothing for now */ 243 + } 244 + 245 + static int seq_show(struct seq_file *file, void *iter_ptr) 246 + { 247 + struct rsb_iter *ri = iter_ptr; 248 + 249 + print_resource(ri->rsb, file); 250 + 251 + return 0; 252 + } 253 + 254 + static struct seq_operations dlm_seq_ops = { 255 + .start = seq_start, 256 + .next = seq_next, 257 + .stop = seq_stop, 258 + .show = seq_show, 259 + }; 260 + 261 + static int do_open(struct inode *inode, struct file *file) 262 + { 263 + struct seq_file *seq; 264 + int ret; 265 + 266 + ret = seq_open(file, &dlm_seq_ops); 267 + if (ret) 268 + return ret; 269 + 270 + seq = file->private_data; 271 + seq->private = inode->u.generic_ip; 272 + 273 + return 0; 274 + } 275 + 276 + static struct file_operations dlm_fops = { 277 + .owner = THIS_MODULE, 278 + .open = do_open, 279 + .read = seq_read, 280 + .llseek = seq_lseek, 281 + .release = seq_release 282 + }; 283 + 284 + int dlm_create_debug_file(struct dlm_ls *ls) 285 + { 286 + ls->ls_debug_dentry = debugfs_create_file(ls->ls_name, 287 + S_IFREG | S_IRUGO, 288 + dlm_root, 289 + ls, 290 + &dlm_fops); 291 + return ls->ls_debug_dentry ? 0 : -ENOMEM; 292 + } 293 + 294 + void dlm_delete_debug_file(struct dlm_ls *ls) 295 + { 296 + if (ls->ls_debug_dentry) 297 + debugfs_remove(ls->ls_debug_dentry); 298 + } 299 + 300 + int dlm_register_debugfs(void) 301 + { 302 + dlm_root = debugfs_create_dir("dlm", NULL); 303 + return dlm_root ? 0 : -ENOMEM; 304 + } 305 + 306 + void dlm_unregister_debugfs(void) 307 + { 308 + debugfs_remove(dlm_root); 309 + } 310 +
+1084
fs/dlm/device.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + /* 15 + * device.c 16 + * 17 + * This is the userland interface to the DLM. 18 + * 19 + * The locking is done via a misc char device (find the 20 + * registered minor number in /proc/misc). 21 + * 22 + * User code should not use this interface directly but 23 + * call the library routines in libdlm.a instead. 24 + * 25 + */ 26 + 27 + #include <linux/miscdevice.h> 28 + #include <linux/init.h> 29 + #include <linux/wait.h> 30 + #include <linux/module.h> 31 + #include <linux/file.h> 32 + #include <linux/fs.h> 33 + #include <linux/poll.h> 34 + #include <linux/signal.h> 35 + #include <linux/spinlock.h> 36 + #include <linux/idr.h> 37 + 38 + #include <linux/dlm.h> 39 + #include <linux/dlm_device.h> 40 + 41 + #include "lvb_table.h" 42 + 43 + static struct file_operations _dlm_fops; 44 + static const char *name_prefix="dlm"; 45 + static struct list_head user_ls_list; 46 + static struct semaphore user_ls_lock; 47 + 48 + /* Lock infos are stored in here indexed by lock ID */ 49 + static DEFINE_IDR(lockinfo_idr); 50 + static rwlock_t lockinfo_lock; 51 + 52 + /* Flags in li_flags */ 53 + #define LI_FLAG_COMPLETE 1 54 + #define LI_FLAG_FIRSTLOCK 2 55 + #define LI_FLAG_PERSISTENT 3 56 + 57 + /* flags in ls_flags*/ 58 + #define LS_FLAG_DELETED 1 59 + #define LS_FLAG_AUTOFREE 2 60 + 61 + 62 + #define LOCKINFO_MAGIC 0x53595324 63 + 64 + struct lock_info { 65 + uint32_t li_magic; 66 + uint8_t li_cmd; 67 + int8_t li_grmode; 68 + int8_t li_rqmode; 69 + struct dlm_lksb li_lksb; 70 + wait_queue_head_t li_waitq; 71 + unsigned long li_flags; 72 + void __user *li_castparam; 73 + void __user *li_castaddr; 74 + void __user *li_bastparam; 75 + void __user *li_bastaddr; 76 + void __user *li_pend_bastparam; 77 + void __user *li_pend_bastaddr; 78 + struct list_head li_ownerqueue; 79 + struct file_info *li_file; 80 + struct dlm_lksb __user *li_user_lksb; 81 + struct semaphore li_firstlock; 82 + }; 83 + 84 + /* A queued AST no less */ 85 + struct ast_info { 86 + struct dlm_lock_result result; 87 + struct list_head list; 88 + uint32_t lvb_updated; 89 + uint32_t progress; /* How much has been read */ 90 + }; 91 + 92 + /* One of these per userland lockspace */ 93 + struct user_ls { 94 + void *ls_lockspace; 95 + atomic_t ls_refcnt; 96 + long ls_flags; 97 + 98 + /* Passed into misc_register() */ 99 + struct miscdevice ls_miscinfo; 100 + struct list_head ls_list; 101 + }; 102 + 103 + /* misc_device info for the control device */ 104 + static struct miscdevice ctl_device; 105 + 106 + /* 107 + * Stuff we hang off the file struct. 108 + * The first two are to cope with unlocking all the 109 + * locks help by a process when it dies. 110 + */ 111 + struct file_info { 112 + struct list_head fi_li_list; /* List of active lock_infos */ 113 + spinlock_t fi_li_lock; 114 + struct list_head fi_ast_list; /* Queue of ASTs to be delivered */ 115 + spinlock_t fi_ast_lock; 116 + wait_queue_head_t fi_wait; 117 + struct user_ls *fi_ls; 118 + atomic_t fi_refcnt; /* Number of users */ 119 + unsigned long fi_flags; /* Bit 1 means the device is open */ 120 + }; 121 + 122 + 123 + /* get and put ops for file_info. 124 + Actually I don't really like "get" and "put", but everyone 125 + else seems to use them and I can't think of anything 126 + nicer at the moment */ 127 + static void get_file_info(struct file_info *f) 128 + { 129 + atomic_inc(&f->fi_refcnt); 130 + } 131 + 132 + static void put_file_info(struct file_info *f) 133 + { 134 + if (atomic_dec_and_test(&f->fi_refcnt)) 135 + kfree(f); 136 + } 137 + 138 + static void release_lockinfo(struct lock_info *li) 139 + { 140 + put_file_info(li->li_file); 141 + 142 + write_lock(&lockinfo_lock); 143 + idr_remove(&lockinfo_idr, li->li_lksb.sb_lkid); 144 + write_unlock(&lockinfo_lock); 145 + 146 + if (li->li_lksb.sb_lvbptr) 147 + kfree(li->li_lksb.sb_lvbptr); 148 + kfree(li); 149 + 150 + module_put(THIS_MODULE); 151 + } 152 + 153 + static struct lock_info *get_lockinfo(uint32_t lockid) 154 + { 155 + struct lock_info *li; 156 + 157 + read_lock(&lockinfo_lock); 158 + li = idr_find(&lockinfo_idr, lockid); 159 + read_unlock(&lockinfo_lock); 160 + 161 + return li; 162 + } 163 + 164 + static int add_lockinfo(struct lock_info *li) 165 + { 166 + int n; 167 + int r; 168 + int ret = -EINVAL; 169 + 170 + write_lock(&lockinfo_lock); 171 + 172 + if (idr_find(&lockinfo_idr, li->li_lksb.sb_lkid)) 173 + goto out_up; 174 + 175 + ret = -ENOMEM; 176 + r = idr_pre_get(&lockinfo_idr, GFP_KERNEL); 177 + if (!r) 178 + goto out_up; 179 + 180 + r = idr_get_new_above(&lockinfo_idr, li, li->li_lksb.sb_lkid, &n); 181 + if (r) 182 + goto out_up; 183 + 184 + if (n != li->li_lksb.sb_lkid) { 185 + idr_remove(&lockinfo_idr, n); 186 + goto out_up; 187 + } 188 + 189 + ret = 0; 190 + 191 + out_up: 192 + write_unlock(&lockinfo_lock); 193 + 194 + return ret; 195 + } 196 + 197 + 198 + static struct user_ls *__find_lockspace(int minor) 199 + { 200 + struct user_ls *lsinfo; 201 + 202 + list_for_each_entry(lsinfo, &user_ls_list, ls_list) { 203 + if (lsinfo->ls_miscinfo.minor == minor) 204 + return lsinfo; 205 + } 206 + return NULL; 207 + } 208 + 209 + /* Find a lockspace struct given the device minor number */ 210 + static struct user_ls *find_lockspace(int minor) 211 + { 212 + struct user_ls *lsinfo; 213 + 214 + down(&user_ls_lock); 215 + lsinfo = __find_lockspace(minor); 216 + up(&user_ls_lock); 217 + 218 + return lsinfo; 219 + } 220 + 221 + static void add_lockspace_to_list(struct user_ls *lsinfo) 222 + { 223 + down(&user_ls_lock); 224 + list_add(&lsinfo->ls_list, &user_ls_list); 225 + up(&user_ls_lock); 226 + } 227 + 228 + /* Register a lockspace with the DLM and create a misc 229 + device for userland to access it */ 230 + static int register_lockspace(char *name, struct user_ls **ls, int flags) 231 + { 232 + struct user_ls *newls; 233 + int status; 234 + int namelen; 235 + 236 + namelen = strlen(name)+strlen(name_prefix)+2; 237 + 238 + newls = kmalloc(sizeof(struct user_ls), GFP_KERNEL); 239 + if (!newls) 240 + return -ENOMEM; 241 + memset(newls, 0, sizeof(struct user_ls)); 242 + 243 + newls->ls_miscinfo.name = kmalloc(namelen, GFP_KERNEL); 244 + if (!newls->ls_miscinfo.name) { 245 + kfree(newls); 246 + return -ENOMEM; 247 + } 248 + 249 + status = dlm_new_lockspace(name, strlen(name), &newls->ls_lockspace, 0, 250 + DLM_USER_LVB_LEN); 251 + if (status != 0) { 252 + kfree(newls->ls_miscinfo.name); 253 + kfree(newls); 254 + return status; 255 + } 256 + 257 + snprintf((char*)newls->ls_miscinfo.name, namelen, "%s_%s", 258 + name_prefix, name); 259 + 260 + newls->ls_miscinfo.fops = &_dlm_fops; 261 + newls->ls_miscinfo.minor = MISC_DYNAMIC_MINOR; 262 + 263 + status = misc_register(&newls->ls_miscinfo); 264 + if (status) { 265 + printk(KERN_ERR "dlm: misc register failed for %s\n", name); 266 + dlm_release_lockspace(newls->ls_lockspace, 0); 267 + kfree(newls->ls_miscinfo.name); 268 + kfree(newls); 269 + return status; 270 + } 271 + 272 + if (flags & DLM_USER_LSFLG_AUTOFREE) 273 + set_bit(LS_FLAG_AUTOFREE, &newls->ls_flags); 274 + 275 + add_lockspace_to_list(newls); 276 + *ls = newls; 277 + return 0; 278 + } 279 + 280 + /* Called with the user_ls_lock semaphore held */ 281 + static int unregister_lockspace(struct user_ls *lsinfo, int force) 282 + { 283 + int status; 284 + 285 + status = dlm_release_lockspace(lsinfo->ls_lockspace, force); 286 + if (status) 287 + return status; 288 + 289 + status = misc_deregister(&lsinfo->ls_miscinfo); 290 + if (status) 291 + return status; 292 + 293 + list_del(&lsinfo->ls_list); 294 + set_bit(LS_FLAG_DELETED, &lsinfo->ls_flags); 295 + lsinfo->ls_lockspace = NULL; 296 + if (atomic_read(&lsinfo->ls_refcnt) == 0) { 297 + kfree(lsinfo->ls_miscinfo.name); 298 + kfree(lsinfo); 299 + } 300 + 301 + return 0; 302 + } 303 + 304 + /* Add it to userland's AST queue */ 305 + static void add_to_astqueue(struct lock_info *li, void *astaddr, void *astparam, 306 + int lvb_updated) 307 + { 308 + struct ast_info *ast = kmalloc(sizeof(struct ast_info), GFP_KERNEL); 309 + if (!ast) 310 + return; 311 + 312 + memset(ast, 0, sizeof(*ast)); 313 + ast->result.user_astparam = astparam; 314 + ast->result.user_astaddr = astaddr; 315 + ast->result.user_lksb = li->li_user_lksb; 316 + memcpy(&ast->result.lksb, &li->li_lksb, sizeof(struct dlm_lksb)); 317 + ast->lvb_updated = lvb_updated; 318 + 319 + spin_lock(&li->li_file->fi_ast_lock); 320 + list_add_tail(&ast->list, &li->li_file->fi_ast_list); 321 + spin_unlock(&li->li_file->fi_ast_lock); 322 + wake_up_interruptible(&li->li_file->fi_wait); 323 + } 324 + 325 + static void bast_routine(void *param, int mode) 326 + { 327 + struct lock_info *li = param; 328 + 329 + if (li && li->li_bastaddr) 330 + add_to_astqueue(li, li->li_bastaddr, li->li_bastparam, 0); 331 + } 332 + 333 + /* 334 + * This is the kernel's AST routine. 335 + * All lock, unlock & query operations complete here. 336 + * The only syncronous ops are those done during device close. 337 + */ 338 + static void ast_routine(void *param) 339 + { 340 + struct lock_info *li = param; 341 + 342 + /* Param may be NULL if a persistent lock is unlocked by someone else */ 343 + if (!li) 344 + return; 345 + 346 + /* If this is a succesful conversion then activate the blocking ast 347 + * args from the conversion request */ 348 + if (!test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) && 349 + li->li_lksb.sb_status == 0) { 350 + 351 + li->li_bastparam = li->li_pend_bastparam; 352 + li->li_bastaddr = li->li_pend_bastaddr; 353 + li->li_pend_bastaddr = NULL; 354 + } 355 + 356 + /* If it's an async request then post data to the user's AST queue. */ 357 + if (li->li_castaddr) { 358 + int lvb_updated = 0; 359 + 360 + /* See if the lvb has been updated */ 361 + if (dlm_lvb_operations[li->li_grmode+1][li->li_rqmode+1] == 1) 362 + lvb_updated = 1; 363 + 364 + if (li->li_lksb.sb_status == 0) 365 + li->li_grmode = li->li_rqmode; 366 + 367 + /* Only queue AST if the device is still open */ 368 + if (test_bit(1, &li->li_file->fi_flags)) 369 + add_to_astqueue(li, li->li_castaddr, li->li_castparam, 370 + lvb_updated); 371 + 372 + /* If it's a new lock operation that failed, then 373 + * remove it from the owner queue and free the 374 + * lock_info. 375 + */ 376 + if (test_and_clear_bit(LI_FLAG_FIRSTLOCK, &li->li_flags) && 377 + li->li_lksb.sb_status != 0) { 378 + 379 + /* Wait till dlm_lock() has finished */ 380 + down(&li->li_firstlock); 381 + up(&li->li_firstlock); 382 + 383 + spin_lock(&li->li_file->fi_li_lock); 384 + list_del(&li->li_ownerqueue); 385 + spin_unlock(&li->li_file->fi_li_lock); 386 + release_lockinfo(li); 387 + return; 388 + } 389 + /* Free unlocks & queries */ 390 + if (li->li_lksb.sb_status == -DLM_EUNLOCK || 391 + li->li_cmd == DLM_USER_QUERY) { 392 + release_lockinfo(li); 393 + } 394 + } else { 395 + /* Synchronous request, just wake up the caller */ 396 + set_bit(LI_FLAG_COMPLETE, &li->li_flags); 397 + wake_up_interruptible(&li->li_waitq); 398 + } 399 + } 400 + 401 + /* 402 + * Wait for the lock op to complete and return the status. 403 + */ 404 + static int wait_for_ast(struct lock_info *li) 405 + { 406 + /* Wait for the AST routine to complete */ 407 + set_task_state(current, TASK_INTERRUPTIBLE); 408 + while (!test_bit(LI_FLAG_COMPLETE, &li->li_flags)) 409 + schedule(); 410 + 411 + set_task_state(current, TASK_RUNNING); 412 + 413 + return li->li_lksb.sb_status; 414 + } 415 + 416 + 417 + /* Open on control device */ 418 + static int dlm_ctl_open(struct inode *inode, struct file *file) 419 + { 420 + file->private_data = NULL; 421 + return 0; 422 + } 423 + 424 + /* Close on control device */ 425 + static int dlm_ctl_close(struct inode *inode, struct file *file) 426 + { 427 + return 0; 428 + } 429 + 430 + /* Open on lockspace device */ 431 + static int dlm_open(struct inode *inode, struct file *file) 432 + { 433 + struct file_info *f; 434 + struct user_ls *lsinfo; 435 + 436 + lsinfo = find_lockspace(iminor(inode)); 437 + if (!lsinfo) 438 + return -ENOENT; 439 + 440 + f = kmalloc(sizeof(struct file_info), GFP_KERNEL); 441 + if (!f) 442 + return -ENOMEM; 443 + 444 + atomic_inc(&lsinfo->ls_refcnt); 445 + INIT_LIST_HEAD(&f->fi_li_list); 446 + INIT_LIST_HEAD(&f->fi_ast_list); 447 + spin_lock_init(&f->fi_li_lock); 448 + spin_lock_init(&f->fi_ast_lock); 449 + init_waitqueue_head(&f->fi_wait); 450 + f->fi_ls = lsinfo; 451 + f->fi_flags = 0; 452 + get_file_info(f); 453 + set_bit(1, &f->fi_flags); 454 + 455 + file->private_data = f; 456 + 457 + return 0; 458 + } 459 + 460 + /* Check the user's version matches ours */ 461 + static int check_version(struct dlm_write_request *req) 462 + { 463 + if (req->version[0] != DLM_DEVICE_VERSION_MAJOR || 464 + (req->version[0] == DLM_DEVICE_VERSION_MAJOR && 465 + req->version[1] > DLM_DEVICE_VERSION_MINOR)) { 466 + 467 + printk(KERN_DEBUG "dlm: process %s (%d) version mismatch " 468 + "user (%d.%d.%d) kernel (%d.%d.%d)\n", 469 + current->comm, 470 + current->pid, 471 + req->version[0], 472 + req->version[1], 473 + req->version[2], 474 + DLM_DEVICE_VERSION_MAJOR, 475 + DLM_DEVICE_VERSION_MINOR, 476 + DLM_DEVICE_VERSION_PATCH); 477 + return -EINVAL; 478 + } 479 + return 0; 480 + } 481 + 482 + /* Close on lockspace device */ 483 + static int dlm_close(struct inode *inode, struct file *file) 484 + { 485 + struct file_info *f = file->private_data; 486 + struct lock_info li; 487 + struct lock_info *old_li, *safe; 488 + sigset_t tmpsig; 489 + sigset_t allsigs; 490 + struct user_ls *lsinfo; 491 + DECLARE_WAITQUEUE(wq, current); 492 + 493 + lsinfo = find_lockspace(iminor(inode)); 494 + if (!lsinfo) 495 + return -ENOENT; 496 + 497 + /* Mark this closed so that ASTs will not be delivered any more */ 498 + clear_bit(1, &f->fi_flags); 499 + 500 + /* Block signals while we are doing this */ 501 + sigfillset(&allsigs); 502 + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); 503 + 504 + /* We use our own lock_info struct here, so that any 505 + * outstanding "real" ASTs will be delivered with the 506 + * corresponding "real" params, thus freeing the lock_info 507 + * that belongs the lock. This catches the corner case where 508 + * a lock is BUSY when we try to unlock it here 509 + */ 510 + memset(&li, 0, sizeof(li)); 511 + clear_bit(LI_FLAG_COMPLETE, &li.li_flags); 512 + init_waitqueue_head(&li.li_waitq); 513 + add_wait_queue(&li.li_waitq, &wq); 514 + 515 + /* 516 + * Free any outstanding locks, they are on the 517 + * list in LIFO order so there should be no problems 518 + * about unlocking parents before children. 519 + */ 520 + list_for_each_entry_safe(old_li, safe, &f->fi_li_list, li_ownerqueue) { 521 + int status; 522 + int flags = 0; 523 + 524 + /* Don't unlock persistent locks, just mark them orphaned */ 525 + if (test_bit(LI_FLAG_PERSISTENT, &old_li->li_flags)) { 526 + list_del(&old_li->li_ownerqueue); 527 + 528 + /* Update master copy */ 529 + /* TODO: Check locking core updates the local and 530 + remote ORPHAN flags */ 531 + li.li_lksb.sb_lkid = old_li->li_lksb.sb_lkid; 532 + status = dlm_lock(f->fi_ls->ls_lockspace, 533 + old_li->li_grmode, &li.li_lksb, 534 + DLM_LKF_CONVERT|DLM_LKF_ORPHAN, 535 + NULL, 0, 0, ast_routine, NULL, 536 + NULL, NULL); 537 + if (status != 0) 538 + printk("dlm: Error orphaning lock %x: %d\n", 539 + old_li->li_lksb.sb_lkid, status); 540 + 541 + /* But tidy our references in it */ 542 + release_lockinfo(old_li); 543 + continue; 544 + } 545 + 546 + clear_bit(LI_FLAG_COMPLETE, &li.li_flags); 547 + 548 + flags = DLM_LKF_FORCEUNLOCK; 549 + if (old_li->li_grmode >= DLM_LOCK_PW) 550 + flags |= DLM_LKF_IVVALBLK; 551 + 552 + status = dlm_unlock(f->fi_ls->ls_lockspace, 553 + old_li->li_lksb.sb_lkid, flags, 554 + &li.li_lksb, &li); 555 + 556 + /* Must wait for it to complete as the next lock could be its 557 + * parent */ 558 + if (status == 0) 559 + wait_for_ast(&li); 560 + 561 + /* Unlock suceeded, free the lock_info struct. */ 562 + if (status == 0) 563 + release_lockinfo(old_li); 564 + } 565 + 566 + remove_wait_queue(&li.li_waitq, &wq); 567 + 568 + /* 569 + * If this is the last reference to the lockspace 570 + * then free the struct. If it's an AUTOFREE lockspace 571 + * then free the whole thing. 572 + */ 573 + down(&user_ls_lock); 574 + if (atomic_dec_and_test(&lsinfo->ls_refcnt)) { 575 + 576 + if (lsinfo->ls_lockspace) { 577 + if (test_bit(LS_FLAG_AUTOFREE, &lsinfo->ls_flags)) { 578 + unregister_lockspace(lsinfo, 1); 579 + } 580 + } else { 581 + kfree(lsinfo->ls_miscinfo.name); 582 + kfree(lsinfo); 583 + } 584 + } 585 + up(&user_ls_lock); 586 + put_file_info(f); 587 + 588 + /* Restore signals */ 589 + sigprocmask(SIG_SETMASK, &tmpsig, NULL); 590 + recalc_sigpending(); 591 + 592 + return 0; 593 + } 594 + 595 + static int do_user_create_lockspace(struct file_info *fi, uint8_t cmd, 596 + struct dlm_lspace_params *kparams) 597 + { 598 + int status; 599 + struct user_ls *lsinfo; 600 + 601 + if (!capable(CAP_SYS_ADMIN)) 602 + return -EPERM; 603 + 604 + status = register_lockspace(kparams->name, &lsinfo, kparams->flags); 605 + 606 + /* If it succeeded then return the minor number */ 607 + if (status == 0) 608 + status = lsinfo->ls_miscinfo.minor; 609 + 610 + return status; 611 + } 612 + 613 + static int do_user_remove_lockspace(struct file_info *fi, uint8_t cmd, 614 + struct dlm_lspace_params *kparams) 615 + { 616 + int status; 617 + int force = 1; 618 + struct user_ls *lsinfo; 619 + 620 + if (!capable(CAP_SYS_ADMIN)) 621 + return -EPERM; 622 + 623 + down(&user_ls_lock); 624 + lsinfo = __find_lockspace(kparams->minor); 625 + if (!lsinfo) { 626 + up(&user_ls_lock); 627 + return -EINVAL; 628 + } 629 + 630 + if (kparams->flags & DLM_USER_LSFLG_FORCEFREE) 631 + force = 2; 632 + 633 + status = unregister_lockspace(lsinfo, force); 634 + up(&user_ls_lock); 635 + 636 + return status; 637 + } 638 + 639 + /* Read call, might block if no ASTs are waiting. 640 + * It will only ever return one message at a time, regardless 641 + * of how many are pending. 642 + */ 643 + static ssize_t dlm_read(struct file *file, char __user *buffer, size_t count, 644 + loff_t *ppos) 645 + { 646 + struct file_info *fi = file->private_data; 647 + struct ast_info *ast; 648 + int data_size; 649 + int offset; 650 + DECLARE_WAITQUEUE(wait, current); 651 + 652 + if (count < sizeof(struct dlm_lock_result)) 653 + return -EINVAL; 654 + 655 + spin_lock(&fi->fi_ast_lock); 656 + if (list_empty(&fi->fi_ast_list)) { 657 + 658 + /* No waiting ASTs. 659 + * Return EOF if the lockspace been deleted. 660 + */ 661 + if (test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags)) 662 + return 0; 663 + 664 + if (file->f_flags & O_NONBLOCK) { 665 + spin_unlock(&fi->fi_ast_lock); 666 + return -EAGAIN; 667 + } 668 + 669 + add_wait_queue(&fi->fi_wait, &wait); 670 + 671 + repeat: 672 + set_current_state(TASK_INTERRUPTIBLE); 673 + if (list_empty(&fi->fi_ast_list) && 674 + !signal_pending(current)) { 675 + 676 + spin_unlock(&fi->fi_ast_lock); 677 + schedule(); 678 + spin_lock(&fi->fi_ast_lock); 679 + goto repeat; 680 + } 681 + 682 + current->state = TASK_RUNNING; 683 + remove_wait_queue(&fi->fi_wait, &wait); 684 + 685 + if (signal_pending(current)) { 686 + spin_unlock(&fi->fi_ast_lock); 687 + return -ERESTARTSYS; 688 + } 689 + } 690 + 691 + ast = list_entry(fi->fi_ast_list.next, struct ast_info, list); 692 + list_del(&ast->list); 693 + spin_unlock(&fi->fi_ast_lock); 694 + 695 + /* Work out the size of the returned data */ 696 + data_size = sizeof(struct dlm_lock_result); 697 + if (ast->lvb_updated && ast->result.lksb.sb_lvbptr) 698 + data_size += DLM_USER_LVB_LEN; 699 + 700 + offset = sizeof(struct dlm_lock_result); 701 + 702 + /* Room for the extended data ? */ 703 + if (count >= data_size) { 704 + 705 + if (ast->lvb_updated && ast->result.lksb.sb_lvbptr) { 706 + if (copy_to_user(buffer+offset, 707 + ast->result.lksb.sb_lvbptr, 708 + DLM_USER_LVB_LEN)) 709 + return -EFAULT; 710 + ast->result.lvb_offset = offset; 711 + offset += DLM_USER_LVB_LEN; 712 + } 713 + } 714 + 715 + ast->result.length = data_size; 716 + /* Copy the header now it has all the offsets in it */ 717 + if (copy_to_user(buffer, &ast->result, sizeof(struct dlm_lock_result))) 718 + offset = -EFAULT; 719 + 720 + /* If we only returned a header and there's more to come then put it 721 + back on the list */ 722 + if (count < data_size) { 723 + spin_lock(&fi->fi_ast_lock); 724 + list_add(&ast->list, &fi->fi_ast_list); 725 + spin_unlock(&fi->fi_ast_lock); 726 + } else 727 + kfree(ast); 728 + return offset; 729 + } 730 + 731 + static unsigned int dlm_poll(struct file *file, poll_table *wait) 732 + { 733 + struct file_info *fi = file->private_data; 734 + 735 + poll_wait(file, &fi->fi_wait, wait); 736 + 737 + spin_lock(&fi->fi_ast_lock); 738 + if (!list_empty(&fi->fi_ast_list)) { 739 + spin_unlock(&fi->fi_ast_lock); 740 + return POLLIN | POLLRDNORM; 741 + } 742 + 743 + spin_unlock(&fi->fi_ast_lock); 744 + return 0; 745 + } 746 + 747 + static struct lock_info *allocate_lockinfo(struct file_info *fi, uint8_t cmd, 748 + struct dlm_lock_params *kparams) 749 + { 750 + struct lock_info *li; 751 + 752 + if (!try_module_get(THIS_MODULE)) 753 + return NULL; 754 + 755 + li = kmalloc(sizeof(struct lock_info), GFP_KERNEL); 756 + if (li) { 757 + li->li_magic = LOCKINFO_MAGIC; 758 + li->li_file = fi; 759 + li->li_cmd = cmd; 760 + li->li_flags = 0; 761 + li->li_grmode = -1; 762 + li->li_rqmode = -1; 763 + li->li_pend_bastparam = NULL; 764 + li->li_pend_bastaddr = NULL; 765 + li->li_castaddr = NULL; 766 + li->li_castparam = NULL; 767 + li->li_lksb.sb_lvbptr = NULL; 768 + li->li_bastaddr = kparams->bastaddr; 769 + li->li_bastparam = kparams->bastparam; 770 + 771 + get_file_info(fi); 772 + } 773 + return li; 774 + } 775 + 776 + static int do_user_lock(struct file_info *fi, uint8_t cmd, 777 + struct dlm_lock_params *kparams) 778 + { 779 + struct lock_info *li; 780 + int status; 781 + 782 + /* 783 + * Validate things that we need to have correct. 784 + */ 785 + if (!kparams->castaddr) 786 + return -EINVAL; 787 + 788 + if (!kparams->lksb) 789 + return -EINVAL; 790 + 791 + /* Persistent child locks are not available yet */ 792 + if ((kparams->flags & DLM_LKF_PERSISTENT) && kparams->parent) 793 + return -EINVAL; 794 + 795 + /* For conversions, there should already be a lockinfo struct, 796 + unless we are adopting an orphaned persistent lock */ 797 + if (kparams->flags & DLM_LKF_CONVERT) { 798 + 799 + li = get_lockinfo(kparams->lkid); 800 + 801 + /* If this is a persistent lock we will have to create a 802 + lockinfo again */ 803 + if (!li && DLM_LKF_PERSISTENT) { 804 + li = allocate_lockinfo(fi, cmd, kparams); 805 + 806 + li->li_lksb.sb_lkid = kparams->lkid; 807 + li->li_castaddr = kparams->castaddr; 808 + li->li_castparam = kparams->castparam; 809 + 810 + /* OK, this isn;t exactly a FIRSTLOCK but it is the 811 + first time we've used this lockinfo, and if things 812 + fail we want rid of it */ 813 + init_MUTEX_LOCKED(&li->li_firstlock); 814 + set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags); 815 + add_lockinfo(li); 816 + 817 + /* TODO: do a query to get the current state ?? */ 818 + } 819 + if (!li) 820 + return -EINVAL; 821 + 822 + if (li->li_magic != LOCKINFO_MAGIC) 823 + return -EINVAL; 824 + 825 + /* For conversions don't overwrite the current blocking AST 826 + info so that: 827 + a) if a blocking AST fires before the conversion is queued 828 + it runs the current handler 829 + b) if the conversion is cancelled, the original blocking AST 830 + declaration is active 831 + The pend_ info is made active when the conversion 832 + completes. 833 + */ 834 + li->li_pend_bastaddr = kparams->bastaddr; 835 + li->li_pend_bastparam = kparams->bastparam; 836 + } else { 837 + li = allocate_lockinfo(fi, cmd, kparams); 838 + if (!li) 839 + return -ENOMEM; 840 + 841 + /* semaphore to allow us to complete our work before 842 + the AST routine runs. In fact we only need (and use) this 843 + when the initial lock fails */ 844 + init_MUTEX_LOCKED(&li->li_firstlock); 845 + set_bit(LI_FLAG_FIRSTLOCK, &li->li_flags); 846 + } 847 + 848 + li->li_user_lksb = kparams->lksb; 849 + li->li_castaddr = kparams->castaddr; 850 + li->li_castparam = kparams->castparam; 851 + li->li_lksb.sb_lkid = kparams->lkid; 852 + li->li_rqmode = kparams->mode; 853 + if (kparams->flags & DLM_LKF_PERSISTENT) 854 + set_bit(LI_FLAG_PERSISTENT, &li->li_flags); 855 + 856 + /* Copy in the value block */ 857 + if (kparams->flags & DLM_LKF_VALBLK) { 858 + if (!li->li_lksb.sb_lvbptr) { 859 + li->li_lksb.sb_lvbptr = kmalloc(DLM_USER_LVB_LEN, 860 + GFP_KERNEL); 861 + if (!li->li_lksb.sb_lvbptr) { 862 + status = -ENOMEM; 863 + goto out_err; 864 + } 865 + } 866 + 867 + memcpy(li->li_lksb.sb_lvbptr, kparams->lvb, DLM_USER_LVB_LEN); 868 + } 869 + 870 + /* Lock it ... */ 871 + status = dlm_lock(fi->fi_ls->ls_lockspace, 872 + kparams->mode, &li->li_lksb, 873 + kparams->flags, 874 + kparams->name, kparams->namelen, 875 + kparams->parent, 876 + ast_routine, 877 + li, 878 + (li->li_pend_bastaddr || li->li_bastaddr) ? 879 + bast_routine : NULL, 880 + kparams->range.ra_end ? &kparams->range : NULL); 881 + if (status) 882 + goto out_err; 883 + 884 + /* If it succeeded (this far) with a new lock then keep track of 885 + it on the file's lockinfo list */ 886 + if (!status && test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) { 887 + 888 + spin_lock(&fi->fi_li_lock); 889 + list_add(&li->li_ownerqueue, &fi->fi_li_list); 890 + spin_unlock(&fi->fi_li_lock); 891 + if (add_lockinfo(li)) 892 + printk(KERN_WARNING "Add lockinfo failed\n"); 893 + 894 + up(&li->li_firstlock); 895 + } 896 + 897 + /* Return the lockid as the user needs it /now/ */ 898 + return li->li_lksb.sb_lkid; 899 + 900 + out_err: 901 + if (test_bit(LI_FLAG_FIRSTLOCK, &li->li_flags)) 902 + release_lockinfo(li); 903 + return status; 904 + 905 + } 906 + 907 + static int do_user_unlock(struct file_info *fi, uint8_t cmd, 908 + struct dlm_lock_params *kparams) 909 + { 910 + struct lock_info *li; 911 + int status; 912 + int convert_cancel = 0; 913 + 914 + li = get_lockinfo(kparams->lkid); 915 + if (!li) { 916 + li = allocate_lockinfo(fi, cmd, kparams); 917 + spin_lock(&fi->fi_li_lock); 918 + list_add(&li->li_ownerqueue, &fi->fi_li_list); 919 + spin_unlock(&fi->fi_li_lock); 920 + } 921 + if (!li) 922 + return -ENOMEM; 923 + 924 + if (li->li_magic != LOCKINFO_MAGIC) 925 + return -EINVAL; 926 + 927 + li->li_user_lksb = kparams->lksb; 928 + li->li_castparam = kparams->castparam; 929 + li->li_cmd = cmd; 930 + 931 + /* Cancelling a conversion doesn't remove the lock...*/ 932 + if (kparams->flags & DLM_LKF_CANCEL && li->li_grmode != -1) 933 + convert_cancel = 1; 934 + 935 + /* dlm_unlock() passes a 0 for castaddr which means don't overwrite 936 + the existing li_castaddr as that's the completion routine for 937 + unlocks. dlm_unlock_wait() specifies a new AST routine to be 938 + executed when the unlock completes. */ 939 + if (kparams->castaddr) 940 + li->li_castaddr = kparams->castaddr; 941 + 942 + /* Use existing lksb & astparams */ 943 + status = dlm_unlock(fi->fi_ls->ls_lockspace, 944 + kparams->lkid, 945 + kparams->flags, &li->li_lksb, li); 946 + 947 + if (!status && !convert_cancel) { 948 + spin_lock(&fi->fi_li_lock); 949 + list_del(&li->li_ownerqueue); 950 + spin_unlock(&fi->fi_li_lock); 951 + } 952 + 953 + return status; 954 + } 955 + 956 + /* Write call, submit a locking request */ 957 + static ssize_t dlm_write(struct file *file, const char __user *buffer, 958 + size_t count, loff_t *ppos) 959 + { 960 + struct file_info *fi = file->private_data; 961 + struct dlm_write_request *kparams; 962 + sigset_t tmpsig; 963 + sigset_t allsigs; 964 + int status; 965 + 966 + /* -1 because lock name is optional */ 967 + if (count < sizeof(struct dlm_write_request)-1) 968 + return -EINVAL; 969 + 970 + /* Has the lockspace been deleted */ 971 + if (fi && test_bit(LS_FLAG_DELETED, &fi->fi_ls->ls_flags)) 972 + return -ENOENT; 973 + 974 + kparams = kmalloc(count, GFP_KERNEL); 975 + if (!kparams) 976 + return -ENOMEM; 977 + 978 + status = -EFAULT; 979 + /* Get the command info */ 980 + if (copy_from_user(kparams, buffer, count)) 981 + goto out_free; 982 + 983 + status = -EBADE; 984 + if (check_version(kparams)) 985 + goto out_free; 986 + 987 + /* Block signals while we are doing this */ 988 + sigfillset(&allsigs); 989 + sigprocmask(SIG_BLOCK, &allsigs, &tmpsig); 990 + 991 + status = -EINVAL; 992 + switch (kparams->cmd) 993 + { 994 + case DLM_USER_LOCK: 995 + if (!fi) goto out_sig; 996 + status = do_user_lock(fi, kparams->cmd, &kparams->i.lock); 997 + break; 998 + 999 + case DLM_USER_UNLOCK: 1000 + if (!fi) goto out_sig; 1001 + status = do_user_unlock(fi, kparams->cmd, &kparams->i.lock); 1002 + break; 1003 + 1004 + case DLM_USER_CREATE_LOCKSPACE: 1005 + if (fi) goto out_sig; 1006 + status = do_user_create_lockspace(fi, kparams->cmd, 1007 + &kparams->i.lspace); 1008 + break; 1009 + 1010 + case DLM_USER_REMOVE_LOCKSPACE: 1011 + if (fi) goto out_sig; 1012 + status = do_user_remove_lockspace(fi, kparams->cmd, 1013 + &kparams->i.lspace); 1014 + break; 1015 + default: 1016 + printk("Unknown command passed to DLM device : %d\n", 1017 + kparams->cmd); 1018 + break; 1019 + } 1020 + 1021 + out_sig: 1022 + /* Restore signals */ 1023 + sigprocmask(SIG_SETMASK, &tmpsig, NULL); 1024 + recalc_sigpending(); 1025 + 1026 + out_free: 1027 + kfree(kparams); 1028 + if (status == 0) 1029 + return count; 1030 + else 1031 + return status; 1032 + } 1033 + 1034 + static struct file_operations _dlm_fops = { 1035 + .open = dlm_open, 1036 + .release = dlm_close, 1037 + .read = dlm_read, 1038 + .write = dlm_write, 1039 + .poll = dlm_poll, 1040 + .owner = THIS_MODULE, 1041 + }; 1042 + 1043 + static struct file_operations _dlm_ctl_fops = { 1044 + .open = dlm_ctl_open, 1045 + .release = dlm_ctl_close, 1046 + .write = dlm_write, 1047 + .owner = THIS_MODULE, 1048 + }; 1049 + 1050 + /* 1051 + * Create control device 1052 + */ 1053 + static int __init dlm_device_init(void) 1054 + { 1055 + int r; 1056 + 1057 + INIT_LIST_HEAD(&user_ls_list); 1058 + init_MUTEX(&user_ls_lock); 1059 + rwlock_init(&lockinfo_lock); 1060 + 1061 + ctl_device.name = "dlm-control"; 1062 + ctl_device.fops = &_dlm_ctl_fops; 1063 + ctl_device.minor = MISC_DYNAMIC_MINOR; 1064 + 1065 + r = misc_register(&ctl_device); 1066 + if (r) { 1067 + printk(KERN_ERR "dlm: misc_register failed for control dev\n"); 1068 + return r; 1069 + } 1070 + 1071 + return 0; 1072 + } 1073 + 1074 + static void __exit dlm_device_exit(void) 1075 + { 1076 + misc_deregister(&ctl_device); 1077 + } 1078 + 1079 + MODULE_DESCRIPTION("Distributed Lock Manager device interface"); 1080 + MODULE_AUTHOR("Red Hat, Inc."); 1081 + MODULE_LICENSE("GPL"); 1082 + 1083 + module_init(dlm_device_init); 1084 + module_exit(dlm_device_exit);
+423
fs/dlm/dir.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #include "dlm_internal.h" 15 + #include "lockspace.h" 16 + #include "member.h" 17 + #include "lowcomms.h" 18 + #include "rcom.h" 19 + #include "config.h" 20 + #include "memory.h" 21 + #include "recover.h" 22 + #include "util.h" 23 + #include "lock.h" 24 + #include "dir.h" 25 + 26 + 27 + static void put_free_de(struct dlm_ls *ls, struct dlm_direntry *de) 28 + { 29 + spin_lock(&ls->ls_recover_list_lock); 30 + list_add(&de->list, &ls->ls_recover_list); 31 + spin_unlock(&ls->ls_recover_list_lock); 32 + } 33 + 34 + static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len) 35 + { 36 + int found = FALSE; 37 + struct dlm_direntry *de; 38 + 39 + spin_lock(&ls->ls_recover_list_lock); 40 + list_for_each_entry(de, &ls->ls_recover_list, list) { 41 + if (de->length == len) { 42 + list_del(&de->list); 43 + de->master_nodeid = 0; 44 + memset(de->name, 0, len); 45 + found = TRUE; 46 + break; 47 + } 48 + } 49 + spin_unlock(&ls->ls_recover_list_lock); 50 + 51 + if (!found) 52 + de = allocate_direntry(ls, len); 53 + return de; 54 + } 55 + 56 + void dlm_clear_free_entries(struct dlm_ls *ls) 57 + { 58 + struct dlm_direntry *de; 59 + 60 + spin_lock(&ls->ls_recover_list_lock); 61 + while (!list_empty(&ls->ls_recover_list)) { 62 + de = list_entry(ls->ls_recover_list.next, struct dlm_direntry, 63 + list); 64 + list_del(&de->list); 65 + free_direntry(de); 66 + } 67 + spin_unlock(&ls->ls_recover_list_lock); 68 + } 69 + 70 + /* 71 + * We use the upper 16 bits of the hash value to select the directory node. 72 + * Low bits are used for distribution of rsb's among hash buckets on each node. 73 + * 74 + * To give the exact range wanted (0 to num_nodes-1), we apply a modulus of 75 + * num_nodes to the hash value. This value in the desired range is used as an 76 + * offset into the sorted list of nodeid's to give the particular nodeid. 77 + */ 78 + 79 + int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash) 80 + { 81 + struct list_head *tmp; 82 + struct dlm_member *memb = NULL; 83 + uint32_t node, n = 0; 84 + int nodeid; 85 + 86 + if (ls->ls_num_nodes == 1) { 87 + nodeid = dlm_our_nodeid(); 88 + goto out; 89 + } 90 + 91 + if (ls->ls_node_array) { 92 + node = (hash >> 16) % ls->ls_total_weight; 93 + nodeid = ls->ls_node_array[node]; 94 + goto out; 95 + } 96 + 97 + /* make_member_array() failed to kmalloc ls_node_array... */ 98 + 99 + node = (hash >> 16) % ls->ls_num_nodes; 100 + 101 + list_for_each(tmp, &ls->ls_nodes) { 102 + if (n++ != node) 103 + continue; 104 + memb = list_entry(tmp, struct dlm_member, list); 105 + break; 106 + } 107 + 108 + DLM_ASSERT(memb , printk("num_nodes=%u n=%u node=%u\n", 109 + ls->ls_num_nodes, n, node);); 110 + nodeid = memb->nodeid; 111 + out: 112 + return nodeid; 113 + } 114 + 115 + int dlm_dir_nodeid(struct dlm_rsb *r) 116 + { 117 + return dlm_hash2nodeid(r->res_ls, r->res_hash); 118 + } 119 + 120 + static inline uint32_t dir_hash(struct dlm_ls *ls, char *name, int len) 121 + { 122 + uint32_t val; 123 + 124 + val = jhash(name, len, 0); 125 + val &= (ls->ls_dirtbl_size - 1); 126 + 127 + return val; 128 + } 129 + 130 + static void add_entry_to_hash(struct dlm_ls *ls, struct dlm_direntry *de) 131 + { 132 + uint32_t bucket; 133 + 134 + bucket = dir_hash(ls, de->name, de->length); 135 + list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); 136 + } 137 + 138 + static struct dlm_direntry *search_bucket(struct dlm_ls *ls, char *name, 139 + int namelen, uint32_t bucket) 140 + { 141 + struct dlm_direntry *de; 142 + 143 + list_for_each_entry(de, &ls->ls_dirtbl[bucket].list, list) { 144 + if (de->length == namelen && !memcmp(name, de->name, namelen)) 145 + goto out; 146 + } 147 + de = NULL; 148 + out: 149 + return de; 150 + } 151 + 152 + void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int namelen) 153 + { 154 + struct dlm_direntry *de; 155 + uint32_t bucket; 156 + 157 + bucket = dir_hash(ls, name, namelen); 158 + 159 + write_lock(&ls->ls_dirtbl[bucket].lock); 160 + 161 + de = search_bucket(ls, name, namelen, bucket); 162 + 163 + if (!de) { 164 + log_error(ls, "remove fr %u none", nodeid); 165 + goto out; 166 + } 167 + 168 + if (de->master_nodeid != nodeid) { 169 + log_error(ls, "remove fr %u ID %u", nodeid, de->master_nodeid); 170 + goto out; 171 + } 172 + 173 + list_del(&de->list); 174 + free_direntry(de); 175 + out: 176 + write_unlock(&ls->ls_dirtbl[bucket].lock); 177 + } 178 + 179 + void dlm_dir_clear(struct dlm_ls *ls) 180 + { 181 + struct list_head *head; 182 + struct dlm_direntry *de; 183 + int i; 184 + 185 + DLM_ASSERT(list_empty(&ls->ls_recover_list), ); 186 + 187 + for (i = 0; i < ls->ls_dirtbl_size; i++) { 188 + write_lock(&ls->ls_dirtbl[i].lock); 189 + head = &ls->ls_dirtbl[i].list; 190 + while (!list_empty(head)) { 191 + de = list_entry(head->next, struct dlm_direntry, list); 192 + list_del(&de->list); 193 + put_free_de(ls, de); 194 + } 195 + write_unlock(&ls->ls_dirtbl[i].lock); 196 + } 197 + } 198 + 199 + int dlm_recover_directory(struct dlm_ls *ls) 200 + { 201 + struct dlm_member *memb; 202 + struct dlm_direntry *de; 203 + char *b, *last_name = NULL; 204 + int error = -ENOMEM, last_len, count = 0; 205 + uint16_t namelen; 206 + 207 + log_debug(ls, "dlm_recover_directory"); 208 + 209 + if (dlm_no_directory(ls)) 210 + goto out_status; 211 + 212 + dlm_dir_clear(ls); 213 + 214 + last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_KERNEL); 215 + if (!last_name) 216 + goto out; 217 + 218 + list_for_each_entry(memb, &ls->ls_nodes, list) { 219 + memset(last_name, 0, DLM_RESNAME_MAXLEN); 220 + last_len = 0; 221 + 222 + for (;;) { 223 + error = dlm_recovery_stopped(ls); 224 + if (error) 225 + goto out_free; 226 + 227 + error = dlm_rcom_names(ls, memb->nodeid, 228 + last_name, last_len); 229 + if (error) 230 + goto out_free; 231 + 232 + schedule(); 233 + 234 + /* 235 + * pick namelen/name pairs out of received buffer 236 + */ 237 + 238 + b = ls->ls_recover_buf + sizeof(struct dlm_rcom); 239 + 240 + for (;;) { 241 + memcpy(&namelen, b, sizeof(uint16_t)); 242 + namelen = be16_to_cpu(namelen); 243 + b += sizeof(uint16_t); 244 + 245 + /* namelen of 0xFFFFF marks end of names for 246 + this node; namelen of 0 marks end of the 247 + buffer */ 248 + 249 + if (namelen == 0xFFFF) 250 + goto done; 251 + if (!namelen) 252 + break; 253 + 254 + error = -ENOMEM; 255 + de = get_free_de(ls, namelen); 256 + if (!de) 257 + goto out_free; 258 + 259 + de->master_nodeid = memb->nodeid; 260 + de->length = namelen; 261 + last_len = namelen; 262 + memcpy(de->name, b, namelen); 263 + memcpy(last_name, b, namelen); 264 + b += namelen; 265 + 266 + add_entry_to_hash(ls, de); 267 + count++; 268 + } 269 + } 270 + done: 271 + ; 272 + } 273 + 274 + out_status: 275 + error = 0; 276 + dlm_set_recover_status(ls, DLM_RS_DIR); 277 + log_debug(ls, "dlm_recover_directory %d entries", count); 278 + out_free: 279 + kfree(last_name); 280 + out: 281 + dlm_clear_free_entries(ls); 282 + return error; 283 + } 284 + 285 + static int get_entry(struct dlm_ls *ls, int nodeid, char *name, 286 + int namelen, int *r_nodeid) 287 + { 288 + struct dlm_direntry *de, *tmp; 289 + uint32_t bucket; 290 + 291 + bucket = dir_hash(ls, name, namelen); 292 + 293 + write_lock(&ls->ls_dirtbl[bucket].lock); 294 + de = search_bucket(ls, name, namelen, bucket); 295 + if (de) { 296 + *r_nodeid = de->master_nodeid; 297 + write_unlock(&ls->ls_dirtbl[bucket].lock); 298 + if (*r_nodeid == nodeid) 299 + return -EEXIST; 300 + return 0; 301 + } 302 + 303 + write_unlock(&ls->ls_dirtbl[bucket].lock); 304 + 305 + de = allocate_direntry(ls, namelen); 306 + if (!de) 307 + return -ENOMEM; 308 + 309 + de->master_nodeid = nodeid; 310 + de->length = namelen; 311 + memcpy(de->name, name, namelen); 312 + 313 + write_lock(&ls->ls_dirtbl[bucket].lock); 314 + tmp = search_bucket(ls, name, namelen, bucket); 315 + if (tmp) { 316 + free_direntry(de); 317 + de = tmp; 318 + } else { 319 + list_add_tail(&de->list, &ls->ls_dirtbl[bucket].list); 320 + } 321 + *r_nodeid = de->master_nodeid; 322 + write_unlock(&ls->ls_dirtbl[bucket].lock); 323 + return 0; 324 + } 325 + 326 + int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen, 327 + int *r_nodeid) 328 + { 329 + return get_entry(ls, nodeid, name, namelen, r_nodeid); 330 + } 331 + 332 + /* Copy the names of master rsb's into the buffer provided. 333 + Only select names whose dir node is the given nodeid. */ 334 + 335 + void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, 336 + char *outbuf, int outlen, int nodeid) 337 + { 338 + struct list_head *list; 339 + struct dlm_rsb *start_r = NULL, *r = NULL; 340 + int offset = 0, start_namelen, error, dir_nodeid; 341 + char *start_name; 342 + uint16_t be_namelen; 343 + 344 + /* 345 + * Find the rsb where we left off (or start again) 346 + */ 347 + 348 + start_namelen = inlen; 349 + start_name = inbuf; 350 + 351 + if (start_namelen > 1) { 352 + /* 353 + * We could also use a find_rsb_root() function here that 354 + * searched the ls_root_list. 355 + */ 356 + error = dlm_find_rsb(ls, start_name, start_namelen, R_MASTER, 357 + &start_r); 358 + DLM_ASSERT(!error && start_r, 359 + printk("error %d\n", error);); 360 + DLM_ASSERT(!list_empty(&start_r->res_root_list), 361 + dlm_print_rsb(start_r);); 362 + dlm_put_rsb(start_r); 363 + } 364 + 365 + /* 366 + * Send rsb names for rsb's we're master of and whose directory node 367 + * matches the requesting node. 368 + */ 369 + 370 + down_read(&ls->ls_root_sem); 371 + if (start_r) 372 + list = start_r->res_root_list.next; 373 + else 374 + list = ls->ls_root_list.next; 375 + 376 + for (offset = 0; list != &ls->ls_root_list; list = list->next) { 377 + r = list_entry(list, struct dlm_rsb, res_root_list); 378 + if (r->res_nodeid) 379 + continue; 380 + 381 + dir_nodeid = dlm_dir_nodeid(r); 382 + if (dir_nodeid != nodeid) 383 + continue; 384 + 385 + /* 386 + * The block ends when we can't fit the following in the 387 + * remaining buffer space: 388 + * namelen (uint16_t) + 389 + * name (r->res_length) + 390 + * end-of-block record 0x0000 (uint16_t) 391 + */ 392 + 393 + if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) { 394 + /* Write end-of-block record */ 395 + be_namelen = 0; 396 + memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 397 + offset += sizeof(uint16_t); 398 + goto out; 399 + } 400 + 401 + be_namelen = cpu_to_be16(r->res_length); 402 + memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 403 + offset += sizeof(uint16_t); 404 + memcpy(outbuf + offset, r->res_name, r->res_length); 405 + offset += r->res_length; 406 + } 407 + 408 + /* 409 + * If we've reached the end of the list (and there's room) write a 410 + * terminating record. 411 + */ 412 + 413 + if ((list == &ls->ls_root_list) && 414 + (offset + sizeof(uint16_t) <= outlen)) { 415 + be_namelen = 0xFFFF; 416 + memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); 417 + offset += sizeof(uint16_t); 418 + } 419 + 420 + out: 421 + up_read(&ls->ls_root_sem); 422 + } 423 +
+30
fs/dlm/dir.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #ifndef __DIR_DOT_H__ 15 + #define __DIR_DOT_H__ 16 + 17 + 18 + int dlm_dir_nodeid(struct dlm_rsb *rsb); 19 + int dlm_hash2nodeid(struct dlm_ls *ls, uint32_t hash); 20 + void dlm_dir_remove_entry(struct dlm_ls *ls, int nodeid, char *name, int len); 21 + void dlm_dir_clear(struct dlm_ls *ls); 22 + void dlm_clear_free_entries(struct dlm_ls *ls); 23 + int dlm_recover_directory(struct dlm_ls *ls); 24 + int dlm_dir_lookup(struct dlm_ls *ls, int nodeid, char *name, int namelen, 25 + int *r_nodeid); 26 + void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, 27 + char *outbuf, int outlen, int nodeid); 28 + 29 + #endif /* __DIR_DOT_H__ */ 30 +
+518
fs/dlm/dlm_internal.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #ifndef __DLM_INTERNAL_DOT_H__ 15 + #define __DLM_INTERNAL_DOT_H__ 16 + 17 + /* 18 + * This is the main header file to be included in each DLM source file. 19 + */ 20 + 21 + #include <linux/module.h> 22 + #include <linux/slab.h> 23 + #include <linux/sched.h> 24 + #include <linux/types.h> 25 + #include <linux/ctype.h> 26 + #include <linux/spinlock.h> 27 + #include <linux/vmalloc.h> 28 + #include <linux/list.h> 29 + #include <linux/errno.h> 30 + #include <linux/random.h> 31 + #include <linux/delay.h> 32 + #include <linux/socket.h> 33 + #include <linux/kthread.h> 34 + #include <linux/kobject.h> 35 + #include <linux/kref.h> 36 + #include <linux/kernel.h> 37 + #include <linux/jhash.h> 38 + #include <asm/semaphore.h> 39 + #include <asm/uaccess.h> 40 + 41 + #include <linux/dlm.h> 42 + 43 + #define DLM_LOCKSPACE_LEN 64 44 + 45 + #ifndef TRUE 46 + #define TRUE 1 47 + #endif 48 + 49 + #ifndef FALSE 50 + #define FALSE 0 51 + #endif 52 + 53 + #if (BITS_PER_LONG == 64) 54 + #define PRIx64 "lx" 55 + #else 56 + #define PRIx64 "Lx" 57 + #endif 58 + 59 + /* Size of the temp buffer midcomms allocates on the stack. 60 + We try to make this large enough so most messages fit. 61 + FIXME: should sctp make this unnecessary? */ 62 + 63 + #define DLM_INBUF_LEN 148 64 + 65 + struct dlm_ls; 66 + struct dlm_lkb; 67 + struct dlm_rsb; 68 + struct dlm_member; 69 + struct dlm_lkbtable; 70 + struct dlm_rsbtable; 71 + struct dlm_dirtable; 72 + struct dlm_direntry; 73 + struct dlm_recover; 74 + struct dlm_header; 75 + struct dlm_message; 76 + struct dlm_rcom; 77 + struct dlm_mhandle; 78 + 79 + #define log_print(fmt, args...) \ 80 + printk(KERN_ERR "dlm: "fmt"\n" , ##args) 81 + #define log_error(ls, fmt, args...) \ 82 + printk(KERN_ERR "dlm: %s: " fmt "\n", (ls)->ls_name , ##args) 83 + 84 + #ifdef DLM_LOG_DEBUG 85 + #define log_debug(ls, fmt, args...) log_error(ls, fmt, ##args) 86 + #else 87 + #define log_debug(ls, fmt, args...) 88 + #endif 89 + 90 + #define DLM_ASSERT(x, do) \ 91 + { \ 92 + if (!(x)) \ 93 + { \ 94 + printk(KERN_ERR "\nDLM: Assertion failed on line %d of file %s\n" \ 95 + "DLM: assertion: \"%s\"\n" \ 96 + "DLM: time = %lu\n", \ 97 + __LINE__, __FILE__, #x, jiffies); \ 98 + {do} \ 99 + printk("\n"); \ 100 + BUG(); \ 101 + panic("DLM: Record message above and reboot.\n"); \ 102 + } \ 103 + } 104 + 105 + 106 + struct dlm_direntry { 107 + struct list_head list; 108 + uint32_t master_nodeid; 109 + uint16_t length; 110 + char name[1]; 111 + }; 112 + 113 + struct dlm_dirtable { 114 + struct list_head list; 115 + rwlock_t lock; 116 + }; 117 + 118 + struct dlm_rsbtable { 119 + struct list_head list; 120 + struct list_head toss; 121 + rwlock_t lock; 122 + }; 123 + 124 + struct dlm_lkbtable { 125 + struct list_head list; 126 + rwlock_t lock; 127 + uint16_t counter; 128 + }; 129 + 130 + /* 131 + * Lockspace member (per node in a ls) 132 + */ 133 + 134 + struct dlm_member { 135 + struct list_head list; 136 + int nodeid; 137 + int weight; 138 + }; 139 + 140 + /* 141 + * Save and manage recovery state for a lockspace. 142 + */ 143 + 144 + struct dlm_recover { 145 + struct list_head list; 146 + int *nodeids; 147 + int node_count; 148 + uint64_t seq; 149 + }; 150 + 151 + /* 152 + * Pass input args to second stage locking function. 153 + */ 154 + 155 + struct dlm_args { 156 + uint32_t flags; 157 + void *astaddr; 158 + long astparam; 159 + void *bastaddr; 160 + int mode; 161 + struct dlm_lksb *lksb; 162 + struct dlm_range *range; 163 + }; 164 + 165 + 166 + /* 167 + * Lock block 168 + * 169 + * A lock can be one of three types: 170 + * 171 + * local copy lock is mastered locally 172 + * (lkb_nodeid is zero and DLM_LKF_MSTCPY is not set) 173 + * process copy lock is mastered on a remote node 174 + * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is not set) 175 + * master copy master node's copy of a lock owned by remote node 176 + * (lkb_nodeid is non-zero and DLM_LKF_MSTCPY is set) 177 + * 178 + * lkb_exflags: a copy of the most recent flags arg provided to dlm_lock or 179 + * dlm_unlock. The dlm does not modify these or use any private flags in 180 + * this field; it only contains DLM_LKF_ flags from dlm.h. These flags 181 + * are sent as-is to the remote master when the lock is remote. 182 + * 183 + * lkb_flags: internal dlm flags (DLM_IFL_ prefix) from dlm_internal.h. 184 + * Some internal flags are shared between the master and process nodes; 185 + * these shared flags are kept in the lower two bytes. One of these 186 + * flags set on the master copy will be propagated to the process copy 187 + * and v.v. Other internal flags are private to the master or process 188 + * node (e.g. DLM_IFL_MSTCPY). These are kept in the high two bytes. 189 + * 190 + * lkb_sbflags: status block flags. These flags are copied directly into 191 + * the caller's lksb.sb_flags prior to the dlm_lock/dlm_unlock completion 192 + * ast. All defined in dlm.h with DLM_SBF_ prefix. 193 + * 194 + * lkb_status: the lock status indicates which rsb queue the lock is 195 + * on, grant, convert, or wait. DLM_LKSTS_ WAITING/GRANTED/CONVERT 196 + * 197 + * lkb_wait_type: the dlm message type (DLM_MSG_ prefix) for which a 198 + * reply is needed. Only set when the lkb is on the lockspace waiters 199 + * list awaiting a reply from a remote node. 200 + * 201 + * lkb_nodeid: when the lkb is a local copy, nodeid is 0; when the lkb 202 + * is a master copy, nodeid specifies the remote lock holder, when the 203 + * lkb is a process copy, the nodeid specifies the lock master. 204 + */ 205 + 206 + /* lkb_ast_type */ 207 + 208 + #define AST_COMP 1 209 + #define AST_BAST 2 210 + 211 + /* lkb_range[] */ 212 + 213 + #define GR_RANGE_START 0 214 + #define GR_RANGE_END 1 215 + #define RQ_RANGE_START 2 216 + #define RQ_RANGE_END 3 217 + 218 + /* lkb_status */ 219 + 220 + #define DLM_LKSTS_WAITING 1 221 + #define DLM_LKSTS_GRANTED 2 222 + #define DLM_LKSTS_CONVERT 3 223 + 224 + /* lkb_flags */ 225 + 226 + #define DLM_IFL_MSTCPY 0x00010000 227 + #define DLM_IFL_RESEND 0x00020000 228 + #define DLM_IFL_RANGE 0x00000001 229 + 230 + struct dlm_lkb { 231 + struct dlm_rsb *lkb_resource; /* the rsb */ 232 + struct kref lkb_ref; 233 + int lkb_nodeid; /* copied from rsb */ 234 + int lkb_ownpid; /* pid of lock owner */ 235 + uint32_t lkb_id; /* our lock ID */ 236 + uint32_t lkb_remid; /* lock ID on remote partner */ 237 + uint32_t lkb_exflags; /* external flags from caller */ 238 + uint32_t lkb_sbflags; /* lksb flags */ 239 + uint32_t lkb_flags; /* internal flags */ 240 + uint32_t lkb_lvbseq; /* lvb sequence number */ 241 + 242 + int8_t lkb_status; /* granted, waiting, convert */ 243 + int8_t lkb_rqmode; /* requested lock mode */ 244 + int8_t lkb_grmode; /* granted lock mode */ 245 + int8_t lkb_bastmode; /* requested mode */ 246 + int8_t lkb_highbast; /* highest mode bast sent for */ 247 + 248 + int8_t lkb_wait_type; /* type of reply waiting for */ 249 + int8_t lkb_ast_type; /* type of ast queued for */ 250 + 251 + struct list_head lkb_idtbl_list; /* lockspace lkbtbl */ 252 + struct list_head lkb_statequeue; /* rsb g/c/w list */ 253 + struct list_head lkb_rsb_lookup; /* waiting for rsb lookup */ 254 + struct list_head lkb_wait_reply; /* waiting for remote reply */ 255 + struct list_head lkb_astqueue; /* need ast to be sent */ 256 + 257 + uint64_t *lkb_range; /* array of gr/rq ranges */ 258 + char *lkb_lvbptr; 259 + struct dlm_lksb *lkb_lksb; /* caller's status block */ 260 + void *lkb_astaddr; /* caller's ast function */ 261 + void *lkb_bastaddr; /* caller's bast function */ 262 + long lkb_astparam; /* caller's ast arg */ 263 + }; 264 + 265 + 266 + struct dlm_rsb { 267 + struct dlm_ls *res_ls; /* the lockspace */ 268 + struct kref res_ref; 269 + struct semaphore res_sem; 270 + unsigned long res_flags; 271 + int res_length; /* length of rsb name */ 272 + int res_nodeid; 273 + uint32_t res_lvbseq; 274 + uint32_t res_hash; 275 + uint32_t res_bucket; /* rsbtbl */ 276 + unsigned long res_toss_time; 277 + uint32_t res_first_lkid; 278 + struct list_head res_lookup; /* lkbs waiting on first */ 279 + struct list_head res_hashchain; /* rsbtbl */ 280 + struct list_head res_grantqueue; 281 + struct list_head res_convertqueue; 282 + struct list_head res_waitqueue; 283 + 284 + struct list_head res_root_list; /* used for recovery */ 285 + struct list_head res_recover_list; /* used for recovery */ 286 + int res_recover_locks_count; 287 + 288 + char *res_lvbptr; 289 + char res_name[1]; 290 + }; 291 + 292 + /* find_rsb() flags */ 293 + 294 + #define R_MASTER 1 /* only return rsb if it's a master */ 295 + #define R_CREATE 2 /* create/add rsb if not found */ 296 + 297 + /* rsb_flags */ 298 + 299 + enum rsb_flags { 300 + RSB_MASTER_UNCERTAIN, 301 + RSB_VALNOTVALID, 302 + RSB_VALNOTVALID_PREV, 303 + RSB_NEW_MASTER, 304 + RSB_NEW_MASTER2, 305 + RSB_RECOVER_CONVERT, 306 + }; 307 + 308 + static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag) 309 + { 310 + __set_bit(flag, &r->res_flags); 311 + } 312 + 313 + static inline void rsb_clear_flag(struct dlm_rsb *r, enum rsb_flags flag) 314 + { 315 + __clear_bit(flag, &r->res_flags); 316 + } 317 + 318 + static inline int rsb_flag(struct dlm_rsb *r, enum rsb_flags flag) 319 + { 320 + return test_bit(flag, &r->res_flags); 321 + } 322 + 323 + 324 + /* dlm_header is first element of all structs sent between nodes */ 325 + 326 + #define DLM_HEADER_MAJOR 0x00020000 327 + #define DLM_HEADER_MINOR 0x00000001 328 + 329 + #define DLM_MSG 1 330 + #define DLM_RCOM 2 331 + 332 + struct dlm_header { 333 + uint32_t h_version; 334 + uint32_t h_lockspace; 335 + uint32_t h_nodeid; /* nodeid of sender */ 336 + uint16_t h_length; 337 + uint8_t h_cmd; /* DLM_MSG, DLM_RCOM */ 338 + uint8_t h_pad; 339 + }; 340 + 341 + 342 + #define DLM_MSG_REQUEST 1 343 + #define DLM_MSG_CONVERT 2 344 + #define DLM_MSG_UNLOCK 3 345 + #define DLM_MSG_CANCEL 4 346 + #define DLM_MSG_REQUEST_REPLY 5 347 + #define DLM_MSG_CONVERT_REPLY 6 348 + #define DLM_MSG_UNLOCK_REPLY 7 349 + #define DLM_MSG_CANCEL_REPLY 8 350 + #define DLM_MSG_GRANT 9 351 + #define DLM_MSG_BAST 10 352 + #define DLM_MSG_LOOKUP 11 353 + #define DLM_MSG_REMOVE 12 354 + #define DLM_MSG_LOOKUP_REPLY 13 355 + 356 + struct dlm_message { 357 + struct dlm_header m_header; 358 + uint32_t m_type; /* DLM_MSG_ */ 359 + uint32_t m_nodeid; 360 + uint32_t m_pid; 361 + uint32_t m_lkid; /* lkid on sender */ 362 + uint32_t m_remid; /* lkid on receiver */ 363 + uint32_t m_parent_lkid; 364 + uint32_t m_parent_remid; 365 + uint32_t m_exflags; 366 + uint32_t m_sbflags; 367 + uint32_t m_flags; 368 + uint32_t m_lvbseq; 369 + uint32_t m_hash; 370 + int m_status; 371 + int m_grmode; 372 + int m_rqmode; 373 + int m_bastmode; 374 + int m_asts; 375 + int m_result; /* 0 or -EXXX */ 376 + uint64_t m_range[2]; 377 + char m_extra[0]; /* name or lvb */ 378 + }; 379 + 380 + 381 + #define DLM_RS_NODES 0x00000001 382 + #define DLM_RS_NODES_ALL 0x00000002 383 + #define DLM_RS_DIR 0x00000004 384 + #define DLM_RS_DIR_ALL 0x00000008 385 + #define DLM_RS_LOCKS 0x00000010 386 + #define DLM_RS_LOCKS_ALL 0x00000020 387 + #define DLM_RS_DONE 0x00000040 388 + #define DLM_RS_DONE_ALL 0x00000080 389 + 390 + #define DLM_RCOM_STATUS 1 391 + #define DLM_RCOM_NAMES 2 392 + #define DLM_RCOM_LOOKUP 3 393 + #define DLM_RCOM_LOCK 4 394 + #define DLM_RCOM_STATUS_REPLY 5 395 + #define DLM_RCOM_NAMES_REPLY 6 396 + #define DLM_RCOM_LOOKUP_REPLY 7 397 + #define DLM_RCOM_LOCK_REPLY 8 398 + 399 + struct dlm_rcom { 400 + struct dlm_header rc_header; 401 + uint32_t rc_type; /* DLM_RCOM_ */ 402 + int rc_result; /* multi-purpose */ 403 + uint64_t rc_id; /* match reply with request */ 404 + char rc_buf[0]; 405 + }; 406 + 407 + struct rcom_config { 408 + uint32_t rf_lvblen; 409 + uint32_t rf_lsflags; 410 + uint64_t rf_unused; 411 + }; 412 + 413 + struct rcom_lock { 414 + uint32_t rl_ownpid; 415 + uint32_t rl_lkid; 416 + uint32_t rl_remid; 417 + uint32_t rl_parent_lkid; 418 + uint32_t rl_parent_remid; 419 + uint32_t rl_exflags; 420 + uint32_t rl_flags; 421 + uint32_t rl_lvbseq; 422 + int rl_result; 423 + int8_t rl_rqmode; 424 + int8_t rl_grmode; 425 + int8_t rl_status; 426 + int8_t rl_asts; 427 + uint16_t rl_wait_type; 428 + uint16_t rl_namelen; 429 + uint64_t rl_range[4]; 430 + char rl_name[DLM_RESNAME_MAXLEN]; 431 + char rl_lvb[0]; 432 + }; 433 + 434 + struct dlm_ls { 435 + struct list_head ls_list; /* list of lockspaces */ 436 + uint32_t ls_global_id; /* global unique lockspace ID */ 437 + uint32_t ls_exflags; 438 + int ls_lvblen; 439 + int ls_count; /* reference count */ 440 + unsigned long ls_flags; /* LSFL_ */ 441 + struct kobject ls_kobj; 442 + 443 + struct dlm_rsbtable *ls_rsbtbl; 444 + uint32_t ls_rsbtbl_size; 445 + 446 + struct dlm_lkbtable *ls_lkbtbl; 447 + uint32_t ls_lkbtbl_size; 448 + 449 + struct dlm_dirtable *ls_dirtbl; 450 + uint32_t ls_dirtbl_size; 451 + 452 + struct semaphore ls_waiters_sem; 453 + struct list_head ls_waiters; /* lkbs needing a reply */ 454 + 455 + struct list_head ls_nodes; /* current nodes in ls */ 456 + struct list_head ls_nodes_gone; /* dead node list, recovery */ 457 + int ls_num_nodes; /* number of nodes in ls */ 458 + int ls_low_nodeid; 459 + int ls_total_weight; 460 + int *ls_node_array; 461 + 462 + struct dlm_rsb ls_stub_rsb; /* for returning errors */ 463 + struct dlm_lkb ls_stub_lkb; /* for returning errors */ 464 + struct dlm_message ls_stub_ms; /* for faking a reply */ 465 + 466 + struct dentry *ls_debug_dentry; /* debugfs */ 467 + 468 + wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ 469 + int ls_uevent_result; 470 + 471 + /* recovery related */ 472 + 473 + struct timer_list ls_timer; 474 + struct task_struct *ls_recoverd_task; 475 + struct semaphore ls_recoverd_active; 476 + spinlock_t ls_recover_lock; 477 + uint32_t ls_recover_status; /* DLM_RS_ */ 478 + uint64_t ls_recover_seq; 479 + struct dlm_recover *ls_recover_args; 480 + struct rw_semaphore ls_in_recovery; /* block local requests */ 481 + struct list_head ls_requestqueue;/* queue remote requests */ 482 + struct semaphore ls_requestqueue_lock; 483 + char *ls_recover_buf; 484 + struct list_head ls_recover_list; 485 + spinlock_t ls_recover_list_lock; 486 + int ls_recover_list_count; 487 + wait_queue_head_t ls_wait_general; 488 + 489 + struct list_head ls_root_list; /* root resources */ 490 + struct rw_semaphore ls_root_sem; /* protect root_list */ 491 + 492 + int ls_namelen; 493 + char ls_name[1]; 494 + }; 495 + 496 + #define LSFL_WORK 0 497 + #define LSFL_RUNNING 1 498 + #define LSFL_RECOVERY_STOP 2 499 + #define LSFL_RCOM_READY 3 500 + #define LSFL_UEVENT_WAIT 4 501 + 502 + static inline int dlm_locking_stopped(struct dlm_ls *ls) 503 + { 504 + return !test_bit(LSFL_RUNNING, &ls->ls_flags); 505 + } 506 + 507 + static inline int dlm_recovery_stopped(struct dlm_ls *ls) 508 + { 509 + return test_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); 510 + } 511 + 512 + static inline int dlm_no_directory(struct dlm_ls *ls) 513 + { 514 + return (ls->ls_exflags & DLM_LSFL_NODIR) ? 1 : 0; 515 + } 516 + 517 + #endif /* __DLM_INTERNAL_DOT_H__ */ 518 +
+3610
fs/dlm/lock.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5 + ** 6 + ** This copyrighted material is made available to anyone wishing to use, 7 + ** modify, copy, or redistribute it subject to the terms and conditions 8 + ** of the GNU General Public License v.2. 9 + ** 10 + ******************************************************************************* 11 + ******************************************************************************/ 12 + 13 + /* Central locking logic has four stages: 14 + 15 + dlm_lock() 16 + dlm_unlock() 17 + 18 + request_lock(ls, lkb) 19 + convert_lock(ls, lkb) 20 + unlock_lock(ls, lkb) 21 + cancel_lock(ls, lkb) 22 + 23 + _request_lock(r, lkb) 24 + _convert_lock(r, lkb) 25 + _unlock_lock(r, lkb) 26 + _cancel_lock(r, lkb) 27 + 28 + do_request(r, lkb) 29 + do_convert(r, lkb) 30 + do_unlock(r, lkb) 31 + do_cancel(r, lkb) 32 + 33 + Stage 1 (lock, unlock) is mainly about checking input args and 34 + splitting into one of the four main operations: 35 + 36 + dlm_lock = request_lock 37 + dlm_lock+CONVERT = convert_lock 38 + dlm_unlock = unlock_lock 39 + dlm_unlock+CANCEL = cancel_lock 40 + 41 + Stage 2, xxxx_lock(), just finds and locks the relevant rsb which is 42 + provided to the next stage. 43 + 44 + Stage 3, _xxxx_lock(), determines if the operation is local or remote. 45 + When remote, it calls send_xxxx(), when local it calls do_xxxx(). 46 + 47 + Stage 4, do_xxxx(), is the guts of the operation. It manipulates the 48 + given rsb and lkb and queues callbacks. 49 + 50 + For remote operations, send_xxxx() results in the corresponding do_xxxx() 51 + function being executed on the remote node. The connecting send/receive 52 + calls on local (L) and remote (R) nodes: 53 + 54 + L: send_xxxx() -> R: receive_xxxx() 55 + R: do_xxxx() 56 + L: receive_xxxx_reply() <- R: send_xxxx_reply() 57 + */ 58 + 59 + #include "dlm_internal.h" 60 + #include "memory.h" 61 + #include "lowcomms.h" 62 + #include "requestqueue.h" 63 + #include "util.h" 64 + #include "dir.h" 65 + #include "member.h" 66 + #include "lockspace.h" 67 + #include "ast.h" 68 + #include "lock.h" 69 + #include "rcom.h" 70 + #include "recover.h" 71 + #include "lvb_table.h" 72 + #include "config.h" 73 + 74 + static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb); 75 + static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb); 76 + static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb); 77 + static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb); 78 + static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb); 79 + static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode); 80 + static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb); 81 + static int send_remove(struct dlm_rsb *r); 82 + static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); 83 + static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, 84 + struct dlm_message *ms); 85 + static int receive_extralen(struct dlm_message *ms); 86 + 87 + /* 88 + * Lock compatibilty matrix - thanks Steve 89 + * UN = Unlocked state. Not really a state, used as a flag 90 + * PD = Padding. Used to make the matrix a nice power of two in size 91 + * Other states are the same as the VMS DLM. 92 + * Usage: matrix[grmode+1][rqmode+1] (although m[rq+1][gr+1] is the same) 93 + */ 94 + 95 + static const int __dlm_compat_matrix[8][8] = { 96 + /* UN NL CR CW PR PW EX PD */ 97 + {1, 1, 1, 1, 1, 1, 1, 0}, /* UN */ 98 + {1, 1, 1, 1, 1, 1, 1, 0}, /* NL */ 99 + {1, 1, 1, 1, 1, 1, 0, 0}, /* CR */ 100 + {1, 1, 1, 1, 0, 0, 0, 0}, /* CW */ 101 + {1, 1, 1, 0, 1, 0, 0, 0}, /* PR */ 102 + {1, 1, 1, 0, 0, 0, 0, 0}, /* PW */ 103 + {1, 1, 0, 0, 0, 0, 0, 0}, /* EX */ 104 + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ 105 + }; 106 + 107 + /* 108 + * This defines the direction of transfer of LVB data. 109 + * Granted mode is the row; requested mode is the column. 110 + * Usage: matrix[grmode+1][rqmode+1] 111 + * 1 = LVB is returned to the caller 112 + * 0 = LVB is written to the resource 113 + * -1 = nothing happens to the LVB 114 + */ 115 + 116 + const int dlm_lvb_operations[8][8] = { 117 + /* UN NL CR CW PR PW EX PD*/ 118 + { -1, 1, 1, 1, 1, 1, 1, -1 }, /* UN */ 119 + { -1, 1, 1, 1, 1, 1, 1, 0 }, /* NL */ 120 + { -1, -1, 1, 1, 1, 1, 1, 0 }, /* CR */ 121 + { -1, -1, -1, 1, 1, 1, 1, 0 }, /* CW */ 122 + { -1, -1, -1, -1, 1, 1, 1, 0 }, /* PR */ 123 + { -1, 0, 0, 0, 0, 0, 1, 0 }, /* PW */ 124 + { -1, 0, 0, 0, 0, 0, 0, 0 }, /* EX */ 125 + { -1, 0, 0, 0, 0, 0, 0, 0 } /* PD */ 126 + }; 127 + EXPORT_SYMBOL_GPL(dlm_lvb_operations); 128 + 129 + #define modes_compat(gr, rq) \ 130 + __dlm_compat_matrix[(gr)->lkb_grmode + 1][(rq)->lkb_rqmode + 1] 131 + 132 + int dlm_modes_compat(int mode1, int mode2) 133 + { 134 + return __dlm_compat_matrix[mode1 + 1][mode2 + 1]; 135 + } 136 + 137 + /* 138 + * Compatibility matrix for conversions with QUECVT set. 139 + * Granted mode is the row; requested mode is the column. 140 + * Usage: matrix[grmode+1][rqmode+1] 141 + */ 142 + 143 + static const int __quecvt_compat_matrix[8][8] = { 144 + /* UN NL CR CW PR PW EX PD */ 145 + {0, 0, 0, 0, 0, 0, 0, 0}, /* UN */ 146 + {0, 0, 1, 1, 1, 1, 1, 0}, /* NL */ 147 + {0, 0, 0, 1, 1, 1, 1, 0}, /* CR */ 148 + {0, 0, 0, 0, 1, 1, 1, 0}, /* CW */ 149 + {0, 0, 0, 1, 0, 1, 1, 0}, /* PR */ 150 + {0, 0, 0, 0, 0, 0, 1, 0}, /* PW */ 151 + {0, 0, 0, 0, 0, 0, 0, 0}, /* EX */ 152 + {0, 0, 0, 0, 0, 0, 0, 0} /* PD */ 153 + }; 154 + 155 + static void dlm_print_lkb(struct dlm_lkb *lkb) 156 + { 157 + printk(KERN_ERR "lkb: nodeid %d id %x remid %x exflags %x flags %x\n" 158 + " status %d rqmode %d grmode %d wait_type %d ast_type %d\n", 159 + lkb->lkb_nodeid, lkb->lkb_id, lkb->lkb_remid, lkb->lkb_exflags, 160 + lkb->lkb_flags, lkb->lkb_status, lkb->lkb_rqmode, 161 + lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type); 162 + } 163 + 164 + void dlm_print_rsb(struct dlm_rsb *r) 165 + { 166 + printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n", 167 + r->res_nodeid, r->res_flags, r->res_first_lkid, 168 + r->res_recover_locks_count, r->res_name); 169 + } 170 + 171 + /* Threads cannot use the lockspace while it's being recovered */ 172 + 173 + static inline void lock_recovery(struct dlm_ls *ls) 174 + { 175 + down_read(&ls->ls_in_recovery); 176 + } 177 + 178 + static inline void unlock_recovery(struct dlm_ls *ls) 179 + { 180 + up_read(&ls->ls_in_recovery); 181 + } 182 + 183 + static inline int lock_recovery_try(struct dlm_ls *ls) 184 + { 185 + return down_read_trylock(&ls->ls_in_recovery); 186 + } 187 + 188 + static inline int can_be_queued(struct dlm_lkb *lkb) 189 + { 190 + return !(lkb->lkb_exflags & DLM_LKF_NOQUEUE); 191 + } 192 + 193 + static inline int force_blocking_asts(struct dlm_lkb *lkb) 194 + { 195 + return (lkb->lkb_exflags & DLM_LKF_NOQUEUEBAST); 196 + } 197 + 198 + static inline int is_demoted(struct dlm_lkb *lkb) 199 + { 200 + return (lkb->lkb_sbflags & DLM_SBF_DEMOTED); 201 + } 202 + 203 + static inline int is_remote(struct dlm_rsb *r) 204 + { 205 + DLM_ASSERT(r->res_nodeid >= 0, dlm_print_rsb(r);); 206 + return !!r->res_nodeid; 207 + } 208 + 209 + static inline int is_process_copy(struct dlm_lkb *lkb) 210 + { 211 + return (lkb->lkb_nodeid && !(lkb->lkb_flags & DLM_IFL_MSTCPY)); 212 + } 213 + 214 + static inline int is_master_copy(struct dlm_lkb *lkb) 215 + { 216 + if (lkb->lkb_flags & DLM_IFL_MSTCPY) 217 + DLM_ASSERT(lkb->lkb_nodeid, dlm_print_lkb(lkb);); 218 + return (lkb->lkb_flags & DLM_IFL_MSTCPY) ? TRUE : FALSE; 219 + } 220 + 221 + static inline int middle_conversion(struct dlm_lkb *lkb) 222 + { 223 + if ((lkb->lkb_grmode==DLM_LOCK_PR && lkb->lkb_rqmode==DLM_LOCK_CW) || 224 + (lkb->lkb_rqmode==DLM_LOCK_PR && lkb->lkb_grmode==DLM_LOCK_CW)) 225 + return TRUE; 226 + return FALSE; 227 + } 228 + 229 + static inline int down_conversion(struct dlm_lkb *lkb) 230 + { 231 + return (!middle_conversion(lkb) && lkb->lkb_rqmode < lkb->lkb_grmode); 232 + } 233 + 234 + static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) 235 + { 236 + if (is_master_copy(lkb)) 237 + return; 238 + 239 + DLM_ASSERT(lkb->lkb_lksb, dlm_print_lkb(lkb);); 240 + 241 + lkb->lkb_lksb->sb_status = rv; 242 + lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; 243 + 244 + dlm_add_ast(lkb, AST_COMP); 245 + } 246 + 247 + static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) 248 + { 249 + if (is_master_copy(lkb)) 250 + send_bast(r, lkb, rqmode); 251 + else { 252 + lkb->lkb_bastmode = rqmode; 253 + dlm_add_ast(lkb, AST_BAST); 254 + } 255 + } 256 + 257 + /* 258 + * Basic operations on rsb's and lkb's 259 + */ 260 + 261 + static struct dlm_rsb *create_rsb(struct dlm_ls *ls, char *name, int len) 262 + { 263 + struct dlm_rsb *r; 264 + 265 + r = allocate_rsb(ls, len); 266 + if (!r) 267 + return NULL; 268 + 269 + r->res_ls = ls; 270 + r->res_length = len; 271 + memcpy(r->res_name, name, len); 272 + init_MUTEX(&r->res_sem); 273 + 274 + INIT_LIST_HEAD(&r->res_lookup); 275 + INIT_LIST_HEAD(&r->res_grantqueue); 276 + INIT_LIST_HEAD(&r->res_convertqueue); 277 + INIT_LIST_HEAD(&r->res_waitqueue); 278 + INIT_LIST_HEAD(&r->res_root_list); 279 + INIT_LIST_HEAD(&r->res_recover_list); 280 + 281 + return r; 282 + } 283 + 284 + static int search_rsb_list(struct list_head *head, char *name, int len, 285 + unsigned int flags, struct dlm_rsb **r_ret) 286 + { 287 + struct dlm_rsb *r; 288 + int error = 0; 289 + 290 + list_for_each_entry(r, head, res_hashchain) { 291 + if (len == r->res_length && !memcmp(name, r->res_name, len)) 292 + goto found; 293 + } 294 + return -ENOENT; 295 + 296 + found: 297 + if (r->res_nodeid && (flags & R_MASTER)) 298 + error = -ENOTBLK; 299 + *r_ret = r; 300 + return error; 301 + } 302 + 303 + static int _search_rsb(struct dlm_ls *ls, char *name, int len, int b, 304 + unsigned int flags, struct dlm_rsb **r_ret) 305 + { 306 + struct dlm_rsb *r; 307 + int error; 308 + 309 + error = search_rsb_list(&ls->ls_rsbtbl[b].list, name, len, flags, &r); 310 + if (!error) { 311 + kref_get(&r->res_ref); 312 + goto out; 313 + } 314 + error = search_rsb_list(&ls->ls_rsbtbl[b].toss, name, len, flags, &r); 315 + if (error) 316 + goto out; 317 + 318 + list_move(&r->res_hashchain, &ls->ls_rsbtbl[b].list); 319 + 320 + if (dlm_no_directory(ls)) 321 + goto out; 322 + 323 + if (r->res_nodeid == -1) { 324 + rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); 325 + r->res_first_lkid = 0; 326 + } else if (r->res_nodeid > 0) { 327 + rsb_set_flag(r, RSB_MASTER_UNCERTAIN); 328 + r->res_first_lkid = 0; 329 + } else { 330 + DLM_ASSERT(r->res_nodeid == 0, dlm_print_rsb(r);); 331 + DLM_ASSERT(!rsb_flag(r, RSB_MASTER_UNCERTAIN),); 332 + } 333 + out: 334 + *r_ret = r; 335 + return error; 336 + } 337 + 338 + static int search_rsb(struct dlm_ls *ls, char *name, int len, int b, 339 + unsigned int flags, struct dlm_rsb **r_ret) 340 + { 341 + int error; 342 + write_lock(&ls->ls_rsbtbl[b].lock); 343 + error = _search_rsb(ls, name, len, b, flags, r_ret); 344 + write_unlock(&ls->ls_rsbtbl[b].lock); 345 + return error; 346 + } 347 + 348 + /* 349 + * Find rsb in rsbtbl and potentially create/add one 350 + * 351 + * Delaying the release of rsb's has a similar benefit to applications keeping 352 + * NL locks on an rsb, but without the guarantee that the cached master value 353 + * will still be valid when the rsb is reused. Apps aren't always smart enough 354 + * to keep NL locks on an rsb that they may lock again shortly; this can lead 355 + * to excessive master lookups and removals if we don't delay the release. 356 + * 357 + * Searching for an rsb means looking through both the normal list and toss 358 + * list. When found on the toss list the rsb is moved to the normal list with 359 + * ref count of 1; when found on normal list the ref count is incremented. 360 + */ 361 + 362 + static int find_rsb(struct dlm_ls *ls, char *name, int namelen, 363 + unsigned int flags, struct dlm_rsb **r_ret) 364 + { 365 + struct dlm_rsb *r, *tmp; 366 + uint32_t hash, bucket; 367 + int error = 0; 368 + 369 + if (dlm_no_directory(ls)) 370 + flags |= R_CREATE; 371 + 372 + hash = jhash(name, namelen, 0); 373 + bucket = hash & (ls->ls_rsbtbl_size - 1); 374 + 375 + error = search_rsb(ls, name, namelen, bucket, flags, &r); 376 + if (!error) 377 + goto out; 378 + 379 + if (error == -ENOENT && !(flags & R_CREATE)) 380 + goto out; 381 + 382 + /* the rsb was found but wasn't a master copy */ 383 + if (error == -ENOTBLK) 384 + goto out; 385 + 386 + error = -ENOMEM; 387 + r = create_rsb(ls, name, namelen); 388 + if (!r) 389 + goto out; 390 + 391 + r->res_hash = hash; 392 + r->res_bucket = bucket; 393 + r->res_nodeid = -1; 394 + kref_init(&r->res_ref); 395 + 396 + /* With no directory, the master can be set immediately */ 397 + if (dlm_no_directory(ls)) { 398 + int nodeid = dlm_dir_nodeid(r); 399 + if (nodeid == dlm_our_nodeid()) 400 + nodeid = 0; 401 + r->res_nodeid = nodeid; 402 + } 403 + 404 + write_lock(&ls->ls_rsbtbl[bucket].lock); 405 + error = _search_rsb(ls, name, namelen, bucket, 0, &tmp); 406 + if (!error) { 407 + write_unlock(&ls->ls_rsbtbl[bucket].lock); 408 + free_rsb(r); 409 + r = tmp; 410 + goto out; 411 + } 412 + list_add(&r->res_hashchain, &ls->ls_rsbtbl[bucket].list); 413 + write_unlock(&ls->ls_rsbtbl[bucket].lock); 414 + error = 0; 415 + out: 416 + *r_ret = r; 417 + return error; 418 + } 419 + 420 + int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen, 421 + unsigned int flags, struct dlm_rsb **r_ret) 422 + { 423 + return find_rsb(ls, name, namelen, flags, r_ret); 424 + } 425 + 426 + /* This is only called to add a reference when the code already holds 427 + a valid reference to the rsb, so there's no need for locking. */ 428 + 429 + static inline void hold_rsb(struct dlm_rsb *r) 430 + { 431 + kref_get(&r->res_ref); 432 + } 433 + 434 + void dlm_hold_rsb(struct dlm_rsb *r) 435 + { 436 + hold_rsb(r); 437 + } 438 + 439 + static void toss_rsb(struct kref *kref) 440 + { 441 + struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref); 442 + struct dlm_ls *ls = r->res_ls; 443 + 444 + DLM_ASSERT(list_empty(&r->res_root_list), dlm_print_rsb(r);); 445 + kref_init(&r->res_ref); 446 + list_move(&r->res_hashchain, &ls->ls_rsbtbl[r->res_bucket].toss); 447 + r->res_toss_time = jiffies; 448 + if (r->res_lvbptr) { 449 + free_lvb(r->res_lvbptr); 450 + r->res_lvbptr = NULL; 451 + } 452 + } 453 + 454 + /* When all references to the rsb are gone it's transfered to 455 + the tossed list for later disposal. */ 456 + 457 + static void put_rsb(struct dlm_rsb *r) 458 + { 459 + struct dlm_ls *ls = r->res_ls; 460 + uint32_t bucket = r->res_bucket; 461 + 462 + write_lock(&ls->ls_rsbtbl[bucket].lock); 463 + kref_put(&r->res_ref, toss_rsb); 464 + write_unlock(&ls->ls_rsbtbl[bucket].lock); 465 + } 466 + 467 + void dlm_put_rsb(struct dlm_rsb *r) 468 + { 469 + put_rsb(r); 470 + } 471 + 472 + /* See comment for unhold_lkb */ 473 + 474 + static void unhold_rsb(struct dlm_rsb *r) 475 + { 476 + int rv; 477 + rv = kref_put(&r->res_ref, toss_rsb); 478 + DLM_ASSERT(!rv, dlm_print_rsb(r);); 479 + } 480 + 481 + static void kill_rsb(struct kref *kref) 482 + { 483 + struct dlm_rsb *r = container_of(kref, struct dlm_rsb, res_ref); 484 + 485 + /* All work is done after the return from kref_put() so we 486 + can release the write_lock before the remove and free. */ 487 + 488 + DLM_ASSERT(list_empty(&r->res_lookup),); 489 + DLM_ASSERT(list_empty(&r->res_grantqueue),); 490 + DLM_ASSERT(list_empty(&r->res_convertqueue),); 491 + DLM_ASSERT(list_empty(&r->res_waitqueue),); 492 + DLM_ASSERT(list_empty(&r->res_root_list),); 493 + DLM_ASSERT(list_empty(&r->res_recover_list),); 494 + } 495 + 496 + /* Attaching/detaching lkb's from rsb's is for rsb reference counting. 497 + The rsb must exist as long as any lkb's for it do. */ 498 + 499 + static void attach_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb) 500 + { 501 + hold_rsb(r); 502 + lkb->lkb_resource = r; 503 + } 504 + 505 + static void detach_lkb(struct dlm_lkb *lkb) 506 + { 507 + if (lkb->lkb_resource) { 508 + put_rsb(lkb->lkb_resource); 509 + lkb->lkb_resource = NULL; 510 + } 511 + } 512 + 513 + static int create_lkb(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) 514 + { 515 + struct dlm_lkb *lkb, *tmp; 516 + uint32_t lkid = 0; 517 + uint16_t bucket; 518 + 519 + lkb = allocate_lkb(ls); 520 + if (!lkb) 521 + return -ENOMEM; 522 + 523 + lkb->lkb_nodeid = -1; 524 + lkb->lkb_grmode = DLM_LOCK_IV; 525 + kref_init(&lkb->lkb_ref); 526 + 527 + get_random_bytes(&bucket, sizeof(bucket)); 528 + bucket &= (ls->ls_lkbtbl_size - 1); 529 + 530 + write_lock(&ls->ls_lkbtbl[bucket].lock); 531 + 532 + /* counter can roll over so we must verify lkid is not in use */ 533 + 534 + while (lkid == 0) { 535 + lkid = bucket | (ls->ls_lkbtbl[bucket].counter++ << 16); 536 + 537 + list_for_each_entry(tmp, &ls->ls_lkbtbl[bucket].list, 538 + lkb_idtbl_list) { 539 + if (tmp->lkb_id != lkid) 540 + continue; 541 + lkid = 0; 542 + break; 543 + } 544 + } 545 + 546 + lkb->lkb_id = lkid; 547 + list_add(&lkb->lkb_idtbl_list, &ls->ls_lkbtbl[bucket].list); 548 + write_unlock(&ls->ls_lkbtbl[bucket].lock); 549 + 550 + *lkb_ret = lkb; 551 + return 0; 552 + } 553 + 554 + static struct dlm_lkb *__find_lkb(struct dlm_ls *ls, uint32_t lkid) 555 + { 556 + uint16_t bucket = lkid & 0xFFFF; 557 + struct dlm_lkb *lkb; 558 + 559 + list_for_each_entry(lkb, &ls->ls_lkbtbl[bucket].list, lkb_idtbl_list) { 560 + if (lkb->lkb_id == lkid) 561 + return lkb; 562 + } 563 + return NULL; 564 + } 565 + 566 + static int find_lkb(struct dlm_ls *ls, uint32_t lkid, struct dlm_lkb **lkb_ret) 567 + { 568 + struct dlm_lkb *lkb; 569 + uint16_t bucket = lkid & 0xFFFF; 570 + 571 + if (bucket >= ls->ls_lkbtbl_size) 572 + return -EBADSLT; 573 + 574 + read_lock(&ls->ls_lkbtbl[bucket].lock); 575 + lkb = __find_lkb(ls, lkid); 576 + if (lkb) 577 + kref_get(&lkb->lkb_ref); 578 + read_unlock(&ls->ls_lkbtbl[bucket].lock); 579 + 580 + *lkb_ret = lkb; 581 + return lkb ? 0 : -ENOENT; 582 + } 583 + 584 + static void kill_lkb(struct kref *kref) 585 + { 586 + struct dlm_lkb *lkb = container_of(kref, struct dlm_lkb, lkb_ref); 587 + 588 + /* All work is done after the return from kref_put() so we 589 + can release the write_lock before the detach_lkb */ 590 + 591 + DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); 592 + } 593 + 594 + static int put_lkb(struct dlm_lkb *lkb) 595 + { 596 + struct dlm_ls *ls = lkb->lkb_resource->res_ls; 597 + uint16_t bucket = lkb->lkb_id & 0xFFFF; 598 + 599 + write_lock(&ls->ls_lkbtbl[bucket].lock); 600 + if (kref_put(&lkb->lkb_ref, kill_lkb)) { 601 + list_del(&lkb->lkb_idtbl_list); 602 + write_unlock(&ls->ls_lkbtbl[bucket].lock); 603 + 604 + detach_lkb(lkb); 605 + 606 + /* for local/process lkbs, lvbptr points to caller's lksb */ 607 + if (lkb->lkb_lvbptr && is_master_copy(lkb)) 608 + free_lvb(lkb->lkb_lvbptr); 609 + if (lkb->lkb_range) 610 + free_range(lkb->lkb_range); 611 + free_lkb(lkb); 612 + return 1; 613 + } else { 614 + write_unlock(&ls->ls_lkbtbl[bucket].lock); 615 + return 0; 616 + } 617 + } 618 + 619 + int dlm_put_lkb(struct dlm_lkb *lkb) 620 + { 621 + return put_lkb(lkb); 622 + } 623 + 624 + /* This is only called to add a reference when the code already holds 625 + a valid reference to the lkb, so there's no need for locking. */ 626 + 627 + static inline void hold_lkb(struct dlm_lkb *lkb) 628 + { 629 + kref_get(&lkb->lkb_ref); 630 + } 631 + 632 + /* This is called when we need to remove a reference and are certain 633 + it's not the last ref. e.g. del_lkb is always called between a 634 + find_lkb/put_lkb and is always the inverse of a previous add_lkb. 635 + put_lkb would work fine, but would involve unnecessary locking */ 636 + 637 + static inline void unhold_lkb(struct dlm_lkb *lkb) 638 + { 639 + int rv; 640 + rv = kref_put(&lkb->lkb_ref, kill_lkb); 641 + DLM_ASSERT(!rv, dlm_print_lkb(lkb);); 642 + } 643 + 644 + static void lkb_add_ordered(struct list_head *new, struct list_head *head, 645 + int mode) 646 + { 647 + struct dlm_lkb *lkb = NULL; 648 + 649 + list_for_each_entry(lkb, head, lkb_statequeue) 650 + if (lkb->lkb_rqmode < mode) 651 + break; 652 + 653 + if (!lkb) 654 + list_add_tail(new, head); 655 + else 656 + __list_add(new, lkb->lkb_statequeue.prev, &lkb->lkb_statequeue); 657 + } 658 + 659 + /* add/remove lkb to rsb's grant/convert/wait queue */ 660 + 661 + static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status) 662 + { 663 + kref_get(&lkb->lkb_ref); 664 + 665 + DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); 666 + 667 + lkb->lkb_status = status; 668 + 669 + switch (status) { 670 + case DLM_LKSTS_WAITING: 671 + if (lkb->lkb_exflags & DLM_LKF_HEADQUE) 672 + list_add(&lkb->lkb_statequeue, &r->res_waitqueue); 673 + else 674 + list_add_tail(&lkb->lkb_statequeue, &r->res_waitqueue); 675 + break; 676 + case DLM_LKSTS_GRANTED: 677 + /* convention says granted locks kept in order of grmode */ 678 + lkb_add_ordered(&lkb->lkb_statequeue, &r->res_grantqueue, 679 + lkb->lkb_grmode); 680 + break; 681 + case DLM_LKSTS_CONVERT: 682 + if (lkb->lkb_exflags & DLM_LKF_HEADQUE) 683 + list_add(&lkb->lkb_statequeue, &r->res_convertqueue); 684 + else 685 + list_add_tail(&lkb->lkb_statequeue, 686 + &r->res_convertqueue); 687 + break; 688 + default: 689 + DLM_ASSERT(0, dlm_print_lkb(lkb); printk("sts=%d\n", status);); 690 + } 691 + } 692 + 693 + static void del_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb) 694 + { 695 + lkb->lkb_status = 0; 696 + list_del(&lkb->lkb_statequeue); 697 + unhold_lkb(lkb); 698 + } 699 + 700 + static void move_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int sts) 701 + { 702 + hold_lkb(lkb); 703 + del_lkb(r, lkb); 704 + add_lkb(r, lkb, sts); 705 + unhold_lkb(lkb); 706 + } 707 + 708 + /* add/remove lkb from global waiters list of lkb's waiting for 709 + a reply from a remote node */ 710 + 711 + static void add_to_waiters(struct dlm_lkb *lkb, int mstype) 712 + { 713 + struct dlm_ls *ls = lkb->lkb_resource->res_ls; 714 + 715 + down(&ls->ls_waiters_sem); 716 + if (lkb->lkb_wait_type) { 717 + log_print("add_to_waiters error %d", lkb->lkb_wait_type); 718 + goto out; 719 + } 720 + lkb->lkb_wait_type = mstype; 721 + kref_get(&lkb->lkb_ref); 722 + list_add(&lkb->lkb_wait_reply, &ls->ls_waiters); 723 + out: 724 + up(&ls->ls_waiters_sem); 725 + } 726 + 727 + static int _remove_from_waiters(struct dlm_lkb *lkb) 728 + { 729 + int error = 0; 730 + 731 + if (!lkb->lkb_wait_type) { 732 + log_print("remove_from_waiters error"); 733 + error = -EINVAL; 734 + goto out; 735 + } 736 + lkb->lkb_wait_type = 0; 737 + list_del(&lkb->lkb_wait_reply); 738 + unhold_lkb(lkb); 739 + out: 740 + return error; 741 + } 742 + 743 + static int remove_from_waiters(struct dlm_lkb *lkb) 744 + { 745 + struct dlm_ls *ls = lkb->lkb_resource->res_ls; 746 + int error; 747 + 748 + down(&ls->ls_waiters_sem); 749 + error = _remove_from_waiters(lkb); 750 + up(&ls->ls_waiters_sem); 751 + return error; 752 + } 753 + 754 + static void dir_remove(struct dlm_rsb *r) 755 + { 756 + int to_nodeid; 757 + 758 + if (dlm_no_directory(r->res_ls)) 759 + return; 760 + 761 + to_nodeid = dlm_dir_nodeid(r); 762 + if (to_nodeid != dlm_our_nodeid()) 763 + send_remove(r); 764 + else 765 + dlm_dir_remove_entry(r->res_ls, to_nodeid, 766 + r->res_name, r->res_length); 767 + } 768 + 769 + /* FIXME: shouldn't this be able to exit as soon as one non-due rsb is 770 + found since they are in order of newest to oldest? */ 771 + 772 + static int shrink_bucket(struct dlm_ls *ls, int b) 773 + { 774 + struct dlm_rsb *r; 775 + int count = 0, found; 776 + 777 + for (;;) { 778 + found = FALSE; 779 + write_lock(&ls->ls_rsbtbl[b].lock); 780 + list_for_each_entry_reverse(r, &ls->ls_rsbtbl[b].toss, 781 + res_hashchain) { 782 + if (!time_after_eq(jiffies, r->res_toss_time + 783 + dlm_config.toss_secs * HZ)) 784 + continue; 785 + found = TRUE; 786 + break; 787 + } 788 + 789 + if (!found) { 790 + write_unlock(&ls->ls_rsbtbl[b].lock); 791 + break; 792 + } 793 + 794 + if (kref_put(&r->res_ref, kill_rsb)) { 795 + list_del(&r->res_hashchain); 796 + write_unlock(&ls->ls_rsbtbl[b].lock); 797 + 798 + if (is_master(r)) 799 + dir_remove(r); 800 + free_rsb(r); 801 + count++; 802 + } else { 803 + write_unlock(&ls->ls_rsbtbl[b].lock); 804 + log_error(ls, "tossed rsb in use %s", r->res_name); 805 + } 806 + } 807 + 808 + return count; 809 + } 810 + 811 + void dlm_scan_rsbs(struct dlm_ls *ls) 812 + { 813 + int i; 814 + 815 + if (dlm_locking_stopped(ls)) 816 + return; 817 + 818 + for (i = 0; i < ls->ls_rsbtbl_size; i++) { 819 + shrink_bucket(ls, i); 820 + cond_resched(); 821 + } 822 + } 823 + 824 + /* lkb is master or local copy */ 825 + 826 + static void set_lvb_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 827 + { 828 + int b, len = r->res_ls->ls_lvblen; 829 + 830 + /* b=1 lvb returned to caller 831 + b=0 lvb written to rsb or invalidated 832 + b=-1 do nothing */ 833 + 834 + b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; 835 + 836 + if (b == 1) { 837 + if (!lkb->lkb_lvbptr) 838 + return; 839 + 840 + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) 841 + return; 842 + 843 + if (!r->res_lvbptr) 844 + return; 845 + 846 + memcpy(lkb->lkb_lvbptr, r->res_lvbptr, len); 847 + lkb->lkb_lvbseq = r->res_lvbseq; 848 + 849 + } else if (b == 0) { 850 + if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) { 851 + rsb_set_flag(r, RSB_VALNOTVALID); 852 + return; 853 + } 854 + 855 + if (!lkb->lkb_lvbptr) 856 + return; 857 + 858 + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) 859 + return; 860 + 861 + if (!r->res_lvbptr) 862 + r->res_lvbptr = allocate_lvb(r->res_ls); 863 + 864 + if (!r->res_lvbptr) 865 + return; 866 + 867 + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, len); 868 + r->res_lvbseq++; 869 + lkb->lkb_lvbseq = r->res_lvbseq; 870 + rsb_clear_flag(r, RSB_VALNOTVALID); 871 + } 872 + 873 + if (rsb_flag(r, RSB_VALNOTVALID)) 874 + lkb->lkb_sbflags |= DLM_SBF_VALNOTVALID; 875 + } 876 + 877 + static void set_lvb_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) 878 + { 879 + if (lkb->lkb_grmode < DLM_LOCK_PW) 880 + return; 881 + 882 + if (lkb->lkb_exflags & DLM_LKF_IVVALBLK) { 883 + rsb_set_flag(r, RSB_VALNOTVALID); 884 + return; 885 + } 886 + 887 + if (!lkb->lkb_lvbptr) 888 + return; 889 + 890 + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) 891 + return; 892 + 893 + if (!r->res_lvbptr) 894 + r->res_lvbptr = allocate_lvb(r->res_ls); 895 + 896 + if (!r->res_lvbptr) 897 + return; 898 + 899 + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); 900 + r->res_lvbseq++; 901 + rsb_clear_flag(r, RSB_VALNOTVALID); 902 + } 903 + 904 + /* lkb is process copy (pc) */ 905 + 906 + static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, 907 + struct dlm_message *ms) 908 + { 909 + int b; 910 + 911 + if (!lkb->lkb_lvbptr) 912 + return; 913 + 914 + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) 915 + return; 916 + 917 + b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; 918 + if (b == 1) { 919 + int len = receive_extralen(ms); 920 + memcpy(lkb->lkb_lvbptr, ms->m_extra, len); 921 + lkb->lkb_lvbseq = ms->m_lvbseq; 922 + } 923 + } 924 + 925 + /* Manipulate lkb's on rsb's convert/granted/waiting queues 926 + remove_lock -- used for unlock, removes lkb from granted 927 + revert_lock -- used for cancel, moves lkb from convert to granted 928 + grant_lock -- used for request and convert, adds lkb to granted or 929 + moves lkb from convert or waiting to granted 930 + 931 + Each of these is used for master or local copy lkb's. There is 932 + also a _pc() variation used to make the corresponding change on 933 + a process copy (pc) lkb. */ 934 + 935 + static void _remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 936 + { 937 + del_lkb(r, lkb); 938 + lkb->lkb_grmode = DLM_LOCK_IV; 939 + /* this unhold undoes the original ref from create_lkb() 940 + so this leads to the lkb being freed */ 941 + unhold_lkb(lkb); 942 + } 943 + 944 + static void remove_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 945 + { 946 + set_lvb_unlock(r, lkb); 947 + _remove_lock(r, lkb); 948 + } 949 + 950 + static void remove_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb) 951 + { 952 + _remove_lock(r, lkb); 953 + } 954 + 955 + static void revert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 956 + { 957 + lkb->lkb_rqmode = DLM_LOCK_IV; 958 + 959 + switch (lkb->lkb_status) { 960 + case DLM_LKSTS_CONVERT: 961 + move_lkb(r, lkb, DLM_LKSTS_GRANTED); 962 + break; 963 + case DLM_LKSTS_WAITING: 964 + del_lkb(r, lkb); 965 + lkb->lkb_grmode = DLM_LOCK_IV; 966 + /* this unhold undoes the original ref from create_lkb() 967 + so this leads to the lkb being freed */ 968 + unhold_lkb(lkb); 969 + break; 970 + default: 971 + log_print("invalid status for revert %d", lkb->lkb_status); 972 + } 973 + } 974 + 975 + static void revert_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb) 976 + { 977 + revert_lock(r, lkb); 978 + } 979 + 980 + static void _grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 981 + { 982 + if (lkb->lkb_grmode != lkb->lkb_rqmode) { 983 + lkb->lkb_grmode = lkb->lkb_rqmode; 984 + if (lkb->lkb_status) 985 + move_lkb(r, lkb, DLM_LKSTS_GRANTED); 986 + else 987 + add_lkb(r, lkb, DLM_LKSTS_GRANTED); 988 + } 989 + 990 + lkb->lkb_rqmode = DLM_LOCK_IV; 991 + 992 + if (lkb->lkb_range) { 993 + lkb->lkb_range[GR_RANGE_START] = lkb->lkb_range[RQ_RANGE_START]; 994 + lkb->lkb_range[GR_RANGE_END] = lkb->lkb_range[RQ_RANGE_END]; 995 + } 996 + } 997 + 998 + static void grant_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 999 + { 1000 + set_lvb_lock(r, lkb); 1001 + _grant_lock(r, lkb); 1002 + lkb->lkb_highbast = 0; 1003 + } 1004 + 1005 + static void grant_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, 1006 + struct dlm_message *ms) 1007 + { 1008 + set_lvb_lock_pc(r, lkb, ms); 1009 + _grant_lock(r, lkb); 1010 + } 1011 + 1012 + /* called by grant_pending_locks() which means an async grant message must 1013 + be sent to the requesting node in addition to granting the lock if the 1014 + lkb belongs to a remote node. */ 1015 + 1016 + static void grant_lock_pending(struct dlm_rsb *r, struct dlm_lkb *lkb) 1017 + { 1018 + grant_lock(r, lkb); 1019 + if (is_master_copy(lkb)) 1020 + send_grant(r, lkb); 1021 + else 1022 + queue_cast(r, lkb, 0); 1023 + } 1024 + 1025 + static inline int first_in_list(struct dlm_lkb *lkb, struct list_head *head) 1026 + { 1027 + struct dlm_lkb *first = list_entry(head->next, struct dlm_lkb, 1028 + lkb_statequeue); 1029 + if (lkb->lkb_id == first->lkb_id) 1030 + return TRUE; 1031 + 1032 + return FALSE; 1033 + } 1034 + 1035 + /* Return 1 if the locks' ranges overlap. If the lkb has no range then it is 1036 + assumed to cover 0-ffffffff.ffffffff */ 1037 + 1038 + static inline int ranges_overlap(struct dlm_lkb *lkb1, struct dlm_lkb *lkb2) 1039 + { 1040 + if (!lkb1->lkb_range || !lkb2->lkb_range) 1041 + return TRUE; 1042 + 1043 + if (lkb1->lkb_range[RQ_RANGE_END] < lkb2->lkb_range[GR_RANGE_START] || 1044 + lkb1->lkb_range[RQ_RANGE_START] > lkb2->lkb_range[GR_RANGE_END]) 1045 + return FALSE; 1046 + 1047 + return TRUE; 1048 + } 1049 + 1050 + /* Check if the given lkb conflicts with another lkb on the queue. */ 1051 + 1052 + static int queue_conflict(struct list_head *head, struct dlm_lkb *lkb) 1053 + { 1054 + struct dlm_lkb *this; 1055 + 1056 + list_for_each_entry(this, head, lkb_statequeue) { 1057 + if (this == lkb) 1058 + continue; 1059 + if (ranges_overlap(lkb, this) && !modes_compat(this, lkb)) 1060 + return TRUE; 1061 + } 1062 + return FALSE; 1063 + } 1064 + 1065 + /* 1066 + * "A conversion deadlock arises with a pair of lock requests in the converting 1067 + * queue for one resource. The granted mode of each lock blocks the requested 1068 + * mode of the other lock." 1069 + * 1070 + * Part 2: if the granted mode of lkb is preventing the first lkb in the 1071 + * convert queue from being granted, then demote lkb (set grmode to NL). 1072 + * This second form requires that we check for conv-deadlk even when 1073 + * now == 0 in _can_be_granted(). 1074 + * 1075 + * Example: 1076 + * Granted Queue: empty 1077 + * Convert Queue: NL->EX (first lock) 1078 + * PR->EX (second lock) 1079 + * 1080 + * The first lock can't be granted because of the granted mode of the second 1081 + * lock and the second lock can't be granted because it's not first in the 1082 + * list. We demote the granted mode of the second lock (the lkb passed to this 1083 + * function). 1084 + * 1085 + * After the resolution, the "grant pending" function needs to go back and try 1086 + * to grant locks on the convert queue again since the first lock can now be 1087 + * granted. 1088 + */ 1089 + 1090 + static int conversion_deadlock_detect(struct dlm_rsb *rsb, struct dlm_lkb *lkb) 1091 + { 1092 + struct dlm_lkb *this, *first = NULL, *self = NULL; 1093 + 1094 + list_for_each_entry(this, &rsb->res_convertqueue, lkb_statequeue) { 1095 + if (!first) 1096 + first = this; 1097 + if (this == lkb) { 1098 + self = lkb; 1099 + continue; 1100 + } 1101 + 1102 + if (!ranges_overlap(lkb, this)) 1103 + continue; 1104 + 1105 + if (!modes_compat(this, lkb) && !modes_compat(lkb, this)) 1106 + return TRUE; 1107 + } 1108 + 1109 + /* if lkb is on the convert queue and is preventing the first 1110 + from being granted, then there's deadlock and we demote lkb. 1111 + multiple converting locks may need to do this before the first 1112 + converting lock can be granted. */ 1113 + 1114 + if (self && self != first) { 1115 + if (!modes_compat(lkb, first) && 1116 + !queue_conflict(&rsb->res_grantqueue, first)) 1117 + return TRUE; 1118 + } 1119 + 1120 + return FALSE; 1121 + } 1122 + 1123 + /* 1124 + * Return 1 if the lock can be granted, 0 otherwise. 1125 + * Also detect and resolve conversion deadlocks. 1126 + * 1127 + * lkb is the lock to be granted 1128 + * 1129 + * now is 1 if the function is being called in the context of the 1130 + * immediate request, it is 0 if called later, after the lock has been 1131 + * queued. 1132 + * 1133 + * References are from chapter 6 of "VAXcluster Principles" by Roy Davis 1134 + */ 1135 + 1136 + static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) 1137 + { 1138 + int8_t conv = (lkb->lkb_grmode != DLM_LOCK_IV); 1139 + 1140 + /* 1141 + * 6-10: Version 5.4 introduced an option to address the phenomenon of 1142 + * a new request for a NL mode lock being blocked. 1143 + * 1144 + * 6-11: If the optional EXPEDITE flag is used with the new NL mode 1145 + * request, then it would be granted. In essence, the use of this flag 1146 + * tells the Lock Manager to expedite theis request by not considering 1147 + * what may be in the CONVERTING or WAITING queues... As of this 1148 + * writing, the EXPEDITE flag can be used only with new requests for NL 1149 + * mode locks. This flag is not valid for conversion requests. 1150 + * 1151 + * A shortcut. Earlier checks return an error if EXPEDITE is used in a 1152 + * conversion or used with a non-NL requested mode. We also know an 1153 + * EXPEDITE request is always granted immediately, so now must always 1154 + * be 1. The full condition to grant an expedite request: (now && 1155 + * !conv && lkb->rqmode == DLM_LOCK_NL && (flags & EXPEDITE)) can 1156 + * therefore be shortened to just checking the flag. 1157 + */ 1158 + 1159 + if (lkb->lkb_exflags & DLM_LKF_EXPEDITE) 1160 + return TRUE; 1161 + 1162 + /* 1163 + * A shortcut. Without this, !queue_conflict(grantqueue, lkb) would be 1164 + * added to the remaining conditions. 1165 + */ 1166 + 1167 + if (queue_conflict(&r->res_grantqueue, lkb)) 1168 + goto out; 1169 + 1170 + /* 1171 + * 6-3: By default, a conversion request is immediately granted if the 1172 + * requested mode is compatible with the modes of all other granted 1173 + * locks 1174 + */ 1175 + 1176 + if (queue_conflict(&r->res_convertqueue, lkb)) 1177 + goto out; 1178 + 1179 + /* 1180 + * 6-5: But the default algorithm for deciding whether to grant or 1181 + * queue conversion requests does not by itself guarantee that such 1182 + * requests are serviced on a "first come first serve" basis. This, in 1183 + * turn, can lead to a phenomenon known as "indefinate postponement". 1184 + * 1185 + * 6-7: This issue is dealt with by using the optional QUECVT flag with 1186 + * the system service employed to request a lock conversion. This flag 1187 + * forces certain conversion requests to be queued, even if they are 1188 + * compatible with the granted modes of other locks on the same 1189 + * resource. Thus, the use of this flag results in conversion requests 1190 + * being ordered on a "first come first servce" basis. 1191 + * 1192 + * DCT: This condition is all about new conversions being able to occur 1193 + * "in place" while the lock remains on the granted queue (assuming 1194 + * nothing else conflicts.) IOW if QUECVT isn't set, a conversion 1195 + * doesn't _have_ to go onto the convert queue where it's processed in 1196 + * order. The "now" variable is necessary to distinguish converts 1197 + * being received and processed for the first time now, because once a 1198 + * convert is moved to the conversion queue the condition below applies 1199 + * requiring fifo granting. 1200 + */ 1201 + 1202 + if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT)) 1203 + return TRUE; 1204 + 1205 + /* 1206 + * When using range locks the NOORDER flag is set to avoid the standard 1207 + * vms rules on grant order. 1208 + */ 1209 + 1210 + if (lkb->lkb_exflags & DLM_LKF_NOORDER) 1211 + return TRUE; 1212 + 1213 + /* 1214 + * 6-3: Once in that queue [CONVERTING], a conversion request cannot be 1215 + * granted until all other conversion requests ahead of it are granted 1216 + * and/or canceled. 1217 + */ 1218 + 1219 + if (!now && conv && first_in_list(lkb, &r->res_convertqueue)) 1220 + return TRUE; 1221 + 1222 + /* 1223 + * 6-4: By default, a new request is immediately granted only if all 1224 + * three of the following conditions are satisfied when the request is 1225 + * issued: 1226 + * - The queue of ungranted conversion requests for the resource is 1227 + * empty. 1228 + * - The queue of ungranted new requests for the resource is empty. 1229 + * - The mode of the new request is compatible with the most 1230 + * restrictive mode of all granted locks on the resource. 1231 + */ 1232 + 1233 + if (now && !conv && list_empty(&r->res_convertqueue) && 1234 + list_empty(&r->res_waitqueue)) 1235 + return TRUE; 1236 + 1237 + /* 1238 + * 6-4: Once a lock request is in the queue of ungranted new requests, 1239 + * it cannot be granted until the queue of ungranted conversion 1240 + * requests is empty, all ungranted new requests ahead of it are 1241 + * granted and/or canceled, and it is compatible with the granted mode 1242 + * of the most restrictive lock granted on the resource. 1243 + */ 1244 + 1245 + if (!now && !conv && list_empty(&r->res_convertqueue) && 1246 + first_in_list(lkb, &r->res_waitqueue)) 1247 + return TRUE; 1248 + 1249 + out: 1250 + /* 1251 + * The following, enabled by CONVDEADLK, departs from VMS. 1252 + */ 1253 + 1254 + if (conv && (lkb->lkb_exflags & DLM_LKF_CONVDEADLK) && 1255 + conversion_deadlock_detect(r, lkb)) { 1256 + lkb->lkb_grmode = DLM_LOCK_NL; 1257 + lkb->lkb_sbflags |= DLM_SBF_DEMOTED; 1258 + } 1259 + 1260 + return FALSE; 1261 + } 1262 + 1263 + /* 1264 + * The ALTPR and ALTCW flags aren't traditional lock manager flags, but are a 1265 + * simple way to provide a big optimization to applications that can use them. 1266 + */ 1267 + 1268 + static int can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) 1269 + { 1270 + uint32_t flags = lkb->lkb_exflags; 1271 + int rv; 1272 + int8_t alt = 0, rqmode = lkb->lkb_rqmode; 1273 + 1274 + rv = _can_be_granted(r, lkb, now); 1275 + if (rv) 1276 + goto out; 1277 + 1278 + if (lkb->lkb_sbflags & DLM_SBF_DEMOTED) 1279 + goto out; 1280 + 1281 + if (rqmode != DLM_LOCK_PR && flags & DLM_LKF_ALTPR) 1282 + alt = DLM_LOCK_PR; 1283 + else if (rqmode != DLM_LOCK_CW && flags & DLM_LKF_ALTCW) 1284 + alt = DLM_LOCK_CW; 1285 + 1286 + if (alt) { 1287 + lkb->lkb_rqmode = alt; 1288 + rv = _can_be_granted(r, lkb, now); 1289 + if (rv) 1290 + lkb->lkb_sbflags |= DLM_SBF_ALTMODE; 1291 + else 1292 + lkb->lkb_rqmode = rqmode; 1293 + } 1294 + out: 1295 + return rv; 1296 + } 1297 + 1298 + static int grant_pending_convert(struct dlm_rsb *r, int high) 1299 + { 1300 + struct dlm_lkb *lkb, *s; 1301 + int hi, demoted, quit, grant_restart, demote_restart; 1302 + 1303 + quit = 0; 1304 + restart: 1305 + grant_restart = 0; 1306 + demote_restart = 0; 1307 + hi = DLM_LOCK_IV; 1308 + 1309 + list_for_each_entry_safe(lkb, s, &r->res_convertqueue, lkb_statequeue) { 1310 + demoted = is_demoted(lkb); 1311 + if (can_be_granted(r, lkb, FALSE)) { 1312 + grant_lock_pending(r, lkb); 1313 + grant_restart = 1; 1314 + } else { 1315 + hi = max_t(int, lkb->lkb_rqmode, hi); 1316 + if (!demoted && is_demoted(lkb)) 1317 + demote_restart = 1; 1318 + } 1319 + } 1320 + 1321 + if (grant_restart) 1322 + goto restart; 1323 + if (demote_restart && !quit) { 1324 + quit = 1; 1325 + goto restart; 1326 + } 1327 + 1328 + return max_t(int, high, hi); 1329 + } 1330 + 1331 + static int grant_pending_wait(struct dlm_rsb *r, int high) 1332 + { 1333 + struct dlm_lkb *lkb, *s; 1334 + 1335 + list_for_each_entry_safe(lkb, s, &r->res_waitqueue, lkb_statequeue) { 1336 + if (can_be_granted(r, lkb, FALSE)) 1337 + grant_lock_pending(r, lkb); 1338 + else 1339 + high = max_t(int, lkb->lkb_rqmode, high); 1340 + } 1341 + 1342 + return high; 1343 + } 1344 + 1345 + static void grant_pending_locks(struct dlm_rsb *r) 1346 + { 1347 + struct dlm_lkb *lkb, *s; 1348 + int high = DLM_LOCK_IV; 1349 + 1350 + DLM_ASSERT(is_master(r), dlm_print_rsb(r);); 1351 + 1352 + high = grant_pending_convert(r, high); 1353 + high = grant_pending_wait(r, high); 1354 + 1355 + if (high == DLM_LOCK_IV) 1356 + return; 1357 + 1358 + /* 1359 + * If there are locks left on the wait/convert queue then send blocking 1360 + * ASTs to granted locks based on the largest requested mode (high) 1361 + * found above. This can generate spurious blocking ASTs for range 1362 + * locks. FIXME: highbast < high comparison not valid for PR/CW. 1363 + */ 1364 + 1365 + list_for_each_entry_safe(lkb, s, &r->res_grantqueue, lkb_statequeue) { 1366 + if (lkb->lkb_bastaddr && (lkb->lkb_highbast < high) && 1367 + !__dlm_compat_matrix[lkb->lkb_grmode+1][high+1]) { 1368 + queue_bast(r, lkb, high); 1369 + lkb->lkb_highbast = high; 1370 + } 1371 + } 1372 + } 1373 + 1374 + static void send_bast_queue(struct dlm_rsb *r, struct list_head *head, 1375 + struct dlm_lkb *lkb) 1376 + { 1377 + struct dlm_lkb *gr; 1378 + 1379 + list_for_each_entry(gr, head, lkb_statequeue) { 1380 + if (gr->lkb_bastaddr && 1381 + gr->lkb_highbast < lkb->lkb_rqmode && 1382 + ranges_overlap(lkb, gr) && !modes_compat(gr, lkb)) { 1383 + queue_bast(r, gr, lkb->lkb_rqmode); 1384 + gr->lkb_highbast = lkb->lkb_rqmode; 1385 + } 1386 + } 1387 + } 1388 + 1389 + static void send_blocking_asts(struct dlm_rsb *r, struct dlm_lkb *lkb) 1390 + { 1391 + send_bast_queue(r, &r->res_grantqueue, lkb); 1392 + } 1393 + 1394 + static void send_blocking_asts_all(struct dlm_rsb *r, struct dlm_lkb *lkb) 1395 + { 1396 + send_bast_queue(r, &r->res_grantqueue, lkb); 1397 + send_bast_queue(r, &r->res_convertqueue, lkb); 1398 + } 1399 + 1400 + /* set_master(r, lkb) -- set the master nodeid of a resource 1401 + 1402 + The purpose of this function is to set the nodeid field in the given 1403 + lkb using the nodeid field in the given rsb. If the rsb's nodeid is 1404 + known, it can just be copied to the lkb and the function will return 1405 + 0. If the rsb's nodeid is _not_ known, it needs to be looked up 1406 + before it can be copied to the lkb. 1407 + 1408 + When the rsb nodeid is being looked up remotely, the initial lkb 1409 + causing the lookup is kept on the ls_waiters list waiting for the 1410 + lookup reply. Other lkb's waiting for the same rsb lookup are kept 1411 + on the rsb's res_lookup list until the master is verified. 1412 + 1413 + Return values: 1414 + 0: nodeid is set in rsb/lkb and the caller should go ahead and use it 1415 + 1: the rsb master is not available and the lkb has been placed on 1416 + a wait queue 1417 + */ 1418 + 1419 + static int set_master(struct dlm_rsb *r, struct dlm_lkb *lkb) 1420 + { 1421 + struct dlm_ls *ls = r->res_ls; 1422 + int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); 1423 + 1424 + if (rsb_flag(r, RSB_MASTER_UNCERTAIN)) { 1425 + rsb_clear_flag(r, RSB_MASTER_UNCERTAIN); 1426 + r->res_first_lkid = lkb->lkb_id; 1427 + lkb->lkb_nodeid = r->res_nodeid; 1428 + return 0; 1429 + } 1430 + 1431 + if (r->res_first_lkid && r->res_first_lkid != lkb->lkb_id) { 1432 + list_add_tail(&lkb->lkb_rsb_lookup, &r->res_lookup); 1433 + return 1; 1434 + } 1435 + 1436 + if (r->res_nodeid == 0) { 1437 + lkb->lkb_nodeid = 0; 1438 + return 0; 1439 + } 1440 + 1441 + if (r->res_nodeid > 0) { 1442 + lkb->lkb_nodeid = r->res_nodeid; 1443 + return 0; 1444 + } 1445 + 1446 + DLM_ASSERT(r->res_nodeid == -1, dlm_print_rsb(r);); 1447 + 1448 + dir_nodeid = dlm_dir_nodeid(r); 1449 + 1450 + if (dir_nodeid != our_nodeid) { 1451 + r->res_first_lkid = lkb->lkb_id; 1452 + send_lookup(r, lkb); 1453 + return 1; 1454 + } 1455 + 1456 + for (;;) { 1457 + /* It's possible for dlm_scand to remove an old rsb for 1458 + this same resource from the toss list, us to create 1459 + a new one, look up the master locally, and find it 1460 + already exists just before dlm_scand does the 1461 + dir_remove() on the previous rsb. */ 1462 + 1463 + error = dlm_dir_lookup(ls, our_nodeid, r->res_name, 1464 + r->res_length, &ret_nodeid); 1465 + if (!error) 1466 + break; 1467 + log_debug(ls, "dir_lookup error %d %s", error, r->res_name); 1468 + schedule(); 1469 + } 1470 + 1471 + if (ret_nodeid == our_nodeid) { 1472 + r->res_first_lkid = 0; 1473 + r->res_nodeid = 0; 1474 + lkb->lkb_nodeid = 0; 1475 + } else { 1476 + r->res_first_lkid = lkb->lkb_id; 1477 + r->res_nodeid = ret_nodeid; 1478 + lkb->lkb_nodeid = ret_nodeid; 1479 + } 1480 + return 0; 1481 + } 1482 + 1483 + static void process_lookup_list(struct dlm_rsb *r) 1484 + { 1485 + struct dlm_lkb *lkb, *safe; 1486 + 1487 + list_for_each_entry_safe(lkb, safe, &r->res_lookup, lkb_rsb_lookup) { 1488 + list_del(&lkb->lkb_rsb_lookup); 1489 + _request_lock(r, lkb); 1490 + schedule(); 1491 + } 1492 + } 1493 + 1494 + /* confirm_master -- confirm (or deny) an rsb's master nodeid */ 1495 + 1496 + static void confirm_master(struct dlm_rsb *r, int error) 1497 + { 1498 + struct dlm_lkb *lkb; 1499 + 1500 + if (!r->res_first_lkid) 1501 + return; 1502 + 1503 + switch (error) { 1504 + case 0: 1505 + case -EINPROGRESS: 1506 + r->res_first_lkid = 0; 1507 + process_lookup_list(r); 1508 + break; 1509 + 1510 + case -EAGAIN: 1511 + /* the remote master didn't queue our NOQUEUE request; 1512 + make a waiting lkb the first_lkid */ 1513 + 1514 + r->res_first_lkid = 0; 1515 + 1516 + if (!list_empty(&r->res_lookup)) { 1517 + lkb = list_entry(r->res_lookup.next, struct dlm_lkb, 1518 + lkb_rsb_lookup); 1519 + list_del(&lkb->lkb_rsb_lookup); 1520 + r->res_first_lkid = lkb->lkb_id; 1521 + _request_lock(r, lkb); 1522 + } else 1523 + r->res_nodeid = -1; 1524 + break; 1525 + 1526 + default: 1527 + log_error(r->res_ls, "confirm_master unknown error %d", error); 1528 + } 1529 + } 1530 + 1531 + static int set_lock_args(int mode, struct dlm_lksb *lksb, uint32_t flags, 1532 + int namelen, uint32_t parent_lkid, void *ast, 1533 + void *astarg, void *bast, struct dlm_range *range, 1534 + struct dlm_args *args) 1535 + { 1536 + int rv = -EINVAL; 1537 + 1538 + /* check for invalid arg usage */ 1539 + 1540 + if (mode < 0 || mode > DLM_LOCK_EX) 1541 + goto out; 1542 + 1543 + if (!(flags & DLM_LKF_CONVERT) && (namelen > DLM_RESNAME_MAXLEN)) 1544 + goto out; 1545 + 1546 + if (flags & DLM_LKF_CANCEL) 1547 + goto out; 1548 + 1549 + if (flags & DLM_LKF_QUECVT && !(flags & DLM_LKF_CONVERT)) 1550 + goto out; 1551 + 1552 + if (flags & DLM_LKF_CONVDEADLK && !(flags & DLM_LKF_CONVERT)) 1553 + goto out; 1554 + 1555 + if (flags & DLM_LKF_CONVDEADLK && flags & DLM_LKF_NOQUEUE) 1556 + goto out; 1557 + 1558 + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_CONVERT) 1559 + goto out; 1560 + 1561 + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_QUECVT) 1562 + goto out; 1563 + 1564 + if (flags & DLM_LKF_EXPEDITE && flags & DLM_LKF_NOQUEUE) 1565 + goto out; 1566 + 1567 + if (flags & DLM_LKF_EXPEDITE && mode != DLM_LOCK_NL) 1568 + goto out; 1569 + 1570 + if (!ast || !lksb) 1571 + goto out; 1572 + 1573 + if (flags & DLM_LKF_VALBLK && !lksb->sb_lvbptr) 1574 + goto out; 1575 + 1576 + /* parent/child locks not yet supported */ 1577 + if (parent_lkid) 1578 + goto out; 1579 + 1580 + if (flags & DLM_LKF_CONVERT && !lksb->sb_lkid) 1581 + goto out; 1582 + 1583 + /* these args will be copied to the lkb in validate_lock_args, 1584 + it cannot be done now because when converting locks, fields in 1585 + an active lkb cannot be modified before locking the rsb */ 1586 + 1587 + args->flags = flags; 1588 + args->astaddr = ast; 1589 + args->astparam = (long) astarg; 1590 + args->bastaddr = bast; 1591 + args->mode = mode; 1592 + args->lksb = lksb; 1593 + args->range = range; 1594 + rv = 0; 1595 + out: 1596 + return rv; 1597 + } 1598 + 1599 + static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args) 1600 + { 1601 + if (flags & ~(DLM_LKF_CANCEL | DLM_LKF_VALBLK | DLM_LKF_IVVALBLK | 1602 + DLM_LKF_FORCEUNLOCK)) 1603 + return -EINVAL; 1604 + 1605 + args->flags = flags; 1606 + args->astparam = (long) astarg; 1607 + return 0; 1608 + } 1609 + 1610 + static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, 1611 + struct dlm_args *args) 1612 + { 1613 + int rv = -EINVAL; 1614 + 1615 + if (args->flags & DLM_LKF_CONVERT) { 1616 + if (lkb->lkb_flags & DLM_IFL_MSTCPY) 1617 + goto out; 1618 + 1619 + if (args->flags & DLM_LKF_QUECVT && 1620 + !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1]) 1621 + goto out; 1622 + 1623 + rv = -EBUSY; 1624 + if (lkb->lkb_status != DLM_LKSTS_GRANTED) 1625 + goto out; 1626 + 1627 + if (lkb->lkb_wait_type) 1628 + goto out; 1629 + } 1630 + 1631 + lkb->lkb_exflags = args->flags; 1632 + lkb->lkb_sbflags = 0; 1633 + lkb->lkb_astaddr = args->astaddr; 1634 + lkb->lkb_astparam = args->astparam; 1635 + lkb->lkb_bastaddr = args->bastaddr; 1636 + lkb->lkb_rqmode = args->mode; 1637 + lkb->lkb_lksb = args->lksb; 1638 + lkb->lkb_lvbptr = args->lksb->sb_lvbptr; 1639 + lkb->lkb_ownpid = (int) current->pid; 1640 + 1641 + rv = 0; 1642 + if (!args->range) 1643 + goto out; 1644 + 1645 + if (!lkb->lkb_range) { 1646 + rv = -ENOMEM; 1647 + lkb->lkb_range = allocate_range(ls); 1648 + if (!lkb->lkb_range) 1649 + goto out; 1650 + /* This is needed for conversions that contain ranges 1651 + where the original lock didn't but it's harmless for 1652 + new locks too. */ 1653 + lkb->lkb_range[GR_RANGE_START] = 0LL; 1654 + lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL; 1655 + } 1656 + 1657 + lkb->lkb_range[RQ_RANGE_START] = args->range->ra_start; 1658 + lkb->lkb_range[RQ_RANGE_END] = args->range->ra_end; 1659 + lkb->lkb_flags |= DLM_IFL_RANGE; 1660 + rv = 0; 1661 + out: 1662 + return rv; 1663 + } 1664 + 1665 + static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args) 1666 + { 1667 + int rv = -EINVAL; 1668 + 1669 + if (lkb->lkb_flags & DLM_IFL_MSTCPY) 1670 + goto out; 1671 + 1672 + if (args->flags & DLM_LKF_FORCEUNLOCK) 1673 + goto out_ok; 1674 + 1675 + if (args->flags & DLM_LKF_CANCEL && 1676 + lkb->lkb_status == DLM_LKSTS_GRANTED) 1677 + goto out; 1678 + 1679 + if (!(args->flags & DLM_LKF_CANCEL) && 1680 + lkb->lkb_status != DLM_LKSTS_GRANTED) 1681 + goto out; 1682 + 1683 + rv = -EBUSY; 1684 + if (lkb->lkb_wait_type) 1685 + goto out; 1686 + 1687 + out_ok: 1688 + lkb->lkb_exflags = args->flags; 1689 + lkb->lkb_sbflags = 0; 1690 + lkb->lkb_astparam = args->astparam; 1691 + 1692 + rv = 0; 1693 + out: 1694 + return rv; 1695 + } 1696 + 1697 + /* 1698 + * Four stage 4 varieties: 1699 + * do_request(), do_convert(), do_unlock(), do_cancel() 1700 + * These are called on the master node for the given lock and 1701 + * from the central locking logic. 1702 + */ 1703 + 1704 + static int do_request(struct dlm_rsb *r, struct dlm_lkb *lkb) 1705 + { 1706 + int error = 0; 1707 + 1708 + if (can_be_granted(r, lkb, TRUE)) { 1709 + grant_lock(r, lkb); 1710 + queue_cast(r, lkb, 0); 1711 + goto out; 1712 + } 1713 + 1714 + if (can_be_queued(lkb)) { 1715 + error = -EINPROGRESS; 1716 + add_lkb(r, lkb, DLM_LKSTS_WAITING); 1717 + send_blocking_asts(r, lkb); 1718 + goto out; 1719 + } 1720 + 1721 + error = -EAGAIN; 1722 + if (force_blocking_asts(lkb)) 1723 + send_blocking_asts_all(r, lkb); 1724 + queue_cast(r, lkb, -EAGAIN); 1725 + 1726 + out: 1727 + return error; 1728 + } 1729 + 1730 + static int do_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) 1731 + { 1732 + int error = 0; 1733 + 1734 + /* changing an existing lock may allow others to be granted */ 1735 + 1736 + if (can_be_granted(r, lkb, TRUE)) { 1737 + grant_lock(r, lkb); 1738 + queue_cast(r, lkb, 0); 1739 + grant_pending_locks(r); 1740 + goto out; 1741 + } 1742 + 1743 + if (can_be_queued(lkb)) { 1744 + if (is_demoted(lkb)) 1745 + grant_pending_locks(r); 1746 + error = -EINPROGRESS; 1747 + del_lkb(r, lkb); 1748 + add_lkb(r, lkb, DLM_LKSTS_CONVERT); 1749 + send_blocking_asts(r, lkb); 1750 + goto out; 1751 + } 1752 + 1753 + error = -EAGAIN; 1754 + if (force_blocking_asts(lkb)) 1755 + send_blocking_asts_all(r, lkb); 1756 + queue_cast(r, lkb, -EAGAIN); 1757 + 1758 + out: 1759 + return error; 1760 + } 1761 + 1762 + static int do_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) 1763 + { 1764 + remove_lock(r, lkb); 1765 + queue_cast(r, lkb, -DLM_EUNLOCK); 1766 + grant_pending_locks(r); 1767 + return -DLM_EUNLOCK; 1768 + } 1769 + 1770 + static int do_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) 1771 + { 1772 + revert_lock(r, lkb); 1773 + queue_cast(r, lkb, -DLM_ECANCEL); 1774 + grant_pending_locks(r); 1775 + return -DLM_ECANCEL; 1776 + } 1777 + 1778 + /* 1779 + * Four stage 3 varieties: 1780 + * _request_lock(), _convert_lock(), _unlock_lock(), _cancel_lock() 1781 + */ 1782 + 1783 + /* add a new lkb to a possibly new rsb, called by requesting process */ 1784 + 1785 + static int _request_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 1786 + { 1787 + int error; 1788 + 1789 + /* set_master: sets lkb nodeid from r */ 1790 + 1791 + error = set_master(r, lkb); 1792 + if (error < 0) 1793 + goto out; 1794 + if (error) { 1795 + error = 0; 1796 + goto out; 1797 + } 1798 + 1799 + if (is_remote(r)) 1800 + /* receive_request() calls do_request() on remote node */ 1801 + error = send_request(r, lkb); 1802 + else 1803 + error = do_request(r, lkb); 1804 + out: 1805 + return error; 1806 + } 1807 + 1808 + /* change some property of an existing lkb, e.g. mode, range */ 1809 + 1810 + static int _convert_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 1811 + { 1812 + int error; 1813 + 1814 + if (is_remote(r)) 1815 + /* receive_convert() calls do_convert() on remote node */ 1816 + error = send_convert(r, lkb); 1817 + else 1818 + error = do_convert(r, lkb); 1819 + 1820 + return error; 1821 + } 1822 + 1823 + /* remove an existing lkb from the granted queue */ 1824 + 1825 + static int _unlock_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 1826 + { 1827 + int error; 1828 + 1829 + if (is_remote(r)) 1830 + /* receive_unlock() calls do_unlock() on remote node */ 1831 + error = send_unlock(r, lkb); 1832 + else 1833 + error = do_unlock(r, lkb); 1834 + 1835 + return error; 1836 + } 1837 + 1838 + /* remove an existing lkb from the convert or wait queue */ 1839 + 1840 + static int _cancel_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 1841 + { 1842 + int error; 1843 + 1844 + if (is_remote(r)) 1845 + /* receive_cancel() calls do_cancel() on remote node */ 1846 + error = send_cancel(r, lkb); 1847 + else 1848 + error = do_cancel(r, lkb); 1849 + 1850 + return error; 1851 + } 1852 + 1853 + /* 1854 + * Four stage 2 varieties: 1855 + * request_lock(), convert_lock(), unlock_lock(), cancel_lock() 1856 + */ 1857 + 1858 + static int request_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, char *name, 1859 + int len, struct dlm_args *args) 1860 + { 1861 + struct dlm_rsb *r; 1862 + int error; 1863 + 1864 + error = validate_lock_args(ls, lkb, args); 1865 + if (error) 1866 + goto out; 1867 + 1868 + error = find_rsb(ls, name, len, R_CREATE, &r); 1869 + if (error) 1870 + goto out; 1871 + 1872 + lock_rsb(r); 1873 + 1874 + attach_lkb(r, lkb); 1875 + lkb->lkb_lksb->sb_lkid = lkb->lkb_id; 1876 + 1877 + error = _request_lock(r, lkb); 1878 + 1879 + unlock_rsb(r); 1880 + put_rsb(r); 1881 + 1882 + out: 1883 + return error; 1884 + } 1885 + 1886 + static int convert_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, 1887 + struct dlm_args *args) 1888 + { 1889 + struct dlm_rsb *r; 1890 + int error; 1891 + 1892 + r = lkb->lkb_resource; 1893 + 1894 + hold_rsb(r); 1895 + lock_rsb(r); 1896 + 1897 + error = validate_lock_args(ls, lkb, args); 1898 + if (error) 1899 + goto out; 1900 + 1901 + error = _convert_lock(r, lkb); 1902 + out: 1903 + unlock_rsb(r); 1904 + put_rsb(r); 1905 + return error; 1906 + } 1907 + 1908 + static int unlock_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, 1909 + struct dlm_args *args) 1910 + { 1911 + struct dlm_rsb *r; 1912 + int error; 1913 + 1914 + r = lkb->lkb_resource; 1915 + 1916 + hold_rsb(r); 1917 + lock_rsb(r); 1918 + 1919 + error = validate_unlock_args(lkb, args); 1920 + if (error) 1921 + goto out; 1922 + 1923 + error = _unlock_lock(r, lkb); 1924 + out: 1925 + unlock_rsb(r); 1926 + put_rsb(r); 1927 + return error; 1928 + } 1929 + 1930 + static int cancel_lock(struct dlm_ls *ls, struct dlm_lkb *lkb, 1931 + struct dlm_args *args) 1932 + { 1933 + struct dlm_rsb *r; 1934 + int error; 1935 + 1936 + r = lkb->lkb_resource; 1937 + 1938 + hold_rsb(r); 1939 + lock_rsb(r); 1940 + 1941 + error = validate_unlock_args(lkb, args); 1942 + if (error) 1943 + goto out; 1944 + 1945 + error = _cancel_lock(r, lkb); 1946 + out: 1947 + unlock_rsb(r); 1948 + put_rsb(r); 1949 + return error; 1950 + } 1951 + 1952 + /* 1953 + * Two stage 1 varieties: dlm_lock() and dlm_unlock() 1954 + */ 1955 + 1956 + int dlm_lock(dlm_lockspace_t *lockspace, 1957 + int mode, 1958 + struct dlm_lksb *lksb, 1959 + uint32_t flags, 1960 + void *name, 1961 + unsigned int namelen, 1962 + uint32_t parent_lkid, 1963 + void (*ast) (void *astarg), 1964 + void *astarg, 1965 + void (*bast) (void *astarg, int mode), 1966 + struct dlm_range *range) 1967 + { 1968 + struct dlm_ls *ls; 1969 + struct dlm_lkb *lkb; 1970 + struct dlm_args args; 1971 + int error, convert = flags & DLM_LKF_CONVERT; 1972 + 1973 + ls = dlm_find_lockspace_local(lockspace); 1974 + if (!ls) 1975 + return -EINVAL; 1976 + 1977 + lock_recovery(ls); 1978 + 1979 + if (convert) 1980 + error = find_lkb(ls, lksb->sb_lkid, &lkb); 1981 + else 1982 + error = create_lkb(ls, &lkb); 1983 + 1984 + if (error) 1985 + goto out; 1986 + 1987 + error = set_lock_args(mode, lksb, flags, namelen, parent_lkid, ast, 1988 + astarg, bast, range, &args); 1989 + if (error) 1990 + goto out_put; 1991 + 1992 + if (convert) 1993 + error = convert_lock(ls, lkb, &args); 1994 + else 1995 + error = request_lock(ls, lkb, name, namelen, &args); 1996 + 1997 + if (error == -EINPROGRESS) 1998 + error = 0; 1999 + out_put: 2000 + if (convert || error) 2001 + put_lkb(lkb); 2002 + if (error == -EAGAIN) 2003 + error = 0; 2004 + out: 2005 + unlock_recovery(ls); 2006 + dlm_put_lockspace(ls); 2007 + return error; 2008 + } 2009 + 2010 + int dlm_unlock(dlm_lockspace_t *lockspace, 2011 + uint32_t lkid, 2012 + uint32_t flags, 2013 + struct dlm_lksb *lksb, 2014 + void *astarg) 2015 + { 2016 + struct dlm_ls *ls; 2017 + struct dlm_lkb *lkb; 2018 + struct dlm_args args; 2019 + int error; 2020 + 2021 + ls = dlm_find_lockspace_local(lockspace); 2022 + if (!ls) 2023 + return -EINVAL; 2024 + 2025 + lock_recovery(ls); 2026 + 2027 + error = find_lkb(ls, lkid, &lkb); 2028 + if (error) 2029 + goto out; 2030 + 2031 + error = set_unlock_args(flags, astarg, &args); 2032 + if (error) 2033 + goto out_put; 2034 + 2035 + if (flags & DLM_LKF_CANCEL) 2036 + error = cancel_lock(ls, lkb, &args); 2037 + else 2038 + error = unlock_lock(ls, lkb, &args); 2039 + 2040 + if (error == -DLM_EUNLOCK || error == -DLM_ECANCEL) 2041 + error = 0; 2042 + out_put: 2043 + put_lkb(lkb); 2044 + out: 2045 + unlock_recovery(ls); 2046 + dlm_put_lockspace(ls); 2047 + return error; 2048 + } 2049 + 2050 + /* 2051 + * send/receive routines for remote operations and replies 2052 + * 2053 + * send_args 2054 + * send_common 2055 + * send_request receive_request 2056 + * send_convert receive_convert 2057 + * send_unlock receive_unlock 2058 + * send_cancel receive_cancel 2059 + * send_grant receive_grant 2060 + * send_bast receive_bast 2061 + * send_lookup receive_lookup 2062 + * send_remove receive_remove 2063 + * 2064 + * send_common_reply 2065 + * receive_request_reply send_request_reply 2066 + * receive_convert_reply send_convert_reply 2067 + * receive_unlock_reply send_unlock_reply 2068 + * receive_cancel_reply send_cancel_reply 2069 + * receive_lookup_reply send_lookup_reply 2070 + */ 2071 + 2072 + static int create_message(struct dlm_rsb *r, struct dlm_lkb *lkb, 2073 + int to_nodeid, int mstype, 2074 + struct dlm_message **ms_ret, 2075 + struct dlm_mhandle **mh_ret) 2076 + { 2077 + struct dlm_message *ms; 2078 + struct dlm_mhandle *mh; 2079 + char *mb; 2080 + int mb_len = sizeof(struct dlm_message); 2081 + 2082 + switch (mstype) { 2083 + case DLM_MSG_REQUEST: 2084 + case DLM_MSG_LOOKUP: 2085 + case DLM_MSG_REMOVE: 2086 + mb_len += r->res_length; 2087 + break; 2088 + case DLM_MSG_CONVERT: 2089 + case DLM_MSG_UNLOCK: 2090 + case DLM_MSG_REQUEST_REPLY: 2091 + case DLM_MSG_CONVERT_REPLY: 2092 + case DLM_MSG_GRANT: 2093 + if (lkb && lkb->lkb_lvbptr) 2094 + mb_len += r->res_ls->ls_lvblen; 2095 + break; 2096 + } 2097 + 2098 + /* get_buffer gives us a message handle (mh) that we need to 2099 + pass into lowcomms_commit and a message buffer (mb) that we 2100 + write our data into */ 2101 + 2102 + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb); 2103 + if (!mh) 2104 + return -ENOBUFS; 2105 + 2106 + memset(mb, 0, mb_len); 2107 + 2108 + ms = (struct dlm_message *) mb; 2109 + 2110 + ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); 2111 + ms->m_header.h_lockspace = r->res_ls->ls_global_id; 2112 + ms->m_header.h_nodeid = dlm_our_nodeid(); 2113 + ms->m_header.h_length = mb_len; 2114 + ms->m_header.h_cmd = DLM_MSG; 2115 + 2116 + ms->m_type = mstype; 2117 + 2118 + *mh_ret = mh; 2119 + *ms_ret = ms; 2120 + return 0; 2121 + } 2122 + 2123 + /* further lowcomms enhancements or alternate implementations may make 2124 + the return value from this function useful at some point */ 2125 + 2126 + static int send_message(struct dlm_mhandle *mh, struct dlm_message *ms) 2127 + { 2128 + dlm_message_out(ms); 2129 + dlm_lowcomms_commit_buffer(mh); 2130 + return 0; 2131 + } 2132 + 2133 + static void send_args(struct dlm_rsb *r, struct dlm_lkb *lkb, 2134 + struct dlm_message *ms) 2135 + { 2136 + ms->m_nodeid = lkb->lkb_nodeid; 2137 + ms->m_pid = lkb->lkb_ownpid; 2138 + ms->m_lkid = lkb->lkb_id; 2139 + ms->m_remid = lkb->lkb_remid; 2140 + ms->m_exflags = lkb->lkb_exflags; 2141 + ms->m_sbflags = lkb->lkb_sbflags; 2142 + ms->m_flags = lkb->lkb_flags; 2143 + ms->m_lvbseq = lkb->lkb_lvbseq; 2144 + ms->m_status = lkb->lkb_status; 2145 + ms->m_grmode = lkb->lkb_grmode; 2146 + ms->m_rqmode = lkb->lkb_rqmode; 2147 + ms->m_hash = r->res_hash; 2148 + 2149 + /* m_result and m_bastmode are set from function args, 2150 + not from lkb fields */ 2151 + 2152 + if (lkb->lkb_bastaddr) 2153 + ms->m_asts |= AST_BAST; 2154 + if (lkb->lkb_astaddr) 2155 + ms->m_asts |= AST_COMP; 2156 + 2157 + if (lkb->lkb_range) { 2158 + ms->m_range[0] = lkb->lkb_range[RQ_RANGE_START]; 2159 + ms->m_range[1] = lkb->lkb_range[RQ_RANGE_END]; 2160 + } 2161 + 2162 + if (ms->m_type == DLM_MSG_REQUEST || ms->m_type == DLM_MSG_LOOKUP) 2163 + memcpy(ms->m_extra, r->res_name, r->res_length); 2164 + 2165 + else if (lkb->lkb_lvbptr) 2166 + memcpy(ms->m_extra, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); 2167 + 2168 + } 2169 + 2170 + static int send_common(struct dlm_rsb *r, struct dlm_lkb *lkb, int mstype) 2171 + { 2172 + struct dlm_message *ms; 2173 + struct dlm_mhandle *mh; 2174 + int to_nodeid, error; 2175 + 2176 + add_to_waiters(lkb, mstype); 2177 + 2178 + to_nodeid = r->res_nodeid; 2179 + 2180 + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); 2181 + if (error) 2182 + goto fail; 2183 + 2184 + send_args(r, lkb, ms); 2185 + 2186 + error = send_message(mh, ms); 2187 + if (error) 2188 + goto fail; 2189 + return 0; 2190 + 2191 + fail: 2192 + remove_from_waiters(lkb); 2193 + return error; 2194 + } 2195 + 2196 + static int send_request(struct dlm_rsb *r, struct dlm_lkb *lkb) 2197 + { 2198 + return send_common(r, lkb, DLM_MSG_REQUEST); 2199 + } 2200 + 2201 + static int send_convert(struct dlm_rsb *r, struct dlm_lkb *lkb) 2202 + { 2203 + int error; 2204 + 2205 + error = send_common(r, lkb, DLM_MSG_CONVERT); 2206 + 2207 + /* down conversions go without a reply from the master */ 2208 + if (!error && down_conversion(lkb)) { 2209 + remove_from_waiters(lkb); 2210 + r->res_ls->ls_stub_ms.m_result = 0; 2211 + __receive_convert_reply(r, lkb, &r->res_ls->ls_stub_ms); 2212 + } 2213 + 2214 + return error; 2215 + } 2216 + 2217 + /* FIXME: if this lkb is the only lock we hold on the rsb, then set 2218 + MASTER_UNCERTAIN to force the next request on the rsb to confirm 2219 + that the master is still correct. */ 2220 + 2221 + static int send_unlock(struct dlm_rsb *r, struct dlm_lkb *lkb) 2222 + { 2223 + return send_common(r, lkb, DLM_MSG_UNLOCK); 2224 + } 2225 + 2226 + static int send_cancel(struct dlm_rsb *r, struct dlm_lkb *lkb) 2227 + { 2228 + return send_common(r, lkb, DLM_MSG_CANCEL); 2229 + } 2230 + 2231 + static int send_grant(struct dlm_rsb *r, struct dlm_lkb *lkb) 2232 + { 2233 + struct dlm_message *ms; 2234 + struct dlm_mhandle *mh; 2235 + int to_nodeid, error; 2236 + 2237 + to_nodeid = lkb->lkb_nodeid; 2238 + 2239 + error = create_message(r, lkb, to_nodeid, DLM_MSG_GRANT, &ms, &mh); 2240 + if (error) 2241 + goto out; 2242 + 2243 + send_args(r, lkb, ms); 2244 + 2245 + ms->m_result = 0; 2246 + 2247 + error = send_message(mh, ms); 2248 + out: 2249 + return error; 2250 + } 2251 + 2252 + static int send_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int mode) 2253 + { 2254 + struct dlm_message *ms; 2255 + struct dlm_mhandle *mh; 2256 + int to_nodeid, error; 2257 + 2258 + to_nodeid = lkb->lkb_nodeid; 2259 + 2260 + error = create_message(r, NULL, to_nodeid, DLM_MSG_BAST, &ms, &mh); 2261 + if (error) 2262 + goto out; 2263 + 2264 + send_args(r, lkb, ms); 2265 + 2266 + ms->m_bastmode = mode; 2267 + 2268 + error = send_message(mh, ms); 2269 + out: 2270 + return error; 2271 + } 2272 + 2273 + static int send_lookup(struct dlm_rsb *r, struct dlm_lkb *lkb) 2274 + { 2275 + struct dlm_message *ms; 2276 + struct dlm_mhandle *mh; 2277 + int to_nodeid, error; 2278 + 2279 + add_to_waiters(lkb, DLM_MSG_LOOKUP); 2280 + 2281 + to_nodeid = dlm_dir_nodeid(r); 2282 + 2283 + error = create_message(r, NULL, to_nodeid, DLM_MSG_LOOKUP, &ms, &mh); 2284 + if (error) 2285 + goto fail; 2286 + 2287 + send_args(r, lkb, ms); 2288 + 2289 + error = send_message(mh, ms); 2290 + if (error) 2291 + goto fail; 2292 + return 0; 2293 + 2294 + fail: 2295 + remove_from_waiters(lkb); 2296 + return error; 2297 + } 2298 + 2299 + static int send_remove(struct dlm_rsb *r) 2300 + { 2301 + struct dlm_message *ms; 2302 + struct dlm_mhandle *mh; 2303 + int to_nodeid, error; 2304 + 2305 + to_nodeid = dlm_dir_nodeid(r); 2306 + 2307 + error = create_message(r, NULL, to_nodeid, DLM_MSG_REMOVE, &ms, &mh); 2308 + if (error) 2309 + goto out; 2310 + 2311 + memcpy(ms->m_extra, r->res_name, r->res_length); 2312 + ms->m_hash = r->res_hash; 2313 + 2314 + error = send_message(mh, ms); 2315 + out: 2316 + return error; 2317 + } 2318 + 2319 + static int send_common_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, 2320 + int mstype, int rv) 2321 + { 2322 + struct dlm_message *ms; 2323 + struct dlm_mhandle *mh; 2324 + int to_nodeid, error; 2325 + 2326 + to_nodeid = lkb->lkb_nodeid; 2327 + 2328 + error = create_message(r, lkb, to_nodeid, mstype, &ms, &mh); 2329 + if (error) 2330 + goto out; 2331 + 2332 + send_args(r, lkb, ms); 2333 + 2334 + ms->m_result = rv; 2335 + 2336 + error = send_message(mh, ms); 2337 + out: 2338 + return error; 2339 + } 2340 + 2341 + static int send_request_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) 2342 + { 2343 + return send_common_reply(r, lkb, DLM_MSG_REQUEST_REPLY, rv); 2344 + } 2345 + 2346 + static int send_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) 2347 + { 2348 + return send_common_reply(r, lkb, DLM_MSG_CONVERT_REPLY, rv); 2349 + } 2350 + 2351 + static int send_unlock_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) 2352 + { 2353 + return send_common_reply(r, lkb, DLM_MSG_UNLOCK_REPLY, rv); 2354 + } 2355 + 2356 + static int send_cancel_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) 2357 + { 2358 + return send_common_reply(r, lkb, DLM_MSG_CANCEL_REPLY, rv); 2359 + } 2360 + 2361 + static int send_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms_in, 2362 + int ret_nodeid, int rv) 2363 + { 2364 + struct dlm_rsb *r = &ls->ls_stub_rsb; 2365 + struct dlm_message *ms; 2366 + struct dlm_mhandle *mh; 2367 + int error, nodeid = ms_in->m_header.h_nodeid; 2368 + 2369 + error = create_message(r, NULL, nodeid, DLM_MSG_LOOKUP_REPLY, &ms, &mh); 2370 + if (error) 2371 + goto out; 2372 + 2373 + ms->m_lkid = ms_in->m_lkid; 2374 + ms->m_result = rv; 2375 + ms->m_nodeid = ret_nodeid; 2376 + 2377 + error = send_message(mh, ms); 2378 + out: 2379 + return error; 2380 + } 2381 + 2382 + /* which args we save from a received message depends heavily on the type 2383 + of message, unlike the send side where we can safely send everything about 2384 + the lkb for any type of message */ 2385 + 2386 + static void receive_flags(struct dlm_lkb *lkb, struct dlm_message *ms) 2387 + { 2388 + lkb->lkb_exflags = ms->m_exflags; 2389 + lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | 2390 + (ms->m_flags & 0x0000FFFF); 2391 + } 2392 + 2393 + static void receive_flags_reply(struct dlm_lkb *lkb, struct dlm_message *ms) 2394 + { 2395 + lkb->lkb_sbflags = ms->m_sbflags; 2396 + lkb->lkb_flags = (lkb->lkb_flags & 0xFFFF0000) | 2397 + (ms->m_flags & 0x0000FFFF); 2398 + } 2399 + 2400 + static int receive_extralen(struct dlm_message *ms) 2401 + { 2402 + return (ms->m_header.h_length - sizeof(struct dlm_message)); 2403 + } 2404 + 2405 + static int receive_range(struct dlm_ls *ls, struct dlm_lkb *lkb, 2406 + struct dlm_message *ms) 2407 + { 2408 + if (lkb->lkb_flags & DLM_IFL_RANGE) { 2409 + if (!lkb->lkb_range) 2410 + lkb->lkb_range = allocate_range(ls); 2411 + if (!lkb->lkb_range) 2412 + return -ENOMEM; 2413 + lkb->lkb_range[RQ_RANGE_START] = ms->m_range[0]; 2414 + lkb->lkb_range[RQ_RANGE_END] = ms->m_range[1]; 2415 + } 2416 + return 0; 2417 + } 2418 + 2419 + static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, 2420 + struct dlm_message *ms) 2421 + { 2422 + int len; 2423 + 2424 + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 2425 + if (!lkb->lkb_lvbptr) 2426 + lkb->lkb_lvbptr = allocate_lvb(ls); 2427 + if (!lkb->lkb_lvbptr) 2428 + return -ENOMEM; 2429 + len = receive_extralen(ms); 2430 + memcpy(lkb->lkb_lvbptr, ms->m_extra, len); 2431 + } 2432 + return 0; 2433 + } 2434 + 2435 + static int receive_request_args(struct dlm_ls *ls, struct dlm_lkb *lkb, 2436 + struct dlm_message *ms) 2437 + { 2438 + lkb->lkb_nodeid = ms->m_header.h_nodeid; 2439 + lkb->lkb_ownpid = ms->m_pid; 2440 + lkb->lkb_remid = ms->m_lkid; 2441 + lkb->lkb_grmode = DLM_LOCK_IV; 2442 + lkb->lkb_rqmode = ms->m_rqmode; 2443 + lkb->lkb_bastaddr = (void *) (long) (ms->m_asts & AST_BAST); 2444 + lkb->lkb_astaddr = (void *) (long) (ms->m_asts & AST_COMP); 2445 + 2446 + DLM_ASSERT(is_master_copy(lkb), dlm_print_lkb(lkb);); 2447 + 2448 + if (receive_range(ls, lkb, ms)) 2449 + return -ENOMEM; 2450 + 2451 + if (receive_lvb(ls, lkb, ms)) 2452 + return -ENOMEM; 2453 + 2454 + return 0; 2455 + } 2456 + 2457 + static int receive_convert_args(struct dlm_ls *ls, struct dlm_lkb *lkb, 2458 + struct dlm_message *ms) 2459 + { 2460 + if (lkb->lkb_nodeid != ms->m_header.h_nodeid) { 2461 + log_error(ls, "convert_args nodeid %d %d lkid %x %x", 2462 + lkb->lkb_nodeid, ms->m_header.h_nodeid, 2463 + lkb->lkb_id, lkb->lkb_remid); 2464 + return -EINVAL; 2465 + } 2466 + 2467 + if (!is_master_copy(lkb)) 2468 + return -EINVAL; 2469 + 2470 + if (lkb->lkb_status != DLM_LKSTS_GRANTED) 2471 + return -EBUSY; 2472 + 2473 + if (receive_range(ls, lkb, ms)) 2474 + return -ENOMEM; 2475 + if (lkb->lkb_range) { 2476 + lkb->lkb_range[GR_RANGE_START] = 0LL; 2477 + lkb->lkb_range[GR_RANGE_END] = 0xffffffffffffffffULL; 2478 + } 2479 + 2480 + if (receive_lvb(ls, lkb, ms)) 2481 + return -ENOMEM; 2482 + 2483 + lkb->lkb_rqmode = ms->m_rqmode; 2484 + lkb->lkb_lvbseq = ms->m_lvbseq; 2485 + 2486 + return 0; 2487 + } 2488 + 2489 + static int receive_unlock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, 2490 + struct dlm_message *ms) 2491 + { 2492 + if (!is_master_copy(lkb)) 2493 + return -EINVAL; 2494 + if (receive_lvb(ls, lkb, ms)) 2495 + return -ENOMEM; 2496 + return 0; 2497 + } 2498 + 2499 + /* We fill in the stub-lkb fields with the info that send_xxxx_reply() 2500 + uses to send a reply and that the remote end uses to process the reply. */ 2501 + 2502 + static void setup_stub_lkb(struct dlm_ls *ls, struct dlm_message *ms) 2503 + { 2504 + struct dlm_lkb *lkb = &ls->ls_stub_lkb; 2505 + lkb->lkb_nodeid = ms->m_header.h_nodeid; 2506 + lkb->lkb_remid = ms->m_lkid; 2507 + } 2508 + 2509 + static void receive_request(struct dlm_ls *ls, struct dlm_message *ms) 2510 + { 2511 + struct dlm_lkb *lkb; 2512 + struct dlm_rsb *r; 2513 + int error, namelen; 2514 + 2515 + error = create_lkb(ls, &lkb); 2516 + if (error) 2517 + goto fail; 2518 + 2519 + receive_flags(lkb, ms); 2520 + lkb->lkb_flags |= DLM_IFL_MSTCPY; 2521 + error = receive_request_args(ls, lkb, ms); 2522 + if (error) { 2523 + put_lkb(lkb); 2524 + goto fail; 2525 + } 2526 + 2527 + namelen = receive_extralen(ms); 2528 + 2529 + error = find_rsb(ls, ms->m_extra, namelen, R_MASTER, &r); 2530 + if (error) { 2531 + put_lkb(lkb); 2532 + goto fail; 2533 + } 2534 + 2535 + lock_rsb(r); 2536 + 2537 + attach_lkb(r, lkb); 2538 + error = do_request(r, lkb); 2539 + send_request_reply(r, lkb, error); 2540 + 2541 + unlock_rsb(r); 2542 + put_rsb(r); 2543 + 2544 + if (error == -EINPROGRESS) 2545 + error = 0; 2546 + if (error) 2547 + put_lkb(lkb); 2548 + return; 2549 + 2550 + fail: 2551 + setup_stub_lkb(ls, ms); 2552 + send_request_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); 2553 + } 2554 + 2555 + static void receive_convert(struct dlm_ls *ls, struct dlm_message *ms) 2556 + { 2557 + struct dlm_lkb *lkb; 2558 + struct dlm_rsb *r; 2559 + int error, reply = TRUE; 2560 + 2561 + error = find_lkb(ls, ms->m_remid, &lkb); 2562 + if (error) 2563 + goto fail; 2564 + 2565 + r = lkb->lkb_resource; 2566 + 2567 + hold_rsb(r); 2568 + lock_rsb(r); 2569 + 2570 + receive_flags(lkb, ms); 2571 + error = receive_convert_args(ls, lkb, ms); 2572 + if (error) 2573 + goto out; 2574 + reply = !down_conversion(lkb); 2575 + 2576 + error = do_convert(r, lkb); 2577 + out: 2578 + if (reply) 2579 + send_convert_reply(r, lkb, error); 2580 + 2581 + unlock_rsb(r); 2582 + put_rsb(r); 2583 + put_lkb(lkb); 2584 + return; 2585 + 2586 + fail: 2587 + setup_stub_lkb(ls, ms); 2588 + send_convert_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); 2589 + } 2590 + 2591 + static void receive_unlock(struct dlm_ls *ls, struct dlm_message *ms) 2592 + { 2593 + struct dlm_lkb *lkb; 2594 + struct dlm_rsb *r; 2595 + int error; 2596 + 2597 + error = find_lkb(ls, ms->m_remid, &lkb); 2598 + if (error) 2599 + goto fail; 2600 + 2601 + r = lkb->lkb_resource; 2602 + 2603 + hold_rsb(r); 2604 + lock_rsb(r); 2605 + 2606 + receive_flags(lkb, ms); 2607 + error = receive_unlock_args(ls, lkb, ms); 2608 + if (error) 2609 + goto out; 2610 + 2611 + error = do_unlock(r, lkb); 2612 + out: 2613 + send_unlock_reply(r, lkb, error); 2614 + 2615 + unlock_rsb(r); 2616 + put_rsb(r); 2617 + put_lkb(lkb); 2618 + return; 2619 + 2620 + fail: 2621 + setup_stub_lkb(ls, ms); 2622 + send_unlock_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); 2623 + } 2624 + 2625 + static void receive_cancel(struct dlm_ls *ls, struct dlm_message *ms) 2626 + { 2627 + struct dlm_lkb *lkb; 2628 + struct dlm_rsb *r; 2629 + int error; 2630 + 2631 + error = find_lkb(ls, ms->m_remid, &lkb); 2632 + if (error) 2633 + goto fail; 2634 + 2635 + receive_flags(lkb, ms); 2636 + 2637 + r = lkb->lkb_resource; 2638 + 2639 + hold_rsb(r); 2640 + lock_rsb(r); 2641 + 2642 + error = do_cancel(r, lkb); 2643 + send_cancel_reply(r, lkb, error); 2644 + 2645 + unlock_rsb(r); 2646 + put_rsb(r); 2647 + put_lkb(lkb); 2648 + return; 2649 + 2650 + fail: 2651 + setup_stub_lkb(ls, ms); 2652 + send_cancel_reply(&ls->ls_stub_rsb, &ls->ls_stub_lkb, error); 2653 + } 2654 + 2655 + static void receive_grant(struct dlm_ls *ls, struct dlm_message *ms) 2656 + { 2657 + struct dlm_lkb *lkb; 2658 + struct dlm_rsb *r; 2659 + int error; 2660 + 2661 + error = find_lkb(ls, ms->m_remid, &lkb); 2662 + if (error) { 2663 + log_error(ls, "receive_grant no lkb"); 2664 + return; 2665 + } 2666 + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); 2667 + 2668 + r = lkb->lkb_resource; 2669 + 2670 + hold_rsb(r); 2671 + lock_rsb(r); 2672 + 2673 + receive_flags_reply(lkb, ms); 2674 + grant_lock_pc(r, lkb, ms); 2675 + queue_cast(r, lkb, 0); 2676 + 2677 + unlock_rsb(r); 2678 + put_rsb(r); 2679 + put_lkb(lkb); 2680 + } 2681 + 2682 + static void receive_bast(struct dlm_ls *ls, struct dlm_message *ms) 2683 + { 2684 + struct dlm_lkb *lkb; 2685 + struct dlm_rsb *r; 2686 + int error; 2687 + 2688 + error = find_lkb(ls, ms->m_remid, &lkb); 2689 + if (error) { 2690 + log_error(ls, "receive_bast no lkb"); 2691 + return; 2692 + } 2693 + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); 2694 + 2695 + r = lkb->lkb_resource; 2696 + 2697 + hold_rsb(r); 2698 + lock_rsb(r); 2699 + 2700 + queue_bast(r, lkb, ms->m_bastmode); 2701 + 2702 + unlock_rsb(r); 2703 + put_rsb(r); 2704 + put_lkb(lkb); 2705 + } 2706 + 2707 + static void receive_lookup(struct dlm_ls *ls, struct dlm_message *ms) 2708 + { 2709 + int len, error, ret_nodeid, dir_nodeid, from_nodeid, our_nodeid; 2710 + 2711 + from_nodeid = ms->m_header.h_nodeid; 2712 + our_nodeid = dlm_our_nodeid(); 2713 + 2714 + len = receive_extralen(ms); 2715 + 2716 + dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash); 2717 + if (dir_nodeid != our_nodeid) { 2718 + log_error(ls, "lookup dir_nodeid %d from %d", 2719 + dir_nodeid, from_nodeid); 2720 + error = -EINVAL; 2721 + ret_nodeid = -1; 2722 + goto out; 2723 + } 2724 + 2725 + error = dlm_dir_lookup(ls, from_nodeid, ms->m_extra, len, &ret_nodeid); 2726 + 2727 + /* Optimization: we're master so treat lookup as a request */ 2728 + if (!error && ret_nodeid == our_nodeid) { 2729 + receive_request(ls, ms); 2730 + return; 2731 + } 2732 + out: 2733 + send_lookup_reply(ls, ms, ret_nodeid, error); 2734 + } 2735 + 2736 + static void receive_remove(struct dlm_ls *ls, struct dlm_message *ms) 2737 + { 2738 + int len, dir_nodeid, from_nodeid; 2739 + 2740 + from_nodeid = ms->m_header.h_nodeid; 2741 + 2742 + len = receive_extralen(ms); 2743 + 2744 + dir_nodeid = dlm_hash2nodeid(ls, ms->m_hash); 2745 + if (dir_nodeid != dlm_our_nodeid()) { 2746 + log_error(ls, "remove dir entry dir_nodeid %d from %d", 2747 + dir_nodeid, from_nodeid); 2748 + return; 2749 + } 2750 + 2751 + dlm_dir_remove_entry(ls, from_nodeid, ms->m_extra, len); 2752 + } 2753 + 2754 + static void receive_request_reply(struct dlm_ls *ls, struct dlm_message *ms) 2755 + { 2756 + struct dlm_lkb *lkb; 2757 + struct dlm_rsb *r; 2758 + int error, mstype; 2759 + 2760 + error = find_lkb(ls, ms->m_remid, &lkb); 2761 + if (error) { 2762 + log_error(ls, "receive_request_reply no lkb"); 2763 + return; 2764 + } 2765 + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); 2766 + 2767 + mstype = lkb->lkb_wait_type; 2768 + error = remove_from_waiters(lkb); 2769 + if (error) { 2770 + log_error(ls, "receive_request_reply not on waiters"); 2771 + goto out; 2772 + } 2773 + 2774 + /* this is the value returned from do_request() on the master */ 2775 + error = ms->m_result; 2776 + 2777 + r = lkb->lkb_resource; 2778 + hold_rsb(r); 2779 + lock_rsb(r); 2780 + 2781 + /* Optimization: the dir node was also the master, so it took our 2782 + lookup as a request and sent request reply instead of lookup reply */ 2783 + if (mstype == DLM_MSG_LOOKUP) { 2784 + r->res_nodeid = ms->m_header.h_nodeid; 2785 + lkb->lkb_nodeid = r->res_nodeid; 2786 + } 2787 + 2788 + switch (error) { 2789 + case -EAGAIN: 2790 + /* request would block (be queued) on remote master; 2791 + the unhold undoes the original ref from create_lkb() 2792 + so it leads to the lkb being freed */ 2793 + queue_cast(r, lkb, -EAGAIN); 2794 + confirm_master(r, -EAGAIN); 2795 + unhold_lkb(lkb); 2796 + break; 2797 + 2798 + case -EINPROGRESS: 2799 + case 0: 2800 + /* request was queued or granted on remote master */ 2801 + receive_flags_reply(lkb, ms); 2802 + lkb->lkb_remid = ms->m_lkid; 2803 + if (error) 2804 + add_lkb(r, lkb, DLM_LKSTS_WAITING); 2805 + else { 2806 + grant_lock_pc(r, lkb, ms); 2807 + queue_cast(r, lkb, 0); 2808 + } 2809 + confirm_master(r, error); 2810 + break; 2811 + 2812 + case -ENOENT: 2813 + case -ENOTBLK: 2814 + /* find_rsb failed to find rsb or rsb wasn't master */ 2815 + r->res_nodeid = -1; 2816 + lkb->lkb_nodeid = -1; 2817 + _request_lock(r, lkb); 2818 + break; 2819 + 2820 + default: 2821 + log_error(ls, "receive_request_reply error %d", error); 2822 + } 2823 + 2824 + unlock_rsb(r); 2825 + put_rsb(r); 2826 + out: 2827 + put_lkb(lkb); 2828 + } 2829 + 2830 + static void __receive_convert_reply(struct dlm_rsb *r, struct dlm_lkb *lkb, 2831 + struct dlm_message *ms) 2832 + { 2833 + int error = ms->m_result; 2834 + 2835 + /* this is the value returned from do_convert() on the master */ 2836 + 2837 + switch (error) { 2838 + case -EAGAIN: 2839 + /* convert would block (be queued) on remote master */ 2840 + queue_cast(r, lkb, -EAGAIN); 2841 + break; 2842 + 2843 + case -EINPROGRESS: 2844 + /* convert was queued on remote master */ 2845 + del_lkb(r, lkb); 2846 + add_lkb(r, lkb, DLM_LKSTS_CONVERT); 2847 + break; 2848 + 2849 + case 0: 2850 + /* convert was granted on remote master */ 2851 + receive_flags_reply(lkb, ms); 2852 + grant_lock_pc(r, lkb, ms); 2853 + queue_cast(r, lkb, 0); 2854 + break; 2855 + 2856 + default: 2857 + log_error(r->res_ls, "receive_convert_reply error %d", error); 2858 + } 2859 + } 2860 + 2861 + static void _receive_convert_reply(struct dlm_lkb *lkb, struct dlm_message *ms) 2862 + { 2863 + struct dlm_rsb *r = lkb->lkb_resource; 2864 + 2865 + hold_rsb(r); 2866 + lock_rsb(r); 2867 + 2868 + __receive_convert_reply(r, lkb, ms); 2869 + 2870 + unlock_rsb(r); 2871 + put_rsb(r); 2872 + } 2873 + 2874 + static void receive_convert_reply(struct dlm_ls *ls, struct dlm_message *ms) 2875 + { 2876 + struct dlm_lkb *lkb; 2877 + int error; 2878 + 2879 + error = find_lkb(ls, ms->m_remid, &lkb); 2880 + if (error) { 2881 + log_error(ls, "receive_convert_reply no lkb"); 2882 + return; 2883 + } 2884 + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); 2885 + 2886 + error = remove_from_waiters(lkb); 2887 + if (error) { 2888 + log_error(ls, "receive_convert_reply not on waiters"); 2889 + goto out; 2890 + } 2891 + 2892 + _receive_convert_reply(lkb, ms); 2893 + out: 2894 + put_lkb(lkb); 2895 + } 2896 + 2897 + static void _receive_unlock_reply(struct dlm_lkb *lkb, struct dlm_message *ms) 2898 + { 2899 + struct dlm_rsb *r = lkb->lkb_resource; 2900 + int error = ms->m_result; 2901 + 2902 + hold_rsb(r); 2903 + lock_rsb(r); 2904 + 2905 + /* this is the value returned from do_unlock() on the master */ 2906 + 2907 + switch (error) { 2908 + case -DLM_EUNLOCK: 2909 + receive_flags_reply(lkb, ms); 2910 + remove_lock_pc(r, lkb); 2911 + queue_cast(r, lkb, -DLM_EUNLOCK); 2912 + break; 2913 + default: 2914 + log_error(r->res_ls, "receive_unlock_reply error %d", error); 2915 + } 2916 + 2917 + unlock_rsb(r); 2918 + put_rsb(r); 2919 + } 2920 + 2921 + static void receive_unlock_reply(struct dlm_ls *ls, struct dlm_message *ms) 2922 + { 2923 + struct dlm_lkb *lkb; 2924 + int error; 2925 + 2926 + error = find_lkb(ls, ms->m_remid, &lkb); 2927 + if (error) { 2928 + log_error(ls, "receive_unlock_reply no lkb"); 2929 + return; 2930 + } 2931 + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); 2932 + 2933 + error = remove_from_waiters(lkb); 2934 + if (error) { 2935 + log_error(ls, "receive_unlock_reply not on waiters"); 2936 + goto out; 2937 + } 2938 + 2939 + _receive_unlock_reply(lkb, ms); 2940 + out: 2941 + put_lkb(lkb); 2942 + } 2943 + 2944 + static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms) 2945 + { 2946 + struct dlm_rsb *r = lkb->lkb_resource; 2947 + int error = ms->m_result; 2948 + 2949 + hold_rsb(r); 2950 + lock_rsb(r); 2951 + 2952 + /* this is the value returned from do_cancel() on the master */ 2953 + 2954 + switch (error) { 2955 + case -DLM_ECANCEL: 2956 + receive_flags_reply(lkb, ms); 2957 + revert_lock_pc(r, lkb); 2958 + queue_cast(r, lkb, -DLM_ECANCEL); 2959 + break; 2960 + default: 2961 + log_error(r->res_ls, "receive_cancel_reply error %d", error); 2962 + } 2963 + 2964 + unlock_rsb(r); 2965 + put_rsb(r); 2966 + } 2967 + 2968 + static void receive_cancel_reply(struct dlm_ls *ls, struct dlm_message *ms) 2969 + { 2970 + struct dlm_lkb *lkb; 2971 + int error; 2972 + 2973 + error = find_lkb(ls, ms->m_remid, &lkb); 2974 + if (error) { 2975 + log_error(ls, "receive_cancel_reply no lkb"); 2976 + return; 2977 + } 2978 + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); 2979 + 2980 + error = remove_from_waiters(lkb); 2981 + if (error) { 2982 + log_error(ls, "receive_cancel_reply not on waiters"); 2983 + goto out; 2984 + } 2985 + 2986 + _receive_cancel_reply(lkb, ms); 2987 + out: 2988 + put_lkb(lkb); 2989 + } 2990 + 2991 + static void receive_lookup_reply(struct dlm_ls *ls, struct dlm_message *ms) 2992 + { 2993 + struct dlm_lkb *lkb; 2994 + struct dlm_rsb *r; 2995 + int error, ret_nodeid; 2996 + 2997 + error = find_lkb(ls, ms->m_lkid, &lkb); 2998 + if (error) { 2999 + log_error(ls, "receive_lookup_reply no lkb"); 3000 + return; 3001 + } 3002 + 3003 + error = remove_from_waiters(lkb); 3004 + if (error) { 3005 + log_error(ls, "receive_lookup_reply not on waiters"); 3006 + goto out; 3007 + } 3008 + 3009 + /* this is the value returned by dlm_dir_lookup on dir node 3010 + FIXME: will a non-zero error ever be returned? */ 3011 + error = ms->m_result; 3012 + 3013 + r = lkb->lkb_resource; 3014 + hold_rsb(r); 3015 + lock_rsb(r); 3016 + 3017 + ret_nodeid = ms->m_nodeid; 3018 + if (ret_nodeid == dlm_our_nodeid()) { 3019 + r->res_nodeid = 0; 3020 + ret_nodeid = 0; 3021 + r->res_first_lkid = 0; 3022 + } else { 3023 + /* set_master() will copy res_nodeid to lkb_nodeid */ 3024 + r->res_nodeid = ret_nodeid; 3025 + } 3026 + 3027 + _request_lock(r, lkb); 3028 + 3029 + if (!ret_nodeid) 3030 + process_lookup_list(r); 3031 + 3032 + unlock_rsb(r); 3033 + put_rsb(r); 3034 + out: 3035 + put_lkb(lkb); 3036 + } 3037 + 3038 + int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery) 3039 + { 3040 + struct dlm_message *ms = (struct dlm_message *) hd; 3041 + struct dlm_ls *ls; 3042 + int error; 3043 + 3044 + if (!recovery) 3045 + dlm_message_in(ms); 3046 + 3047 + ls = dlm_find_lockspace_global(hd->h_lockspace); 3048 + if (!ls) { 3049 + log_print("drop message %d from %d for unknown lockspace %d", 3050 + ms->m_type, nodeid, hd->h_lockspace); 3051 + return -EINVAL; 3052 + } 3053 + 3054 + /* recovery may have just ended leaving a bunch of backed-up requests 3055 + in the requestqueue; wait while dlm_recoverd clears them */ 3056 + 3057 + if (!recovery) 3058 + dlm_wait_requestqueue(ls); 3059 + 3060 + /* recovery may have just started while there were a bunch of 3061 + in-flight requests -- save them in requestqueue to be processed 3062 + after recovery. we can't let dlm_recvd block on the recovery 3063 + lock. if dlm_recoverd is calling this function to clear the 3064 + requestqueue, it needs to be interrupted (-EINTR) if another 3065 + recovery operation is starting. */ 3066 + 3067 + while (1) { 3068 + if (dlm_locking_stopped(ls)) { 3069 + if (!recovery) 3070 + dlm_add_requestqueue(ls, nodeid, hd); 3071 + error = -EINTR; 3072 + goto out; 3073 + } 3074 + 3075 + if (lock_recovery_try(ls)) 3076 + break; 3077 + schedule(); 3078 + } 3079 + 3080 + switch (ms->m_type) { 3081 + 3082 + /* messages sent to a master node */ 3083 + 3084 + case DLM_MSG_REQUEST: 3085 + receive_request(ls, ms); 3086 + break; 3087 + 3088 + case DLM_MSG_CONVERT: 3089 + receive_convert(ls, ms); 3090 + break; 3091 + 3092 + case DLM_MSG_UNLOCK: 3093 + receive_unlock(ls, ms); 3094 + break; 3095 + 3096 + case DLM_MSG_CANCEL: 3097 + receive_cancel(ls, ms); 3098 + break; 3099 + 3100 + /* messages sent from a master node (replies to above) */ 3101 + 3102 + case DLM_MSG_REQUEST_REPLY: 3103 + receive_request_reply(ls, ms); 3104 + break; 3105 + 3106 + case DLM_MSG_CONVERT_REPLY: 3107 + receive_convert_reply(ls, ms); 3108 + break; 3109 + 3110 + case DLM_MSG_UNLOCK_REPLY: 3111 + receive_unlock_reply(ls, ms); 3112 + break; 3113 + 3114 + case DLM_MSG_CANCEL_REPLY: 3115 + receive_cancel_reply(ls, ms); 3116 + break; 3117 + 3118 + /* messages sent from a master node (only two types of async msg) */ 3119 + 3120 + case DLM_MSG_GRANT: 3121 + receive_grant(ls, ms); 3122 + break; 3123 + 3124 + case DLM_MSG_BAST: 3125 + receive_bast(ls, ms); 3126 + break; 3127 + 3128 + /* messages sent to a dir node */ 3129 + 3130 + case DLM_MSG_LOOKUP: 3131 + receive_lookup(ls, ms); 3132 + break; 3133 + 3134 + case DLM_MSG_REMOVE: 3135 + receive_remove(ls, ms); 3136 + break; 3137 + 3138 + /* messages sent from a dir node (remove has no reply) */ 3139 + 3140 + case DLM_MSG_LOOKUP_REPLY: 3141 + receive_lookup_reply(ls, ms); 3142 + break; 3143 + 3144 + default: 3145 + log_error(ls, "unknown message type %d", ms->m_type); 3146 + } 3147 + 3148 + unlock_recovery(ls); 3149 + out: 3150 + dlm_put_lockspace(ls); 3151 + dlm_astd_wake(); 3152 + return 0; 3153 + } 3154 + 3155 + 3156 + /* 3157 + * Recovery related 3158 + */ 3159 + 3160 + static void recover_convert_waiter(struct dlm_ls *ls, struct dlm_lkb *lkb) 3161 + { 3162 + if (middle_conversion(lkb)) { 3163 + hold_lkb(lkb); 3164 + ls->ls_stub_ms.m_result = -EINPROGRESS; 3165 + _remove_from_waiters(lkb); 3166 + _receive_convert_reply(lkb, &ls->ls_stub_ms); 3167 + 3168 + /* Same special case as in receive_rcom_lock_args() */ 3169 + lkb->lkb_grmode = DLM_LOCK_IV; 3170 + rsb_set_flag(lkb->lkb_resource, RSB_RECOVER_CONVERT); 3171 + unhold_lkb(lkb); 3172 + 3173 + } else if (lkb->lkb_rqmode >= lkb->lkb_grmode) { 3174 + lkb->lkb_flags |= DLM_IFL_RESEND; 3175 + } 3176 + 3177 + /* lkb->lkb_rqmode < lkb->lkb_grmode shouldn't happen since down 3178 + conversions are async; there's no reply from the remote master */ 3179 + } 3180 + 3181 + /* A waiting lkb needs recovery if the master node has failed, or 3182 + the master node is changing (only when no directory is used) */ 3183 + 3184 + static int waiter_needs_recovery(struct dlm_ls *ls, struct dlm_lkb *lkb) 3185 + { 3186 + if (dlm_is_removed(ls, lkb->lkb_nodeid)) 3187 + return 1; 3188 + 3189 + if (!dlm_no_directory(ls)) 3190 + return 0; 3191 + 3192 + if (dlm_dir_nodeid(lkb->lkb_resource) != lkb->lkb_nodeid) 3193 + return 1; 3194 + 3195 + return 0; 3196 + } 3197 + 3198 + /* Recovery for locks that are waiting for replies from nodes that are now 3199 + gone. We can just complete unlocks and cancels by faking a reply from the 3200 + dead node. Requests and up-conversions we flag to be resent after 3201 + recovery. Down-conversions can just be completed with a fake reply like 3202 + unlocks. Conversions between PR and CW need special attention. */ 3203 + 3204 + void dlm_recover_waiters_pre(struct dlm_ls *ls) 3205 + { 3206 + struct dlm_lkb *lkb, *safe; 3207 + 3208 + down(&ls->ls_waiters_sem); 3209 + 3210 + list_for_each_entry_safe(lkb, safe, &ls->ls_waiters, lkb_wait_reply) { 3211 + log_debug(ls, "pre recover waiter lkid %x type %d flags %x", 3212 + lkb->lkb_id, lkb->lkb_wait_type, lkb->lkb_flags); 3213 + 3214 + /* all outstanding lookups, regardless of destination will be 3215 + resent after recovery is done */ 3216 + 3217 + if (lkb->lkb_wait_type == DLM_MSG_LOOKUP) { 3218 + lkb->lkb_flags |= DLM_IFL_RESEND; 3219 + continue; 3220 + } 3221 + 3222 + if (!waiter_needs_recovery(ls, lkb)) 3223 + continue; 3224 + 3225 + switch (lkb->lkb_wait_type) { 3226 + 3227 + case DLM_MSG_REQUEST: 3228 + lkb->lkb_flags |= DLM_IFL_RESEND; 3229 + break; 3230 + 3231 + case DLM_MSG_CONVERT: 3232 + recover_convert_waiter(ls, lkb); 3233 + break; 3234 + 3235 + case DLM_MSG_UNLOCK: 3236 + hold_lkb(lkb); 3237 + ls->ls_stub_ms.m_result = -DLM_EUNLOCK; 3238 + _remove_from_waiters(lkb); 3239 + _receive_unlock_reply(lkb, &ls->ls_stub_ms); 3240 + put_lkb(lkb); 3241 + break; 3242 + 3243 + case DLM_MSG_CANCEL: 3244 + hold_lkb(lkb); 3245 + ls->ls_stub_ms.m_result = -DLM_ECANCEL; 3246 + _remove_from_waiters(lkb); 3247 + _receive_cancel_reply(lkb, &ls->ls_stub_ms); 3248 + put_lkb(lkb); 3249 + break; 3250 + 3251 + default: 3252 + log_error(ls, "invalid lkb wait_type %d", 3253 + lkb->lkb_wait_type); 3254 + } 3255 + } 3256 + up(&ls->ls_waiters_sem); 3257 + } 3258 + 3259 + static int remove_resend_waiter(struct dlm_ls *ls, struct dlm_lkb **lkb_ret) 3260 + { 3261 + struct dlm_lkb *lkb; 3262 + int rv = 0; 3263 + 3264 + down(&ls->ls_waiters_sem); 3265 + list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { 3266 + if (lkb->lkb_flags & DLM_IFL_RESEND) { 3267 + rv = lkb->lkb_wait_type; 3268 + _remove_from_waiters(lkb); 3269 + lkb->lkb_flags &= ~DLM_IFL_RESEND; 3270 + break; 3271 + } 3272 + } 3273 + up(&ls->ls_waiters_sem); 3274 + 3275 + if (!rv) 3276 + lkb = NULL; 3277 + *lkb_ret = lkb; 3278 + return rv; 3279 + } 3280 + 3281 + /* Deal with lookups and lkb's marked RESEND from _pre. We may now be the 3282 + master or dir-node for r. Processing the lkb may result in it being placed 3283 + back on waiters. */ 3284 + 3285 + int dlm_recover_waiters_post(struct dlm_ls *ls) 3286 + { 3287 + struct dlm_lkb *lkb; 3288 + struct dlm_rsb *r; 3289 + int error = 0, mstype; 3290 + 3291 + while (1) { 3292 + if (dlm_locking_stopped(ls)) { 3293 + log_debug(ls, "recover_waiters_post aborted"); 3294 + error = -EINTR; 3295 + break; 3296 + } 3297 + 3298 + mstype = remove_resend_waiter(ls, &lkb); 3299 + if (!mstype) 3300 + break; 3301 + 3302 + r = lkb->lkb_resource; 3303 + 3304 + log_debug(ls, "recover_waiters_post %x type %d flags %x %s", 3305 + lkb->lkb_id, mstype, lkb->lkb_flags, r->res_name); 3306 + 3307 + switch (mstype) { 3308 + 3309 + case DLM_MSG_LOOKUP: 3310 + hold_rsb(r); 3311 + lock_rsb(r); 3312 + _request_lock(r, lkb); 3313 + if (is_master(r)) 3314 + confirm_master(r, 0); 3315 + unlock_rsb(r); 3316 + put_rsb(r); 3317 + break; 3318 + 3319 + case DLM_MSG_REQUEST: 3320 + hold_rsb(r); 3321 + lock_rsb(r); 3322 + _request_lock(r, lkb); 3323 + unlock_rsb(r); 3324 + put_rsb(r); 3325 + break; 3326 + 3327 + case DLM_MSG_CONVERT: 3328 + hold_rsb(r); 3329 + lock_rsb(r); 3330 + _convert_lock(r, lkb); 3331 + unlock_rsb(r); 3332 + put_rsb(r); 3333 + break; 3334 + 3335 + default: 3336 + log_error(ls, "recover_waiters_post type %d", mstype); 3337 + } 3338 + } 3339 + 3340 + return error; 3341 + } 3342 + 3343 + static void purge_queue(struct dlm_rsb *r, struct list_head *queue, 3344 + int (*test)(struct dlm_ls *ls, struct dlm_lkb *lkb)) 3345 + { 3346 + struct dlm_ls *ls = r->res_ls; 3347 + struct dlm_lkb *lkb, *safe; 3348 + 3349 + list_for_each_entry_safe(lkb, safe, queue, lkb_statequeue) { 3350 + if (test(ls, lkb)) { 3351 + del_lkb(r, lkb); 3352 + /* this put should free the lkb */ 3353 + if (!put_lkb(lkb)) 3354 + log_error(ls, "purged lkb not released"); 3355 + } 3356 + } 3357 + } 3358 + 3359 + static int purge_dead_test(struct dlm_ls *ls, struct dlm_lkb *lkb) 3360 + { 3361 + return (is_master_copy(lkb) && dlm_is_removed(ls, lkb->lkb_nodeid)); 3362 + } 3363 + 3364 + static int purge_mstcpy_test(struct dlm_ls *ls, struct dlm_lkb *lkb) 3365 + { 3366 + return is_master_copy(lkb); 3367 + } 3368 + 3369 + static void purge_dead_locks(struct dlm_rsb *r) 3370 + { 3371 + purge_queue(r, &r->res_grantqueue, &purge_dead_test); 3372 + purge_queue(r, &r->res_convertqueue, &purge_dead_test); 3373 + purge_queue(r, &r->res_waitqueue, &purge_dead_test); 3374 + } 3375 + 3376 + void dlm_purge_mstcpy_locks(struct dlm_rsb *r) 3377 + { 3378 + purge_queue(r, &r->res_grantqueue, &purge_mstcpy_test); 3379 + purge_queue(r, &r->res_convertqueue, &purge_mstcpy_test); 3380 + purge_queue(r, &r->res_waitqueue, &purge_mstcpy_test); 3381 + } 3382 + 3383 + /* Get rid of locks held by nodes that are gone. */ 3384 + 3385 + int dlm_purge_locks(struct dlm_ls *ls) 3386 + { 3387 + struct dlm_rsb *r; 3388 + 3389 + log_debug(ls, "dlm_purge_locks"); 3390 + 3391 + down_write(&ls->ls_root_sem); 3392 + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 3393 + hold_rsb(r); 3394 + lock_rsb(r); 3395 + if (is_master(r)) 3396 + purge_dead_locks(r); 3397 + unlock_rsb(r); 3398 + unhold_rsb(r); 3399 + 3400 + schedule(); 3401 + } 3402 + up_write(&ls->ls_root_sem); 3403 + 3404 + return 0; 3405 + } 3406 + 3407 + int dlm_grant_after_purge(struct dlm_ls *ls) 3408 + { 3409 + struct dlm_rsb *r; 3410 + int i; 3411 + 3412 + for (i = 0; i < ls->ls_rsbtbl_size; i++) { 3413 + read_lock(&ls->ls_rsbtbl[i].lock); 3414 + list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) { 3415 + hold_rsb(r); 3416 + lock_rsb(r); 3417 + if (is_master(r)) { 3418 + grant_pending_locks(r); 3419 + confirm_master(r, 0); 3420 + } 3421 + unlock_rsb(r); 3422 + put_rsb(r); 3423 + } 3424 + read_unlock(&ls->ls_rsbtbl[i].lock); 3425 + } 3426 + 3427 + return 0; 3428 + } 3429 + 3430 + static struct dlm_lkb *search_remid_list(struct list_head *head, int nodeid, 3431 + uint32_t remid) 3432 + { 3433 + struct dlm_lkb *lkb; 3434 + 3435 + list_for_each_entry(lkb, head, lkb_statequeue) { 3436 + if (lkb->lkb_nodeid == nodeid && lkb->lkb_remid == remid) 3437 + return lkb; 3438 + } 3439 + return NULL; 3440 + } 3441 + 3442 + static struct dlm_lkb *search_remid(struct dlm_rsb *r, int nodeid, 3443 + uint32_t remid) 3444 + { 3445 + struct dlm_lkb *lkb; 3446 + 3447 + lkb = search_remid_list(&r->res_grantqueue, nodeid, remid); 3448 + if (lkb) 3449 + return lkb; 3450 + lkb = search_remid_list(&r->res_convertqueue, nodeid, remid); 3451 + if (lkb) 3452 + return lkb; 3453 + lkb = search_remid_list(&r->res_waitqueue, nodeid, remid); 3454 + if (lkb) 3455 + return lkb; 3456 + return NULL; 3457 + } 3458 + 3459 + static int receive_rcom_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb, 3460 + struct dlm_rsb *r, struct dlm_rcom *rc) 3461 + { 3462 + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; 3463 + int lvblen; 3464 + 3465 + lkb->lkb_nodeid = rc->rc_header.h_nodeid; 3466 + lkb->lkb_ownpid = rl->rl_ownpid; 3467 + lkb->lkb_remid = rl->rl_lkid; 3468 + lkb->lkb_exflags = rl->rl_exflags; 3469 + lkb->lkb_flags = rl->rl_flags & 0x0000FFFF; 3470 + lkb->lkb_flags |= DLM_IFL_MSTCPY; 3471 + lkb->lkb_lvbseq = rl->rl_lvbseq; 3472 + lkb->lkb_rqmode = rl->rl_rqmode; 3473 + lkb->lkb_grmode = rl->rl_grmode; 3474 + /* don't set lkb_status because add_lkb wants to itself */ 3475 + 3476 + lkb->lkb_bastaddr = (void *) (long) (rl->rl_asts & AST_BAST); 3477 + lkb->lkb_astaddr = (void *) (long) (rl->rl_asts & AST_COMP); 3478 + 3479 + if (lkb->lkb_flags & DLM_IFL_RANGE) { 3480 + lkb->lkb_range = allocate_range(ls); 3481 + if (!lkb->lkb_range) 3482 + return -ENOMEM; 3483 + memcpy(lkb->lkb_range, rl->rl_range, 4*sizeof(uint64_t)); 3484 + } 3485 + 3486 + if (lkb->lkb_exflags & DLM_LKF_VALBLK) { 3487 + lkb->lkb_lvbptr = allocate_lvb(ls); 3488 + if (!lkb->lkb_lvbptr) 3489 + return -ENOMEM; 3490 + lvblen = rc->rc_header.h_length - sizeof(struct dlm_rcom) - 3491 + sizeof(struct rcom_lock); 3492 + memcpy(lkb->lkb_lvbptr, rl->rl_lvb, lvblen); 3493 + } 3494 + 3495 + /* Conversions between PR and CW (middle modes) need special handling. 3496 + The real granted mode of these converting locks cannot be determined 3497 + until all locks have been rebuilt on the rsb (recover_conversion) */ 3498 + 3499 + if (rl->rl_wait_type == DLM_MSG_CONVERT && middle_conversion(lkb)) { 3500 + rl->rl_status = DLM_LKSTS_CONVERT; 3501 + lkb->lkb_grmode = DLM_LOCK_IV; 3502 + rsb_set_flag(r, RSB_RECOVER_CONVERT); 3503 + } 3504 + 3505 + return 0; 3506 + } 3507 + 3508 + /* This lkb may have been recovered in a previous aborted recovery so we need 3509 + to check if the rsb already has an lkb with the given remote nodeid/lkid. 3510 + If so we just send back a standard reply. If not, we create a new lkb with 3511 + the given values and send back our lkid. We send back our lkid by sending 3512 + back the rcom_lock struct we got but with the remid field filled in. */ 3513 + 3514 + int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc) 3515 + { 3516 + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; 3517 + struct dlm_rsb *r; 3518 + struct dlm_lkb *lkb; 3519 + int error; 3520 + 3521 + if (rl->rl_parent_lkid) { 3522 + error = -EOPNOTSUPP; 3523 + goto out; 3524 + } 3525 + 3526 + error = find_rsb(ls, rl->rl_name, rl->rl_namelen, R_MASTER, &r); 3527 + if (error) 3528 + goto out; 3529 + 3530 + lock_rsb(r); 3531 + 3532 + lkb = search_remid(r, rc->rc_header.h_nodeid, rl->rl_lkid); 3533 + if (lkb) { 3534 + error = -EEXIST; 3535 + goto out_remid; 3536 + } 3537 + 3538 + error = create_lkb(ls, &lkb); 3539 + if (error) 3540 + goto out_unlock; 3541 + 3542 + error = receive_rcom_lock_args(ls, lkb, r, rc); 3543 + if (error) { 3544 + put_lkb(lkb); 3545 + goto out_unlock; 3546 + } 3547 + 3548 + attach_lkb(r, lkb); 3549 + add_lkb(r, lkb, rl->rl_status); 3550 + error = 0; 3551 + 3552 + out_remid: 3553 + /* this is the new value returned to the lock holder for 3554 + saving in its process-copy lkb */ 3555 + rl->rl_remid = lkb->lkb_id; 3556 + 3557 + out_unlock: 3558 + unlock_rsb(r); 3559 + put_rsb(r); 3560 + out: 3561 + if (error) 3562 + log_print("recover_master_copy %d %x", error, rl->rl_lkid); 3563 + rl->rl_result = error; 3564 + return error; 3565 + } 3566 + 3567 + int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc) 3568 + { 3569 + struct rcom_lock *rl = (struct rcom_lock *) rc->rc_buf; 3570 + struct dlm_rsb *r; 3571 + struct dlm_lkb *lkb; 3572 + int error; 3573 + 3574 + error = find_lkb(ls, rl->rl_lkid, &lkb); 3575 + if (error) { 3576 + log_error(ls, "recover_process_copy no lkid %x", rl->rl_lkid); 3577 + return error; 3578 + } 3579 + 3580 + DLM_ASSERT(is_process_copy(lkb), dlm_print_lkb(lkb);); 3581 + 3582 + error = rl->rl_result; 3583 + 3584 + r = lkb->lkb_resource; 3585 + hold_rsb(r); 3586 + lock_rsb(r); 3587 + 3588 + switch (error) { 3589 + case -EEXIST: 3590 + log_debug(ls, "master copy exists %x", lkb->lkb_id); 3591 + /* fall through */ 3592 + case 0: 3593 + lkb->lkb_remid = rl->rl_remid; 3594 + break; 3595 + default: 3596 + log_error(ls, "dlm_recover_process_copy unknown error %d %x", 3597 + error, lkb->lkb_id); 3598 + } 3599 + 3600 + /* an ack for dlm_recover_locks() which waits for replies from 3601 + all the locks it sends to new masters */ 3602 + dlm_recovered_lock(r); 3603 + 3604 + unlock_rsb(r); 3605 + put_rsb(r); 3606 + put_lkb(lkb); 3607 + 3608 + return 0; 3609 + } 3610 +
+50
fs/dlm/lock.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5 + ** 6 + ** This copyrighted material is made available to anyone wishing to use, 7 + ** modify, copy, or redistribute it subject to the terms and conditions 8 + ** of the GNU General Public License v.2. 9 + ** 10 + ******************************************************************************* 11 + ******************************************************************************/ 12 + 13 + #ifndef __LOCK_DOT_H__ 14 + #define __LOCK_DOT_H__ 15 + 16 + void dlm_print_rsb(struct dlm_rsb *r); 17 + int dlm_receive_message(struct dlm_header *hd, int nodeid, int recovery); 18 + int dlm_modes_compat(int mode1, int mode2); 19 + int dlm_find_rsb(struct dlm_ls *ls, char *name, int namelen, 20 + unsigned int flags, struct dlm_rsb **r_ret); 21 + void dlm_put_rsb(struct dlm_rsb *r); 22 + void dlm_hold_rsb(struct dlm_rsb *r); 23 + int dlm_put_lkb(struct dlm_lkb *lkb); 24 + void dlm_scan_rsbs(struct dlm_ls *ls); 25 + 26 + int dlm_purge_locks(struct dlm_ls *ls); 27 + void dlm_purge_mstcpy_locks(struct dlm_rsb *r); 28 + int dlm_grant_after_purge(struct dlm_ls *ls); 29 + int dlm_recover_waiters_post(struct dlm_ls *ls); 30 + void dlm_recover_waiters_pre(struct dlm_ls *ls); 31 + int dlm_recover_master_copy(struct dlm_ls *ls, struct dlm_rcom *rc); 32 + int dlm_recover_process_copy(struct dlm_ls *ls, struct dlm_rcom *rc); 33 + 34 + static inline int is_master(struct dlm_rsb *r) 35 + { 36 + return !r->res_nodeid; 37 + } 38 + 39 + static inline void lock_rsb(struct dlm_rsb *r) 40 + { 41 + down(&r->res_sem); 42 + } 43 + 44 + static inline void unlock_rsb(struct dlm_rsb *r) 45 + { 46 + up(&r->res_sem); 47 + } 48 + 49 + #endif 50 +
+666
fs/dlm/lockspace.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #include "dlm_internal.h" 15 + #include "lockspace.h" 16 + #include "member.h" 17 + #include "recoverd.h" 18 + #include "ast.h" 19 + #include "dir.h" 20 + #include "lowcomms.h" 21 + #include "config.h" 22 + #include "memory.h" 23 + #include "lock.h" 24 + 25 + #ifdef CONFIG_DLM_DEBUG 26 + int dlm_create_debug_file(struct dlm_ls *ls); 27 + void dlm_delete_debug_file(struct dlm_ls *ls); 28 + #else 29 + static inline int dlm_create_debug_file(struct dlm_ls *ls) { return 0; } 30 + static inline void dlm_delete_debug_file(struct dlm_ls *ls) { } 31 + #endif 32 + 33 + static int ls_count; 34 + static struct semaphore ls_lock; 35 + static struct list_head lslist; 36 + static spinlock_t lslist_lock; 37 + static struct task_struct * scand_task; 38 + 39 + 40 + static ssize_t dlm_control_store(struct dlm_ls *ls, const char *buf, size_t len) 41 + { 42 + ssize_t ret = len; 43 + int n = simple_strtol(buf, NULL, 0); 44 + 45 + switch (n) { 46 + case 0: 47 + dlm_ls_stop(ls); 48 + break; 49 + case 1: 50 + dlm_ls_start(ls); 51 + break; 52 + default: 53 + ret = -EINVAL; 54 + } 55 + return ret; 56 + } 57 + 58 + static ssize_t dlm_event_store(struct dlm_ls *ls, const char *buf, size_t len) 59 + { 60 + ls->ls_uevent_result = simple_strtol(buf, NULL, 0); 61 + set_bit(LSFL_UEVENT_WAIT, &ls->ls_flags); 62 + wake_up(&ls->ls_uevent_wait); 63 + return len; 64 + } 65 + 66 + static ssize_t dlm_id_show(struct dlm_ls *ls, char *buf) 67 + { 68 + return sprintf(buf, "%u\n", ls->ls_global_id); 69 + } 70 + 71 + static ssize_t dlm_id_store(struct dlm_ls *ls, const char *buf, size_t len) 72 + { 73 + ls->ls_global_id = simple_strtoul(buf, NULL, 0); 74 + return len; 75 + } 76 + 77 + struct dlm_attr { 78 + struct attribute attr; 79 + ssize_t (*show)(struct dlm_ls *, char *); 80 + ssize_t (*store)(struct dlm_ls *, const char *, size_t); 81 + }; 82 + 83 + static struct dlm_attr dlm_attr_control = { 84 + .attr = {.name = "control", .mode = S_IWUSR}, 85 + .store = dlm_control_store 86 + }; 87 + 88 + static struct dlm_attr dlm_attr_event = { 89 + .attr = {.name = "event_done", .mode = S_IWUSR}, 90 + .store = dlm_event_store 91 + }; 92 + 93 + static struct dlm_attr dlm_attr_id = { 94 + .attr = {.name = "id", .mode = S_IRUGO | S_IWUSR}, 95 + .show = dlm_id_show, 96 + .store = dlm_id_store 97 + }; 98 + 99 + static struct attribute *dlm_attrs[] = { 100 + &dlm_attr_control.attr, 101 + &dlm_attr_event.attr, 102 + &dlm_attr_id.attr, 103 + NULL, 104 + }; 105 + 106 + static ssize_t dlm_attr_show(struct kobject *kobj, struct attribute *attr, 107 + char *buf) 108 + { 109 + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); 110 + struct dlm_attr *a = container_of(attr, struct dlm_attr, attr); 111 + return a->show ? a->show(ls, buf) : 0; 112 + } 113 + 114 + static ssize_t dlm_attr_store(struct kobject *kobj, struct attribute *attr, 115 + const char *buf, size_t len) 116 + { 117 + struct dlm_ls *ls = container_of(kobj, struct dlm_ls, ls_kobj); 118 + struct dlm_attr *a = container_of(attr, struct dlm_attr, attr); 119 + return a->store ? a->store(ls, buf, len) : len; 120 + } 121 + 122 + static struct sysfs_ops dlm_attr_ops = { 123 + .show = dlm_attr_show, 124 + .store = dlm_attr_store, 125 + }; 126 + 127 + static struct kobj_type dlm_ktype = { 128 + .default_attrs = dlm_attrs, 129 + .sysfs_ops = &dlm_attr_ops, 130 + }; 131 + 132 + static struct kset dlm_kset = { 133 + .subsys = &kernel_subsys, 134 + .kobj = {.name = "dlm",}, 135 + .ktype = &dlm_ktype, 136 + }; 137 + 138 + static int kobject_setup(struct dlm_ls *ls) 139 + { 140 + char lsname[DLM_LOCKSPACE_LEN]; 141 + int error; 142 + 143 + memset(lsname, 0, DLM_LOCKSPACE_LEN); 144 + snprintf(lsname, DLM_LOCKSPACE_LEN, "%s", ls->ls_name); 145 + 146 + error = kobject_set_name(&ls->ls_kobj, "%s", lsname); 147 + if (error) 148 + return error; 149 + 150 + ls->ls_kobj.kset = &dlm_kset; 151 + ls->ls_kobj.ktype = &dlm_ktype; 152 + return 0; 153 + } 154 + 155 + static int do_uevent(struct dlm_ls *ls, int in) 156 + { 157 + int error; 158 + 159 + if (in) 160 + kobject_uevent(&ls->ls_kobj, KOBJ_ONLINE); 161 + else 162 + kobject_uevent(&ls->ls_kobj, KOBJ_OFFLINE); 163 + 164 + error = wait_event_interruptible(ls->ls_uevent_wait, 165 + test_and_clear_bit(LSFL_UEVENT_WAIT, &ls->ls_flags)); 166 + if (error) 167 + goto out; 168 + 169 + error = ls->ls_uevent_result; 170 + out: 171 + return error; 172 + } 173 + 174 + 175 + int dlm_lockspace_init(void) 176 + { 177 + int error; 178 + 179 + ls_count = 0; 180 + init_MUTEX(&ls_lock); 181 + INIT_LIST_HEAD(&lslist); 182 + spin_lock_init(&lslist_lock); 183 + 184 + error = kset_register(&dlm_kset); 185 + if (error) 186 + printk("dlm_lockspace_init: cannot register kset %d\n", error); 187 + return error; 188 + } 189 + 190 + void dlm_lockspace_exit(void) 191 + { 192 + kset_unregister(&dlm_kset); 193 + } 194 + 195 + static int dlm_scand(void *data) 196 + { 197 + struct dlm_ls *ls; 198 + 199 + while (!kthread_should_stop()) { 200 + list_for_each_entry(ls, &lslist, ls_list) 201 + dlm_scan_rsbs(ls); 202 + schedule_timeout_interruptible(dlm_config.scan_secs * HZ); 203 + } 204 + return 0; 205 + } 206 + 207 + static int dlm_scand_start(void) 208 + { 209 + struct task_struct *p; 210 + int error = 0; 211 + 212 + p = kthread_run(dlm_scand, NULL, "dlm_scand"); 213 + if (IS_ERR(p)) 214 + error = PTR_ERR(p); 215 + else 216 + scand_task = p; 217 + return error; 218 + } 219 + 220 + static void dlm_scand_stop(void) 221 + { 222 + kthread_stop(scand_task); 223 + } 224 + 225 + static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen) 226 + { 227 + struct dlm_ls *ls; 228 + 229 + spin_lock(&lslist_lock); 230 + 231 + list_for_each_entry(ls, &lslist, ls_list) { 232 + if (ls->ls_namelen == namelen && 233 + memcmp(ls->ls_name, name, namelen) == 0) 234 + goto out; 235 + } 236 + ls = NULL; 237 + out: 238 + spin_unlock(&lslist_lock); 239 + return ls; 240 + } 241 + 242 + struct dlm_ls *dlm_find_lockspace_global(uint32_t id) 243 + { 244 + struct dlm_ls *ls; 245 + 246 + spin_lock(&lslist_lock); 247 + 248 + list_for_each_entry(ls, &lslist, ls_list) { 249 + if (ls->ls_global_id == id) { 250 + ls->ls_count++; 251 + goto out; 252 + } 253 + } 254 + ls = NULL; 255 + out: 256 + spin_unlock(&lslist_lock); 257 + return ls; 258 + } 259 + 260 + struct dlm_ls *dlm_find_lockspace_local(void *id) 261 + { 262 + struct dlm_ls *ls = id; 263 + 264 + spin_lock(&lslist_lock); 265 + ls->ls_count++; 266 + spin_unlock(&lslist_lock); 267 + return ls; 268 + } 269 + 270 + void dlm_put_lockspace(struct dlm_ls *ls) 271 + { 272 + spin_lock(&lslist_lock); 273 + ls->ls_count--; 274 + spin_unlock(&lslist_lock); 275 + } 276 + 277 + static void remove_lockspace(struct dlm_ls *ls) 278 + { 279 + for (;;) { 280 + spin_lock(&lslist_lock); 281 + if (ls->ls_count == 0) { 282 + list_del(&ls->ls_list); 283 + spin_unlock(&lslist_lock); 284 + return; 285 + } 286 + spin_unlock(&lslist_lock); 287 + ssleep(1); 288 + } 289 + } 290 + 291 + static int threads_start(void) 292 + { 293 + int error; 294 + 295 + /* Thread which process lock requests for all lockspace's */ 296 + error = dlm_astd_start(); 297 + if (error) { 298 + log_print("cannot start dlm_astd thread %d", error); 299 + goto fail; 300 + } 301 + 302 + error = dlm_scand_start(); 303 + if (error) { 304 + log_print("cannot start dlm_scand thread %d", error); 305 + goto astd_fail; 306 + } 307 + 308 + /* Thread for sending/receiving messages for all lockspace's */ 309 + error = dlm_lowcomms_start(); 310 + if (error) { 311 + log_print("cannot start dlm lowcomms %d", error); 312 + goto scand_fail; 313 + } 314 + 315 + return 0; 316 + 317 + scand_fail: 318 + dlm_scand_stop(); 319 + astd_fail: 320 + dlm_astd_stop(); 321 + fail: 322 + return error; 323 + } 324 + 325 + static void threads_stop(void) 326 + { 327 + dlm_scand_stop(); 328 + dlm_lowcomms_stop(); 329 + dlm_astd_stop(); 330 + } 331 + 332 + static int new_lockspace(char *name, int namelen, void **lockspace, 333 + uint32_t flags, int lvblen) 334 + { 335 + struct dlm_ls *ls; 336 + int i, size, error = -ENOMEM; 337 + 338 + if (namelen > DLM_LOCKSPACE_LEN) 339 + return -EINVAL; 340 + 341 + if (!lvblen || (lvblen % 8)) 342 + return -EINVAL; 343 + 344 + if (!try_module_get(THIS_MODULE)) 345 + return -EINVAL; 346 + 347 + ls = dlm_find_lockspace_name(name, namelen); 348 + if (ls) { 349 + *lockspace = ls; 350 + module_put(THIS_MODULE); 351 + return -EEXIST; 352 + } 353 + 354 + ls = kmalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); 355 + if (!ls) 356 + goto out; 357 + memset(ls, 0, sizeof(struct dlm_ls) + namelen); 358 + memcpy(ls->ls_name, name, namelen); 359 + ls->ls_namelen = namelen; 360 + ls->ls_exflags = flags; 361 + ls->ls_lvblen = lvblen; 362 + ls->ls_count = 0; 363 + ls->ls_flags = 0; 364 + 365 + size = dlm_config.rsbtbl_size; 366 + ls->ls_rsbtbl_size = size; 367 + 368 + ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL); 369 + if (!ls->ls_rsbtbl) 370 + goto out_lsfree; 371 + for (i = 0; i < size; i++) { 372 + INIT_LIST_HEAD(&ls->ls_rsbtbl[i].list); 373 + INIT_LIST_HEAD(&ls->ls_rsbtbl[i].toss); 374 + rwlock_init(&ls->ls_rsbtbl[i].lock); 375 + } 376 + 377 + size = dlm_config.lkbtbl_size; 378 + ls->ls_lkbtbl_size = size; 379 + 380 + ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL); 381 + if (!ls->ls_lkbtbl) 382 + goto out_rsbfree; 383 + for (i = 0; i < size; i++) { 384 + INIT_LIST_HEAD(&ls->ls_lkbtbl[i].list); 385 + rwlock_init(&ls->ls_lkbtbl[i].lock); 386 + ls->ls_lkbtbl[i].counter = 1; 387 + } 388 + 389 + size = dlm_config.dirtbl_size; 390 + ls->ls_dirtbl_size = size; 391 + 392 + ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL); 393 + if (!ls->ls_dirtbl) 394 + goto out_lkbfree; 395 + for (i = 0; i < size; i++) { 396 + INIT_LIST_HEAD(&ls->ls_dirtbl[i].list); 397 + rwlock_init(&ls->ls_dirtbl[i].lock); 398 + } 399 + 400 + INIT_LIST_HEAD(&ls->ls_waiters); 401 + init_MUTEX(&ls->ls_waiters_sem); 402 + 403 + INIT_LIST_HEAD(&ls->ls_nodes); 404 + INIT_LIST_HEAD(&ls->ls_nodes_gone); 405 + ls->ls_num_nodes = 0; 406 + ls->ls_low_nodeid = 0; 407 + ls->ls_total_weight = 0; 408 + ls->ls_node_array = NULL; 409 + 410 + memset(&ls->ls_stub_rsb, 0, sizeof(struct dlm_rsb)); 411 + ls->ls_stub_rsb.res_ls = ls; 412 + 413 + ls->ls_debug_dentry = NULL; 414 + 415 + init_waitqueue_head(&ls->ls_uevent_wait); 416 + ls->ls_uevent_result = 0; 417 + 418 + ls->ls_recoverd_task = NULL; 419 + init_MUTEX(&ls->ls_recoverd_active); 420 + spin_lock_init(&ls->ls_recover_lock); 421 + ls->ls_recover_status = 0; 422 + ls->ls_recover_seq = 0; 423 + ls->ls_recover_args = NULL; 424 + init_rwsem(&ls->ls_in_recovery); 425 + INIT_LIST_HEAD(&ls->ls_requestqueue); 426 + init_MUTEX(&ls->ls_requestqueue_lock); 427 + 428 + ls->ls_recover_buf = kmalloc(dlm_config.buffer_size, GFP_KERNEL); 429 + if (!ls->ls_recover_buf) 430 + goto out_dirfree; 431 + 432 + INIT_LIST_HEAD(&ls->ls_recover_list); 433 + spin_lock_init(&ls->ls_recover_list_lock); 434 + ls->ls_recover_list_count = 0; 435 + init_waitqueue_head(&ls->ls_wait_general); 436 + INIT_LIST_HEAD(&ls->ls_root_list); 437 + init_rwsem(&ls->ls_root_sem); 438 + 439 + down_write(&ls->ls_in_recovery); 440 + 441 + error = dlm_recoverd_start(ls); 442 + if (error) { 443 + log_error(ls, "can't start dlm_recoverd %d", error); 444 + goto out_rcomfree; 445 + } 446 + 447 + spin_lock(&lslist_lock); 448 + list_add(&ls->ls_list, &lslist); 449 + spin_unlock(&lslist_lock); 450 + 451 + dlm_create_debug_file(ls); 452 + 453 + error = kobject_setup(ls); 454 + if (error) 455 + goto out_del; 456 + 457 + error = kobject_register(&ls->ls_kobj); 458 + if (error) 459 + goto out_del; 460 + 461 + error = do_uevent(ls, 1); 462 + if (error) 463 + goto out_unreg; 464 + 465 + *lockspace = ls; 466 + return 0; 467 + 468 + out_unreg: 469 + kobject_unregister(&ls->ls_kobj); 470 + out_del: 471 + dlm_delete_debug_file(ls); 472 + spin_lock(&lslist_lock); 473 + list_del(&ls->ls_list); 474 + spin_unlock(&lslist_lock); 475 + dlm_recoverd_stop(ls); 476 + out_rcomfree: 477 + kfree(ls->ls_recover_buf); 478 + out_dirfree: 479 + kfree(ls->ls_dirtbl); 480 + out_lkbfree: 481 + kfree(ls->ls_lkbtbl); 482 + out_rsbfree: 483 + kfree(ls->ls_rsbtbl); 484 + out_lsfree: 485 + kfree(ls); 486 + out: 487 + module_put(THIS_MODULE); 488 + return error; 489 + } 490 + 491 + int dlm_new_lockspace(char *name, int namelen, void **lockspace, 492 + uint32_t flags, int lvblen) 493 + { 494 + int error = 0; 495 + 496 + down(&ls_lock); 497 + if (!ls_count) 498 + error = threads_start(); 499 + if (error) 500 + goto out; 501 + 502 + error = new_lockspace(name, namelen, lockspace, flags, lvblen); 503 + if (!error) 504 + ls_count++; 505 + out: 506 + up(&ls_lock); 507 + return error; 508 + } 509 + 510 + /* Return 1 if the lockspace still has active remote locks, 511 + * 2 if the lockspace still has active local locks. 512 + */ 513 + static int lockspace_busy(struct dlm_ls *ls) 514 + { 515 + int i, lkb_found = 0; 516 + struct dlm_lkb *lkb; 517 + 518 + /* NOTE: We check the lockidtbl here rather than the resource table. 519 + This is because there may be LKBs queued as ASTs that have been 520 + unlinked from their RSBs and are pending deletion once the AST has 521 + been delivered */ 522 + 523 + for (i = 0; i < ls->ls_lkbtbl_size; i++) { 524 + read_lock(&ls->ls_lkbtbl[i].lock); 525 + if (!list_empty(&ls->ls_lkbtbl[i].list)) { 526 + lkb_found = 1; 527 + list_for_each_entry(lkb, &ls->ls_lkbtbl[i].list, 528 + lkb_idtbl_list) { 529 + if (!lkb->lkb_nodeid) { 530 + read_unlock(&ls->ls_lkbtbl[i].lock); 531 + return 2; 532 + } 533 + } 534 + } 535 + read_unlock(&ls->ls_lkbtbl[i].lock); 536 + } 537 + return lkb_found; 538 + } 539 + 540 + static int release_lockspace(struct dlm_ls *ls, int force) 541 + { 542 + struct dlm_lkb *lkb; 543 + struct dlm_rsb *rsb; 544 + struct list_head *head; 545 + int i; 546 + int busy = lockspace_busy(ls); 547 + 548 + if (busy > force) 549 + return -EBUSY; 550 + 551 + if (force < 3) 552 + do_uevent(ls, 0); 553 + 554 + dlm_recoverd_stop(ls); 555 + 556 + remove_lockspace(ls); 557 + 558 + dlm_delete_debug_file(ls); 559 + 560 + dlm_astd_suspend(); 561 + 562 + kfree(ls->ls_recover_buf); 563 + 564 + /* 565 + * Free direntry structs. 566 + */ 567 + 568 + dlm_dir_clear(ls); 569 + kfree(ls->ls_dirtbl); 570 + 571 + /* 572 + * Free all lkb's on lkbtbl[] lists. 573 + */ 574 + 575 + for (i = 0; i < ls->ls_lkbtbl_size; i++) { 576 + head = &ls->ls_lkbtbl[i].list; 577 + while (!list_empty(head)) { 578 + lkb = list_entry(head->next, struct dlm_lkb, 579 + lkb_idtbl_list); 580 + 581 + list_del(&lkb->lkb_idtbl_list); 582 + 583 + dlm_del_ast(lkb); 584 + 585 + if (lkb->lkb_lvbptr && lkb->lkb_flags & DLM_IFL_MSTCPY) 586 + free_lvb(lkb->lkb_lvbptr); 587 + 588 + free_lkb(lkb); 589 + } 590 + } 591 + dlm_astd_resume(); 592 + 593 + kfree(ls->ls_lkbtbl); 594 + 595 + /* 596 + * Free all rsb's on rsbtbl[] lists 597 + */ 598 + 599 + for (i = 0; i < ls->ls_rsbtbl_size; i++) { 600 + head = &ls->ls_rsbtbl[i].list; 601 + while (!list_empty(head)) { 602 + rsb = list_entry(head->next, struct dlm_rsb, 603 + res_hashchain); 604 + 605 + list_del(&rsb->res_hashchain); 606 + free_rsb(rsb); 607 + } 608 + 609 + head = &ls->ls_rsbtbl[i].toss; 610 + while (!list_empty(head)) { 611 + rsb = list_entry(head->next, struct dlm_rsb, 612 + res_hashchain); 613 + list_del(&rsb->res_hashchain); 614 + free_rsb(rsb); 615 + } 616 + } 617 + 618 + kfree(ls->ls_rsbtbl); 619 + 620 + /* 621 + * Free structures on any other lists 622 + */ 623 + 624 + kfree(ls->ls_recover_args); 625 + dlm_clear_free_entries(ls); 626 + dlm_clear_members(ls); 627 + dlm_clear_members_gone(ls); 628 + kfree(ls->ls_node_array); 629 + kobject_unregister(&ls->ls_kobj); 630 + kfree(ls); 631 + 632 + down(&ls_lock); 633 + ls_count--; 634 + if (!ls_count) 635 + threads_stop(); 636 + up(&ls_lock); 637 + 638 + module_put(THIS_MODULE); 639 + return 0; 640 + } 641 + 642 + /* 643 + * Called when a system has released all its locks and is not going to use the 644 + * lockspace any longer. We free everything we're managing for this lockspace. 645 + * Remaining nodes will go through the recovery process as if we'd died. The 646 + * lockspace must continue to function as usual, participating in recoveries, 647 + * until this returns. 648 + * 649 + * Force has 4 possible values: 650 + * 0 - don't destroy locksapce if it has any LKBs 651 + * 1 - destroy lockspace if it has remote LKBs but not if it has local LKBs 652 + * 2 - destroy lockspace regardless of LKBs 653 + * 3 - destroy lockspace as part of a forced shutdown 654 + */ 655 + 656 + int dlm_release_lockspace(void *lockspace, int force) 657 + { 658 + struct dlm_ls *ls; 659 + 660 + ls = dlm_find_lockspace_local(lockspace); 661 + if (!ls) 662 + return -EINVAL; 663 + dlm_put_lockspace(ls); 664 + return release_lockspace(ls, force); 665 + } 666 +
+24
fs/dlm/lockspace.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #ifndef __LOCKSPACE_DOT_H__ 15 + #define __LOCKSPACE_DOT_H__ 16 + 17 + int dlm_lockspace_init(void); 18 + void dlm_lockspace_exit(void); 19 + struct dlm_ls *dlm_find_lockspace_global(uint32_t id); 20 + struct dlm_ls *dlm_find_lockspace_local(void *id); 21 + void dlm_put_lockspace(struct dlm_ls *ls); 22 + 23 + #endif /* __LOCKSPACE_DOT_H__ */ 24 +
+1218
fs/dlm/lowcomms.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + /* 15 + * lowcomms.c 16 + * 17 + * This is the "low-level" comms layer. 18 + * 19 + * It is responsible for sending/receiving messages 20 + * from other nodes in the cluster. 21 + * 22 + * Cluster nodes are referred to by their nodeids. nodeids are 23 + * simply 32 bit numbers to the locking module - if they need to 24 + * be expanded for the cluster infrastructure then that is it's 25 + * responsibility. It is this layer's 26 + * responsibility to resolve these into IP address or 27 + * whatever it needs for inter-node communication. 28 + * 29 + * The comms level is two kernel threads that deal mainly with 30 + * the receiving of messages from other nodes and passing them 31 + * up to the mid-level comms layer (which understands the 32 + * message format) for execution by the locking core, and 33 + * a send thread which does all the setting up of connections 34 + * to remote nodes and the sending of data. Threads are not allowed 35 + * to send their own data because it may cause them to wait in times 36 + * of high load. Also, this way, the sending thread can collect together 37 + * messages bound for one node and send them in one block. 38 + * 39 + * I don't see any problem with the recv thread executing the locking 40 + * code on behalf of remote processes as the locking code is 41 + * short, efficient and never (well, hardly ever) waits. 42 + * 43 + */ 44 + 45 + #include <asm/ioctls.h> 46 + #include <net/sock.h> 47 + #include <net/tcp.h> 48 + #include <net/sctp/user.h> 49 + #include <linux/pagemap.h> 50 + #include <linux/socket.h> 51 + #include <linux/idr.h> 52 + 53 + #include "dlm_internal.h" 54 + #include "lowcomms.h" 55 + #include "config.h" 56 + #include "midcomms.h" 57 + 58 + static struct sockaddr_storage *local_addr[DLM_MAX_ADDR_COUNT]; 59 + static int local_count; 60 + static int local_nodeid; 61 + 62 + /* One of these per connected node */ 63 + 64 + #define NI_INIT_PENDING 1 65 + #define NI_WRITE_PENDING 2 66 + 67 + struct nodeinfo { 68 + spinlock_t lock; 69 + sctp_assoc_t assoc_id; 70 + unsigned long flags; 71 + struct list_head write_list; /* nodes with pending writes */ 72 + struct list_head writequeue; /* outgoing writequeue_entries */ 73 + spinlock_t writequeue_lock; 74 + int nodeid; 75 + }; 76 + 77 + static DEFINE_IDR(nodeinfo_idr); 78 + static struct rw_semaphore nodeinfo_lock; 79 + static int max_nodeid; 80 + 81 + struct cbuf { 82 + unsigned base; 83 + unsigned len; 84 + unsigned mask; 85 + }; 86 + 87 + /* Just the one of these, now. But this struct keeps 88 + the connection-specific variables together */ 89 + 90 + #define CF_READ_PENDING 1 91 + 92 + struct connection { 93 + struct socket *sock; 94 + unsigned long flags; 95 + struct page *rx_page; 96 + atomic_t waiting_requests; 97 + struct cbuf cb; 98 + int eagain_flag; 99 + }; 100 + 101 + /* An entry waiting to be sent */ 102 + 103 + struct writequeue_entry { 104 + struct list_head list; 105 + struct page *page; 106 + int offset; 107 + int len; 108 + int end; 109 + int users; 110 + struct nodeinfo *ni; 111 + }; 112 + 113 + #define CBUF_ADD(cb, n) do { (cb)->len += n; } while(0) 114 + #define CBUF_EMPTY(cb) ((cb)->len == 0) 115 + #define CBUF_MAY_ADD(cb, n) (((cb)->len + (n)) < ((cb)->mask + 1)) 116 + #define CBUF_DATA(cb) (((cb)->base + (cb)->len) & (cb)->mask) 117 + 118 + #define CBUF_INIT(cb, size) \ 119 + do { \ 120 + (cb)->base = (cb)->len = 0; \ 121 + (cb)->mask = ((size)-1); \ 122 + } while(0) 123 + 124 + #define CBUF_EAT(cb, n) \ 125 + do { \ 126 + (cb)->len -= (n); \ 127 + (cb)->base += (n); \ 128 + (cb)->base &= (cb)->mask; \ 129 + } while(0) 130 + 131 + 132 + /* List of nodes which have writes pending */ 133 + static struct list_head write_nodes; 134 + static spinlock_t write_nodes_lock; 135 + 136 + /* Maximum number of incoming messages to process before 137 + * doing a schedule() 138 + */ 139 + #define MAX_RX_MSG_COUNT 25 140 + 141 + /* Manage daemons */ 142 + static struct task_struct *recv_task; 143 + static struct task_struct *send_task; 144 + static wait_queue_head_t lowcomms_recv_wait; 145 + static atomic_t accepting; 146 + 147 + /* The SCTP connection */ 148 + static struct connection sctp_con; 149 + 150 + 151 + static int nodeid_to_addr(int nodeid, struct sockaddr *retaddr) 152 + { 153 + struct sockaddr_storage addr; 154 + int error; 155 + 156 + if (!local_count) 157 + return -1; 158 + 159 + error = dlm_nodeid_to_addr(nodeid, &addr); 160 + if (error) 161 + return error; 162 + 163 + if (local_addr[0]->ss_family == AF_INET) { 164 + struct sockaddr_in *in4 = (struct sockaddr_in *) &addr; 165 + struct sockaddr_in *ret4 = (struct sockaddr_in *) retaddr; 166 + ret4->sin_addr.s_addr = in4->sin_addr.s_addr; 167 + } else { 168 + struct sockaddr_in6 *in6 = (struct sockaddr_in6 *) &addr; 169 + struct sockaddr_in6 *ret6 = (struct sockaddr_in6 *) retaddr; 170 + memcpy(&ret6->sin6_addr, &in6->sin6_addr, 171 + sizeof(in6->sin6_addr)); 172 + } 173 + 174 + return 0; 175 + } 176 + 177 + static struct nodeinfo *nodeid2nodeinfo(int nodeid, int alloc) 178 + { 179 + struct nodeinfo *ni; 180 + int r; 181 + int n; 182 + 183 + down_read(&nodeinfo_lock); 184 + ni = idr_find(&nodeinfo_idr, nodeid); 185 + up_read(&nodeinfo_lock); 186 + 187 + if (!ni && alloc) { 188 + down_write(&nodeinfo_lock); 189 + 190 + ni = idr_find(&nodeinfo_idr, nodeid); 191 + if (ni) 192 + goto out_up; 193 + 194 + r = idr_pre_get(&nodeinfo_idr, alloc); 195 + if (!r) 196 + goto out_up; 197 + 198 + ni = kmalloc(sizeof(struct nodeinfo), alloc); 199 + if (!ni) 200 + goto out_up; 201 + 202 + r = idr_get_new_above(&nodeinfo_idr, ni, nodeid, &n); 203 + if (r) { 204 + kfree(ni); 205 + ni = NULL; 206 + goto out_up; 207 + } 208 + if (n != nodeid) { 209 + idr_remove(&nodeinfo_idr, n); 210 + kfree(ni); 211 + ni = NULL; 212 + goto out_up; 213 + } 214 + memset(ni, 0, sizeof(struct nodeinfo)); 215 + spin_lock_init(&ni->lock); 216 + INIT_LIST_HEAD(&ni->writequeue); 217 + spin_lock_init(&ni->writequeue_lock); 218 + ni->nodeid = nodeid; 219 + 220 + if (nodeid > max_nodeid) 221 + max_nodeid = nodeid; 222 + out_up: 223 + up_write(&nodeinfo_lock); 224 + } 225 + 226 + return ni; 227 + } 228 + 229 + /* Don't call this too often... */ 230 + static struct nodeinfo *assoc2nodeinfo(sctp_assoc_t assoc) 231 + { 232 + int i; 233 + struct nodeinfo *ni; 234 + 235 + for (i=1; i<=max_nodeid; i++) { 236 + ni = nodeid2nodeinfo(i, 0); 237 + if (ni && ni->assoc_id == assoc) 238 + return ni; 239 + } 240 + return NULL; 241 + } 242 + 243 + /* Data or notification available on socket */ 244 + static void lowcomms_data_ready(struct sock *sk, int count_unused) 245 + { 246 + atomic_inc(&sctp_con.waiting_requests); 247 + if (test_and_set_bit(CF_READ_PENDING, &sctp_con.flags)) 248 + return; 249 + 250 + wake_up_interruptible(&lowcomms_recv_wait); 251 + } 252 + 253 + 254 + /* Add the port number to an IP6 or 4 sockaddr and return the address length. 255 + Also padd out the struct with zeros to make comparisons meaningful */ 256 + 257 + static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port, 258 + int *addr_len) 259 + { 260 + struct sockaddr_in *local4_addr; 261 + struct sockaddr_in6 *local6_addr; 262 + 263 + if (!local_count) 264 + return; 265 + 266 + if (!port) { 267 + if (local_addr[0]->ss_family == AF_INET) { 268 + local4_addr = (struct sockaddr_in *)local_addr[0]; 269 + port = be16_to_cpu(local4_addr->sin_port); 270 + } else { 271 + local6_addr = (struct sockaddr_in6 *)local_addr[0]; 272 + port = be16_to_cpu(local6_addr->sin6_port); 273 + } 274 + } 275 + 276 + saddr->ss_family = local_addr[0]->ss_family; 277 + if (local_addr[0]->ss_family == AF_INET) { 278 + struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr; 279 + in4_addr->sin_port = cpu_to_be16(port); 280 + memset(&in4_addr->sin_zero, 0, sizeof(in4_addr->sin_zero)); 281 + memset(in4_addr+1, 0, sizeof(struct sockaddr_storage) - 282 + sizeof(struct sockaddr_in)); 283 + *addr_len = sizeof(struct sockaddr_in); 284 + } else { 285 + struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr; 286 + in6_addr->sin6_port = cpu_to_be16(port); 287 + memset(in6_addr+1, 0, sizeof(struct sockaddr_storage) - 288 + sizeof(struct sockaddr_in6)); 289 + *addr_len = sizeof(struct sockaddr_in6); 290 + } 291 + } 292 + 293 + /* Close the connection and tidy up */ 294 + static void close_connection(void) 295 + { 296 + if (sctp_con.sock) { 297 + sock_release(sctp_con.sock); 298 + sctp_con.sock = NULL; 299 + } 300 + 301 + if (sctp_con.rx_page) { 302 + __free_page(sctp_con.rx_page); 303 + sctp_con.rx_page = NULL; 304 + } 305 + } 306 + 307 + /* We only send shutdown messages to nodes that are not part of the cluster */ 308 + static void send_shutdown(sctp_assoc_t associd) 309 + { 310 + static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; 311 + struct msghdr outmessage; 312 + struct cmsghdr *cmsg; 313 + struct sctp_sndrcvinfo *sinfo; 314 + int ret; 315 + 316 + outmessage.msg_name = NULL; 317 + outmessage.msg_namelen = 0; 318 + outmessage.msg_control = outcmsg; 319 + outmessage.msg_controllen = sizeof(outcmsg); 320 + outmessage.msg_flags = MSG_EOR; 321 + 322 + cmsg = CMSG_FIRSTHDR(&outmessage); 323 + cmsg->cmsg_level = IPPROTO_SCTP; 324 + cmsg->cmsg_type = SCTP_SNDRCV; 325 + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); 326 + outmessage.msg_controllen = cmsg->cmsg_len; 327 + sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); 328 + memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); 329 + 330 + sinfo->sinfo_flags |= MSG_EOF; 331 + sinfo->sinfo_assoc_id = associd; 332 + 333 + ret = kernel_sendmsg(sctp_con.sock, &outmessage, NULL, 0, 0); 334 + 335 + if (ret != 0) 336 + log_print("send EOF to node failed: %d", ret); 337 + } 338 + 339 + 340 + /* INIT failed but we don't know which node... 341 + restart INIT on all pending nodes */ 342 + static void init_failed(void) 343 + { 344 + int i; 345 + struct nodeinfo *ni; 346 + 347 + for (i=1; i<=max_nodeid; i++) { 348 + ni = nodeid2nodeinfo(i, 0); 349 + if (!ni) 350 + continue; 351 + 352 + if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) { 353 + ni->assoc_id = 0; 354 + if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { 355 + spin_lock_bh(&write_nodes_lock); 356 + list_add_tail(&ni->write_list, &write_nodes); 357 + spin_unlock_bh(&write_nodes_lock); 358 + } 359 + } 360 + } 361 + wake_up_process(send_task); 362 + } 363 + 364 + /* Something happened to an association */ 365 + static void process_sctp_notification(struct msghdr *msg, char *buf) 366 + { 367 + union sctp_notification *sn = (union sctp_notification *)buf; 368 + 369 + if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) { 370 + switch (sn->sn_assoc_change.sac_state) { 371 + 372 + case SCTP_COMM_UP: 373 + case SCTP_RESTART: 374 + { 375 + /* Check that the new node is in the lockspace */ 376 + struct sctp_prim prim; 377 + mm_segment_t fs; 378 + int nodeid; 379 + int prim_len, ret; 380 + int addr_len; 381 + struct nodeinfo *ni; 382 + 383 + /* This seems to happen when we received a connection 384 + * too early... or something... anyway, it happens but 385 + * we always seem to get a real message too, see 386 + * receive_from_sock */ 387 + 388 + if ((int)sn->sn_assoc_change.sac_assoc_id <= 0) { 389 + log_print("COMM_UP for invalid assoc ID %d", 390 + (int)sn->sn_assoc_change.sac_assoc_id); 391 + init_failed(); 392 + return; 393 + } 394 + memset(&prim, 0, sizeof(struct sctp_prim)); 395 + prim_len = sizeof(struct sctp_prim); 396 + prim.ssp_assoc_id = sn->sn_assoc_change.sac_assoc_id; 397 + 398 + fs = get_fs(); 399 + set_fs(get_ds()); 400 + ret = sctp_con.sock->ops->getsockopt(sctp_con.sock, 401 + IPPROTO_SCTP, SCTP_PRIMARY_ADDR, 402 + (char*)&prim, &prim_len); 403 + set_fs(fs); 404 + if (ret < 0) { 405 + struct nodeinfo *ni; 406 + 407 + log_print("getsockopt/sctp_primary_addr on " 408 + "new assoc %d failed : %d", 409 + (int)sn->sn_assoc_change.sac_assoc_id, ret); 410 + 411 + /* Retry INIT later */ 412 + ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id); 413 + if (ni) 414 + clear_bit(NI_INIT_PENDING, &ni->flags); 415 + return; 416 + } 417 + make_sockaddr(&prim.ssp_addr, 0, &addr_len); 418 + if (dlm_addr_to_nodeid(&prim.ssp_addr, &nodeid)) { 419 + log_print("reject connect from unknown addr"); 420 + send_shutdown(prim.ssp_assoc_id); 421 + return; 422 + } 423 + 424 + ni = nodeid2nodeinfo(nodeid, GFP_KERNEL); 425 + if (!ni) 426 + return; 427 + 428 + /* Save the assoc ID */ 429 + spin_lock(&ni->lock); 430 + ni->assoc_id = sn->sn_assoc_change.sac_assoc_id; 431 + spin_unlock(&ni->lock); 432 + 433 + log_print("got new/restarted association %d nodeid %d", 434 + (int)sn->sn_assoc_change.sac_assoc_id, nodeid); 435 + 436 + /* Send any pending writes */ 437 + clear_bit(NI_INIT_PENDING, &ni->flags); 438 + if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { 439 + spin_lock_bh(&write_nodes_lock); 440 + list_add_tail(&ni->write_list, &write_nodes); 441 + spin_unlock_bh(&write_nodes_lock); 442 + } 443 + wake_up_process(send_task); 444 + } 445 + break; 446 + 447 + case SCTP_COMM_LOST: 448 + case SCTP_SHUTDOWN_COMP: 449 + { 450 + struct nodeinfo *ni; 451 + 452 + ni = assoc2nodeinfo(sn->sn_assoc_change.sac_assoc_id); 453 + if (ni) { 454 + spin_lock(&ni->lock); 455 + ni->assoc_id = 0; 456 + spin_unlock(&ni->lock); 457 + } 458 + } 459 + break; 460 + 461 + /* We don't know which INIT failed, so clear the PENDING flags 462 + * on them all. if assoc_id is zero then it will then try 463 + * again */ 464 + 465 + case SCTP_CANT_STR_ASSOC: 466 + { 467 + log_print("Can't start SCTP association - retrying"); 468 + init_failed(); 469 + } 470 + break; 471 + 472 + default: 473 + log_print("unexpected SCTP assoc change id=%d state=%d", 474 + (int)sn->sn_assoc_change.sac_assoc_id, 475 + sn->sn_assoc_change.sac_state); 476 + } 477 + } 478 + } 479 + 480 + /* Data received from remote end */ 481 + static int receive_from_sock(void) 482 + { 483 + int ret = 0; 484 + struct msghdr msg; 485 + struct kvec iov[2]; 486 + unsigned len; 487 + int r; 488 + struct sctp_sndrcvinfo *sinfo; 489 + struct cmsghdr *cmsg; 490 + struct nodeinfo *ni; 491 + 492 + /* These two are marginally too big for stack allocation, but this 493 + * function is (currently) only called by dlm_recvd so static should be 494 + * OK. 495 + */ 496 + static struct sockaddr_storage msgname; 497 + static char incmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; 498 + 499 + if (sctp_con.sock == NULL) 500 + goto out; 501 + 502 + if (sctp_con.rx_page == NULL) { 503 + /* 504 + * This doesn't need to be atomic, but I think it should 505 + * improve performance if it is. 506 + */ 507 + sctp_con.rx_page = alloc_page(GFP_ATOMIC); 508 + if (sctp_con.rx_page == NULL) 509 + goto out_resched; 510 + CBUF_INIT(&sctp_con.cb, PAGE_CACHE_SIZE); 511 + } 512 + 513 + memset(&incmsg, 0, sizeof(incmsg)); 514 + memset(&msgname, 0, sizeof(msgname)); 515 + 516 + memset(incmsg, 0, sizeof(incmsg)); 517 + msg.msg_name = &msgname; 518 + msg.msg_namelen = sizeof(msgname); 519 + msg.msg_flags = 0; 520 + msg.msg_control = incmsg; 521 + msg.msg_controllen = sizeof(incmsg); 522 + 523 + /* I don't see why this circular buffer stuff is necessary for SCTP 524 + * which is a packet-based protocol, but the whole thing breaks under 525 + * load without it! The overhead is minimal (and is in the TCP lowcomms 526 + * anyway, of course) so I'll leave it in until I can figure out what's 527 + * really happening. 528 + */ 529 + 530 + /* 531 + * iov[0] is the bit of the circular buffer between the current end 532 + * point (cb.base + cb.len) and the end of the buffer. 533 + */ 534 + iov[0].iov_len = sctp_con.cb.base - CBUF_DATA(&sctp_con.cb); 535 + iov[0].iov_base = page_address(sctp_con.rx_page) + 536 + CBUF_DATA(&sctp_con.cb); 537 + iov[1].iov_len = 0; 538 + 539 + /* 540 + * iov[1] is the bit of the circular buffer between the start of the 541 + * buffer and the start of the currently used section (cb.base) 542 + */ 543 + if (CBUF_DATA(&sctp_con.cb) >= sctp_con.cb.base) { 544 + iov[0].iov_len = PAGE_CACHE_SIZE - CBUF_DATA(&sctp_con.cb); 545 + iov[1].iov_len = sctp_con.cb.base; 546 + iov[1].iov_base = page_address(sctp_con.rx_page); 547 + msg.msg_iovlen = 2; 548 + } 549 + len = iov[0].iov_len + iov[1].iov_len; 550 + 551 + r = ret = kernel_recvmsg(sctp_con.sock, &msg, iov, 1, len, 552 + MSG_NOSIGNAL | MSG_DONTWAIT); 553 + if (ret <= 0) 554 + goto out_close; 555 + 556 + msg.msg_control = incmsg; 557 + msg.msg_controllen = sizeof(incmsg); 558 + cmsg = CMSG_FIRSTHDR(&msg); 559 + sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); 560 + 561 + if (msg.msg_flags & MSG_NOTIFICATION) { 562 + process_sctp_notification(&msg, page_address(sctp_con.rx_page)); 563 + return 0; 564 + } 565 + 566 + /* Is this a new association ? */ 567 + ni = nodeid2nodeinfo(le32_to_cpu(sinfo->sinfo_ppid), GFP_KERNEL); 568 + if (ni) { 569 + ni->assoc_id = sinfo->sinfo_assoc_id; 570 + if (test_and_clear_bit(NI_INIT_PENDING, &ni->flags)) { 571 + 572 + if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { 573 + spin_lock_bh(&write_nodes_lock); 574 + list_add_tail(&ni->write_list, &write_nodes); 575 + spin_unlock_bh(&write_nodes_lock); 576 + } 577 + wake_up_process(send_task); 578 + } 579 + } 580 + 581 + /* INIT sends a message with length of 1 - ignore it */ 582 + if (r == 1) 583 + return 0; 584 + 585 + CBUF_ADD(&sctp_con.cb, ret); 586 + ret = dlm_process_incoming_buffer(cpu_to_le32(sinfo->sinfo_ppid), 587 + page_address(sctp_con.rx_page), 588 + sctp_con.cb.base, sctp_con.cb.len, 589 + PAGE_CACHE_SIZE); 590 + if (ret < 0) 591 + goto out_close; 592 + CBUF_EAT(&sctp_con.cb, ret); 593 + 594 + out: 595 + ret = 0; 596 + goto out_ret; 597 + 598 + out_resched: 599 + lowcomms_data_ready(sctp_con.sock->sk, 0); 600 + ret = 0; 601 + schedule(); 602 + goto out_ret; 603 + 604 + out_close: 605 + if (ret != -EAGAIN) 606 + log_print("error reading from sctp socket: %d", ret); 607 + out_ret: 608 + return ret; 609 + } 610 + 611 + /* Bind to an IP address. SCTP allows multiple address so it can do multi-homing */ 612 + static int add_bind_addr(struct sockaddr_storage *addr, int addr_len, int num) 613 + { 614 + mm_segment_t fs; 615 + int result = 0; 616 + 617 + fs = get_fs(); 618 + set_fs(get_ds()); 619 + if (num == 1) 620 + result = sctp_con.sock->ops->bind(sctp_con.sock, 621 + (struct sockaddr *) addr, addr_len); 622 + else 623 + result = sctp_con.sock->ops->setsockopt(sctp_con.sock, SOL_SCTP, 624 + SCTP_SOCKOPT_BINDX_ADD, (char *)addr, addr_len); 625 + set_fs(fs); 626 + 627 + if (result < 0) 628 + log_print("Can't bind to port %d addr number %d", 629 + dlm_config.tcp_port, num); 630 + 631 + return result; 632 + } 633 + 634 + static void init_local(void) 635 + { 636 + struct sockaddr_storage sas, *addr; 637 + int i; 638 + 639 + local_nodeid = dlm_our_nodeid(); 640 + 641 + for (i = 0; i < DLM_MAX_ADDR_COUNT - 1; i++) { 642 + if (dlm_our_addr(&sas, i)) 643 + break; 644 + 645 + addr = kmalloc(sizeof(*addr), GFP_KERNEL); 646 + if (!addr) 647 + break; 648 + memcpy(addr, &sas, sizeof(*addr)); 649 + local_addr[local_count++] = addr; 650 + } 651 + } 652 + 653 + /* Initialise SCTP socket and bind to all interfaces */ 654 + static int init_sock(void) 655 + { 656 + mm_segment_t fs; 657 + struct socket *sock = NULL; 658 + struct sockaddr_storage localaddr; 659 + struct sctp_event_subscribe subscribe; 660 + int result = -EINVAL, num = 1, i, addr_len; 661 + 662 + if (!local_count) { 663 + init_local(); 664 + if (!local_count) { 665 + log_print("no local IP address has been set"); 666 + goto out; 667 + } 668 + } 669 + 670 + result = sock_create_kern(local_addr[0]->ss_family, SOCK_SEQPACKET, 671 + IPPROTO_SCTP, &sock); 672 + if (result < 0) { 673 + log_print("Can't create comms socket, check SCTP is loaded"); 674 + goto out; 675 + } 676 + 677 + /* Listen for events */ 678 + memset(&subscribe, 0, sizeof(subscribe)); 679 + subscribe.sctp_data_io_event = 1; 680 + subscribe.sctp_association_event = 1; 681 + subscribe.sctp_send_failure_event = 1; 682 + subscribe.sctp_shutdown_event = 1; 683 + subscribe.sctp_partial_delivery_event = 1; 684 + 685 + fs = get_fs(); 686 + set_fs(get_ds()); 687 + result = sock->ops->setsockopt(sock, SOL_SCTP, SCTP_EVENTS, 688 + (char *)&subscribe, sizeof(subscribe)); 689 + set_fs(fs); 690 + 691 + if (result < 0) { 692 + log_print("Failed to set SCTP_EVENTS on socket: result=%d", 693 + result); 694 + goto create_delsock; 695 + } 696 + 697 + /* Init con struct */ 698 + sock->sk->sk_user_data = &sctp_con; 699 + sctp_con.sock = sock; 700 + sctp_con.sock->sk->sk_data_ready = lowcomms_data_ready; 701 + 702 + /* Bind to all interfaces. */ 703 + for (i = 0; i < local_count; i++) { 704 + memcpy(&localaddr, local_addr[i], sizeof(localaddr)); 705 + make_sockaddr(&localaddr, dlm_config.tcp_port, &addr_len); 706 + 707 + result = add_bind_addr(&localaddr, addr_len, num); 708 + if (result) 709 + goto create_delsock; 710 + ++num; 711 + } 712 + 713 + result = sock->ops->listen(sock, 5); 714 + if (result < 0) { 715 + log_print("Can't set socket listening"); 716 + goto create_delsock; 717 + } 718 + 719 + return 0; 720 + 721 + create_delsock: 722 + sock_release(sock); 723 + sctp_con.sock = NULL; 724 + out: 725 + return result; 726 + } 727 + 728 + 729 + static struct writequeue_entry *new_writequeue_entry(int allocation) 730 + { 731 + struct writequeue_entry *entry; 732 + 733 + entry = kmalloc(sizeof(struct writequeue_entry), allocation); 734 + if (!entry) 735 + return NULL; 736 + 737 + entry->page = alloc_page(allocation); 738 + if (!entry->page) { 739 + kfree(entry); 740 + return NULL; 741 + } 742 + 743 + entry->offset = 0; 744 + entry->len = 0; 745 + entry->end = 0; 746 + entry->users = 0; 747 + 748 + return entry; 749 + } 750 + 751 + void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc) 752 + { 753 + struct writequeue_entry *e; 754 + int offset = 0; 755 + int users = 0; 756 + struct nodeinfo *ni; 757 + 758 + if (!atomic_read(&accepting)) 759 + return NULL; 760 + 761 + ni = nodeid2nodeinfo(nodeid, allocation); 762 + if (!ni) 763 + return NULL; 764 + 765 + spin_lock(&ni->writequeue_lock); 766 + e = list_entry(ni->writequeue.prev, struct writequeue_entry, list); 767 + if (((struct list_head *) e == &ni->writequeue) || 768 + (PAGE_CACHE_SIZE - e->end < len)) { 769 + e = NULL; 770 + } else { 771 + offset = e->end; 772 + e->end += len; 773 + users = e->users++; 774 + } 775 + spin_unlock(&ni->writequeue_lock); 776 + 777 + if (e) { 778 + got_one: 779 + if (users == 0) 780 + kmap(e->page); 781 + *ppc = page_address(e->page) + offset; 782 + return e; 783 + } 784 + 785 + e = new_writequeue_entry(allocation); 786 + if (e) { 787 + spin_lock(&ni->writequeue_lock); 788 + offset = e->end; 789 + e->end += len; 790 + e->ni = ni; 791 + users = e->users++; 792 + list_add_tail(&e->list, &ni->writequeue); 793 + spin_unlock(&ni->writequeue_lock); 794 + goto got_one; 795 + } 796 + return NULL; 797 + } 798 + 799 + void dlm_lowcomms_commit_buffer(void *arg) 800 + { 801 + struct writequeue_entry *e = (struct writequeue_entry *) arg; 802 + int users; 803 + struct nodeinfo *ni = e->ni; 804 + 805 + if (!atomic_read(&accepting)) 806 + return; 807 + 808 + spin_lock(&ni->writequeue_lock); 809 + users = --e->users; 810 + if (users) 811 + goto out; 812 + e->len = e->end - e->offset; 813 + kunmap(e->page); 814 + spin_unlock(&ni->writequeue_lock); 815 + 816 + if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { 817 + spin_lock_bh(&write_nodes_lock); 818 + list_add_tail(&ni->write_list, &write_nodes); 819 + spin_unlock_bh(&write_nodes_lock); 820 + wake_up_process(send_task); 821 + } 822 + return; 823 + 824 + out: 825 + spin_unlock(&ni->writequeue_lock); 826 + return; 827 + } 828 + 829 + static void free_entry(struct writequeue_entry *e) 830 + { 831 + __free_page(e->page); 832 + kfree(e); 833 + } 834 + 835 + /* Initiate an SCTP association. In theory we could just use sendmsg() on 836 + the first IP address and it should work, but this allows us to set up the 837 + association before sending any valuable data that we can't afford to lose. 838 + It also keeps the send path clean as it can now always use the association ID */ 839 + static void initiate_association(int nodeid) 840 + { 841 + struct sockaddr_storage rem_addr; 842 + static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; 843 + struct msghdr outmessage; 844 + struct cmsghdr *cmsg; 845 + struct sctp_sndrcvinfo *sinfo; 846 + int ret; 847 + int addrlen; 848 + char buf[1]; 849 + struct kvec iov[1]; 850 + struct nodeinfo *ni; 851 + 852 + log_print("Initiating association with node %d", nodeid); 853 + 854 + ni = nodeid2nodeinfo(nodeid, GFP_KERNEL); 855 + if (!ni) 856 + return; 857 + 858 + if (nodeid_to_addr(nodeid, (struct sockaddr *)&rem_addr)) { 859 + log_print("no address for nodeid %d", nodeid); 860 + return; 861 + } 862 + 863 + make_sockaddr(&rem_addr, dlm_config.tcp_port, &addrlen); 864 + 865 + outmessage.msg_name = &rem_addr; 866 + outmessage.msg_namelen = addrlen; 867 + outmessage.msg_control = outcmsg; 868 + outmessage.msg_controllen = sizeof(outcmsg); 869 + outmessage.msg_flags = MSG_EOR; 870 + 871 + iov[0].iov_base = buf; 872 + iov[0].iov_len = 1; 873 + 874 + /* Real INIT messages seem to cause trouble. Just send a 1 byte message 875 + we can afford to lose */ 876 + cmsg = CMSG_FIRSTHDR(&outmessage); 877 + cmsg->cmsg_level = IPPROTO_SCTP; 878 + cmsg->cmsg_type = SCTP_SNDRCV; 879 + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); 880 + sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); 881 + memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); 882 + sinfo->sinfo_ppid = cpu_to_le32(local_nodeid); 883 + 884 + outmessage.msg_controllen = cmsg->cmsg_len; 885 + ret = kernel_sendmsg(sctp_con.sock, &outmessage, iov, 1, 1); 886 + if (ret < 0) { 887 + log_print("send INIT to node failed: %d", ret); 888 + /* Try again later */ 889 + clear_bit(NI_INIT_PENDING, &ni->flags); 890 + } 891 + } 892 + 893 + /* Send a message */ 894 + static int send_to_sock(struct nodeinfo *ni) 895 + { 896 + int ret = 0; 897 + struct writequeue_entry *e; 898 + int len, offset; 899 + struct msghdr outmsg; 900 + static char outcmsg[CMSG_SPACE(sizeof(struct sctp_sndrcvinfo))]; 901 + struct cmsghdr *cmsg; 902 + struct sctp_sndrcvinfo *sinfo; 903 + struct kvec iov; 904 + 905 + /* See if we need to init an association before we start 906 + sending precious messages */ 907 + spin_lock(&ni->lock); 908 + if (!ni->assoc_id && !test_and_set_bit(NI_INIT_PENDING, &ni->flags)) { 909 + spin_unlock(&ni->lock); 910 + initiate_association(ni->nodeid); 911 + return 0; 912 + } 913 + spin_unlock(&ni->lock); 914 + 915 + outmsg.msg_name = NULL; /* We use assoc_id */ 916 + outmsg.msg_namelen = 0; 917 + outmsg.msg_control = outcmsg; 918 + outmsg.msg_controllen = sizeof(outcmsg); 919 + outmsg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL | MSG_EOR; 920 + 921 + cmsg = CMSG_FIRSTHDR(&outmsg); 922 + cmsg->cmsg_level = IPPROTO_SCTP; 923 + cmsg->cmsg_type = SCTP_SNDRCV; 924 + cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); 925 + sinfo = (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); 926 + memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); 927 + sinfo->sinfo_ppid = cpu_to_le32(local_nodeid); 928 + sinfo->sinfo_assoc_id = ni->assoc_id; 929 + outmsg.msg_controllen = cmsg->cmsg_len; 930 + 931 + spin_lock(&ni->writequeue_lock); 932 + for (;;) { 933 + if (list_empty(&ni->writequeue)) 934 + break; 935 + e = list_entry(ni->writequeue.next, struct writequeue_entry, 936 + list); 937 + kmap(e->page); 938 + len = e->len; 939 + offset = e->offset; 940 + BUG_ON(len == 0 && e->users == 0); 941 + spin_unlock(&ni->writequeue_lock); 942 + 943 + ret = 0; 944 + if (len) { 945 + iov.iov_base = page_address(e->page)+offset; 946 + iov.iov_len = len; 947 + 948 + ret = kernel_sendmsg(sctp_con.sock, &outmsg, &iov, 1, 949 + len); 950 + if (ret == -EAGAIN) { 951 + sctp_con.eagain_flag = 1; 952 + goto out; 953 + } else if (ret < 0) 954 + goto send_error; 955 + } else { 956 + /* Don't starve people filling buffers */ 957 + schedule(); 958 + } 959 + 960 + spin_lock(&ni->writequeue_lock); 961 + e->offset += ret; 962 + e->len -= ret; 963 + 964 + if (e->len == 0 && e->users == 0) { 965 + list_del(&e->list); 966 + free_entry(e); 967 + continue; 968 + } 969 + } 970 + spin_unlock(&ni->writequeue_lock); 971 + out: 972 + return ret; 973 + 974 + send_error: 975 + log_print("Error sending to node %d %d", ni->nodeid, ret); 976 + spin_lock(&ni->lock); 977 + if (!test_and_set_bit(NI_INIT_PENDING, &ni->flags)) { 978 + ni->assoc_id = 0; 979 + spin_unlock(&ni->lock); 980 + initiate_association(ni->nodeid); 981 + } else 982 + spin_unlock(&ni->lock); 983 + 984 + return ret; 985 + } 986 + 987 + /* Try to send any messages that are pending */ 988 + static void process_output_queue(void) 989 + { 990 + struct list_head *list; 991 + struct list_head *temp; 992 + 993 + spin_lock_bh(&write_nodes_lock); 994 + list_for_each_safe(list, temp, &write_nodes) { 995 + struct nodeinfo *ni = 996 + list_entry(list, struct nodeinfo, write_list); 997 + clear_bit(NI_WRITE_PENDING, &ni->flags); 998 + list_del(&ni->write_list); 999 + 1000 + spin_unlock_bh(&write_nodes_lock); 1001 + 1002 + send_to_sock(ni); 1003 + spin_lock_bh(&write_nodes_lock); 1004 + } 1005 + spin_unlock_bh(&write_nodes_lock); 1006 + } 1007 + 1008 + /* Called after we've had -EAGAIN and been woken up */ 1009 + static void refill_write_queue(void) 1010 + { 1011 + int i; 1012 + 1013 + for (i=1; i<=max_nodeid; i++) { 1014 + struct nodeinfo *ni = nodeid2nodeinfo(i, 0); 1015 + 1016 + if (ni) { 1017 + if (!test_and_set_bit(NI_WRITE_PENDING, &ni->flags)) { 1018 + spin_lock_bh(&write_nodes_lock); 1019 + list_add_tail(&ni->write_list, &write_nodes); 1020 + spin_unlock_bh(&write_nodes_lock); 1021 + } 1022 + } 1023 + } 1024 + } 1025 + 1026 + static void clean_one_writequeue(struct nodeinfo *ni) 1027 + { 1028 + struct list_head *list; 1029 + struct list_head *temp; 1030 + 1031 + spin_lock(&ni->writequeue_lock); 1032 + list_for_each_safe(list, temp, &ni->writequeue) { 1033 + struct writequeue_entry *e = 1034 + list_entry(list, struct writequeue_entry, list); 1035 + list_del(&e->list); 1036 + free_entry(e); 1037 + } 1038 + spin_unlock(&ni->writequeue_lock); 1039 + } 1040 + 1041 + static void clean_writequeues(void) 1042 + { 1043 + int i; 1044 + 1045 + for (i=1; i<=max_nodeid; i++) { 1046 + struct nodeinfo *ni = nodeid2nodeinfo(i, 0); 1047 + if (ni) 1048 + clean_one_writequeue(ni); 1049 + } 1050 + } 1051 + 1052 + 1053 + static void dealloc_nodeinfo(void) 1054 + { 1055 + int i; 1056 + 1057 + for (i=1; i<=max_nodeid; i++) { 1058 + struct nodeinfo *ni = nodeid2nodeinfo(i, 0); 1059 + if (ni) { 1060 + idr_remove(&nodeinfo_idr, i); 1061 + kfree(ni); 1062 + } 1063 + } 1064 + } 1065 + 1066 + static int write_list_empty(void) 1067 + { 1068 + int status; 1069 + 1070 + spin_lock_bh(&write_nodes_lock); 1071 + status = list_empty(&write_nodes); 1072 + spin_unlock_bh(&write_nodes_lock); 1073 + 1074 + return status; 1075 + } 1076 + 1077 + static int dlm_recvd(void *data) 1078 + { 1079 + DECLARE_WAITQUEUE(wait, current); 1080 + 1081 + while (!kthread_should_stop()) { 1082 + int count = 0; 1083 + 1084 + set_current_state(TASK_INTERRUPTIBLE); 1085 + add_wait_queue(&lowcomms_recv_wait, &wait); 1086 + if (!test_bit(CF_READ_PENDING, &sctp_con.flags)) 1087 + schedule(); 1088 + remove_wait_queue(&lowcomms_recv_wait, &wait); 1089 + set_current_state(TASK_RUNNING); 1090 + 1091 + if (test_and_clear_bit(CF_READ_PENDING, &sctp_con.flags)) { 1092 + int ret; 1093 + 1094 + do { 1095 + ret = receive_from_sock(); 1096 + 1097 + /* Don't starve out everyone else */ 1098 + if (++count >= MAX_RX_MSG_COUNT) { 1099 + schedule(); 1100 + count = 0; 1101 + } 1102 + } while (!kthread_should_stop() && ret >=0); 1103 + } 1104 + schedule(); 1105 + } 1106 + 1107 + return 0; 1108 + } 1109 + 1110 + static int dlm_sendd(void *data) 1111 + { 1112 + DECLARE_WAITQUEUE(wait, current); 1113 + 1114 + add_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); 1115 + 1116 + while (!kthread_should_stop()) { 1117 + set_current_state(TASK_INTERRUPTIBLE); 1118 + if (write_list_empty()) 1119 + schedule(); 1120 + set_current_state(TASK_RUNNING); 1121 + 1122 + if (sctp_con.eagain_flag) { 1123 + sctp_con.eagain_flag = 0; 1124 + refill_write_queue(); 1125 + } 1126 + process_output_queue(); 1127 + } 1128 + 1129 + remove_wait_queue(sctp_con.sock->sk->sk_sleep, &wait); 1130 + 1131 + return 0; 1132 + } 1133 + 1134 + static void daemons_stop(void) 1135 + { 1136 + kthread_stop(recv_task); 1137 + kthread_stop(send_task); 1138 + } 1139 + 1140 + static int daemons_start(void) 1141 + { 1142 + struct task_struct *p; 1143 + int error; 1144 + 1145 + p = kthread_run(dlm_recvd, NULL, "dlm_recvd"); 1146 + error = IS_ERR(p); 1147 + if (error) { 1148 + log_print("can't start dlm_recvd %d", error); 1149 + return error; 1150 + } 1151 + recv_task = p; 1152 + 1153 + p = kthread_run(dlm_sendd, NULL, "dlm_sendd"); 1154 + error = IS_ERR(p); 1155 + if (error) { 1156 + log_print("can't start dlm_sendd %d", error); 1157 + kthread_stop(recv_task); 1158 + return error; 1159 + } 1160 + send_task = p; 1161 + 1162 + return 0; 1163 + } 1164 + 1165 + /* 1166 + * This is quite likely to sleep... 1167 + */ 1168 + int dlm_lowcomms_start(void) 1169 + { 1170 + int error; 1171 + 1172 + spin_lock_init(&write_nodes_lock); 1173 + INIT_LIST_HEAD(&write_nodes); 1174 + init_rwsem(&nodeinfo_lock); 1175 + 1176 + error = init_sock(); 1177 + if (error) 1178 + goto fail_sock; 1179 + error = daemons_start(); 1180 + if (error) 1181 + goto fail_sock; 1182 + atomic_set(&accepting, 1); 1183 + return 0; 1184 + 1185 + fail_sock: 1186 + close_connection(); 1187 + return error; 1188 + } 1189 + 1190 + /* Set all the activity flags to prevent any socket activity. */ 1191 + 1192 + void dlm_lowcomms_stop(void) 1193 + { 1194 + atomic_set(&accepting, 0); 1195 + sctp_con.flags = 0x7; 1196 + daemons_stop(); 1197 + clean_writequeues(); 1198 + close_connection(); 1199 + dealloc_nodeinfo(); 1200 + max_nodeid = 0; 1201 + } 1202 + 1203 + int dlm_lowcomms_init(void) 1204 + { 1205 + init_waitqueue_head(&lowcomms_recv_wait); 1206 + return 0; 1207 + } 1208 + 1209 + void dlm_lowcomms_exit(void) 1210 + { 1211 + int i; 1212 + 1213 + for (i = 0; i < local_count; i++) 1214 + kfree(local_addr[i]); 1215 + local_count = 0; 1216 + local_nodeid = 0; 1217 + } 1218 +
+25
fs/dlm/lowcomms.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #ifndef __LOWCOMMS_DOT_H__ 15 + #define __LOWCOMMS_DOT_H__ 16 + 17 + int dlm_lowcomms_init(void); 18 + void dlm_lowcomms_exit(void); 19 + int dlm_lowcomms_start(void); 20 + void dlm_lowcomms_stop(void); 21 + void *dlm_lowcomms_get_buffer(int nodeid, int len, int allocation, char **ppc); 22 + void dlm_lowcomms_commit_buffer(void *mh); 23 + 24 + #endif /* __LOWCOMMS_DOT_H__ */ 25 +
+18
fs/dlm/lvb_table.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5 + ** 6 + ** This copyrighted material is made available to anyone wishing to use, 7 + ** modify, copy, or redistribute it subject to the terms and conditions 8 + ** of the GNU General Public License v.2. 9 + ** 10 + ******************************************************************************* 11 + ******************************************************************************/ 12 + 13 + #ifndef __LVB_TABLE_DOT_H__ 14 + #define __LVB_TABLE_DOT_H__ 15 + 16 + extern const int dlm_lvb_operations[8][8]; 17 + 18 + #endif
+89
fs/dlm/main.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #include "dlm_internal.h" 15 + #include "lockspace.h" 16 + #include "lock.h" 17 + #include "memory.h" 18 + #include "lowcomms.h" 19 + #include "config.h" 20 + 21 + #ifdef CONFIG_DLM_DEBUG 22 + int dlm_register_debugfs(void); 23 + void dlm_unregister_debugfs(void); 24 + #else 25 + static inline int dlm_register_debugfs(void) { return 0; } 26 + static inline void dlm_unregister_debugfs(void) { } 27 + #endif 28 + 29 + static int __init init_dlm(void) 30 + { 31 + int error; 32 + 33 + error = dlm_memory_init(); 34 + if (error) 35 + goto out; 36 + 37 + error = dlm_lockspace_init(); 38 + if (error) 39 + goto out_mem; 40 + 41 + error = dlm_config_init(); 42 + if (error) 43 + goto out_lockspace; 44 + 45 + error = dlm_register_debugfs(); 46 + if (error) 47 + goto out_config; 48 + 49 + error = dlm_lowcomms_init(); 50 + if (error) 51 + goto out_debug; 52 + 53 + printk("DLM (built %s %s) installed\n", __DATE__, __TIME__); 54 + 55 + return 0; 56 + 57 + out_debug: 58 + dlm_unregister_debugfs(); 59 + out_config: 60 + dlm_config_exit(); 61 + out_lockspace: 62 + dlm_lockspace_exit(); 63 + out_mem: 64 + dlm_memory_exit(); 65 + out: 66 + return error; 67 + } 68 + 69 + static void __exit exit_dlm(void) 70 + { 71 + dlm_lowcomms_exit(); 72 + dlm_config_exit(); 73 + dlm_memory_exit(); 74 + dlm_lockspace_exit(); 75 + dlm_unregister_debugfs(); 76 + } 77 + 78 + module_init(init_dlm); 79 + module_exit(exit_dlm); 80 + 81 + MODULE_DESCRIPTION("Distributed Lock Manager"); 82 + MODULE_AUTHOR("Red Hat, Inc."); 83 + MODULE_LICENSE("GPL"); 84 + 85 + EXPORT_SYMBOL_GPL(dlm_new_lockspace); 86 + EXPORT_SYMBOL_GPL(dlm_release_lockspace); 87 + EXPORT_SYMBOL_GPL(dlm_lock); 88 + EXPORT_SYMBOL_GPL(dlm_unlock); 89 +
+314
fs/dlm/member.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5 + ** 6 + ** This copyrighted material is made available to anyone wishing to use, 7 + ** modify, copy, or redistribute it subject to the terms and conditions 8 + ** of the GNU General Public License v.2. 9 + ** 10 + ******************************************************************************* 11 + ******************************************************************************/ 12 + 13 + #include "dlm_internal.h" 14 + #include "lockspace.h" 15 + #include "member.h" 16 + #include "recoverd.h" 17 + #include "recover.h" 18 + #include "lowcomms.h" 19 + #include "rcom.h" 20 + #include "config.h" 21 + 22 + /* 23 + * Following called by dlm_recoverd thread 24 + */ 25 + 26 + static void add_ordered_member(struct dlm_ls *ls, struct dlm_member *new) 27 + { 28 + struct dlm_member *memb = NULL; 29 + struct list_head *tmp; 30 + struct list_head *newlist = &new->list; 31 + struct list_head *head = &ls->ls_nodes; 32 + 33 + list_for_each(tmp, head) { 34 + memb = list_entry(tmp, struct dlm_member, list); 35 + if (new->nodeid < memb->nodeid) 36 + break; 37 + } 38 + 39 + if (!memb) 40 + list_add_tail(newlist, head); 41 + else { 42 + /* FIXME: can use list macro here */ 43 + newlist->prev = tmp->prev; 44 + newlist->next = tmp; 45 + tmp->prev->next = newlist; 46 + tmp->prev = newlist; 47 + } 48 + } 49 + 50 + static int dlm_add_member(struct dlm_ls *ls, int nodeid) 51 + { 52 + struct dlm_member *memb; 53 + int w; 54 + 55 + memb = kmalloc(sizeof(struct dlm_member), GFP_KERNEL); 56 + if (!memb) 57 + return -ENOMEM; 58 + 59 + w = dlm_node_weight(ls->ls_name, nodeid); 60 + if (w < 0) 61 + return w; 62 + 63 + memb->nodeid = nodeid; 64 + memb->weight = w; 65 + add_ordered_member(ls, memb); 66 + ls->ls_num_nodes++; 67 + return 0; 68 + } 69 + 70 + static void dlm_remove_member(struct dlm_ls *ls, struct dlm_member *memb) 71 + { 72 + list_move(&memb->list, &ls->ls_nodes_gone); 73 + ls->ls_num_nodes--; 74 + } 75 + 76 + static int dlm_is_member(struct dlm_ls *ls, int nodeid) 77 + { 78 + struct dlm_member *memb; 79 + 80 + list_for_each_entry(memb, &ls->ls_nodes, list) { 81 + if (memb->nodeid == nodeid) 82 + return TRUE; 83 + } 84 + return FALSE; 85 + } 86 + 87 + int dlm_is_removed(struct dlm_ls *ls, int nodeid) 88 + { 89 + struct dlm_member *memb; 90 + 91 + list_for_each_entry(memb, &ls->ls_nodes_gone, list) { 92 + if (memb->nodeid == nodeid) 93 + return TRUE; 94 + } 95 + return FALSE; 96 + } 97 + 98 + static void clear_memb_list(struct list_head *head) 99 + { 100 + struct dlm_member *memb; 101 + 102 + while (!list_empty(head)) { 103 + memb = list_entry(head->next, struct dlm_member, list); 104 + list_del(&memb->list); 105 + kfree(memb); 106 + } 107 + } 108 + 109 + void dlm_clear_members(struct dlm_ls *ls) 110 + { 111 + clear_memb_list(&ls->ls_nodes); 112 + ls->ls_num_nodes = 0; 113 + } 114 + 115 + void dlm_clear_members_gone(struct dlm_ls *ls) 116 + { 117 + clear_memb_list(&ls->ls_nodes_gone); 118 + } 119 + 120 + static void make_member_array(struct dlm_ls *ls) 121 + { 122 + struct dlm_member *memb; 123 + int i, w, x = 0, total = 0, all_zero = 0, *array; 124 + 125 + kfree(ls->ls_node_array); 126 + ls->ls_node_array = NULL; 127 + 128 + list_for_each_entry(memb, &ls->ls_nodes, list) { 129 + if (memb->weight) 130 + total += memb->weight; 131 + } 132 + 133 + /* all nodes revert to weight of 1 if all have weight 0 */ 134 + 135 + if (!total) { 136 + total = ls->ls_num_nodes; 137 + all_zero = 1; 138 + } 139 + 140 + ls->ls_total_weight = total; 141 + 142 + array = kmalloc(sizeof(int) * total, GFP_KERNEL); 143 + if (!array) 144 + return; 145 + 146 + list_for_each_entry(memb, &ls->ls_nodes, list) { 147 + if (!all_zero && !memb->weight) 148 + continue; 149 + 150 + if (all_zero) 151 + w = 1; 152 + else 153 + w = memb->weight; 154 + 155 + DLM_ASSERT(x < total, printk("total %d x %d\n", total, x);); 156 + 157 + for (i = 0; i < w; i++) 158 + array[x++] = memb->nodeid; 159 + } 160 + 161 + ls->ls_node_array = array; 162 + } 163 + 164 + /* send a status request to all members just to establish comms connections */ 165 + 166 + static void ping_members(struct dlm_ls *ls) 167 + { 168 + struct dlm_member *memb; 169 + list_for_each_entry(memb, &ls->ls_nodes, list) 170 + dlm_rcom_status(ls, memb->nodeid); 171 + } 172 + 173 + int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out) 174 + { 175 + struct dlm_member *memb, *safe; 176 + int i, error, found, pos = 0, neg = 0, low = -1; 177 + 178 + /* move departed members from ls_nodes to ls_nodes_gone */ 179 + 180 + list_for_each_entry_safe(memb, safe, &ls->ls_nodes, list) { 181 + found = FALSE; 182 + for (i = 0; i < rv->node_count; i++) { 183 + if (memb->nodeid == rv->nodeids[i]) { 184 + found = TRUE; 185 + break; 186 + } 187 + } 188 + 189 + if (!found) { 190 + neg++; 191 + dlm_remove_member(ls, memb); 192 + log_debug(ls, "remove member %d", memb->nodeid); 193 + } 194 + } 195 + 196 + /* add new members to ls_nodes */ 197 + 198 + for (i = 0; i < rv->node_count; i++) { 199 + if (dlm_is_member(ls, rv->nodeids[i])) 200 + continue; 201 + dlm_add_member(ls, rv->nodeids[i]); 202 + pos++; 203 + log_debug(ls, "add member %d", rv->nodeids[i]); 204 + } 205 + 206 + list_for_each_entry(memb, &ls->ls_nodes, list) { 207 + if (low == -1 || memb->nodeid < low) 208 + low = memb->nodeid; 209 + } 210 + ls->ls_low_nodeid = low; 211 + 212 + make_member_array(ls); 213 + dlm_set_recover_status(ls, DLM_RS_NODES); 214 + *neg_out = neg; 215 + 216 + ping_members(ls); 217 + 218 + error = dlm_recover_members_wait(ls); 219 + log_debug(ls, "total members %d", ls->ls_num_nodes); 220 + return error; 221 + } 222 + 223 + /* 224 + * Following called from lockspace.c 225 + */ 226 + 227 + int dlm_ls_stop(struct dlm_ls *ls) 228 + { 229 + int new; 230 + 231 + /* 232 + * A stop cancels any recovery that's in progress (see RECOVERY_STOP, 233 + * dlm_recovery_stopped()) and prevents any new locks from being 234 + * processed (see RUNNING, dlm_locking_stopped()). 235 + */ 236 + 237 + spin_lock(&ls->ls_recover_lock); 238 + set_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); 239 + new = test_and_clear_bit(LSFL_RUNNING, &ls->ls_flags); 240 + ls->ls_recover_seq++; 241 + spin_unlock(&ls->ls_recover_lock); 242 + 243 + /* 244 + * This in_recovery lock does two things: 245 + * 246 + * 1) Keeps this function from returning until all threads are out 247 + * of locking routines and locking is truely stopped. 248 + * 2) Keeps any new requests from being processed until it's unlocked 249 + * when recovery is complete. 250 + */ 251 + 252 + if (new) 253 + down_write(&ls->ls_in_recovery); 254 + 255 + /* 256 + * The recoverd suspend/resume makes sure that dlm_recoverd (if 257 + * running) has noticed the clearing of RUNNING above and quit 258 + * processing the previous recovery. This will be true for all nodes 259 + * before any nodes start the new recovery. 260 + */ 261 + 262 + dlm_recoverd_suspend(ls); 263 + ls->ls_recover_status = 0; 264 + dlm_recoverd_resume(ls); 265 + return 0; 266 + } 267 + 268 + int dlm_ls_start(struct dlm_ls *ls) 269 + { 270 + struct dlm_recover *rv = NULL, *rv_old; 271 + int *ids = NULL; 272 + int error, count; 273 + 274 + rv = kmalloc(sizeof(struct dlm_recover), GFP_KERNEL); 275 + if (!rv) 276 + return -ENOMEM; 277 + memset(rv, 0, sizeof(struct dlm_recover)); 278 + 279 + error = count = dlm_nodeid_list(ls->ls_name, &ids); 280 + if (error <= 0) 281 + goto fail; 282 + 283 + spin_lock(&ls->ls_recover_lock); 284 + 285 + /* the lockspace needs to be stopped before it can be started */ 286 + 287 + if (!dlm_locking_stopped(ls)) { 288 + spin_unlock(&ls->ls_recover_lock); 289 + log_error(ls, "start ignored: lockspace running"); 290 + error = -EINVAL; 291 + goto fail; 292 + } 293 + 294 + rv->nodeids = ids; 295 + rv->node_count = count; 296 + rv->seq = ++ls->ls_recover_seq; 297 + rv_old = ls->ls_recover_args; 298 + ls->ls_recover_args = rv; 299 + spin_unlock(&ls->ls_recover_lock); 300 + 301 + if (rv_old) { 302 + kfree(rv_old->nodeids); 303 + kfree(rv_old); 304 + } 305 + 306 + dlm_recoverd_kick(ls); 307 + return 0; 308 + 309 + fail: 310 + kfree(rv); 311 + kfree(ids); 312 + return error; 313 + } 314 +
+24
fs/dlm/member.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5 + ** 6 + ** This copyrighted material is made available to anyone wishing to use, 7 + ** modify, copy, or redistribute it subject to the terms and conditions 8 + ** of the GNU General Public License v.2. 9 + ** 10 + ******************************************************************************* 11 + ******************************************************************************/ 12 + 13 + #ifndef __MEMBER_DOT_H__ 14 + #define __MEMBER_DOT_H__ 15 + 16 + int dlm_ls_stop(struct dlm_ls *ls); 17 + int dlm_ls_start(struct dlm_ls *ls); 18 + void dlm_clear_members(struct dlm_ls *ls); 19 + void dlm_clear_members_gone(struct dlm_ls *ls); 20 + int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv,int *neg_out); 21 + int dlm_is_removed(struct dlm_ls *ls, int nodeid); 22 + 23 + #endif /* __MEMBER_DOT_H__ */ 24 +
+122
fs/dlm/memory.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #include "dlm_internal.h" 15 + #include "config.h" 16 + #include "memory.h" 17 + 18 + static kmem_cache_t *lkb_cache; 19 + 20 + 21 + int dlm_memory_init(void) 22 + { 23 + int ret = 0; 24 + 25 + lkb_cache = kmem_cache_create("dlm_lkb", sizeof(struct dlm_lkb), 26 + __alignof__(struct dlm_lkb), 0, NULL, NULL); 27 + if (!lkb_cache) 28 + ret = -ENOMEM; 29 + return ret; 30 + } 31 + 32 + void dlm_memory_exit(void) 33 + { 34 + if (lkb_cache) 35 + kmem_cache_destroy(lkb_cache); 36 + } 37 + 38 + char *allocate_lvb(struct dlm_ls *ls) 39 + { 40 + char *p; 41 + 42 + p = kmalloc(ls->ls_lvblen, GFP_KERNEL); 43 + if (p) 44 + memset(p, 0, ls->ls_lvblen); 45 + return p; 46 + } 47 + 48 + void free_lvb(char *p) 49 + { 50 + kfree(p); 51 + } 52 + 53 + uint64_t *allocate_range(struct dlm_ls *ls) 54 + { 55 + int ralen = 4*sizeof(uint64_t); 56 + uint64_t *p; 57 + 58 + p = kmalloc(ralen, GFP_KERNEL); 59 + if (p) 60 + memset(p, 0, ralen); 61 + return p; 62 + } 63 + 64 + void free_range(uint64_t *p) 65 + { 66 + kfree(p); 67 + } 68 + 69 + /* FIXME: have some minimal space built-in to rsb for the name and 70 + kmalloc a separate name if needed, like dentries are done */ 71 + 72 + struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen) 73 + { 74 + struct dlm_rsb *r; 75 + 76 + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); 77 + 78 + r = kmalloc(sizeof(*r) + namelen, GFP_KERNEL); 79 + if (r) 80 + memset(r, 0, sizeof(*r) + namelen); 81 + return r; 82 + } 83 + 84 + void free_rsb(struct dlm_rsb *r) 85 + { 86 + if (r->res_lvbptr) 87 + free_lvb(r->res_lvbptr); 88 + kfree(r); 89 + } 90 + 91 + struct dlm_lkb *allocate_lkb(struct dlm_ls *ls) 92 + { 93 + struct dlm_lkb *lkb; 94 + 95 + lkb = kmem_cache_alloc(lkb_cache, GFP_KERNEL); 96 + if (lkb) 97 + memset(lkb, 0, sizeof(*lkb)); 98 + return lkb; 99 + } 100 + 101 + void free_lkb(struct dlm_lkb *lkb) 102 + { 103 + kmem_cache_free(lkb_cache, lkb); 104 + } 105 + 106 + struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen) 107 + { 108 + struct dlm_direntry *de; 109 + 110 + DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); 111 + 112 + de = kmalloc(sizeof(*de) + namelen, GFP_KERNEL); 113 + if (de) 114 + memset(de, 0, sizeof(*de) + namelen); 115 + return de; 116 + } 117 + 118 + void free_direntry(struct dlm_direntry *de) 119 + { 120 + kfree(de); 121 + } 122 +
+31
fs/dlm/memory.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #ifndef __MEMORY_DOT_H__ 15 + #define __MEMORY_DOT_H__ 16 + 17 + int dlm_memory_init(void); 18 + void dlm_memory_exit(void); 19 + struct dlm_rsb *allocate_rsb(struct dlm_ls *ls, int namelen); 20 + void free_rsb(struct dlm_rsb *r); 21 + struct dlm_lkb *allocate_lkb(struct dlm_ls *ls); 22 + void free_lkb(struct dlm_lkb *l); 23 + struct dlm_direntry *allocate_direntry(struct dlm_ls *ls, int namelen); 24 + void free_direntry(struct dlm_direntry *de); 25 + char *allocate_lvb(struct dlm_ls *ls); 26 + void free_lvb(char *l); 27 + uint64_t *allocate_range(struct dlm_ls *ls); 28 + void free_range(uint64_t *l); 29 + 30 + #endif /* __MEMORY_DOT_H__ */ 31 +
+140
fs/dlm/midcomms.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + /* 15 + * midcomms.c 16 + * 17 + * This is the appallingly named "mid-level" comms layer. 18 + * 19 + * Its purpose is to take packets from the "real" comms layer, 20 + * split them up into packets and pass them to the interested 21 + * part of the locking mechanism. 22 + * 23 + * It also takes messages from the locking layer, formats them 24 + * into packets and sends them to the comms layer. 25 + */ 26 + 27 + #include "dlm_internal.h" 28 + #include "lowcomms.h" 29 + #include "config.h" 30 + #include "rcom.h" 31 + #include "lock.h" 32 + #include "midcomms.h" 33 + 34 + 35 + static void copy_from_cb(void *dst, const void *base, unsigned offset, 36 + unsigned len, unsigned limit) 37 + { 38 + unsigned copy = len; 39 + 40 + if ((copy + offset) > limit) 41 + copy = limit - offset; 42 + memcpy(dst, base + offset, copy); 43 + len -= copy; 44 + if (len) 45 + memcpy(dst + copy, base, len); 46 + } 47 + 48 + /* 49 + * Called from the low-level comms layer to process a buffer of 50 + * commands. 51 + * 52 + * Only complete messages are processed here, any "spare" bytes from 53 + * the end of a buffer are saved and tacked onto the front of the next 54 + * message that comes in. I doubt this will happen very often but we 55 + * need to be able to cope with it and I don't want the task to be waiting 56 + * for packets to come in when there is useful work to be done. 57 + */ 58 + 59 + int dlm_process_incoming_buffer(int nodeid, const void *base, 60 + unsigned offset, unsigned len, unsigned limit) 61 + { 62 + unsigned char __tmp[DLM_INBUF_LEN]; 63 + struct dlm_header *msg = (struct dlm_header *) __tmp; 64 + int ret = 0; 65 + int err = 0; 66 + uint16_t msglen; 67 + uint32_t lockspace; 68 + 69 + while (len > sizeof(struct dlm_header)) { 70 + 71 + /* Copy just the header to check the total length. The 72 + message may wrap around the end of the buffer back to the 73 + start, so we need to use a temp buffer and copy_from_cb. */ 74 + 75 + copy_from_cb(msg, base, offset, sizeof(struct dlm_header), 76 + limit); 77 + 78 + msglen = le16_to_cpu(msg->h_length); 79 + lockspace = msg->h_lockspace; 80 + 81 + err = -EINVAL; 82 + if (msglen < sizeof(struct dlm_header)) 83 + break; 84 + err = -E2BIG; 85 + if (msglen > dlm_config.buffer_size) { 86 + log_print("message size %d from %d too big, buf len %d", 87 + msglen, nodeid, len); 88 + break; 89 + } 90 + err = 0; 91 + 92 + /* If only part of the full message is contained in this 93 + buffer, then do nothing and wait for lowcomms to call 94 + us again later with more data. We return 0 meaning 95 + we've consumed none of the input buffer. */ 96 + 97 + if (msglen > len) 98 + break; 99 + 100 + /* Allocate a larger temp buffer if the full message won't fit 101 + in the buffer on the stack (which should work for most 102 + ordinary messages). */ 103 + 104 + if (msglen > sizeof(__tmp) && 105 + msg == (struct dlm_header *) __tmp) { 106 + msg = kmalloc(dlm_config.buffer_size, GFP_KERNEL); 107 + if (msg == NULL) 108 + return ret; 109 + } 110 + 111 + copy_from_cb(msg, base, offset, msglen, limit); 112 + 113 + BUG_ON(lockspace != msg->h_lockspace); 114 + 115 + ret += msglen; 116 + offset += msglen; 117 + offset &= (limit - 1); 118 + len -= msglen; 119 + 120 + switch (msg->h_cmd) { 121 + case DLM_MSG: 122 + dlm_receive_message(msg, nodeid, FALSE); 123 + break; 124 + 125 + case DLM_RCOM: 126 + dlm_receive_rcom(msg, nodeid); 127 + break; 128 + 129 + default: 130 + log_print("unknown msg type %x from %u: %u %u %u %u", 131 + msg->h_cmd, nodeid, msglen, len, offset, ret); 132 + } 133 + } 134 + 135 + if (msg != (struct dlm_header *) __tmp) 136 + kfree(msg); 137 + 138 + return err ? err : ret; 139 + } 140 +
+21
fs/dlm/midcomms.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #ifndef __MIDCOMMS_DOT_H__ 15 + #define __MIDCOMMS_DOT_H__ 16 + 17 + int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset, 18 + unsigned len, unsigned limit); 19 + 20 + #endif /* __MIDCOMMS_DOT_H__ */ 21 +
+460
fs/dlm/rcom.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #include "dlm_internal.h" 15 + #include "lockspace.h" 16 + #include "member.h" 17 + #include "lowcomms.h" 18 + #include "midcomms.h" 19 + #include "rcom.h" 20 + #include "recover.h" 21 + #include "dir.h" 22 + #include "config.h" 23 + #include "memory.h" 24 + #include "lock.h" 25 + #include "util.h" 26 + 27 + 28 + static int rcom_response(struct dlm_ls *ls) 29 + { 30 + return test_bit(LSFL_RCOM_READY, &ls->ls_flags); 31 + } 32 + 33 + static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, 34 + struct dlm_rcom **rc_ret, struct dlm_mhandle **mh_ret) 35 + { 36 + struct dlm_rcom *rc; 37 + struct dlm_mhandle *mh; 38 + char *mb; 39 + int mb_len = sizeof(struct dlm_rcom) + len; 40 + 41 + mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_KERNEL, &mb); 42 + if (!mh) { 43 + log_print("create_rcom to %d type %d len %d ENOBUFS", 44 + to_nodeid, type, len); 45 + return -ENOBUFS; 46 + } 47 + memset(mb, 0, mb_len); 48 + 49 + rc = (struct dlm_rcom *) mb; 50 + 51 + rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); 52 + rc->rc_header.h_lockspace = ls->ls_global_id; 53 + rc->rc_header.h_nodeid = dlm_our_nodeid(); 54 + rc->rc_header.h_length = mb_len; 55 + rc->rc_header.h_cmd = DLM_RCOM; 56 + 57 + rc->rc_type = type; 58 + 59 + *mh_ret = mh; 60 + *rc_ret = rc; 61 + return 0; 62 + } 63 + 64 + static void send_rcom(struct dlm_ls *ls, struct dlm_mhandle *mh, 65 + struct dlm_rcom *rc) 66 + { 67 + dlm_rcom_out(rc); 68 + dlm_lowcomms_commit_buffer(mh); 69 + } 70 + 71 + /* When replying to a status request, a node also sends back its 72 + configuration values. The requesting node then checks that the remote 73 + node is configured the same way as itself. */ 74 + 75 + static void make_config(struct dlm_ls *ls, struct rcom_config *rf) 76 + { 77 + rf->rf_lvblen = ls->ls_lvblen; 78 + rf->rf_lsflags = ls->ls_exflags; 79 + } 80 + 81 + static int check_config(struct dlm_ls *ls, struct rcom_config *rf, int nodeid) 82 + { 83 + if (rf->rf_lvblen != ls->ls_lvblen || 84 + rf->rf_lsflags != ls->ls_exflags) { 85 + log_error(ls, "config mismatch: %d,%x nodeid %d: %d,%x", 86 + ls->ls_lvblen, ls->ls_exflags, 87 + nodeid, rf->rf_lvblen, rf->rf_lsflags); 88 + return -EINVAL; 89 + } 90 + return 0; 91 + } 92 + 93 + int dlm_rcom_status(struct dlm_ls *ls, int nodeid) 94 + { 95 + struct dlm_rcom *rc; 96 + struct dlm_mhandle *mh; 97 + int error = 0; 98 + 99 + memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); 100 + 101 + if (nodeid == dlm_our_nodeid()) { 102 + rc = (struct dlm_rcom *) ls->ls_recover_buf; 103 + rc->rc_result = dlm_recover_status(ls); 104 + goto out; 105 + } 106 + 107 + error = create_rcom(ls, nodeid, DLM_RCOM_STATUS, 0, &rc, &mh); 108 + if (error) 109 + goto out; 110 + 111 + send_rcom(ls, mh, rc); 112 + 113 + error = dlm_wait_function(ls, &rcom_response); 114 + clear_bit(LSFL_RCOM_READY, &ls->ls_flags); 115 + if (error) 116 + goto out; 117 + 118 + rc = (struct dlm_rcom *) ls->ls_recover_buf; 119 + 120 + if (rc->rc_result == -ESRCH) { 121 + /* we pretend the remote lockspace exists with 0 status */ 122 + log_debug(ls, "remote node %d not ready", nodeid); 123 + rc->rc_result = 0; 124 + } else 125 + error = check_config(ls, (struct rcom_config *) rc->rc_buf, 126 + nodeid); 127 + /* the caller looks at rc_result for the remote recovery status */ 128 + out: 129 + return error; 130 + } 131 + 132 + static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) 133 + { 134 + struct dlm_rcom *rc; 135 + struct dlm_mhandle *mh; 136 + int error, nodeid = rc_in->rc_header.h_nodeid; 137 + 138 + error = create_rcom(ls, nodeid, DLM_RCOM_STATUS_REPLY, 139 + sizeof(struct rcom_config), &rc, &mh); 140 + if (error) 141 + return; 142 + rc->rc_result = dlm_recover_status(ls); 143 + make_config(ls, (struct rcom_config *) rc->rc_buf); 144 + 145 + send_rcom(ls, mh, rc); 146 + } 147 + 148 + static void receive_rcom_status_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) 149 + { 150 + memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length); 151 + set_bit(LSFL_RCOM_READY, &ls->ls_flags); 152 + wake_up(&ls->ls_wait_general); 153 + } 154 + 155 + int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name, int last_len) 156 + { 157 + struct dlm_rcom *rc; 158 + struct dlm_mhandle *mh; 159 + int error = 0, len = sizeof(struct dlm_rcom); 160 + 161 + memset(ls->ls_recover_buf, 0, dlm_config.buffer_size); 162 + 163 + if (nodeid == dlm_our_nodeid()) { 164 + dlm_copy_master_names(ls, last_name, last_len, 165 + ls->ls_recover_buf + len, 166 + dlm_config.buffer_size - len, nodeid); 167 + goto out; 168 + } 169 + 170 + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES, last_len, &rc, &mh); 171 + if (error) 172 + goto out; 173 + memcpy(rc->rc_buf, last_name, last_len); 174 + 175 + send_rcom(ls, mh, rc); 176 + 177 + error = dlm_wait_function(ls, &rcom_response); 178 + clear_bit(LSFL_RCOM_READY, &ls->ls_flags); 179 + out: 180 + return error; 181 + } 182 + 183 + static void receive_rcom_names(struct dlm_ls *ls, struct dlm_rcom *rc_in) 184 + { 185 + struct dlm_rcom *rc; 186 + struct dlm_mhandle *mh; 187 + int error, inlen, outlen; 188 + int nodeid = rc_in->rc_header.h_nodeid; 189 + uint32_t status = dlm_recover_status(ls); 190 + 191 + /* 192 + * We can't run dlm_dir_rebuild_send (which uses ls_nodes) while 193 + * dlm_recoverd is running ls_nodes_reconfig (which changes ls_nodes). 194 + * It could only happen in rare cases where we get a late NAMES 195 + * message from a previous instance of recovery. 196 + */ 197 + 198 + if (!(status & DLM_RS_NODES)) { 199 + log_debug(ls, "ignoring RCOM_NAMES from %u", nodeid); 200 + return; 201 + } 202 + 203 + nodeid = rc_in->rc_header.h_nodeid; 204 + inlen = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); 205 + outlen = dlm_config.buffer_size - sizeof(struct dlm_rcom); 206 + 207 + error = create_rcom(ls, nodeid, DLM_RCOM_NAMES_REPLY, outlen, &rc, &mh); 208 + if (error) 209 + return; 210 + 211 + dlm_copy_master_names(ls, rc_in->rc_buf, inlen, rc->rc_buf, outlen, 212 + nodeid); 213 + send_rcom(ls, mh, rc); 214 + } 215 + 216 + static void receive_rcom_names_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) 217 + { 218 + memcpy(ls->ls_recover_buf, rc_in, rc_in->rc_header.h_length); 219 + set_bit(LSFL_RCOM_READY, &ls->ls_flags); 220 + wake_up(&ls->ls_wait_general); 221 + } 222 + 223 + int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid) 224 + { 225 + struct dlm_rcom *rc; 226 + struct dlm_mhandle *mh; 227 + struct dlm_ls *ls = r->res_ls; 228 + int error; 229 + 230 + error = create_rcom(ls, dir_nodeid, DLM_RCOM_LOOKUP, r->res_length, 231 + &rc, &mh); 232 + if (error) 233 + goto out; 234 + memcpy(rc->rc_buf, r->res_name, r->res_length); 235 + rc->rc_id = (unsigned long) r; 236 + 237 + send_rcom(ls, mh, rc); 238 + out: 239 + return error; 240 + } 241 + 242 + static void receive_rcom_lookup(struct dlm_ls *ls, struct dlm_rcom *rc_in) 243 + { 244 + struct dlm_rcom *rc; 245 + struct dlm_mhandle *mh; 246 + int error, ret_nodeid, nodeid = rc_in->rc_header.h_nodeid; 247 + int len = rc_in->rc_header.h_length - sizeof(struct dlm_rcom); 248 + 249 + error = create_rcom(ls, nodeid, DLM_RCOM_LOOKUP_REPLY, 0, &rc, &mh); 250 + if (error) 251 + return; 252 + 253 + error = dlm_dir_lookup(ls, nodeid, rc_in->rc_buf, len, &ret_nodeid); 254 + if (error) 255 + ret_nodeid = error; 256 + rc->rc_result = ret_nodeid; 257 + rc->rc_id = rc_in->rc_id; 258 + 259 + send_rcom(ls, mh, rc); 260 + } 261 + 262 + static void receive_rcom_lookup_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) 263 + { 264 + dlm_recover_master_reply(ls, rc_in); 265 + } 266 + 267 + static void pack_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb, 268 + struct rcom_lock *rl) 269 + { 270 + memset(rl, 0, sizeof(*rl)); 271 + 272 + rl->rl_ownpid = lkb->lkb_ownpid; 273 + rl->rl_lkid = lkb->lkb_id; 274 + rl->rl_exflags = lkb->lkb_exflags; 275 + rl->rl_flags = lkb->lkb_flags; 276 + rl->rl_lvbseq = lkb->lkb_lvbseq; 277 + rl->rl_rqmode = lkb->lkb_rqmode; 278 + rl->rl_grmode = lkb->lkb_grmode; 279 + rl->rl_status = lkb->lkb_status; 280 + rl->rl_wait_type = lkb->lkb_wait_type; 281 + 282 + if (lkb->lkb_bastaddr) 283 + rl->rl_asts |= AST_BAST; 284 + if (lkb->lkb_astaddr) 285 + rl->rl_asts |= AST_COMP; 286 + 287 + if (lkb->lkb_range) 288 + memcpy(rl->rl_range, lkb->lkb_range, 4*sizeof(uint64_t)); 289 + 290 + rl->rl_namelen = r->res_length; 291 + memcpy(rl->rl_name, r->res_name, r->res_length); 292 + 293 + /* FIXME: might we have an lvb without DLM_LKF_VALBLK set ? 294 + If so, receive_rcom_lock_args() won't take this copy. */ 295 + 296 + if (lkb->lkb_lvbptr) 297 + memcpy(rl->rl_lvb, lkb->lkb_lvbptr, r->res_ls->ls_lvblen); 298 + } 299 + 300 + int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb) 301 + { 302 + struct dlm_ls *ls = r->res_ls; 303 + struct dlm_rcom *rc; 304 + struct dlm_mhandle *mh; 305 + struct rcom_lock *rl; 306 + int error, len = sizeof(struct rcom_lock); 307 + 308 + if (lkb->lkb_lvbptr) 309 + len += ls->ls_lvblen; 310 + 311 + error = create_rcom(ls, r->res_nodeid, DLM_RCOM_LOCK, len, &rc, &mh); 312 + if (error) 313 + goto out; 314 + 315 + rl = (struct rcom_lock *) rc->rc_buf; 316 + pack_rcom_lock(r, lkb, rl); 317 + rc->rc_id = (unsigned long) r; 318 + 319 + send_rcom(ls, mh, rc); 320 + out: 321 + return error; 322 + } 323 + 324 + static void receive_rcom_lock(struct dlm_ls *ls, struct dlm_rcom *rc_in) 325 + { 326 + struct dlm_rcom *rc; 327 + struct dlm_mhandle *mh; 328 + int error, nodeid = rc_in->rc_header.h_nodeid; 329 + 330 + dlm_recover_master_copy(ls, rc_in); 331 + 332 + error = create_rcom(ls, nodeid, DLM_RCOM_LOCK_REPLY, 333 + sizeof(struct rcom_lock), &rc, &mh); 334 + if (error) 335 + return; 336 + 337 + /* We send back the same rcom_lock struct we received, but 338 + dlm_recover_master_copy() has filled in rl_remid and rl_result */ 339 + 340 + memcpy(rc->rc_buf, rc_in->rc_buf, sizeof(struct rcom_lock)); 341 + rc->rc_id = rc_in->rc_id; 342 + 343 + send_rcom(ls, mh, rc); 344 + } 345 + 346 + static void receive_rcom_lock_reply(struct dlm_ls *ls, struct dlm_rcom *rc_in) 347 + { 348 + uint32_t status = dlm_recover_status(ls); 349 + 350 + if (!(status & DLM_RS_DIR)) { 351 + log_debug(ls, "ignoring RCOM_LOCK_REPLY from %u", 352 + rc_in->rc_header.h_nodeid); 353 + return; 354 + } 355 + 356 + dlm_recover_process_copy(ls, rc_in); 357 + } 358 + 359 + static int send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) 360 + { 361 + struct dlm_rcom *rc; 362 + struct dlm_mhandle *mh; 363 + char *mb; 364 + int mb_len = sizeof(struct dlm_rcom); 365 + 366 + mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_KERNEL, &mb); 367 + if (!mh) 368 + return -ENOBUFS; 369 + memset(mb, 0, mb_len); 370 + 371 + rc = (struct dlm_rcom *) mb; 372 + 373 + rc->rc_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); 374 + rc->rc_header.h_lockspace = rc_in->rc_header.h_lockspace; 375 + rc->rc_header.h_nodeid = dlm_our_nodeid(); 376 + rc->rc_header.h_length = mb_len; 377 + rc->rc_header.h_cmd = DLM_RCOM; 378 + 379 + rc->rc_type = DLM_RCOM_STATUS_REPLY; 380 + rc->rc_result = -ESRCH; 381 + 382 + dlm_rcom_out(rc); 383 + dlm_lowcomms_commit_buffer(mh); 384 + 385 + return 0; 386 + } 387 + 388 + /* Called by dlm_recvd; corresponds to dlm_receive_message() but special 389 + recovery-only comms are sent through here. */ 390 + 391 + void dlm_receive_rcom(struct dlm_header *hd, int nodeid) 392 + { 393 + struct dlm_rcom *rc = (struct dlm_rcom *) hd; 394 + struct dlm_ls *ls; 395 + 396 + dlm_rcom_in(rc); 397 + 398 + /* If the lockspace doesn't exist then still send a status message 399 + back; it's possible that it just doesn't have its global_id yet. */ 400 + 401 + ls = dlm_find_lockspace_global(hd->h_lockspace); 402 + if (!ls) { 403 + log_print("lockspace %x from %d not found", 404 + hd->h_lockspace, nodeid); 405 + send_ls_not_ready(nodeid, rc); 406 + return; 407 + } 408 + 409 + if (dlm_recovery_stopped(ls) && (rc->rc_type != DLM_RCOM_STATUS)) { 410 + log_error(ls, "ignoring recovery message %x from %d", 411 + rc->rc_type, nodeid); 412 + goto out; 413 + } 414 + 415 + if (nodeid != rc->rc_header.h_nodeid) { 416 + log_error(ls, "bad rcom nodeid %d from %d", 417 + rc->rc_header.h_nodeid, nodeid); 418 + goto out; 419 + } 420 + 421 + switch (rc->rc_type) { 422 + case DLM_RCOM_STATUS: 423 + receive_rcom_status(ls, rc); 424 + break; 425 + 426 + case DLM_RCOM_NAMES: 427 + receive_rcom_names(ls, rc); 428 + break; 429 + 430 + case DLM_RCOM_LOOKUP: 431 + receive_rcom_lookup(ls, rc); 432 + break; 433 + 434 + case DLM_RCOM_LOCK: 435 + receive_rcom_lock(ls, rc); 436 + break; 437 + 438 + case DLM_RCOM_STATUS_REPLY: 439 + receive_rcom_status_reply(ls, rc); 440 + break; 441 + 442 + case DLM_RCOM_NAMES_REPLY: 443 + receive_rcom_names_reply(ls, rc); 444 + break; 445 + 446 + case DLM_RCOM_LOOKUP_REPLY: 447 + receive_rcom_lookup_reply(ls, rc); 448 + break; 449 + 450 + case DLM_RCOM_LOCK_REPLY: 451 + receive_rcom_lock_reply(ls, rc); 452 + break; 453 + 454 + default: 455 + DLM_ASSERT(0, printk("rc_type=%x\n", rc->rc_type);); 456 + } 457 + out: 458 + dlm_put_lockspace(ls); 459 + } 460 +
+24
fs/dlm/rcom.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #ifndef __RCOM_DOT_H__ 15 + #define __RCOM_DOT_H__ 16 + 17 + int dlm_rcom_status(struct dlm_ls *ls, int nodeid); 18 + int dlm_rcom_names(struct dlm_ls *ls, int nodeid, char *last_name,int last_len); 19 + int dlm_send_rcom_lookup(struct dlm_rsb *r, int dir_nodeid); 20 + int dlm_send_rcom_lock(struct dlm_rsb *r, struct dlm_lkb *lkb); 21 + void dlm_receive_rcom(struct dlm_header *hd, int nodeid); 22 + 23 + #endif 24 +
+762
fs/dlm/recover.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #include "dlm_internal.h" 15 + #include "lockspace.h" 16 + #include "dir.h" 17 + #include "config.h" 18 + #include "ast.h" 19 + #include "memory.h" 20 + #include "rcom.h" 21 + #include "lock.h" 22 + #include "lowcomms.h" 23 + #include "member.h" 24 + #include "recover.h" 25 + 26 + 27 + /* 28 + * Recovery waiting routines: these functions wait for a particular reply from 29 + * a remote node, or for the remote node to report a certain status. They need 30 + * to abort if the lockspace is stopped indicating a node has failed (perhaps 31 + * the one being waited for). 32 + */ 33 + 34 + /* 35 + * Wait until given function returns non-zero or lockspace is stopped 36 + * (LS_RECOVERY_STOP set due to failure of a node in ls_nodes). When another 37 + * function thinks it could have completed the waited-on task, they should wake 38 + * up ls_wait_general to get an immediate response rather than waiting for the 39 + * timer to detect the result. A timer wakes us up periodically while waiting 40 + * to see if we should abort due to a node failure. This should only be called 41 + * by the dlm_recoverd thread. 42 + */ 43 + 44 + static void dlm_wait_timer_fn(unsigned long data) 45 + { 46 + struct dlm_ls *ls = (struct dlm_ls *) data; 47 + mod_timer(&ls->ls_timer, jiffies + (dlm_config.recover_timer * HZ)); 48 + wake_up(&ls->ls_wait_general); 49 + } 50 + 51 + int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)) 52 + { 53 + int error = 0; 54 + 55 + init_timer(&ls->ls_timer); 56 + ls->ls_timer.function = dlm_wait_timer_fn; 57 + ls->ls_timer.data = (long) ls; 58 + ls->ls_timer.expires = jiffies + (dlm_config.recover_timer * HZ); 59 + add_timer(&ls->ls_timer); 60 + 61 + wait_event(ls->ls_wait_general, testfn(ls) || dlm_recovery_stopped(ls)); 62 + del_timer_sync(&ls->ls_timer); 63 + 64 + if (dlm_recovery_stopped(ls)) { 65 + log_debug(ls, "dlm_wait_function aborted"); 66 + error = -EINTR; 67 + } 68 + return error; 69 + } 70 + 71 + /* 72 + * An efficient way for all nodes to wait for all others to have a certain 73 + * status. The node with the lowest nodeid polls all the others for their 74 + * status (wait_status_all) and all the others poll the node with the low id 75 + * for its accumulated result (wait_status_low). When all nodes have set 76 + * status flag X, then status flag X_ALL will be set on the low nodeid. 77 + */ 78 + 79 + uint32_t dlm_recover_status(struct dlm_ls *ls) 80 + { 81 + uint32_t status; 82 + spin_lock(&ls->ls_recover_lock); 83 + status = ls->ls_recover_status; 84 + spin_unlock(&ls->ls_recover_lock); 85 + return status; 86 + } 87 + 88 + void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status) 89 + { 90 + spin_lock(&ls->ls_recover_lock); 91 + ls->ls_recover_status |= status; 92 + spin_unlock(&ls->ls_recover_lock); 93 + } 94 + 95 + static int wait_status_all(struct dlm_ls *ls, uint32_t wait_status) 96 + { 97 + struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf; 98 + struct dlm_member *memb; 99 + int error = 0, delay; 100 + 101 + list_for_each_entry(memb, &ls->ls_nodes, list) { 102 + delay = 0; 103 + for (;;) { 104 + if (dlm_recovery_stopped(ls)) { 105 + error = -EINTR; 106 + goto out; 107 + } 108 + 109 + error = dlm_rcom_status(ls, memb->nodeid); 110 + if (error) 111 + goto out; 112 + 113 + if (rc->rc_result & wait_status) 114 + break; 115 + if (delay < 1000) 116 + delay += 20; 117 + msleep(delay); 118 + } 119 + } 120 + out: 121 + return error; 122 + } 123 + 124 + static int wait_status_low(struct dlm_ls *ls, uint32_t wait_status) 125 + { 126 + struct dlm_rcom *rc = (struct dlm_rcom *) ls->ls_recover_buf; 127 + int error = 0, delay = 0, nodeid = ls->ls_low_nodeid; 128 + 129 + for (;;) { 130 + if (dlm_recovery_stopped(ls)) { 131 + error = -EINTR; 132 + goto out; 133 + } 134 + 135 + error = dlm_rcom_status(ls, nodeid); 136 + if (error) 137 + break; 138 + 139 + if (rc->rc_result & wait_status) 140 + break; 141 + if (delay < 1000) 142 + delay += 20; 143 + msleep(delay); 144 + } 145 + out: 146 + return error; 147 + } 148 + 149 + static int wait_status(struct dlm_ls *ls, uint32_t status) 150 + { 151 + uint32_t status_all = status << 1; 152 + int error; 153 + 154 + if (ls->ls_low_nodeid == dlm_our_nodeid()) { 155 + error = wait_status_all(ls, status); 156 + if (!error) 157 + dlm_set_recover_status(ls, status_all); 158 + } else 159 + error = wait_status_low(ls, status_all); 160 + 161 + return error; 162 + } 163 + 164 + int dlm_recover_members_wait(struct dlm_ls *ls) 165 + { 166 + return wait_status(ls, DLM_RS_NODES); 167 + } 168 + 169 + int dlm_recover_directory_wait(struct dlm_ls *ls) 170 + { 171 + return wait_status(ls, DLM_RS_DIR); 172 + } 173 + 174 + int dlm_recover_locks_wait(struct dlm_ls *ls) 175 + { 176 + return wait_status(ls, DLM_RS_LOCKS); 177 + } 178 + 179 + int dlm_recover_done_wait(struct dlm_ls *ls) 180 + { 181 + return wait_status(ls, DLM_RS_DONE); 182 + } 183 + 184 + /* 185 + * The recover_list contains all the rsb's for which we've requested the new 186 + * master nodeid. As replies are returned from the resource directories the 187 + * rsb's are removed from the list. When the list is empty we're done. 188 + * 189 + * The recover_list is later similarly used for all rsb's for which we've sent 190 + * new lkb's and need to receive new corresponding lkid's. 191 + * 192 + * We use the address of the rsb struct as a simple local identifier for the 193 + * rsb so we can match an rcom reply with the rsb it was sent for. 194 + */ 195 + 196 + static int recover_list_empty(struct dlm_ls *ls) 197 + { 198 + int empty; 199 + 200 + spin_lock(&ls->ls_recover_list_lock); 201 + empty = list_empty(&ls->ls_recover_list); 202 + spin_unlock(&ls->ls_recover_list_lock); 203 + 204 + return empty; 205 + } 206 + 207 + static void recover_list_add(struct dlm_rsb *r) 208 + { 209 + struct dlm_ls *ls = r->res_ls; 210 + 211 + spin_lock(&ls->ls_recover_list_lock); 212 + if (list_empty(&r->res_recover_list)) { 213 + list_add_tail(&r->res_recover_list, &ls->ls_recover_list); 214 + ls->ls_recover_list_count++; 215 + dlm_hold_rsb(r); 216 + } 217 + spin_unlock(&ls->ls_recover_list_lock); 218 + } 219 + 220 + static void recover_list_del(struct dlm_rsb *r) 221 + { 222 + struct dlm_ls *ls = r->res_ls; 223 + 224 + spin_lock(&ls->ls_recover_list_lock); 225 + list_del_init(&r->res_recover_list); 226 + ls->ls_recover_list_count--; 227 + spin_unlock(&ls->ls_recover_list_lock); 228 + 229 + dlm_put_rsb(r); 230 + } 231 + 232 + static struct dlm_rsb *recover_list_find(struct dlm_ls *ls, uint64_t id) 233 + { 234 + struct dlm_rsb *r = NULL; 235 + 236 + spin_lock(&ls->ls_recover_list_lock); 237 + 238 + list_for_each_entry(r, &ls->ls_recover_list, res_recover_list) { 239 + if (id == (unsigned long) r) 240 + goto out; 241 + } 242 + r = NULL; 243 + out: 244 + spin_unlock(&ls->ls_recover_list_lock); 245 + return r; 246 + } 247 + 248 + static void recover_list_clear(struct dlm_ls *ls) 249 + { 250 + struct dlm_rsb *r, *s; 251 + 252 + spin_lock(&ls->ls_recover_list_lock); 253 + list_for_each_entry_safe(r, s, &ls->ls_recover_list, res_recover_list) { 254 + list_del_init(&r->res_recover_list); 255 + dlm_put_rsb(r); 256 + ls->ls_recover_list_count--; 257 + } 258 + 259 + if (ls->ls_recover_list_count != 0) { 260 + log_error(ls, "warning: recover_list_count %d", 261 + ls->ls_recover_list_count); 262 + ls->ls_recover_list_count = 0; 263 + } 264 + spin_unlock(&ls->ls_recover_list_lock); 265 + } 266 + 267 + 268 + /* Master recovery: find new master node for rsb's that were 269 + mastered on nodes that have been removed. 270 + 271 + dlm_recover_masters 272 + recover_master 273 + dlm_send_rcom_lookup -> receive_rcom_lookup 274 + dlm_dir_lookup 275 + receive_rcom_lookup_reply <- 276 + dlm_recover_master_reply 277 + set_new_master 278 + set_master_lkbs 279 + set_lock_master 280 + */ 281 + 282 + /* 283 + * Set the lock master for all LKBs in a lock queue 284 + * If we are the new master of the rsb, we may have received new 285 + * MSTCPY locks from other nodes already which we need to ignore 286 + * when setting the new nodeid. 287 + */ 288 + 289 + static void set_lock_master(struct list_head *queue, int nodeid) 290 + { 291 + struct dlm_lkb *lkb; 292 + 293 + list_for_each_entry(lkb, queue, lkb_statequeue) 294 + if (!(lkb->lkb_flags & DLM_IFL_MSTCPY)) 295 + lkb->lkb_nodeid = nodeid; 296 + } 297 + 298 + static void set_master_lkbs(struct dlm_rsb *r) 299 + { 300 + set_lock_master(&r->res_grantqueue, r->res_nodeid); 301 + set_lock_master(&r->res_convertqueue, r->res_nodeid); 302 + set_lock_master(&r->res_waitqueue, r->res_nodeid); 303 + } 304 + 305 + /* 306 + * Propogate the new master nodeid to locks 307 + * The NEW_MASTER flag tells dlm_recover_locks() which rsb's to consider. 308 + * The NEW_MASTER2 flag tells recover_lvb() which rsb's to consider. 309 + */ 310 + 311 + static void set_new_master(struct dlm_rsb *r, int nodeid) 312 + { 313 + lock_rsb(r); 314 + r->res_nodeid = nodeid; 315 + set_master_lkbs(r); 316 + rsb_set_flag(r, RSB_NEW_MASTER); 317 + rsb_set_flag(r, RSB_NEW_MASTER2); 318 + unlock_rsb(r); 319 + } 320 + 321 + /* 322 + * We do async lookups on rsb's that need new masters. The rsb's 323 + * waiting for a lookup reply are kept on the recover_list. 324 + */ 325 + 326 + static int recover_master(struct dlm_rsb *r) 327 + { 328 + struct dlm_ls *ls = r->res_ls; 329 + int error, dir_nodeid, ret_nodeid, our_nodeid = dlm_our_nodeid(); 330 + 331 + dir_nodeid = dlm_dir_nodeid(r); 332 + 333 + if (dir_nodeid == our_nodeid) { 334 + error = dlm_dir_lookup(ls, our_nodeid, r->res_name, 335 + r->res_length, &ret_nodeid); 336 + if (error) 337 + log_error(ls, "recover dir lookup error %d", error); 338 + 339 + if (ret_nodeid == our_nodeid) 340 + ret_nodeid = 0; 341 + set_new_master(r, ret_nodeid); 342 + } else { 343 + recover_list_add(r); 344 + error = dlm_send_rcom_lookup(r, dir_nodeid); 345 + } 346 + 347 + return error; 348 + } 349 + 350 + /* 351 + * When not using a directory, most resource names will hash to a new static 352 + * master nodeid and the resource will need to be remastered. 353 + */ 354 + 355 + static int recover_master_static(struct dlm_rsb *r) 356 + { 357 + int master = dlm_dir_nodeid(r); 358 + 359 + if (master == dlm_our_nodeid()) 360 + master = 0; 361 + 362 + if (r->res_nodeid != master) { 363 + if (is_master(r)) 364 + dlm_purge_mstcpy_locks(r); 365 + set_new_master(r, master); 366 + return 1; 367 + } 368 + return 0; 369 + } 370 + 371 + /* 372 + * Go through local root resources and for each rsb which has a master which 373 + * has departed, get the new master nodeid from the directory. The dir will 374 + * assign mastery to the first node to look up the new master. That means 375 + * we'll discover in this lookup if we're the new master of any rsb's. 376 + * 377 + * We fire off all the dir lookup requests individually and asynchronously to 378 + * the correct dir node. 379 + */ 380 + 381 + int dlm_recover_masters(struct dlm_ls *ls) 382 + { 383 + struct dlm_rsb *r; 384 + int error = 0, count = 0; 385 + 386 + log_debug(ls, "dlm_recover_masters"); 387 + 388 + down_read(&ls->ls_root_sem); 389 + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 390 + if (dlm_recovery_stopped(ls)) { 391 + up_read(&ls->ls_root_sem); 392 + error = -EINTR; 393 + goto out; 394 + } 395 + 396 + if (dlm_no_directory(ls)) 397 + count += recover_master_static(r); 398 + else if (!is_master(r) && dlm_is_removed(ls, r->res_nodeid)) { 399 + recover_master(r); 400 + count++; 401 + } 402 + 403 + schedule(); 404 + } 405 + up_read(&ls->ls_root_sem); 406 + 407 + log_debug(ls, "dlm_recover_masters %d resources", count); 408 + 409 + error = dlm_wait_function(ls, &recover_list_empty); 410 + out: 411 + if (error) 412 + recover_list_clear(ls); 413 + return error; 414 + } 415 + 416 + int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc) 417 + { 418 + struct dlm_rsb *r; 419 + int nodeid; 420 + 421 + r = recover_list_find(ls, rc->rc_id); 422 + if (!r) { 423 + log_error(ls, "dlm_recover_master_reply no id %"PRIx64"", 424 + rc->rc_id); 425 + goto out; 426 + } 427 + 428 + nodeid = rc->rc_result; 429 + if (nodeid == dlm_our_nodeid()) 430 + nodeid = 0; 431 + 432 + set_new_master(r, nodeid); 433 + recover_list_del(r); 434 + 435 + if (recover_list_empty(ls)) 436 + wake_up(&ls->ls_wait_general); 437 + out: 438 + return 0; 439 + } 440 + 441 + 442 + /* Lock recovery: rebuild the process-copy locks we hold on a 443 + remastered rsb on the new rsb master. 444 + 445 + dlm_recover_locks 446 + recover_locks 447 + recover_locks_queue 448 + dlm_send_rcom_lock -> receive_rcom_lock 449 + dlm_recover_master_copy 450 + receive_rcom_lock_reply <- 451 + dlm_recover_process_copy 452 + */ 453 + 454 + 455 + /* 456 + * keep a count of the number of lkb's we send to the new master; when we get 457 + * an equal number of replies then recovery for the rsb is done 458 + */ 459 + 460 + static int recover_locks_queue(struct dlm_rsb *r, struct list_head *head) 461 + { 462 + struct dlm_lkb *lkb; 463 + int error = 0; 464 + 465 + list_for_each_entry(lkb, head, lkb_statequeue) { 466 + error = dlm_send_rcom_lock(r, lkb); 467 + if (error) 468 + break; 469 + r->res_recover_locks_count++; 470 + } 471 + 472 + return error; 473 + } 474 + 475 + static int all_queues_empty(struct dlm_rsb *r) 476 + { 477 + if (!list_empty(&r->res_grantqueue) || 478 + !list_empty(&r->res_convertqueue) || 479 + !list_empty(&r->res_waitqueue)) 480 + return FALSE; 481 + return TRUE; 482 + } 483 + 484 + static int recover_locks(struct dlm_rsb *r) 485 + { 486 + int error = 0; 487 + 488 + lock_rsb(r); 489 + if (all_queues_empty(r)) 490 + goto out; 491 + 492 + DLM_ASSERT(!r->res_recover_locks_count, dlm_print_rsb(r);); 493 + 494 + error = recover_locks_queue(r, &r->res_grantqueue); 495 + if (error) 496 + goto out; 497 + error = recover_locks_queue(r, &r->res_convertqueue); 498 + if (error) 499 + goto out; 500 + error = recover_locks_queue(r, &r->res_waitqueue); 501 + if (error) 502 + goto out; 503 + 504 + if (r->res_recover_locks_count) 505 + recover_list_add(r); 506 + else 507 + rsb_clear_flag(r, RSB_NEW_MASTER); 508 + out: 509 + unlock_rsb(r); 510 + return error; 511 + } 512 + 513 + int dlm_recover_locks(struct dlm_ls *ls) 514 + { 515 + struct dlm_rsb *r; 516 + int error, count = 0; 517 + 518 + log_debug(ls, "dlm_recover_locks"); 519 + 520 + down_read(&ls->ls_root_sem); 521 + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 522 + if (is_master(r)) { 523 + rsb_clear_flag(r, RSB_NEW_MASTER); 524 + continue; 525 + } 526 + 527 + if (!rsb_flag(r, RSB_NEW_MASTER)) 528 + continue; 529 + 530 + if (dlm_recovery_stopped(ls)) { 531 + error = -EINTR; 532 + up_read(&ls->ls_root_sem); 533 + goto out; 534 + } 535 + 536 + error = recover_locks(r); 537 + if (error) { 538 + up_read(&ls->ls_root_sem); 539 + goto out; 540 + } 541 + 542 + count += r->res_recover_locks_count; 543 + } 544 + up_read(&ls->ls_root_sem); 545 + 546 + log_debug(ls, "dlm_recover_locks %d locks", count); 547 + 548 + error = dlm_wait_function(ls, &recover_list_empty); 549 + out: 550 + if (error) 551 + recover_list_clear(ls); 552 + else 553 + dlm_set_recover_status(ls, DLM_RS_LOCKS); 554 + return error; 555 + } 556 + 557 + void dlm_recovered_lock(struct dlm_rsb *r) 558 + { 559 + DLM_ASSERT(rsb_flag(r, RSB_NEW_MASTER), dlm_print_rsb(r);); 560 + 561 + r->res_recover_locks_count--; 562 + if (!r->res_recover_locks_count) { 563 + rsb_clear_flag(r, RSB_NEW_MASTER); 564 + recover_list_del(r); 565 + } 566 + 567 + if (recover_list_empty(r->res_ls)) 568 + wake_up(&r->res_ls->ls_wait_general); 569 + } 570 + 571 + /* 572 + * The lvb needs to be recovered on all master rsb's. This includes setting 573 + * the VALNOTVALID flag if necessary, and determining the correct lvb contents 574 + * based on the lvb's of the locks held on the rsb. 575 + * 576 + * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it 577 + * was already set prior to recovery, it's not cleared, regardless of locks. 578 + * 579 + * The LVB contents are only considered for changing when this is a new master 580 + * of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with 581 + * mode > CR. If no lkb's exist with mode above CR, the lvb contents are taken 582 + * from the lkb with the largest lvb sequence number. 583 + */ 584 + 585 + static void recover_lvb(struct dlm_rsb *r) 586 + { 587 + struct dlm_lkb *lkb, *high_lkb = NULL; 588 + uint32_t high_seq = 0; 589 + int lock_lvb_exists = FALSE; 590 + int big_lock_exists = FALSE; 591 + int lvblen = r->res_ls->ls_lvblen; 592 + 593 + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { 594 + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) 595 + continue; 596 + 597 + lock_lvb_exists = TRUE; 598 + 599 + if (lkb->lkb_grmode > DLM_LOCK_CR) { 600 + big_lock_exists = TRUE; 601 + goto setflag; 602 + } 603 + 604 + if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) { 605 + high_lkb = lkb; 606 + high_seq = lkb->lkb_lvbseq; 607 + } 608 + } 609 + 610 + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { 611 + if (!(lkb->lkb_exflags & DLM_LKF_VALBLK)) 612 + continue; 613 + 614 + lock_lvb_exists = TRUE; 615 + 616 + if (lkb->lkb_grmode > DLM_LOCK_CR) { 617 + big_lock_exists = TRUE; 618 + goto setflag; 619 + } 620 + 621 + if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) { 622 + high_lkb = lkb; 623 + high_seq = lkb->lkb_lvbseq; 624 + } 625 + } 626 + 627 + setflag: 628 + if (!lock_lvb_exists) 629 + goto out; 630 + 631 + if (!big_lock_exists) 632 + rsb_set_flag(r, RSB_VALNOTVALID); 633 + 634 + /* don't mess with the lvb unless we're the new master */ 635 + if (!rsb_flag(r, RSB_NEW_MASTER2)) 636 + goto out; 637 + 638 + if (!r->res_lvbptr) { 639 + r->res_lvbptr = allocate_lvb(r->res_ls); 640 + if (!r->res_lvbptr) 641 + goto out; 642 + } 643 + 644 + if (big_lock_exists) { 645 + r->res_lvbseq = lkb->lkb_lvbseq; 646 + memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen); 647 + } else if (high_lkb) { 648 + r->res_lvbseq = high_lkb->lkb_lvbseq; 649 + memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen); 650 + } else { 651 + r->res_lvbseq = 0; 652 + memset(r->res_lvbptr, 0, lvblen); 653 + } 654 + out: 655 + return; 656 + } 657 + 658 + /* All master rsb's flagged RECOVER_CONVERT need to be looked at. The locks 659 + converting PR->CW or CW->PR need to have their lkb_grmode set. */ 660 + 661 + static void recover_conversion(struct dlm_rsb *r) 662 + { 663 + struct dlm_lkb *lkb; 664 + int grmode = -1; 665 + 666 + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) { 667 + if (lkb->lkb_grmode == DLM_LOCK_PR || 668 + lkb->lkb_grmode == DLM_LOCK_CW) { 669 + grmode = lkb->lkb_grmode; 670 + break; 671 + } 672 + } 673 + 674 + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) { 675 + if (lkb->lkb_grmode != DLM_LOCK_IV) 676 + continue; 677 + if (grmode == -1) 678 + lkb->lkb_grmode = lkb->lkb_rqmode; 679 + else 680 + lkb->lkb_grmode = grmode; 681 + } 682 + } 683 + 684 + void dlm_recover_rsbs(struct dlm_ls *ls) 685 + { 686 + struct dlm_rsb *r; 687 + int count = 0; 688 + 689 + log_debug(ls, "dlm_recover_rsbs"); 690 + 691 + down_read(&ls->ls_root_sem); 692 + list_for_each_entry(r, &ls->ls_root_list, res_root_list) { 693 + lock_rsb(r); 694 + if (is_master(r)) { 695 + if (rsb_flag(r, RSB_RECOVER_CONVERT)) 696 + recover_conversion(r); 697 + recover_lvb(r); 698 + count++; 699 + } 700 + rsb_clear_flag(r, RSB_RECOVER_CONVERT); 701 + unlock_rsb(r); 702 + } 703 + up_read(&ls->ls_root_sem); 704 + 705 + log_debug(ls, "dlm_recover_rsbs %d rsbs", count); 706 + } 707 + 708 + /* Create a single list of all root rsb's to be used during recovery */ 709 + 710 + int dlm_create_root_list(struct dlm_ls *ls) 711 + { 712 + struct dlm_rsb *r; 713 + int i, error = 0; 714 + 715 + down_write(&ls->ls_root_sem); 716 + if (!list_empty(&ls->ls_root_list)) { 717 + log_error(ls, "root list not empty"); 718 + error = -EINVAL; 719 + goto out; 720 + } 721 + 722 + for (i = 0; i < ls->ls_rsbtbl_size; i++) { 723 + read_lock(&ls->ls_rsbtbl[i].lock); 724 + list_for_each_entry(r, &ls->ls_rsbtbl[i].list, res_hashchain) { 725 + list_add(&r->res_root_list, &ls->ls_root_list); 726 + dlm_hold_rsb(r); 727 + } 728 + read_unlock(&ls->ls_rsbtbl[i].lock); 729 + } 730 + out: 731 + up_write(&ls->ls_root_sem); 732 + return error; 733 + } 734 + 735 + void dlm_release_root_list(struct dlm_ls *ls) 736 + { 737 + struct dlm_rsb *r, *safe; 738 + 739 + down_write(&ls->ls_root_sem); 740 + list_for_each_entry_safe(r, safe, &ls->ls_root_list, res_root_list) { 741 + list_del_init(&r->res_root_list); 742 + dlm_put_rsb(r); 743 + } 744 + up_write(&ls->ls_root_sem); 745 + } 746 + 747 + void dlm_clear_toss_list(struct dlm_ls *ls) 748 + { 749 + struct dlm_rsb *r, *safe; 750 + int i; 751 + 752 + for (i = 0; i < ls->ls_rsbtbl_size; i++) { 753 + write_lock(&ls->ls_rsbtbl[i].lock); 754 + list_for_each_entry_safe(r, safe, &ls->ls_rsbtbl[i].toss, 755 + res_hashchain) { 756 + list_del(&r->res_hashchain); 757 + free_rsb(r); 758 + } 759 + write_unlock(&ls->ls_rsbtbl[i].lock); 760 + } 761 + } 762 +
+34
fs/dlm/recover.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #ifndef __RECOVER_DOT_H__ 15 + #define __RECOVER_DOT_H__ 16 + 17 + int dlm_wait_function(struct dlm_ls *ls, int (*testfn) (struct dlm_ls *ls)); 18 + uint32_t dlm_recover_status(struct dlm_ls *ls); 19 + void dlm_set_recover_status(struct dlm_ls *ls, uint32_t status); 20 + int dlm_recover_members_wait(struct dlm_ls *ls); 21 + int dlm_recover_directory_wait(struct dlm_ls *ls); 22 + int dlm_recover_locks_wait(struct dlm_ls *ls); 23 + int dlm_recover_done_wait(struct dlm_ls *ls); 24 + int dlm_recover_masters(struct dlm_ls *ls); 25 + int dlm_recover_master_reply(struct dlm_ls *ls, struct dlm_rcom *rc); 26 + int dlm_recover_locks(struct dlm_ls *ls); 27 + void dlm_recovered_lock(struct dlm_rsb *r); 28 + int dlm_create_root_list(struct dlm_ls *ls); 29 + void dlm_release_root_list(struct dlm_ls *ls); 30 + void dlm_clear_toss_list(struct dlm_ls *ls); 31 + void dlm_recover_rsbs(struct dlm_ls *ls); 32 + 33 + #endif /* __RECOVER_DOT_H__ */ 34 +
+285
fs/dlm/recoverd.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #include "dlm_internal.h" 15 + #include "lockspace.h" 16 + #include "member.h" 17 + #include "dir.h" 18 + #include "ast.h" 19 + #include "recover.h" 20 + #include "lowcomms.h" 21 + #include "lock.h" 22 + #include "requestqueue.h" 23 + #include "recoverd.h" 24 + 25 + 26 + /* If the start for which we're re-enabling locking (seq) has been superseded 27 + by a newer stop (ls_recover_seq), we need to leave locking disabled. */ 28 + 29 + static int enable_locking(struct dlm_ls *ls, uint64_t seq) 30 + { 31 + int error = -EINTR; 32 + 33 + spin_lock(&ls->ls_recover_lock); 34 + if (ls->ls_recover_seq == seq) { 35 + set_bit(LSFL_RUNNING, &ls->ls_flags); 36 + up_write(&ls->ls_in_recovery); 37 + error = 0; 38 + } 39 + spin_unlock(&ls->ls_recover_lock); 40 + return error; 41 + } 42 + 43 + static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv) 44 + { 45 + unsigned long start; 46 + int error, neg = 0; 47 + 48 + log_debug(ls, "recover %"PRIx64"", rv->seq); 49 + 50 + down(&ls->ls_recoverd_active); 51 + 52 + /* 53 + * Suspending and resuming dlm_astd ensures that no lkb's from this ls 54 + * will be processed by dlm_astd during recovery. 55 + */ 56 + 57 + dlm_astd_suspend(); 58 + dlm_astd_resume(); 59 + 60 + /* 61 + * This list of root rsb's will be the basis of most of the recovery 62 + * routines. 63 + */ 64 + 65 + dlm_create_root_list(ls); 66 + 67 + /* 68 + * Free all the tossed rsb's so we don't have to recover them. 69 + */ 70 + 71 + dlm_clear_toss_list(ls); 72 + 73 + /* 74 + * Add or remove nodes from the lockspace's ls_nodes list. 75 + * Also waits for all nodes to complete dlm_recover_members. 76 + */ 77 + 78 + error = dlm_recover_members(ls, rv, &neg); 79 + if (error) { 80 + log_error(ls, "recover_members failed %d", error); 81 + goto fail; 82 + } 83 + start = jiffies; 84 + 85 + /* 86 + * Rebuild our own share of the directory by collecting from all other 87 + * nodes their master rsb names that hash to us. 88 + */ 89 + 90 + error = dlm_recover_directory(ls); 91 + if (error) { 92 + log_error(ls, "recover_directory failed %d", error); 93 + goto fail; 94 + } 95 + 96 + /* 97 + * Purge directory-related requests that are saved in requestqueue. 98 + * All dir requests from before recovery are invalid now due to the dir 99 + * rebuild and will be resent by the requesting nodes. 100 + */ 101 + 102 + dlm_purge_requestqueue(ls); 103 + 104 + /* 105 + * Wait for all nodes to complete directory rebuild. 106 + */ 107 + 108 + error = dlm_recover_directory_wait(ls); 109 + if (error) { 110 + log_error(ls, "recover_directory_wait failed %d", error); 111 + goto fail; 112 + } 113 + 114 + /* 115 + * We may have outstanding operations that are waiting for a reply from 116 + * a failed node. Mark these to be resent after recovery. Unlock and 117 + * cancel ops can just be completed. 118 + */ 119 + 120 + dlm_recover_waiters_pre(ls); 121 + 122 + error = dlm_recovery_stopped(ls); 123 + if (error) 124 + goto fail; 125 + 126 + if (neg || dlm_no_directory(ls)) { 127 + /* 128 + * Clear lkb's for departed nodes. 129 + */ 130 + 131 + dlm_purge_locks(ls); 132 + 133 + /* 134 + * Get new master nodeid's for rsb's that were mastered on 135 + * departed nodes. 136 + */ 137 + 138 + error = dlm_recover_masters(ls); 139 + if (error) { 140 + log_error(ls, "recover_masters failed %d", error); 141 + goto fail; 142 + } 143 + 144 + /* 145 + * Send our locks on remastered rsb's to the new masters. 146 + */ 147 + 148 + error = dlm_recover_locks(ls); 149 + if (error) { 150 + log_error(ls, "recover_locks failed %d", error); 151 + goto fail; 152 + } 153 + 154 + error = dlm_recover_locks_wait(ls); 155 + if (error) { 156 + log_error(ls, "recover_locks_wait failed %d", error); 157 + goto fail; 158 + } 159 + 160 + /* 161 + * Finalize state in master rsb's now that all locks can be 162 + * checked. This includes conversion resolution and lvb 163 + * settings. 164 + */ 165 + 166 + dlm_recover_rsbs(ls); 167 + } 168 + 169 + dlm_release_root_list(ls); 170 + 171 + dlm_set_recover_status(ls, DLM_RS_DONE); 172 + error = dlm_recover_done_wait(ls); 173 + if (error) { 174 + log_error(ls, "recover_done_wait failed %d", error); 175 + goto fail; 176 + } 177 + 178 + dlm_clear_members_gone(ls); 179 + 180 + error = enable_locking(ls, rv->seq); 181 + if (error) { 182 + log_error(ls, "enable_locking failed %d", error); 183 + goto fail; 184 + } 185 + 186 + error = dlm_process_requestqueue(ls); 187 + if (error) { 188 + log_error(ls, "process_requestqueue failed %d", error); 189 + goto fail; 190 + } 191 + 192 + error = dlm_recover_waiters_post(ls); 193 + if (error) { 194 + log_error(ls, "recover_waiters_post failed %d", error); 195 + goto fail; 196 + } 197 + 198 + dlm_grant_after_purge(ls); 199 + 200 + dlm_astd_wake(); 201 + 202 + log_debug(ls, "recover %"PRIx64" done: %u ms", rv->seq, 203 + jiffies_to_msecs(jiffies - start)); 204 + up(&ls->ls_recoverd_active); 205 + 206 + return 0; 207 + 208 + fail: 209 + dlm_release_root_list(ls); 210 + log_debug(ls, "recover %"PRIx64" error %d", rv->seq, error); 211 + up(&ls->ls_recoverd_active); 212 + return error; 213 + } 214 + 215 + static void do_ls_recovery(struct dlm_ls *ls) 216 + { 217 + struct dlm_recover *rv = NULL; 218 + 219 + spin_lock(&ls->ls_recover_lock); 220 + rv = ls->ls_recover_args; 221 + ls->ls_recover_args = NULL; 222 + clear_bit(LSFL_RECOVERY_STOP, &ls->ls_flags); 223 + spin_unlock(&ls->ls_recover_lock); 224 + 225 + if (rv) { 226 + ls_recover(ls, rv); 227 + kfree(rv->nodeids); 228 + kfree(rv); 229 + } 230 + } 231 + 232 + static int dlm_recoverd(void *arg) 233 + { 234 + struct dlm_ls *ls; 235 + 236 + ls = dlm_find_lockspace_local(arg); 237 + 238 + while (!kthread_should_stop()) { 239 + set_current_state(TASK_INTERRUPTIBLE); 240 + if (!test_bit(LSFL_WORK, &ls->ls_flags)) 241 + schedule(); 242 + set_current_state(TASK_RUNNING); 243 + 244 + if (test_and_clear_bit(LSFL_WORK, &ls->ls_flags)) 245 + do_ls_recovery(ls); 246 + } 247 + 248 + dlm_put_lockspace(ls); 249 + return 0; 250 + } 251 + 252 + void dlm_recoverd_kick(struct dlm_ls *ls) 253 + { 254 + set_bit(LSFL_WORK, &ls->ls_flags); 255 + wake_up_process(ls->ls_recoverd_task); 256 + } 257 + 258 + int dlm_recoverd_start(struct dlm_ls *ls) 259 + { 260 + struct task_struct *p; 261 + int error = 0; 262 + 263 + p = kthread_run(dlm_recoverd, ls, "dlm_recoverd"); 264 + if (IS_ERR(p)) 265 + error = PTR_ERR(p); 266 + else 267 + ls->ls_recoverd_task = p; 268 + return error; 269 + } 270 + 271 + void dlm_recoverd_stop(struct dlm_ls *ls) 272 + { 273 + kthread_stop(ls->ls_recoverd_task); 274 + } 275 + 276 + void dlm_recoverd_suspend(struct dlm_ls *ls) 277 + { 278 + down(&ls->ls_recoverd_active); 279 + } 280 + 281 + void dlm_recoverd_resume(struct dlm_ls *ls) 282 + { 283 + up(&ls->ls_recoverd_active); 284 + } 285 +
+24
fs/dlm/recoverd.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #ifndef __RECOVERD_DOT_H__ 15 + #define __RECOVERD_DOT_H__ 16 + 17 + void dlm_recoverd_kick(struct dlm_ls *ls); 18 + void dlm_recoverd_stop(struct dlm_ls *ls); 19 + int dlm_recoverd_start(struct dlm_ls *ls); 20 + void dlm_recoverd_suspend(struct dlm_ls *ls); 21 + void dlm_recoverd_resume(struct dlm_ls *ls); 22 + 23 + #endif /* __RECOVERD_DOT_H__ */ 24 +
+184
fs/dlm/requestqueue.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5 + ** 6 + ** This copyrighted material is made available to anyone wishing to use, 7 + ** modify, copy, or redistribute it subject to the terms and conditions 8 + ** of the GNU General Public License v.2. 9 + ** 10 + ******************************************************************************* 11 + ******************************************************************************/ 12 + 13 + #include "dlm_internal.h" 14 + #include "member.h" 15 + #include "lock.h" 16 + #include "dir.h" 17 + #include "config.h" 18 + #include "requestqueue.h" 19 + 20 + struct rq_entry { 21 + struct list_head list; 22 + int nodeid; 23 + char request[1]; 24 + }; 25 + 26 + /* 27 + * Requests received while the lockspace is in recovery get added to the 28 + * request queue and processed when recovery is complete. This happens when 29 + * the lockspace is suspended on some nodes before it is on others, or the 30 + * lockspace is enabled on some while still suspended on others. 31 + */ 32 + 33 + void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd) 34 + { 35 + struct rq_entry *e; 36 + int length = hd->h_length; 37 + 38 + if (dlm_is_removed(ls, nodeid)) 39 + return; 40 + 41 + e = kmalloc(sizeof(struct rq_entry) + length, GFP_KERNEL); 42 + if (!e) { 43 + log_print("dlm_add_requestqueue: out of memory\n"); 44 + return; 45 + } 46 + 47 + e->nodeid = nodeid; 48 + memcpy(e->request, hd, length); 49 + 50 + down(&ls->ls_requestqueue_lock); 51 + list_add_tail(&e->list, &ls->ls_requestqueue); 52 + up(&ls->ls_requestqueue_lock); 53 + } 54 + 55 + int dlm_process_requestqueue(struct dlm_ls *ls) 56 + { 57 + struct rq_entry *e; 58 + struct dlm_header *hd; 59 + int error = 0; 60 + 61 + down(&ls->ls_requestqueue_lock); 62 + 63 + for (;;) { 64 + if (list_empty(&ls->ls_requestqueue)) { 65 + up(&ls->ls_requestqueue_lock); 66 + error = 0; 67 + break; 68 + } 69 + e = list_entry(ls->ls_requestqueue.next, struct rq_entry, list); 70 + up(&ls->ls_requestqueue_lock); 71 + 72 + hd = (struct dlm_header *) e->request; 73 + error = dlm_receive_message(hd, e->nodeid, TRUE); 74 + 75 + if (error == -EINTR) { 76 + /* entry is left on requestqueue */ 77 + log_debug(ls, "process_requestqueue abort eintr"); 78 + break; 79 + } 80 + 81 + down(&ls->ls_requestqueue_lock); 82 + list_del(&e->list); 83 + kfree(e); 84 + 85 + if (dlm_locking_stopped(ls)) { 86 + log_debug(ls, "process_requestqueue abort running"); 87 + up(&ls->ls_requestqueue_lock); 88 + error = -EINTR; 89 + break; 90 + } 91 + schedule(); 92 + } 93 + 94 + return error; 95 + } 96 + 97 + /* 98 + * After recovery is done, locking is resumed and dlm_recoverd takes all the 99 + * saved requests and processes them as they would have been by dlm_recvd. At 100 + * the same time, dlm_recvd will start receiving new requests from remote 101 + * nodes. We want to delay dlm_recvd processing new requests until 102 + * dlm_recoverd has finished processing the old saved requests. 103 + */ 104 + 105 + void dlm_wait_requestqueue(struct dlm_ls *ls) 106 + { 107 + for (;;) { 108 + down(&ls->ls_requestqueue_lock); 109 + if (list_empty(&ls->ls_requestqueue)) 110 + break; 111 + if (dlm_locking_stopped(ls)) 112 + break; 113 + up(&ls->ls_requestqueue_lock); 114 + schedule(); 115 + } 116 + up(&ls->ls_requestqueue_lock); 117 + } 118 + 119 + static int purge_request(struct dlm_ls *ls, struct dlm_message *ms, int nodeid) 120 + { 121 + uint32_t type = ms->m_type; 122 + 123 + if (dlm_is_removed(ls, nodeid)) 124 + return 1; 125 + 126 + /* directory operations are always purged because the directory is 127 + always rebuilt during recovery and the lookups resent */ 128 + 129 + if (type == DLM_MSG_REMOVE || 130 + type == DLM_MSG_LOOKUP || 131 + type == DLM_MSG_LOOKUP_REPLY) 132 + return 1; 133 + 134 + if (!dlm_no_directory(ls)) 135 + return 0; 136 + 137 + /* with no directory, the master is likely to change as a part of 138 + recovery; requests to/from the defunct master need to be purged */ 139 + 140 + switch (type) { 141 + case DLM_MSG_REQUEST: 142 + case DLM_MSG_CONVERT: 143 + case DLM_MSG_UNLOCK: 144 + case DLM_MSG_CANCEL: 145 + /* we're no longer the master of this resource, the sender 146 + will resend to the new master (see waiter_needs_recovery) */ 147 + 148 + if (dlm_hash2nodeid(ls, ms->m_hash) != dlm_our_nodeid()) 149 + return 1; 150 + break; 151 + 152 + case DLM_MSG_REQUEST_REPLY: 153 + case DLM_MSG_CONVERT_REPLY: 154 + case DLM_MSG_UNLOCK_REPLY: 155 + case DLM_MSG_CANCEL_REPLY: 156 + case DLM_MSG_GRANT: 157 + /* this reply is from the former master of the resource, 158 + we'll resend to the new master if needed */ 159 + 160 + if (dlm_hash2nodeid(ls, ms->m_hash) != nodeid) 161 + return 1; 162 + break; 163 + } 164 + 165 + return 0; 166 + } 167 + 168 + void dlm_purge_requestqueue(struct dlm_ls *ls) 169 + { 170 + struct dlm_message *ms; 171 + struct rq_entry *e, *safe; 172 + 173 + down(&ls->ls_requestqueue_lock); 174 + list_for_each_entry_safe(e, safe, &ls->ls_requestqueue, list) { 175 + ms = (struct dlm_message *) e->request; 176 + 177 + if (purge_request(ls, ms, e->nodeid)) { 178 + list_del(&e->list); 179 + kfree(e); 180 + } 181 + } 182 + up(&ls->ls_requestqueue_lock); 183 + } 184 +
+22
fs/dlm/requestqueue.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5 + ** 6 + ** This copyrighted material is made available to anyone wishing to use, 7 + ** modify, copy, or redistribute it subject to the terms and conditions 8 + ** of the GNU General Public License v.2. 9 + ** 10 + ******************************************************************************* 11 + ******************************************************************************/ 12 + 13 + #ifndef __REQUESTQUEUE_DOT_H__ 14 + #define __REQUESTQUEUE_DOT_H__ 15 + 16 + void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_header *hd); 17 + int dlm_process_requestqueue(struct dlm_ls *ls); 18 + void dlm_wait_requestqueue(struct dlm_ls *ls); 19 + void dlm_purge_requestqueue(struct dlm_ls *ls); 20 + 21 + #endif 22 +
+173
fs/dlm/util.c
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5 + ** 6 + ** This copyrighted material is made available to anyone wishing to use, 7 + ** modify, copy, or redistribute it subject to the terms and conditions 8 + ** of the GNU General Public License v.2. 9 + ** 10 + ******************************************************************************* 11 + ******************************************************************************/ 12 + 13 + #include "dlm_internal.h" 14 + #include "rcom.h" 15 + #include "util.h" 16 + 17 + static void header_out(struct dlm_header *hd) 18 + { 19 + hd->h_version = cpu_to_le32(hd->h_version); 20 + hd->h_lockspace = cpu_to_le32(hd->h_lockspace); 21 + hd->h_nodeid = cpu_to_le32(hd->h_nodeid); 22 + hd->h_length = cpu_to_le16(hd->h_length); 23 + } 24 + 25 + static void header_in(struct dlm_header *hd) 26 + { 27 + hd->h_version = le32_to_cpu(hd->h_version); 28 + hd->h_lockspace = le32_to_cpu(hd->h_lockspace); 29 + hd->h_nodeid = le32_to_cpu(hd->h_nodeid); 30 + hd->h_length = le16_to_cpu(hd->h_length); 31 + } 32 + 33 + void dlm_message_out(struct dlm_message *ms) 34 + { 35 + struct dlm_header *hd = (struct dlm_header *) ms; 36 + 37 + header_out(hd); 38 + 39 + ms->m_type = cpu_to_le32(ms->m_type); 40 + ms->m_nodeid = cpu_to_le32(ms->m_nodeid); 41 + ms->m_pid = cpu_to_le32(ms->m_pid); 42 + ms->m_lkid = cpu_to_le32(ms->m_lkid); 43 + ms->m_remid = cpu_to_le32(ms->m_remid); 44 + ms->m_parent_lkid = cpu_to_le32(ms->m_parent_lkid); 45 + ms->m_parent_remid = cpu_to_le32(ms->m_parent_remid); 46 + ms->m_exflags = cpu_to_le32(ms->m_exflags); 47 + ms->m_sbflags = cpu_to_le32(ms->m_sbflags); 48 + ms->m_flags = cpu_to_le32(ms->m_flags); 49 + ms->m_lvbseq = cpu_to_le32(ms->m_lvbseq); 50 + ms->m_hash = cpu_to_le32(ms->m_hash); 51 + ms->m_status = cpu_to_le32(ms->m_status); 52 + ms->m_grmode = cpu_to_le32(ms->m_grmode); 53 + ms->m_rqmode = cpu_to_le32(ms->m_rqmode); 54 + ms->m_bastmode = cpu_to_le32(ms->m_bastmode); 55 + ms->m_asts = cpu_to_le32(ms->m_asts); 56 + ms->m_result = cpu_to_le32(ms->m_result); 57 + ms->m_range[0] = cpu_to_le64(ms->m_range[0]); 58 + ms->m_range[1] = cpu_to_le64(ms->m_range[1]); 59 + } 60 + 61 + void dlm_message_in(struct dlm_message *ms) 62 + { 63 + struct dlm_header *hd = (struct dlm_header *) ms; 64 + 65 + header_in(hd); 66 + 67 + ms->m_type = le32_to_cpu(ms->m_type); 68 + ms->m_nodeid = le32_to_cpu(ms->m_nodeid); 69 + ms->m_pid = le32_to_cpu(ms->m_pid); 70 + ms->m_lkid = le32_to_cpu(ms->m_lkid); 71 + ms->m_remid = le32_to_cpu(ms->m_remid); 72 + ms->m_parent_lkid = le32_to_cpu(ms->m_parent_lkid); 73 + ms->m_parent_remid = le32_to_cpu(ms->m_parent_remid); 74 + ms->m_exflags = le32_to_cpu(ms->m_exflags); 75 + ms->m_sbflags = le32_to_cpu(ms->m_sbflags); 76 + ms->m_flags = le32_to_cpu(ms->m_flags); 77 + ms->m_lvbseq = le32_to_cpu(ms->m_lvbseq); 78 + ms->m_hash = le32_to_cpu(ms->m_hash); 79 + ms->m_status = le32_to_cpu(ms->m_status); 80 + ms->m_grmode = le32_to_cpu(ms->m_grmode); 81 + ms->m_rqmode = le32_to_cpu(ms->m_rqmode); 82 + ms->m_bastmode = le32_to_cpu(ms->m_bastmode); 83 + ms->m_asts = le32_to_cpu(ms->m_asts); 84 + ms->m_result = le32_to_cpu(ms->m_result); 85 + ms->m_range[0] = le64_to_cpu(ms->m_range[0]); 86 + ms->m_range[1] = le64_to_cpu(ms->m_range[1]); 87 + } 88 + 89 + static void rcom_lock_out(struct rcom_lock *rl) 90 + { 91 + rl->rl_ownpid = cpu_to_le32(rl->rl_ownpid); 92 + rl->rl_lkid = cpu_to_le32(rl->rl_lkid); 93 + rl->rl_remid = cpu_to_le32(rl->rl_remid); 94 + rl->rl_parent_lkid = cpu_to_le32(rl->rl_parent_lkid); 95 + rl->rl_parent_remid = cpu_to_le32(rl->rl_parent_remid); 96 + rl->rl_exflags = cpu_to_le32(rl->rl_exflags); 97 + rl->rl_flags = cpu_to_le32(rl->rl_flags); 98 + rl->rl_lvbseq = cpu_to_le32(rl->rl_lvbseq); 99 + rl->rl_result = cpu_to_le32(rl->rl_result); 100 + rl->rl_wait_type = cpu_to_le16(rl->rl_wait_type); 101 + rl->rl_namelen = cpu_to_le16(rl->rl_namelen); 102 + rl->rl_range[0] = cpu_to_le64(rl->rl_range[0]); 103 + rl->rl_range[1] = cpu_to_le64(rl->rl_range[1]); 104 + rl->rl_range[2] = cpu_to_le64(rl->rl_range[2]); 105 + rl->rl_range[3] = cpu_to_le64(rl->rl_range[3]); 106 + } 107 + 108 + static void rcom_lock_in(struct rcom_lock *rl) 109 + { 110 + rl->rl_ownpid = le32_to_cpu(rl->rl_ownpid); 111 + rl->rl_lkid = le32_to_cpu(rl->rl_lkid); 112 + rl->rl_remid = le32_to_cpu(rl->rl_remid); 113 + rl->rl_parent_lkid = le32_to_cpu(rl->rl_parent_lkid); 114 + rl->rl_parent_remid = le32_to_cpu(rl->rl_parent_remid); 115 + rl->rl_exflags = le32_to_cpu(rl->rl_exflags); 116 + rl->rl_flags = le32_to_cpu(rl->rl_flags); 117 + rl->rl_lvbseq = le32_to_cpu(rl->rl_lvbseq); 118 + rl->rl_result = le32_to_cpu(rl->rl_result); 119 + rl->rl_wait_type = le16_to_cpu(rl->rl_wait_type); 120 + rl->rl_namelen = le16_to_cpu(rl->rl_namelen); 121 + rl->rl_range[0] = le64_to_cpu(rl->rl_range[0]); 122 + rl->rl_range[1] = le64_to_cpu(rl->rl_range[1]); 123 + rl->rl_range[2] = le64_to_cpu(rl->rl_range[2]); 124 + rl->rl_range[3] = le64_to_cpu(rl->rl_range[3]); 125 + } 126 + 127 + static void rcom_config_out(struct rcom_config *rf) 128 + { 129 + rf->rf_lvblen = cpu_to_le32(rf->rf_lvblen); 130 + rf->rf_lsflags = cpu_to_le32(rf->rf_lsflags); 131 + } 132 + 133 + static void rcom_config_in(struct rcom_config *rf) 134 + { 135 + rf->rf_lvblen = le32_to_cpu(rf->rf_lvblen); 136 + rf->rf_lsflags = le32_to_cpu(rf->rf_lsflags); 137 + } 138 + 139 + void dlm_rcom_out(struct dlm_rcom *rc) 140 + { 141 + struct dlm_header *hd = (struct dlm_header *) rc; 142 + int type = rc->rc_type; 143 + 144 + header_out(hd); 145 + 146 + rc->rc_type = cpu_to_le32(rc->rc_type); 147 + rc->rc_result = cpu_to_le32(rc->rc_result); 148 + rc->rc_id = cpu_to_le64(rc->rc_id); 149 + 150 + if (type == DLM_RCOM_LOCK) 151 + rcom_lock_out((struct rcom_lock *) rc->rc_buf); 152 + 153 + else if (type == DLM_RCOM_STATUS_REPLY) 154 + rcom_config_out((struct rcom_config *) rc->rc_buf); 155 + } 156 + 157 + void dlm_rcom_in(struct dlm_rcom *rc) 158 + { 159 + struct dlm_header *hd = (struct dlm_header *) rc; 160 + 161 + header_in(hd); 162 + 163 + rc->rc_type = le32_to_cpu(rc->rc_type); 164 + rc->rc_result = le32_to_cpu(rc->rc_result); 165 + rc->rc_id = le64_to_cpu(rc->rc_id); 166 + 167 + if (rc->rc_type == DLM_RCOM_LOCK) 168 + rcom_lock_in((struct rcom_lock *) rc->rc_buf); 169 + 170 + else if (rc->rc_type == DLM_RCOM_STATUS_REPLY) 171 + rcom_config_in((struct rcom_config *) rc->rc_buf); 172 + } 173 +
+22
fs/dlm/util.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) 2005 Red Hat, Inc. All rights reserved. 5 + ** 6 + ** This copyrighted material is made available to anyone wishing to use, 7 + ** modify, copy, or redistribute it subject to the terms and conditions 8 + ** of the GNU General Public License v.2. 9 + ** 10 + ******************************************************************************* 11 + ******************************************************************************/ 12 + 13 + #ifndef __UTIL_DOT_H__ 14 + #define __UTIL_DOT_H__ 15 + 16 + void dlm_message_out(struct dlm_message *ms); 17 + void dlm_message_in(struct dlm_message *ms); 18 + void dlm_rcom_out(struct dlm_rcom *rc); 19 + void dlm_rcom_in(struct dlm_rcom *rc); 20 + 21 + #endif 22 +
+312
include/linux/dlm.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + #ifndef __DLM_DOT_H__ 15 + #define __DLM_DOT_H__ 16 + 17 + /* 18 + * Interface to Distributed Lock Manager (DLM) 19 + * routines and structures to use DLM lockspaces 20 + */ 21 + 22 + /* 23 + * Lock Modes 24 + */ 25 + 26 + #define DLM_LOCK_IV -1 /* invalid */ 27 + #define DLM_LOCK_NL 0 /* null */ 28 + #define DLM_LOCK_CR 1 /* concurrent read */ 29 + #define DLM_LOCK_CW 2 /* concurrent write */ 30 + #define DLM_LOCK_PR 3 /* protected read */ 31 + #define DLM_LOCK_PW 4 /* protected write */ 32 + #define DLM_LOCK_EX 5 /* exclusive */ 33 + 34 + /* 35 + * Maximum size in bytes of a dlm_lock name 36 + */ 37 + 38 + #define DLM_RESNAME_MAXLEN 64 39 + 40 + /* 41 + * Flags to dlm_lock 42 + * 43 + * DLM_LKF_NOQUEUE 44 + * 45 + * Do not queue the lock request on the wait queue if it cannot be granted 46 + * immediately. If the lock cannot be granted because of this flag, DLM will 47 + * either return -EAGAIN from the dlm_lock call or will return 0 from 48 + * dlm_lock and -EAGAIN in the lock status block when the AST is executed. 49 + * 50 + * DLM_LKF_CANCEL 51 + * 52 + * Used to cancel a pending lock request or conversion. A converting lock is 53 + * returned to its previously granted mode. 54 + * 55 + * DLM_LKF_CONVERT 56 + * 57 + * Indicates a lock conversion request. For conversions the name and namelen 58 + * are ignored and the lock ID in the LKSB is used to identify the lock. 59 + * 60 + * DLM_LKF_VALBLK 61 + * 62 + * Requests DLM to return the current contents of the lock value block in the 63 + * lock status block. When this flag is set in a lock conversion from PW or EX 64 + * modes, DLM assigns the value specified in the lock status block to the lock 65 + * value block of the lock resource. The LVB is a DLM_LVB_LEN size array 66 + * containing application-specific information. 67 + * 68 + * DLM_LKF_QUECVT 69 + * 70 + * Force a conversion request to be queued, even if it is compatible with 71 + * the granted modes of other locks on the same resource. 72 + * 73 + * DLM_LKF_IVVALBLK 74 + * 75 + * Invalidate the lock value block. 76 + * 77 + * DLM_LKF_CONVDEADLK 78 + * 79 + * Allows the dlm to resolve conversion deadlocks internally by demoting the 80 + * granted mode of a converting lock to NL. The DLM_SBF_DEMOTED flag is 81 + * returned for a conversion that's been effected by this. 82 + * 83 + * DLM_LKF_PERSISTENT 84 + * 85 + * Only relevant to locks originating in userspace. A persistent lock will not 86 + * be removed if the process holding the lock exits. 87 + * 88 + * DLM_LKF_NODLKWT 89 + * DLM_LKF_NODLCKBLK 90 + * 91 + * net yet implemented 92 + * 93 + * DLM_LKF_EXPEDITE 94 + * 95 + * Used only with new requests for NL mode locks. Tells the lock manager 96 + * to grant the lock, ignoring other locks in convert and wait queues. 97 + * 98 + * DLM_LKF_NOQUEUEBAST 99 + * 100 + * Send blocking AST's before returning -EAGAIN to the caller. It is only 101 + * used along with the NOQUEUE flag. Blocking AST's are not sent for failed 102 + * NOQUEUE requests otherwise. 103 + * 104 + * DLM_LKF_HEADQUE 105 + * 106 + * Add a lock to the head of the convert or wait queue rather than the tail. 107 + * 108 + * DLM_LKF_NOORDER 109 + * 110 + * Disregard the standard grant order rules and grant a lock as soon as it 111 + * is compatible with other granted locks. 112 + * 113 + * DLM_LKF_ORPHAN 114 + * 115 + * not yet implemented 116 + * 117 + * DLM_LKF_ALTPR 118 + * 119 + * If the requested mode cannot be granted immediately, try to grant the lock 120 + * in PR mode instead. If this alternate mode is granted instead of the 121 + * requested mode, DLM_SBF_ALTMODE is returned in the lksb. 122 + * 123 + * DLM_LKF_ALTCW 124 + * 125 + * The same as ALTPR, but the alternate mode is CW. 126 + * 127 + * DLM_LKF_FORCEUNLOCK 128 + * 129 + * Unlock the lock even if it is converting or waiting or has sublocks. 130 + * Only really for use by the userland device.c code. 131 + * 132 + */ 133 + 134 + #define DLM_LKF_NOQUEUE 0x00000001 135 + #define DLM_LKF_CANCEL 0x00000002 136 + #define DLM_LKF_CONVERT 0x00000004 137 + #define DLM_LKF_VALBLK 0x00000008 138 + #define DLM_LKF_QUECVT 0x00000010 139 + #define DLM_LKF_IVVALBLK 0x00000020 140 + #define DLM_LKF_CONVDEADLK 0x00000040 141 + #define DLM_LKF_PERSISTENT 0x00000080 142 + #define DLM_LKF_NODLCKWT 0x00000100 143 + #define DLM_LKF_NODLCKBLK 0x00000200 144 + #define DLM_LKF_EXPEDITE 0x00000400 145 + #define DLM_LKF_NOQUEUEBAST 0x00000800 146 + #define DLM_LKF_HEADQUE 0x00001000 147 + #define DLM_LKF_NOORDER 0x00002000 148 + #define DLM_LKF_ORPHAN 0x00004000 149 + #define DLM_LKF_ALTPR 0x00008000 150 + #define DLM_LKF_ALTCW 0x00010000 151 + #define DLM_LKF_FORCEUNLOCK 0x00020000 152 + 153 + /* 154 + * Some return codes that are not in errno.h 155 + */ 156 + 157 + #define DLM_ECANCEL 0x10001 158 + #define DLM_EUNLOCK 0x10002 159 + 160 + typedef void dlm_lockspace_t; 161 + 162 + /* 163 + * Lock range structure 164 + */ 165 + 166 + struct dlm_range { 167 + uint64_t ra_start; 168 + uint64_t ra_end; 169 + }; 170 + 171 + /* 172 + * Lock status block 173 + * 174 + * Use this structure to specify the contents of the lock value block. For a 175 + * conversion request, this structure is used to specify the lock ID of the 176 + * lock. DLM writes the status of the lock request and the lock ID assigned 177 + * to the request in the lock status block. 178 + * 179 + * sb_lkid: the returned lock ID. It is set on new (non-conversion) requests. 180 + * It is available when dlm_lock returns. 181 + * 182 + * sb_lvbptr: saves or returns the contents of the lock's LVB according to rules 183 + * shown for the DLM_LKF_VALBLK flag. 184 + * 185 + * sb_flags: DLM_SBF_DEMOTED is returned if in the process of promoting a lock, 186 + * it was first demoted to NL to avoid conversion deadlock. 187 + * DLM_SBF_VALNOTVALID is returned if the resource's LVB is marked invalid. 188 + * 189 + * sb_status: the returned status of the lock request set prior to AST 190 + * execution. Possible return values: 191 + * 192 + * 0 if lock request was successful 193 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE 194 + * -ENOMEM if there is no memory to process request 195 + * -EINVAL if there are invalid parameters 196 + * -DLM_EUNLOCK if unlock request was successful 197 + * -DLM_ECANCEL if a cancel completed successfully 198 + */ 199 + 200 + #define DLM_SBF_DEMOTED 0x01 201 + #define DLM_SBF_VALNOTVALID 0x02 202 + #define DLM_SBF_ALTMODE 0x04 203 + 204 + struct dlm_lksb { 205 + int sb_status; 206 + uint32_t sb_lkid; 207 + char sb_flags; 208 + char * sb_lvbptr; 209 + }; 210 + 211 + 212 + #ifdef __KERNEL__ 213 + 214 + #define DLM_LSFL_NODIR 0x00000001 215 + 216 + /* 217 + * dlm_new_lockspace 218 + * 219 + * Starts a lockspace with the given name. If the named lockspace exists in 220 + * the cluster, the calling node joins it. 221 + */ 222 + 223 + int dlm_new_lockspace(char *name, int namelen, dlm_lockspace_t **lockspace, 224 + uint32_t flags, int lvblen); 225 + 226 + /* 227 + * dlm_release_lockspace 228 + * 229 + * Stop a lockspace. 230 + */ 231 + 232 + int dlm_release_lockspace(dlm_lockspace_t *lockspace, int force); 233 + 234 + /* 235 + * dlm_lock 236 + * 237 + * Make an asyncronous request to acquire or convert a lock on a named 238 + * resource. 239 + * 240 + * lockspace: context for the request 241 + * mode: the requested mode of the lock (DLM_LOCK_) 242 + * lksb: lock status block for input and async return values 243 + * flags: input flags (DLM_LKF_) 244 + * name: name of the resource to lock, can be binary 245 + * namelen: the length in bytes of the resource name (MAX_RESNAME_LEN) 246 + * parent: the lock ID of a parent lock or 0 if none 247 + * lockast: function DLM executes when it completes processing the request 248 + * astarg: argument passed to lockast and bast functions 249 + * bast: function DLM executes when this lock later blocks another request 250 + * 251 + * Returns: 252 + * 0 if request is successfully queued for processing 253 + * -EINVAL if any input parameters are invalid 254 + * -EAGAIN if request would block and is flagged DLM_LKF_NOQUEUE 255 + * -ENOMEM if there is no memory to process request 256 + * -ENOTCONN if there is a communication error 257 + * 258 + * If the call to dlm_lock returns an error then the operation has failed and 259 + * the AST routine will not be called. If dlm_lock returns 0 it is still 260 + * possible that the lock operation will fail. The AST routine will be called 261 + * when the locking is complete and the status is returned in the lksb. 262 + * 263 + * If the AST routines or parameter are passed to a conversion operation then 264 + * they will overwrite those values that were passed to a previous dlm_lock 265 + * call. 266 + * 267 + * AST routines should not block (at least not for long), but may make 268 + * any locking calls they please. 269 + */ 270 + 271 + int dlm_lock(dlm_lockspace_t *lockspace, 272 + int mode, 273 + struct dlm_lksb *lksb, 274 + uint32_t flags, 275 + void *name, 276 + unsigned int namelen, 277 + uint32_t parent_lkid, 278 + void (*lockast) (void *astarg), 279 + void *astarg, 280 + void (*bast) (void *astarg, int mode), 281 + struct dlm_range *range); 282 + 283 + /* 284 + * dlm_unlock 285 + * 286 + * Asynchronously release a lock on a resource. The AST routine is called 287 + * when the resource is successfully unlocked. 288 + * 289 + * lockspace: context for the request 290 + * lkid: the lock ID as returned in the lksb 291 + * flags: input flags (DLM_LKF_) 292 + * lksb: if NULL the lksb parameter passed to last lock request is used 293 + * astarg: the arg used with the completion ast for the unlock 294 + * 295 + * Returns: 296 + * 0 if request is successfully queued for processing 297 + * -EINVAL if any input parameters are invalid 298 + * -ENOTEMPTY if the lock still has sublocks 299 + * -EBUSY if the lock is waiting for a remote lock operation 300 + * -ENOTCONN if there is a communication error 301 + */ 302 + 303 + int dlm_unlock(dlm_lockspace_t *lockspace, 304 + uint32_t lkid, 305 + uint32_t flags, 306 + struct dlm_lksb *lksb, 307 + void *astarg); 308 + 309 + #endif /* __KERNEL__ */ 310 + 311 + #endif /* __DLM_DOT_H__ */ 312 +
+84
include/linux/dlm_device.h
··· 1 + /****************************************************************************** 2 + ******************************************************************************* 3 + ** 4 + ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 5 + ** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. 6 + ** 7 + ** This copyrighted material is made available to anyone wishing to use, 8 + ** modify, copy, or redistribute it subject to the terms and conditions 9 + ** of the GNU General Public License v.2. 10 + ** 11 + ******************************************************************************* 12 + ******************************************************************************/ 13 + 14 + /* This is the device interface for dlm, most users will use a library 15 + * interface. 16 + */ 17 + 18 + #define DLM_USER_LVB_LEN 32 19 + 20 + /* Version of the device interface */ 21 + #define DLM_DEVICE_VERSION_MAJOR 3 22 + #define DLM_DEVICE_VERSION_MINOR 0 23 + #define DLM_DEVICE_VERSION_PATCH 0 24 + 25 + /* struct passed to the lock write */ 26 + struct dlm_lock_params { 27 + __u8 mode; 28 + __u16 flags; 29 + __u32 lkid; 30 + __u32 parent; 31 + struct dlm_range range; 32 + __u8 namelen; 33 + void __user *castparam; 34 + void __user *castaddr; 35 + void __user *bastparam; 36 + void __user *bastaddr; 37 + struct dlm_lksb __user *lksb; 38 + char lvb[DLM_USER_LVB_LEN]; 39 + char name[1]; 40 + }; 41 + 42 + struct dlm_lspace_params { 43 + __u32 flags; 44 + __u32 minor; 45 + char name[1]; 46 + }; 47 + 48 + struct dlm_write_request { 49 + __u32 version[3]; 50 + __u8 cmd; 51 + 52 + union { 53 + struct dlm_lock_params lock; 54 + struct dlm_lspace_params lspace; 55 + } i; 56 + }; 57 + 58 + /* struct read from the "device" fd, 59 + consists mainly of userspace pointers for the library to use */ 60 + struct dlm_lock_result { 61 + __u32 length; 62 + void __user * user_astaddr; 63 + void __user * user_astparam; 64 + struct dlm_lksb __user * user_lksb; 65 + struct dlm_lksb lksb; 66 + __u8 bast_mode; 67 + /* Offsets may be zero if no data is present */ 68 + __u32 lvb_offset; 69 + }; 70 + 71 + /* Commands passed to the device */ 72 + #define DLM_USER_LOCK 1 73 + #define DLM_USER_UNLOCK 2 74 + #define DLM_USER_QUERY 3 75 + #define DLM_USER_CREATE_LOCKSPACE 4 76 + #define DLM_USER_REMOVE_LOCKSPACE 5 77 + 78 + /* Arbitrary length restriction */ 79 + #define MAX_LS_NAME_LEN 64 80 + 81 + /* Lockspace flags */ 82 + #define DLM_USER_LSFLG_AUTOFREE 1 83 + #define DLM_USER_LSFLG_FORCEFREE 2 84 +