Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.8-rc6 894 lines 26 kB view raw
1/* 2 * In-kernel transcendent memory (generic implementation) 3 * 4 * Copyright (c) 2009-2012, Dan Magenheimer, Oracle Corp. 5 * 6 * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented 7 * "handles" (triples containing a pool id, and object id, and an index), to 8 * pages in a page-accessible memory (PAM). Tmem references the PAM pages via 9 * an abstract "pampd" (PAM page-descriptor), which can be operated on by a 10 * set of functions (pamops). Each pampd contains some representation of 11 * PAGE_SIZE bytes worth of data. For those familiar with key-value stores, 12 * the tmem handle is a three-level hierarchical key, and the value is always 13 * reconstituted (but not necessarily stored) as PAGE_SIZE bytes and is 14 * referenced in the datastore by the pampd. The hierarchy is required 15 * to ensure that certain invalidation functions can be performed efficiently 16 * (i.e. flush all indexes associated with this object_id, or 17 * flush all objects associated with this pool). 18 * 19 * Tmem must support potentially millions of pages and must be able to insert, 20 * find, and delete these pages at a potential frequency of thousands per 21 * second concurrently across many CPUs, (and, if used with KVM, across many 22 * vcpus across many guests). Tmem is tracked with a hierarchy of data 23 * structures, organized by the elements in the handle-tuple: pool_id, 24 * object_id, and page index. One or more "clients" (e.g. guests) each 25 * provide one or more tmem_pools. Each pool, contains a hash table of 26 * rb_trees of tmem_objs. Each tmem_obj contains a radix-tree-like tree 27 * of pointers, with intermediate nodes called tmem_objnodes. Each leaf 28 * pointer in this tree points to a pampd, which is accessible only through 29 * a small set of callbacks registered by the PAM implementation (see 30 * tmem_register_pamops). Tmem only needs to memory allocation for objs 31 * and objnodes and this is done via a set of callbacks that must be 32 * registered by the tmem host implementation (e.g. see tmem_register_hostops). 33 */ 34 35#include <linux/list.h> 36#include <linux/spinlock.h> 37#include <linux/atomic.h> 38#ifdef CONFIG_RAMSTER 39#include <linux/delay.h> 40#endif 41 42#include "tmem.h" 43 44/* data structure sentinels used for debugging... see tmem.h */ 45#define POOL_SENTINEL 0x87658765 46#define OBJ_SENTINEL 0x12345678 47#define OBJNODE_SENTINEL 0xfedcba09 48 49/* 50 * A tmem host implementation must use this function to register callbacks 51 * for memory allocation. 52 */ 53static struct tmem_hostops tmem_hostops; 54 55static void tmem_objnode_tree_init(void); 56 57void tmem_register_hostops(struct tmem_hostops *m) 58{ 59 tmem_objnode_tree_init(); 60 tmem_hostops = *m; 61} 62 63/* 64 * A tmem host implementation must use this function to register 65 * callbacks for a page-accessible memory (PAM) implementation. 66 */ 67static struct tmem_pamops tmem_pamops; 68 69void tmem_register_pamops(struct tmem_pamops *m) 70{ 71 tmem_pamops = *m; 72} 73 74/* 75 * Oid's are potentially very sparse and tmem_objs may have an indeterminately 76 * short life, being added and deleted at a relatively high frequency. 77 * So an rb_tree is an ideal data structure to manage tmem_objs. But because 78 * of the potentially huge number of tmem_objs, each pool manages a hashtable 79 * of rb_trees to reduce search, insert, delete, and rebalancing time. 80 * Each hashbucket also has a lock to manage concurrent access and no 81 * searches, inserts, or deletions can be performed unless the lock is held. 82 * As a result, care must be taken to ensure tmem routines are not called 83 * recursively; the vast majority of the time, a recursive call may work 84 * but a deadlock will occur a small fraction of the time due to the 85 * hashbucket lock. 86 * 87 * The following routines manage tmem_objs. In all of these routines, 88 * the hashbucket lock is already held. 89 */ 90 91/* Search for object==oid in pool, returns object if found. */ 92static struct tmem_obj *__tmem_obj_find(struct tmem_hashbucket *hb, 93 struct tmem_oid *oidp, 94 struct rb_node **parent, 95 struct rb_node ***link) 96{ 97 struct rb_node *_parent = NULL, **rbnode; 98 struct tmem_obj *obj = NULL; 99 100 rbnode = &hb->obj_rb_root.rb_node; 101 while (*rbnode) { 102 BUG_ON(RB_EMPTY_NODE(*rbnode)); 103 _parent = *rbnode; 104 obj = rb_entry(*rbnode, struct tmem_obj, 105 rb_tree_node); 106 switch (tmem_oid_compare(oidp, &obj->oid)) { 107 case 0: /* equal */ 108 goto out; 109 case -1: 110 rbnode = &(*rbnode)->rb_left; 111 break; 112 case 1: 113 rbnode = &(*rbnode)->rb_right; 114 break; 115 } 116 } 117 118 if (parent) 119 *parent = _parent; 120 if (link) 121 *link = rbnode; 122 obj = NULL; 123out: 124 return obj; 125} 126 127static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, 128 struct tmem_oid *oidp) 129{ 130 return __tmem_obj_find(hb, oidp, NULL, NULL); 131} 132 133static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *, bool); 134 135/* Free an object that has no more pampds in it. */ 136static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) 137{ 138 struct tmem_pool *pool; 139 140 BUG_ON(obj == NULL); 141 ASSERT_SENTINEL(obj, OBJ); 142 BUG_ON(obj->pampd_count > 0); 143 pool = obj->pool; 144 BUG_ON(pool == NULL); 145 if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ 146 tmem_pampd_destroy_all_in_obj(obj, false); 147 BUG_ON(obj->objnode_tree_root != NULL); 148 BUG_ON((long)obj->objnode_count != 0); 149 atomic_dec(&pool->obj_count); 150 BUG_ON(atomic_read(&pool->obj_count) < 0); 151 INVERT_SENTINEL(obj, OBJ); 152 obj->pool = NULL; 153 tmem_oid_set_invalid(&obj->oid); 154 rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); 155} 156 157/* 158 * Initialize, and insert an tmem_object_root (called only if find failed). 159 */ 160static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, 161 struct tmem_pool *pool, 162 struct tmem_oid *oidp) 163{ 164 struct rb_root *root = &hb->obj_rb_root; 165 struct rb_node **new = NULL, *parent = NULL; 166 167 BUG_ON(pool == NULL); 168 atomic_inc(&pool->obj_count); 169 obj->objnode_tree_height = 0; 170 obj->objnode_tree_root = NULL; 171 obj->pool = pool; 172 obj->oid = *oidp; 173 obj->objnode_count = 0; 174 obj->pampd_count = 0; 175#ifdef CONFIG_RAMSTER 176 if (tmem_pamops.new_obj != NULL) 177 (*tmem_pamops.new_obj)(obj); 178#endif 179 SET_SENTINEL(obj, OBJ); 180 181 if (__tmem_obj_find(hb, oidp, &parent, &new)) 182 BUG(); 183 184 rb_link_node(&obj->rb_tree_node, parent, new); 185 rb_insert_color(&obj->rb_tree_node, root); 186} 187 188/* 189 * Tmem is managed as a set of tmem_pools with certain attributes, such as 190 * "ephemeral" vs "persistent". These attributes apply to all tmem_objs 191 * and all pampds that belong to a tmem_pool. A tmem_pool is created 192 * or deleted relatively rarely (for example, when a filesystem is 193 * mounted or unmounted). 194 */ 195 196/* flush all data from a pool and, optionally, free it */ 197static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) 198{ 199 struct rb_node *rbnode; 200 struct tmem_obj *obj; 201 struct tmem_hashbucket *hb = &pool->hashbucket[0]; 202 int i; 203 204 BUG_ON(pool == NULL); 205 for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { 206 spin_lock(&hb->lock); 207 rbnode = rb_first(&hb->obj_rb_root); 208 while (rbnode != NULL) { 209 obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); 210 rbnode = rb_next(rbnode); 211 tmem_pampd_destroy_all_in_obj(obj, true); 212 tmem_obj_free(obj, hb); 213 (*tmem_hostops.obj_free)(obj, pool); 214 } 215 spin_unlock(&hb->lock); 216 } 217 if (destroy) 218 list_del(&pool->pool_list); 219} 220 221/* 222 * A tmem_obj contains a radix-tree-like tree in which the intermediate 223 * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation 224 * is very specialized and tuned for specific uses and is not particularly 225 * suited for use from this code, though some code from the core algorithms has 226 * been reused, thus the copyright notices below). Each tmem_objnode contains 227 * a set of pointers which point to either a set of intermediate tmem_objnodes 228 * or a set of of pampds. 229 * 230 * Portions Copyright (C) 2001 Momchil Velikov 231 * Portions Copyright (C) 2001 Christoph Hellwig 232 * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com> 233 */ 234 235struct tmem_objnode_tree_path { 236 struct tmem_objnode *objnode; 237 int offset; 238}; 239 240/* objnode height_to_maxindex translation */ 241static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; 242 243static void tmem_objnode_tree_init(void) 244{ 245 unsigned int ht, tmp; 246 247 for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { 248 tmp = ht * OBJNODE_TREE_MAP_SHIFT; 249 if (tmp >= OBJNODE_TREE_INDEX_BITS) 250 tmem_objnode_tree_h2max[ht] = ~0UL; 251 else 252 tmem_objnode_tree_h2max[ht] = 253 (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; 254 } 255} 256 257static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) 258{ 259 struct tmem_objnode *objnode; 260 261 ASSERT_SENTINEL(obj, OBJ); 262 BUG_ON(obj->pool == NULL); 263 ASSERT_SENTINEL(obj->pool, POOL); 264 objnode = (*tmem_hostops.objnode_alloc)(obj->pool); 265 if (unlikely(objnode == NULL)) 266 goto out; 267 objnode->obj = obj; 268 SET_SENTINEL(objnode, OBJNODE); 269 memset(&objnode->slots, 0, sizeof(objnode->slots)); 270 objnode->slots_in_use = 0; 271 obj->objnode_count++; 272out: 273 return objnode; 274} 275 276static void tmem_objnode_free(struct tmem_objnode *objnode) 277{ 278 struct tmem_pool *pool; 279 int i; 280 281 BUG_ON(objnode == NULL); 282 for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) 283 BUG_ON(objnode->slots[i] != NULL); 284 ASSERT_SENTINEL(objnode, OBJNODE); 285 INVERT_SENTINEL(objnode, OBJNODE); 286 BUG_ON(objnode->obj == NULL); 287 ASSERT_SENTINEL(objnode->obj, OBJ); 288 pool = objnode->obj->pool; 289 BUG_ON(pool == NULL); 290 ASSERT_SENTINEL(pool, POOL); 291 objnode->obj->objnode_count--; 292 objnode->obj = NULL; 293 (*tmem_hostops.objnode_free)(objnode, pool); 294} 295 296/* 297 * Lookup index in object and return associated pampd (or NULL if not found). 298 */ 299static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) 300{ 301 unsigned int height, shift; 302 struct tmem_objnode **slot = NULL; 303 304 BUG_ON(obj == NULL); 305 ASSERT_SENTINEL(obj, OBJ); 306 BUG_ON(obj->pool == NULL); 307 ASSERT_SENTINEL(obj->pool, POOL); 308 309 height = obj->objnode_tree_height; 310 if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) 311 goto out; 312 if (height == 0 && obj->objnode_tree_root) { 313 slot = &obj->objnode_tree_root; 314 goto out; 315 } 316 shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; 317 slot = &obj->objnode_tree_root; 318 while (height > 0) { 319 if (*slot == NULL) 320 goto out; 321 slot = (struct tmem_objnode **) 322 ((*slot)->slots + 323 ((index >> shift) & OBJNODE_TREE_MAP_MASK)); 324 shift -= OBJNODE_TREE_MAP_SHIFT; 325 height--; 326 } 327out: 328 return slot != NULL ? (void **)slot : NULL; 329} 330 331static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) 332{ 333 struct tmem_objnode **slot; 334 335 slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); 336 return slot != NULL ? *slot : NULL; 337} 338 339#ifdef CONFIG_RAMSTER 340static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, 341 void *new_pampd, bool no_free) 342{ 343 struct tmem_objnode **slot; 344 void *ret = NULL; 345 346 slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); 347 if ((slot != NULL) && (*slot != NULL)) { 348 void *old_pampd = *(void **)slot; 349 *(void **)slot = new_pampd; 350 if (!no_free) 351 (*tmem_pamops.free)(old_pampd, obj->pool, 352 NULL, 0, false); 353 ret = new_pampd; 354 } 355 return ret; 356} 357#endif 358 359static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, 360 void *pampd) 361{ 362 int ret = 0; 363 struct tmem_objnode *objnode = NULL, *newnode, *slot; 364 unsigned int height, shift; 365 int offset = 0; 366 367 /* if necessary, extend the tree to be higher */ 368 if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { 369 height = obj->objnode_tree_height + 1; 370 if (index > tmem_objnode_tree_h2max[height]) 371 while (index > tmem_objnode_tree_h2max[height]) 372 height++; 373 if (obj->objnode_tree_root == NULL) { 374 obj->objnode_tree_height = height; 375 goto insert; 376 } 377 do { 378 newnode = tmem_objnode_alloc(obj); 379 if (!newnode) { 380 ret = -ENOMEM; 381 goto out; 382 } 383 newnode->slots[0] = obj->objnode_tree_root; 384 newnode->slots_in_use = 1; 385 obj->objnode_tree_root = newnode; 386 obj->objnode_tree_height++; 387 } while (height > obj->objnode_tree_height); 388 } 389insert: 390 slot = obj->objnode_tree_root; 391 height = obj->objnode_tree_height; 392 shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; 393 while (height > 0) { 394 if (slot == NULL) { 395 /* add a child objnode. */ 396 slot = tmem_objnode_alloc(obj); 397 if (!slot) { 398 ret = -ENOMEM; 399 goto out; 400 } 401 if (objnode) { 402 403 objnode->slots[offset] = slot; 404 objnode->slots_in_use++; 405 } else 406 obj->objnode_tree_root = slot; 407 } 408 /* go down a level */ 409 offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; 410 objnode = slot; 411 slot = objnode->slots[offset]; 412 shift -= OBJNODE_TREE_MAP_SHIFT; 413 height--; 414 } 415 BUG_ON(slot != NULL); 416 if (objnode) { 417 objnode->slots_in_use++; 418 objnode->slots[offset] = pampd; 419 } else 420 obj->objnode_tree_root = pampd; 421 obj->pampd_count++; 422out: 423 return ret; 424} 425 426static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) 427{ 428 struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; 429 struct tmem_objnode_tree_path *pathp = path; 430 struct tmem_objnode *slot = NULL; 431 unsigned int height, shift; 432 int offset; 433 434 BUG_ON(obj == NULL); 435 ASSERT_SENTINEL(obj, OBJ); 436 BUG_ON(obj->pool == NULL); 437 ASSERT_SENTINEL(obj->pool, POOL); 438 height = obj->objnode_tree_height; 439 if (index > tmem_objnode_tree_h2max[height]) 440 goto out; 441 slot = obj->objnode_tree_root; 442 if (height == 0 && obj->objnode_tree_root) { 443 obj->objnode_tree_root = NULL; 444 goto out; 445 } 446 shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; 447 pathp->objnode = NULL; 448 do { 449 if (slot == NULL) 450 goto out; 451 pathp++; 452 offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; 453 pathp->offset = offset; 454 pathp->objnode = slot; 455 slot = slot->slots[offset]; 456 shift -= OBJNODE_TREE_MAP_SHIFT; 457 height--; 458 } while (height > 0); 459 if (slot == NULL) 460 goto out; 461 while (pathp->objnode) { 462 pathp->objnode->slots[pathp->offset] = NULL; 463 pathp->objnode->slots_in_use--; 464 if (pathp->objnode->slots_in_use) { 465 if (pathp->objnode == obj->objnode_tree_root) { 466 while (obj->objnode_tree_height > 0 && 467 obj->objnode_tree_root->slots_in_use == 1 && 468 obj->objnode_tree_root->slots[0]) { 469 struct tmem_objnode *to_free = 470 obj->objnode_tree_root; 471 472 obj->objnode_tree_root = 473 to_free->slots[0]; 474 obj->objnode_tree_height--; 475 to_free->slots[0] = NULL; 476 to_free->slots_in_use = 0; 477 tmem_objnode_free(to_free); 478 } 479 } 480 goto out; 481 } 482 tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ 483 pathp--; 484 } 485 obj->objnode_tree_height = 0; 486 obj->objnode_tree_root = NULL; 487 488out: 489 if (slot != NULL) 490 obj->pampd_count--; 491 BUG_ON(obj->pampd_count < 0); 492 return slot; 493} 494 495/* Recursively walk the objnode_tree destroying pampds and objnodes. */ 496static void tmem_objnode_node_destroy(struct tmem_obj *obj, 497 struct tmem_objnode *objnode, 498 unsigned int ht) 499{ 500 int i; 501 502 if (ht == 0) 503 return; 504 for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { 505 if (objnode->slots[i]) { 506 if (ht == 1) { 507 obj->pampd_count--; 508 (*tmem_pamops.free)(objnode->slots[i], 509 obj->pool, NULL, 0, true); 510 objnode->slots[i] = NULL; 511 continue; 512 } 513 tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); 514 tmem_objnode_free(objnode->slots[i]); 515 objnode->slots[i] = NULL; 516 } 517 } 518} 519 520static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj, 521 bool pool_destroy) 522{ 523 if (obj->objnode_tree_root == NULL) 524 return; 525 if (obj->objnode_tree_height == 0) { 526 obj->pampd_count--; 527 (*tmem_pamops.free)(obj->objnode_tree_root, 528 obj->pool, NULL, 0, true); 529 } else { 530 tmem_objnode_node_destroy(obj, obj->objnode_tree_root, 531 obj->objnode_tree_height); 532 tmem_objnode_free(obj->objnode_tree_root); 533 obj->objnode_tree_height = 0; 534 } 535 obj->objnode_tree_root = NULL; 536#ifdef CONFIG_RAMSTER 537 if (tmem_pamops.free_obj != NULL) 538 (*tmem_pamops.free_obj)(obj->pool, obj, pool_destroy); 539#endif 540} 541 542/* 543 * Tmem is operated on by a set of well-defined actions: 544 * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". 545 * (The tmem ABI allows for subpages and exchanges but these operations 546 * are not included in this implementation.) 547 * 548 * These "tmem core" operations are implemented in the following functions. 549 */ 550 551/* 552 * "Put" a page, e.g. associate the passed pampd with the passed handle. 553 * Tmem_put is complicated by a corner case: What if a page with matching 554 * handle already exists in tmem? To guarantee coherency, one of two 555 * actions is necessary: Either the data for the page must be overwritten, 556 * or the page must be "flushed" so that the data is not accessible to a 557 * subsequent "get". Since these "duplicate puts" are relatively rare, 558 * this implementation always flushes for simplicity. 559 */ 560int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, 561 bool raw, void *pampd_to_use) 562{ 563 struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; 564 void *pampd = NULL, *pampd_del = NULL; 565 int ret = -ENOMEM; 566 struct tmem_hashbucket *hb; 567 568 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 569 spin_lock(&hb->lock); 570 obj = objfound = tmem_obj_find(hb, oidp); 571 if (obj != NULL) { 572 pampd = tmem_pampd_lookup_in_obj(objfound, index); 573 if (pampd != NULL) { 574 /* if found, is a dup put, flush the old one */ 575 pampd_del = tmem_pampd_delete_from_obj(obj, index); 576 BUG_ON(pampd_del != pampd); 577 (*tmem_pamops.free)(pampd, pool, oidp, index, true); 578 if (obj->pampd_count == 0) { 579 objnew = obj; 580 objfound = NULL; 581 } 582 pampd = NULL; 583 } 584 } else { 585 obj = objnew = (*tmem_hostops.obj_alloc)(pool); 586 if (unlikely(obj == NULL)) { 587 ret = -ENOMEM; 588 goto out; 589 } 590 tmem_obj_init(obj, hb, pool, oidp); 591 } 592 BUG_ON(obj == NULL); 593 BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); 594 pampd = pampd_to_use; 595 BUG_ON(pampd_to_use == NULL); 596 ret = tmem_pampd_add_to_obj(obj, index, pampd); 597 if (unlikely(ret == -ENOMEM)) 598 /* may have partially built objnode tree ("stump") */ 599 goto delete_and_free; 600 (*tmem_pamops.create_finish)(pampd, is_ephemeral(pool)); 601 goto out; 602 603delete_and_free: 604 (void)tmem_pampd_delete_from_obj(obj, index); 605 if (pampd) 606 (*tmem_pamops.free)(pampd, pool, NULL, 0, true); 607 if (objnew) { 608 tmem_obj_free(objnew, hb); 609 (*tmem_hostops.obj_free)(objnew, pool); 610 } 611out: 612 spin_unlock(&hb->lock); 613 return ret; 614} 615 616#ifdef CONFIG_RAMSTER 617/* 618 * For ramster only: The following routines provide a two-step sequence 619 * to allow the caller to replace a pampd in the tmem data structures with 620 * another pampd. Here, we lookup the passed handle and, if found, return the 621 * associated pampd and object, leaving the hashbucket locked and returning 622 * a reference to it. The caller is expected to immediately call the 623 * matching tmem_localify_finish routine which will handles the replacement 624 * and unlocks the hashbucket. 625 */ 626void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp, 627 uint32_t index, struct tmem_obj **ret_obj, 628 void **saved_hb) 629{ 630 struct tmem_hashbucket *hb; 631 struct tmem_obj *obj = NULL; 632 void *pampd = NULL; 633 634 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 635 spin_lock(&hb->lock); 636 obj = tmem_obj_find(hb, oidp); 637 if (likely(obj != NULL)) 638 pampd = tmem_pampd_lookup_in_obj(obj, index); 639 *ret_obj = obj; 640 *saved_hb = (void *)hb; 641 /* note, hashbucket remains locked */ 642 return pampd; 643} 644 645void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, 646 void *pampd, void *saved_hb, bool delete) 647{ 648 struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb; 649 650 BUG_ON(!spin_is_locked(&hb->lock)); 651 if (pampd != NULL) { 652 BUG_ON(obj == NULL); 653 (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1); 654 (*tmem_pamops.create_finish)(pampd, is_ephemeral(obj->pool)); 655 } else if (delete) { 656 BUG_ON(obj == NULL); 657 (void)tmem_pampd_delete_from_obj(obj, index); 658 } 659 spin_unlock(&hb->lock); 660} 661 662/* 663 * For ramster only. Helper function to support asynchronous tmem_get. 664 */ 665static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb, 666 struct tmem_pool *pool, struct tmem_oid *oidp, 667 uint32_t index, bool free, char *data) 668{ 669 void *old_pampd = *ppampd, *new_pampd = NULL; 670 bool intransit = false; 671 int ret = 0; 672 673 if (!is_ephemeral(pool)) 674 new_pampd = (*tmem_pamops.repatriate_preload)( 675 old_pampd, pool, oidp, index, &intransit); 676 if (intransit) 677 ret = -EAGAIN; 678 else if (new_pampd != NULL) 679 *ppampd = new_pampd; 680 /* must release the hb->lock else repatriate can't sleep */ 681 spin_unlock(&hb->lock); 682 if (!intransit) 683 ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool, 684 oidp, index, free, data); 685 if (ret == -EAGAIN) { 686 /* rare I think, but should cond_resched()??? */ 687 usleep_range(10, 1000); 688 } else if (ret == -ENOTCONN || ret == -EHOSTDOWN) { 689 ret = -1; 690 } else if (ret != 0 && ret != -ENOENT) { 691 ret = -1; 692 } 693 /* note hb->lock has now been unlocked */ 694 return ret; 695} 696 697/* 698 * For ramster only. If a page in tmem matches the handle, replace the 699 * page so that any subsequent "get" gets the new page. Returns 0 if 700 * there was a page to replace, else returns -1. 701 */ 702int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, 703 uint32_t index, void *new_pampd) 704{ 705 struct tmem_obj *obj; 706 int ret = -1; 707 struct tmem_hashbucket *hb; 708 709 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 710 spin_lock(&hb->lock); 711 obj = tmem_obj_find(hb, oidp); 712 if (obj == NULL) 713 goto out; 714 new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0); 715 /* if we bug here, pamops wasn't properly set up for ramster */ 716 BUG_ON(tmem_pamops.replace_in_obj == NULL); 717 ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); 718out: 719 spin_unlock(&hb->lock); 720 return ret; 721} 722#endif 723 724/* 725 * "Get" a page, e.g. if a pampd can be found matching the passed handle, 726 * use a pamops callback to recreated the page from the pampd with the 727 * matching handle. By tmem definition, when a "get" is successful on 728 * an ephemeral page, the page is "flushed", and when a "get" is successful 729 * on a persistent page, the page is retained in tmem. Note that to preserve 730 * coherency, "get" can never be skipped if tmem contains the data. 731 * That is, if a get is done with a certain handle and fails, any 732 * subsequent "get" must also fail (unless of course there is a 733 * "put" done with the same handle). 734 */ 735int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, 736 char *data, size_t *sizep, bool raw, int get_and_free) 737{ 738 struct tmem_obj *obj; 739 void *pampd = NULL; 740 bool ephemeral = is_ephemeral(pool); 741 int ret = -1; 742 struct tmem_hashbucket *hb; 743 bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); 744 bool lock_held = false; 745 void **ppampd; 746 747 do { 748 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 749 spin_lock(&hb->lock); 750 lock_held = true; 751 obj = tmem_obj_find(hb, oidp); 752 if (obj == NULL) 753 goto out; 754 ppampd = __tmem_pampd_lookup_in_obj(obj, index); 755 if (ppampd == NULL) 756 goto out; 757#ifdef CONFIG_RAMSTER 758 if ((tmem_pamops.is_remote != NULL) && 759 tmem_pamops.is_remote(*ppampd)) { 760 ret = tmem_repatriate(ppampd, hb, pool, oidp, 761 index, free, data); 762 /* tmem_repatriate releases hb->lock */ 763 lock_held = false; 764 *sizep = PAGE_SIZE; 765 if (ret != -EAGAIN) 766 goto out; 767 } 768#endif 769 } while (ret == -EAGAIN); 770 if (free) 771 pampd = tmem_pampd_delete_from_obj(obj, index); 772 else 773 pampd = tmem_pampd_lookup_in_obj(obj, index); 774 if (pampd == NULL) 775 goto out; 776 if (free) { 777 if (obj->pampd_count == 0) { 778 tmem_obj_free(obj, hb); 779 (*tmem_hostops.obj_free)(obj, pool); 780 obj = NULL; 781 } 782 } 783 if (free) 784 ret = (*tmem_pamops.get_data_and_free)( 785 data, sizep, raw, pampd, pool, oidp, index); 786 else 787 ret = (*tmem_pamops.get_data)( 788 data, sizep, raw, pampd, pool, oidp, index); 789 if (ret < 0) 790 goto out; 791 ret = 0; 792out: 793 if (lock_held) 794 spin_unlock(&hb->lock); 795 return ret; 796} 797 798/* 799 * If a page in tmem matches the handle, "flush" this page from tmem such 800 * that any subsequent "get" does not succeed (unless, of course, there 801 * was another "put" with the same handle). 802 */ 803int tmem_flush_page(struct tmem_pool *pool, 804 struct tmem_oid *oidp, uint32_t index) 805{ 806 struct tmem_obj *obj; 807 void *pampd; 808 int ret = -1; 809 struct tmem_hashbucket *hb; 810 811 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 812 spin_lock(&hb->lock); 813 obj = tmem_obj_find(hb, oidp); 814 if (obj == NULL) 815 goto out; 816 pampd = tmem_pampd_delete_from_obj(obj, index); 817 if (pampd == NULL) 818 goto out; 819 (*tmem_pamops.free)(pampd, pool, oidp, index, true); 820 if (obj->pampd_count == 0) { 821 tmem_obj_free(obj, hb); 822 (*tmem_hostops.obj_free)(obj, pool); 823 } 824 ret = 0; 825 826out: 827 spin_unlock(&hb->lock); 828 return ret; 829} 830 831/* 832 * "Flush" all pages in tmem matching this oid. 833 */ 834int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) 835{ 836 struct tmem_obj *obj; 837 struct tmem_hashbucket *hb; 838 int ret = -1; 839 840 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 841 spin_lock(&hb->lock); 842 obj = tmem_obj_find(hb, oidp); 843 if (obj == NULL) 844 goto out; 845 tmem_pampd_destroy_all_in_obj(obj, false); 846 tmem_obj_free(obj, hb); 847 (*tmem_hostops.obj_free)(obj, pool); 848 ret = 0; 849 850out: 851 spin_unlock(&hb->lock); 852 return ret; 853} 854 855/* 856 * "Flush" all pages (and tmem_objs) from this tmem_pool and disable 857 * all subsequent access to this tmem_pool. 858 */ 859int tmem_destroy_pool(struct tmem_pool *pool) 860{ 861 int ret = -1; 862 863 if (pool == NULL) 864 goto out; 865 tmem_pool_flush(pool, 1); 866 ret = 0; 867out: 868 return ret; 869} 870 871static LIST_HEAD(tmem_global_pool_list); 872 873/* 874 * Create a new tmem_pool with the provided flag and return 875 * a pool id provided by the tmem host implementation. 876 */ 877void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) 878{ 879 int persistent = flags & TMEM_POOL_PERSIST; 880 int shared = flags & TMEM_POOL_SHARED; 881 struct tmem_hashbucket *hb = &pool->hashbucket[0]; 882 int i; 883 884 for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { 885 hb->obj_rb_root = RB_ROOT; 886 spin_lock_init(&hb->lock); 887 } 888 INIT_LIST_HEAD(&pool->pool_list); 889 atomic_set(&pool->obj_count, 0); 890 SET_SENTINEL(pool, POOL); 891 list_add_tail(&pool->pool_list, &tmem_global_pool_list); 892 pool->persistent = persistent; 893 pool->shared = shared; 894}