Linux kernel mirror (for testing) git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git
kernel os linux
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at v3.5-rc6 851 lines 24 kB view raw
1/* 2 * In-kernel transcendent memory (generic implementation) 3 * 4 * Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp. 5 * 6 * The primary purpose of Transcedent Memory ("tmem") is to map object-oriented 7 * "handles" (triples containing a pool id, and object id, and an index), to 8 * pages in a page-accessible memory (PAM). Tmem references the PAM pages via 9 * an abstract "pampd" (PAM page-descriptor), which can be operated on by a 10 * set of functions (pamops). Each pampd contains some representation of 11 * PAGE_SIZE bytes worth of data. Tmem must support potentially millions of 12 * pages and must be able to insert, find, and delete these pages at a 13 * potential frequency of thousands per second concurrently across many CPUs, 14 * (and, if used with KVM, across many vcpus across many guests). 15 * Tmem is tracked with a hierarchy of data structures, organized by 16 * the elements in a handle-tuple: pool_id, object_id, and page index. 17 * One or more "clients" (e.g. guests) each provide one or more tmem_pools. 18 * Each pool, contains a hash table of rb_trees of tmem_objs. Each 19 * tmem_obj contains a radix-tree-like tree of pointers, with intermediate 20 * nodes called tmem_objnodes. Each leaf pointer in this tree points to 21 * a pampd, which is accessible only through a small set of callbacks 22 * registered by the PAM implementation (see tmem_register_pamops). Tmem 23 * does all memory allocation via a set of callbacks registered by the tmem 24 * host implementation (e.g. see tmem_register_hostops). 25 */ 26 27#include <linux/list.h> 28#include <linux/spinlock.h> 29#include <linux/atomic.h> 30#include <linux/delay.h> 31 32#include "tmem.h" 33 34/* data structure sentinels used for debugging... see tmem.h */ 35#define POOL_SENTINEL 0x87658765 36#define OBJ_SENTINEL 0x12345678 37#define OBJNODE_SENTINEL 0xfedcba09 38 39/* 40 * A tmem host implementation must use this function to register callbacks 41 * for memory allocation. 42 */ 43static struct tmem_hostops tmem_hostops; 44 45static void tmem_objnode_tree_init(void); 46 47void tmem_register_hostops(struct tmem_hostops *m) 48{ 49 tmem_objnode_tree_init(); 50 tmem_hostops = *m; 51} 52 53/* 54 * A tmem host implementation must use this function to register 55 * callbacks for a page-accessible memory (PAM) implementation 56 */ 57static struct tmem_pamops tmem_pamops; 58 59void tmem_register_pamops(struct tmem_pamops *m) 60{ 61 tmem_pamops = *m; 62} 63 64/* 65 * Oid's are potentially very sparse and tmem_objs may have an indeterminately 66 * short life, being added and deleted at a relatively high frequency. 67 * So an rb_tree is an ideal data structure to manage tmem_objs. But because 68 * of the potentially huge number of tmem_objs, each pool manages a hashtable 69 * of rb_trees to reduce search, insert, delete, and rebalancing time. 70 * Each hashbucket also has a lock to manage concurrent access. 71 * 72 * The following routines manage tmem_objs. When any tmem_obj is accessed, 73 * the hashbucket lock must be held. 74 */ 75 76/* searches for object==oid in pool, returns locked object if found */ 77static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb, 78 struct tmem_oid *oidp) 79{ 80 struct rb_node *rbnode; 81 struct tmem_obj *obj; 82 83 rbnode = hb->obj_rb_root.rb_node; 84 while (rbnode) { 85 BUG_ON(RB_EMPTY_NODE(rbnode)); 86 obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); 87 switch (tmem_oid_compare(oidp, &obj->oid)) { 88 case 0: /* equal */ 89 goto out; 90 case -1: 91 rbnode = rbnode->rb_left; 92 break; 93 case 1: 94 rbnode = rbnode->rb_right; 95 break; 96 } 97 } 98 obj = NULL; 99out: 100 return obj; 101} 102 103static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *); 104 105/* free an object that has no more pampds in it */ 106static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb) 107{ 108 struct tmem_pool *pool; 109 110 BUG_ON(obj == NULL); 111 ASSERT_SENTINEL(obj, OBJ); 112 BUG_ON(obj->pampd_count > 0); 113 pool = obj->pool; 114 BUG_ON(pool == NULL); 115 if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */ 116 tmem_pampd_destroy_all_in_obj(obj); 117 BUG_ON(obj->objnode_tree_root != NULL); 118 BUG_ON((long)obj->objnode_count != 0); 119 atomic_dec(&pool->obj_count); 120 BUG_ON(atomic_read(&pool->obj_count) < 0); 121 INVERT_SENTINEL(obj, OBJ); 122 obj->pool = NULL; 123 tmem_oid_set_invalid(&obj->oid); 124 rb_erase(&obj->rb_tree_node, &hb->obj_rb_root); 125} 126 127/* 128 * initialize, and insert an tmem_object_root (called only if find failed) 129 */ 130static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb, 131 struct tmem_pool *pool, 132 struct tmem_oid *oidp) 133{ 134 struct rb_root *root = &hb->obj_rb_root; 135 struct rb_node **new = &(root->rb_node), *parent = NULL; 136 struct tmem_obj *this; 137 138 BUG_ON(pool == NULL); 139 atomic_inc(&pool->obj_count); 140 obj->objnode_tree_height = 0; 141 obj->objnode_tree_root = NULL; 142 obj->pool = pool; 143 obj->oid = *oidp; 144 obj->objnode_count = 0; 145 obj->pampd_count = 0; 146 (*tmem_pamops.new_obj)(obj); 147 SET_SENTINEL(obj, OBJ); 148 while (*new) { 149 BUG_ON(RB_EMPTY_NODE(*new)); 150 this = rb_entry(*new, struct tmem_obj, rb_tree_node); 151 parent = *new; 152 switch (tmem_oid_compare(oidp, &this->oid)) { 153 case 0: 154 BUG(); /* already present; should never happen! */ 155 break; 156 case -1: 157 new = &(*new)->rb_left; 158 break; 159 case 1: 160 new = &(*new)->rb_right; 161 break; 162 } 163 } 164 rb_link_node(&obj->rb_tree_node, parent, new); 165 rb_insert_color(&obj->rb_tree_node, root); 166} 167 168/* 169 * Tmem is managed as a set of tmem_pools with certain attributes, such as 170 * "ephemeral" vs "persistent". These attributes apply to all tmem_objs 171 * and all pampds that belong to a tmem_pool. A tmem_pool is created 172 * or deleted relatively rarely (for example, when a filesystem is 173 * mounted or unmounted. 174 */ 175 176/* flush all data from a pool and, optionally, free it */ 177static void tmem_pool_flush(struct tmem_pool *pool, bool destroy) 178{ 179 struct rb_node *rbnode; 180 struct tmem_obj *obj; 181 struct tmem_hashbucket *hb = &pool->hashbucket[0]; 182 int i; 183 184 BUG_ON(pool == NULL); 185 for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { 186 spin_lock(&hb->lock); 187 rbnode = rb_first(&hb->obj_rb_root); 188 while (rbnode != NULL) { 189 obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node); 190 rbnode = rb_next(rbnode); 191 tmem_pampd_destroy_all_in_obj(obj); 192 tmem_obj_free(obj, hb); 193 (*tmem_hostops.obj_free)(obj, pool); 194 } 195 spin_unlock(&hb->lock); 196 } 197 if (destroy) 198 list_del(&pool->pool_list); 199} 200 201/* 202 * A tmem_obj contains a radix-tree-like tree in which the intermediate 203 * nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation 204 * is very specialized and tuned for specific uses and is not particularly 205 * suited for use from this code, though some code from the core algorithms has 206 * been reused, thus the copyright notices below). Each tmem_objnode contains 207 * a set of pointers which point to either a set of intermediate tmem_objnodes 208 * or a set of of pampds. 209 * 210 * Portions Copyright (C) 2001 Momchil Velikov 211 * Portions Copyright (C) 2001 Christoph Hellwig 212 * Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com> 213 */ 214 215struct tmem_objnode_tree_path { 216 struct tmem_objnode *objnode; 217 int offset; 218}; 219 220/* objnode height_to_maxindex translation */ 221static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1]; 222 223static void tmem_objnode_tree_init(void) 224{ 225 unsigned int ht, tmp; 226 227 for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) { 228 tmp = ht * OBJNODE_TREE_MAP_SHIFT; 229 if (tmp >= OBJNODE_TREE_INDEX_BITS) 230 tmem_objnode_tree_h2max[ht] = ~0UL; 231 else 232 tmem_objnode_tree_h2max[ht] = 233 (~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1; 234 } 235} 236 237static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj) 238{ 239 struct tmem_objnode *objnode; 240 241 ASSERT_SENTINEL(obj, OBJ); 242 BUG_ON(obj->pool == NULL); 243 ASSERT_SENTINEL(obj->pool, POOL); 244 objnode = (*tmem_hostops.objnode_alloc)(obj->pool); 245 if (unlikely(objnode == NULL)) 246 goto out; 247 objnode->obj = obj; 248 SET_SENTINEL(objnode, OBJNODE); 249 memset(&objnode->slots, 0, sizeof(objnode->slots)); 250 objnode->slots_in_use = 0; 251 obj->objnode_count++; 252out: 253 return objnode; 254} 255 256static void tmem_objnode_free(struct tmem_objnode *objnode) 257{ 258 struct tmem_pool *pool; 259 int i; 260 261 BUG_ON(objnode == NULL); 262 for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) 263 BUG_ON(objnode->slots[i] != NULL); 264 ASSERT_SENTINEL(objnode, OBJNODE); 265 INVERT_SENTINEL(objnode, OBJNODE); 266 BUG_ON(objnode->obj == NULL); 267 ASSERT_SENTINEL(objnode->obj, OBJ); 268 pool = objnode->obj->pool; 269 BUG_ON(pool == NULL); 270 ASSERT_SENTINEL(pool, POOL); 271 objnode->obj->objnode_count--; 272 objnode->obj = NULL; 273 (*tmem_hostops.objnode_free)(objnode, pool); 274} 275 276/* 277 * lookup index in object and return associated pampd (or NULL if not found) 278 */ 279static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) 280{ 281 unsigned int height, shift; 282 struct tmem_objnode **slot = NULL; 283 284 BUG_ON(obj == NULL); 285 ASSERT_SENTINEL(obj, OBJ); 286 BUG_ON(obj->pool == NULL); 287 ASSERT_SENTINEL(obj->pool, POOL); 288 289 height = obj->objnode_tree_height; 290 if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) 291 goto out; 292 if (height == 0 && obj->objnode_tree_root) { 293 slot = &obj->objnode_tree_root; 294 goto out; 295 } 296 shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; 297 slot = &obj->objnode_tree_root; 298 while (height > 0) { 299 if (*slot == NULL) 300 goto out; 301 slot = (struct tmem_objnode **) 302 ((*slot)->slots + 303 ((index >> shift) & OBJNODE_TREE_MAP_MASK)); 304 shift -= OBJNODE_TREE_MAP_SHIFT; 305 height--; 306 } 307out: 308 return slot != NULL ? (void **)slot : NULL; 309} 310 311static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index) 312{ 313 struct tmem_objnode **slot; 314 315 slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); 316 return slot != NULL ? *slot : NULL; 317} 318 319static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index, 320 void *new_pampd, bool no_free) 321{ 322 struct tmem_objnode **slot; 323 void *ret = NULL; 324 325 slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index); 326 if ((slot != NULL) && (*slot != NULL)) { 327 void *old_pampd = *(void **)slot; 328 *(void **)slot = new_pampd; 329 if (!no_free) 330 (*tmem_pamops.free)(old_pampd, obj->pool, 331 NULL, 0, false); 332 ret = new_pampd; 333 } 334 return ret; 335} 336 337static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index, 338 void *pampd) 339{ 340 int ret = 0; 341 struct tmem_objnode *objnode = NULL, *newnode, *slot; 342 unsigned int height, shift; 343 int offset = 0; 344 345 /* if necessary, extend the tree to be higher */ 346 if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) { 347 height = obj->objnode_tree_height + 1; 348 if (index > tmem_objnode_tree_h2max[height]) 349 while (index > tmem_objnode_tree_h2max[height]) 350 height++; 351 if (obj->objnode_tree_root == NULL) { 352 obj->objnode_tree_height = height; 353 goto insert; 354 } 355 do { 356 newnode = tmem_objnode_alloc(obj); 357 if (!newnode) { 358 ret = -ENOMEM; 359 goto out; 360 } 361 newnode->slots[0] = obj->objnode_tree_root; 362 newnode->slots_in_use = 1; 363 obj->objnode_tree_root = newnode; 364 obj->objnode_tree_height++; 365 } while (height > obj->objnode_tree_height); 366 } 367insert: 368 slot = obj->objnode_tree_root; 369 height = obj->objnode_tree_height; 370 shift = (height-1) * OBJNODE_TREE_MAP_SHIFT; 371 while (height > 0) { 372 if (slot == NULL) { 373 /* add a child objnode. */ 374 slot = tmem_objnode_alloc(obj); 375 if (!slot) { 376 ret = -ENOMEM; 377 goto out; 378 } 379 if (objnode) { 380 381 objnode->slots[offset] = slot; 382 objnode->slots_in_use++; 383 } else 384 obj->objnode_tree_root = slot; 385 } 386 /* go down a level */ 387 offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; 388 objnode = slot; 389 slot = objnode->slots[offset]; 390 shift -= OBJNODE_TREE_MAP_SHIFT; 391 height--; 392 } 393 BUG_ON(slot != NULL); 394 if (objnode) { 395 objnode->slots_in_use++; 396 objnode->slots[offset] = pampd; 397 } else 398 obj->objnode_tree_root = pampd; 399 obj->pampd_count++; 400out: 401 return ret; 402} 403 404static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index) 405{ 406 struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1]; 407 struct tmem_objnode_tree_path *pathp = path; 408 struct tmem_objnode *slot = NULL; 409 unsigned int height, shift; 410 int offset; 411 412 BUG_ON(obj == NULL); 413 ASSERT_SENTINEL(obj, OBJ); 414 BUG_ON(obj->pool == NULL); 415 ASSERT_SENTINEL(obj->pool, POOL); 416 height = obj->objnode_tree_height; 417 if (index > tmem_objnode_tree_h2max[height]) 418 goto out; 419 slot = obj->objnode_tree_root; 420 if (height == 0 && obj->objnode_tree_root) { 421 obj->objnode_tree_root = NULL; 422 goto out; 423 } 424 shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT; 425 pathp->objnode = NULL; 426 do { 427 if (slot == NULL) 428 goto out; 429 pathp++; 430 offset = (index >> shift) & OBJNODE_TREE_MAP_MASK; 431 pathp->offset = offset; 432 pathp->objnode = slot; 433 slot = slot->slots[offset]; 434 shift -= OBJNODE_TREE_MAP_SHIFT; 435 height--; 436 } while (height > 0); 437 if (slot == NULL) 438 goto out; 439 while (pathp->objnode) { 440 pathp->objnode->slots[pathp->offset] = NULL; 441 pathp->objnode->slots_in_use--; 442 if (pathp->objnode->slots_in_use) { 443 if (pathp->objnode == obj->objnode_tree_root) { 444 while (obj->objnode_tree_height > 0 && 445 obj->objnode_tree_root->slots_in_use == 1 && 446 obj->objnode_tree_root->slots[0]) { 447 struct tmem_objnode *to_free = 448 obj->objnode_tree_root; 449 450 obj->objnode_tree_root = 451 to_free->slots[0]; 452 obj->objnode_tree_height--; 453 to_free->slots[0] = NULL; 454 to_free->slots_in_use = 0; 455 tmem_objnode_free(to_free); 456 } 457 } 458 goto out; 459 } 460 tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */ 461 pathp--; 462 } 463 obj->objnode_tree_height = 0; 464 obj->objnode_tree_root = NULL; 465 466out: 467 if (slot != NULL) 468 obj->pampd_count--; 469 BUG_ON(obj->pampd_count < 0); 470 return slot; 471} 472 473/* recursively walk the objnode_tree destroying pampds and objnodes */ 474static void tmem_objnode_node_destroy(struct tmem_obj *obj, 475 struct tmem_objnode *objnode, 476 unsigned int ht) 477{ 478 int i; 479 480 if (ht == 0) 481 return; 482 for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) { 483 if (objnode->slots[i]) { 484 if (ht == 1) { 485 obj->pampd_count--; 486 (*tmem_pamops.free)(objnode->slots[i], 487 obj->pool, NULL, 0, true); 488 objnode->slots[i] = NULL; 489 continue; 490 } 491 tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1); 492 tmem_objnode_free(objnode->slots[i]); 493 objnode->slots[i] = NULL; 494 } 495 } 496} 497 498static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj) 499{ 500 if (obj->objnode_tree_root == NULL) 501 return; 502 if (obj->objnode_tree_height == 0) { 503 obj->pampd_count--; 504 (*tmem_pamops.free)(obj->objnode_tree_root, 505 obj->pool, NULL, 0, true); 506 } else { 507 tmem_objnode_node_destroy(obj, obj->objnode_tree_root, 508 obj->objnode_tree_height); 509 tmem_objnode_free(obj->objnode_tree_root); 510 obj->objnode_tree_height = 0; 511 } 512 obj->objnode_tree_root = NULL; 513 (*tmem_pamops.free_obj)(obj->pool, obj); 514} 515 516/* 517 * Tmem is operated on by a set of well-defined actions: 518 * "put", "get", "flush", "flush_object", "new pool" and "destroy pool". 519 * (The tmem ABI allows for subpages and exchanges but these operations 520 * are not included in this implementation.) 521 * 522 * These "tmem core" operations are implemented in the following functions. 523 */ 524 525/* 526 * "Put" a page, e.g. copy a page from the kernel into newly allocated 527 * PAM space (if such space is available). Tmem_put is complicated by 528 * a corner case: What if a page with matching handle already exists in 529 * tmem? To guarantee coherency, one of two actions is necessary: Either 530 * the data for the page must be overwritten, or the page must be 531 * "flushed" so that the data is not accessible to a subsequent "get". 532 * Since these "duplicate puts" are relatively rare, this implementation 533 * always flushes for simplicity. 534 */ 535int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, 536 char *data, size_t size, bool raw, int ephemeral) 537{ 538 struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL; 539 void *pampd = NULL, *pampd_del = NULL; 540 int ret = -ENOMEM; 541 struct tmem_hashbucket *hb; 542 543 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 544 spin_lock(&hb->lock); 545 obj = objfound = tmem_obj_find(hb, oidp); 546 if (obj != NULL) { 547 pampd = tmem_pampd_lookup_in_obj(objfound, index); 548 if (pampd != NULL) { 549 /* if found, is a dup put, flush the old one */ 550 pampd_del = tmem_pampd_delete_from_obj(obj, index); 551 BUG_ON(pampd_del != pampd); 552 (*tmem_pamops.free)(pampd, pool, oidp, index, true); 553 if (obj->pampd_count == 0) { 554 objnew = obj; 555 objfound = NULL; 556 } 557 pampd = NULL; 558 } 559 } else { 560 obj = objnew = (*tmem_hostops.obj_alloc)(pool); 561 if (unlikely(obj == NULL)) { 562 ret = -ENOMEM; 563 goto out; 564 } 565 tmem_obj_init(obj, hb, pool, oidp); 566 } 567 BUG_ON(obj == NULL); 568 BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound)); 569 pampd = (*tmem_pamops.create)(data, size, raw, ephemeral, 570 obj->pool, &obj->oid, index); 571 if (unlikely(pampd == NULL)) 572 goto free; 573 ret = tmem_pampd_add_to_obj(obj, index, pampd); 574 if (unlikely(ret == -ENOMEM)) 575 /* may have partially built objnode tree ("stump") */ 576 goto delete_and_free; 577 goto out; 578 579delete_and_free: 580 (void)tmem_pampd_delete_from_obj(obj, index); 581free: 582 if (pampd) 583 (*tmem_pamops.free)(pampd, pool, NULL, 0, true); 584 if (objnew) { 585 tmem_obj_free(objnew, hb); 586 (*tmem_hostops.obj_free)(objnew, pool); 587 } 588out: 589 spin_unlock(&hb->lock); 590 return ret; 591} 592 593void *tmem_localify_get_pampd(struct tmem_pool *pool, struct tmem_oid *oidp, 594 uint32_t index, struct tmem_obj **ret_obj, 595 void **saved_hb) 596{ 597 struct tmem_hashbucket *hb; 598 struct tmem_obj *obj = NULL; 599 void *pampd = NULL; 600 601 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 602 spin_lock(&hb->lock); 603 obj = tmem_obj_find(hb, oidp); 604 if (likely(obj != NULL)) 605 pampd = tmem_pampd_lookup_in_obj(obj, index); 606 *ret_obj = obj; 607 *saved_hb = (void *)hb; 608 /* note, hashbucket remains locked */ 609 return pampd; 610} 611 612void tmem_localify_finish(struct tmem_obj *obj, uint32_t index, 613 void *pampd, void *saved_hb, bool delete) 614{ 615 struct tmem_hashbucket *hb = (struct tmem_hashbucket *)saved_hb; 616 617 BUG_ON(!spin_is_locked(&hb->lock)); 618 if (pampd != NULL) { 619 BUG_ON(obj == NULL); 620 (void)tmem_pampd_replace_in_obj(obj, index, pampd, 1); 621 } else if (delete) { 622 BUG_ON(obj == NULL); 623 (void)tmem_pampd_delete_from_obj(obj, index); 624 } 625 spin_unlock(&hb->lock); 626} 627 628static int tmem_repatriate(void **ppampd, struct tmem_hashbucket *hb, 629 struct tmem_pool *pool, struct tmem_oid *oidp, 630 uint32_t index, bool free, char *data) 631{ 632 void *old_pampd = *ppampd, *new_pampd = NULL; 633 bool intransit = false; 634 int ret = 0; 635 636 637 if (!is_ephemeral(pool)) 638 new_pampd = (*tmem_pamops.repatriate_preload)( 639 old_pampd, pool, oidp, index, &intransit); 640 if (intransit) 641 ret = -EAGAIN; 642 else if (new_pampd != NULL) 643 *ppampd = new_pampd; 644 /* must release the hb->lock else repatriate can't sleep */ 645 spin_unlock(&hb->lock); 646 if (!intransit) 647 ret = (*tmem_pamops.repatriate)(old_pampd, new_pampd, pool, 648 oidp, index, free, data); 649 return ret; 650} 651 652/* 653 * "Get" a page, e.g. if one can be found, copy the tmem page with the 654 * matching handle from PAM space to the kernel. By tmem definition, 655 * when a "get" is successful on an ephemeral page, the page is "flushed", 656 * and when a "get" is successful on a persistent page, the page is retained 657 * in tmem. Note that to preserve 658 * coherency, "get" can never be skipped if tmem contains the data. 659 * That is, if a get is done with a certain handle and fails, any 660 * subsequent "get" must also fail (unless of course there is a 661 * "put" done with the same handle). 662 663 */ 664int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index, 665 char *data, size_t *size, bool raw, int get_and_free) 666{ 667 struct tmem_obj *obj; 668 void *pampd; 669 bool ephemeral = is_ephemeral(pool); 670 int ret = -1; 671 struct tmem_hashbucket *hb; 672 bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral); 673 bool lock_held = 0; 674 void **ppampd; 675 676again: 677 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 678 spin_lock(&hb->lock); 679 lock_held = 1; 680 obj = tmem_obj_find(hb, oidp); 681 if (obj == NULL) 682 goto out; 683 ppampd = __tmem_pampd_lookup_in_obj(obj, index); 684 if (ppampd == NULL) 685 goto out; 686 if (tmem_pamops.is_remote(*ppampd)) { 687 ret = tmem_repatriate(ppampd, hb, pool, oidp, 688 index, free, data); 689 lock_held = 0; /* note hb->lock has been unlocked */ 690 if (ret == -EAGAIN) { 691 /* rare I think, but should cond_resched()??? */ 692 usleep_range(10, 1000); 693 goto again; 694 } else if (ret != 0) { 695 if (ret != -ENOENT) 696 pr_err("UNTESTED case in tmem_get, ret=%d\n", 697 ret); 698 ret = -1; 699 goto out; 700 } 701 goto out; 702 } 703 if (free) 704 pampd = tmem_pampd_delete_from_obj(obj, index); 705 else 706 pampd = tmem_pampd_lookup_in_obj(obj, index); 707 if (pampd == NULL) 708 goto out; 709 if (free) { 710 if (obj->pampd_count == 0) { 711 tmem_obj_free(obj, hb); 712 (*tmem_hostops.obj_free)(obj, pool); 713 obj = NULL; 714 } 715 } 716 if (free) 717 ret = (*tmem_pamops.get_data_and_free)( 718 data, size, raw, pampd, pool, oidp, index); 719 else 720 ret = (*tmem_pamops.get_data)( 721 data, size, raw, pampd, pool, oidp, index); 722 if (ret < 0) 723 goto out; 724 ret = 0; 725out: 726 if (lock_held) 727 spin_unlock(&hb->lock); 728 return ret; 729} 730 731/* 732 * If a page in tmem matches the handle, "flush" this page from tmem such 733 * that any subsequent "get" does not succeed (unless, of course, there 734 * was another "put" with the same handle). 735 */ 736int tmem_flush_page(struct tmem_pool *pool, 737 struct tmem_oid *oidp, uint32_t index) 738{ 739 struct tmem_obj *obj; 740 void *pampd; 741 int ret = -1; 742 struct tmem_hashbucket *hb; 743 744 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 745 spin_lock(&hb->lock); 746 obj = tmem_obj_find(hb, oidp); 747 if (obj == NULL) 748 goto out; 749 pampd = tmem_pampd_delete_from_obj(obj, index); 750 if (pampd == NULL) 751 goto out; 752 (*tmem_pamops.free)(pampd, pool, oidp, index, true); 753 if (obj->pampd_count == 0) { 754 tmem_obj_free(obj, hb); 755 (*tmem_hostops.obj_free)(obj, pool); 756 } 757 ret = 0; 758 759out: 760 spin_unlock(&hb->lock); 761 return ret; 762} 763 764/* 765 * If a page in tmem matches the handle, replace the page so that any 766 * subsequent "get" gets the new page. Returns the new page if 767 * there was a page to replace, else returns NULL. 768 */ 769int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp, 770 uint32_t index, void *new_pampd) 771{ 772 struct tmem_obj *obj; 773 int ret = -1; 774 struct tmem_hashbucket *hb; 775 776 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 777 spin_lock(&hb->lock); 778 obj = tmem_obj_find(hb, oidp); 779 if (obj == NULL) 780 goto out; 781 new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd, 0); 782 ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj); 783out: 784 spin_unlock(&hb->lock); 785 return ret; 786} 787 788/* 789 * "Flush" all pages in tmem matching this oid. 790 */ 791int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp) 792{ 793 struct tmem_obj *obj; 794 struct tmem_hashbucket *hb; 795 int ret = -1; 796 797 hb = &pool->hashbucket[tmem_oid_hash(oidp)]; 798 spin_lock(&hb->lock); 799 obj = tmem_obj_find(hb, oidp); 800 if (obj == NULL) 801 goto out; 802 tmem_pampd_destroy_all_in_obj(obj); 803 tmem_obj_free(obj, hb); 804 (*tmem_hostops.obj_free)(obj, pool); 805 ret = 0; 806 807out: 808 spin_unlock(&hb->lock); 809 return ret; 810} 811 812/* 813 * "Flush" all pages (and tmem_objs) from this tmem_pool and disable 814 * all subsequent access to this tmem_pool. 815 */ 816int tmem_destroy_pool(struct tmem_pool *pool) 817{ 818 int ret = -1; 819 820 if (pool == NULL) 821 goto out; 822 tmem_pool_flush(pool, 1); 823 ret = 0; 824out: 825 return ret; 826} 827 828static LIST_HEAD(tmem_global_pool_list); 829 830/* 831 * Create a new tmem_pool with the provided flag and return 832 * a pool id provided by the tmem host implementation. 833 */ 834void tmem_new_pool(struct tmem_pool *pool, uint32_t flags) 835{ 836 int persistent = flags & TMEM_POOL_PERSIST; 837 int shared = flags & TMEM_POOL_SHARED; 838 struct tmem_hashbucket *hb = &pool->hashbucket[0]; 839 int i; 840 841 for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) { 842 hb->obj_rb_root = RB_ROOT; 843 spin_lock_init(&hb->lock); 844 } 845 INIT_LIST_HEAD(&pool->pool_list); 846 atomic_set(&pool->obj_count, 0); 847 SET_SENTINEL(pool, POOL); 848 list_add_tail(&pool->pool_list, &tmem_global_pool_list); 849 pool->persistent = persistent; 850 pool->shared = shared; 851}