2
* In-kernel transcendent memory (generic implementation)
4
* Copyright (c) 2009-2011, Dan Magenheimer, Oracle Corp.
6
* The primary purpose of Transcedent Memory ("tmem") is to map object-oriented
7
* "handles" (triples containing a pool id, and object id, and an index), to
8
* pages in a page-accessible memory (PAM). Tmem references the PAM pages via
9
* an abstract "pampd" (PAM page-descriptor), which can be operated on by a
10
* set of functions (pamops). Each pampd contains some representation of
11
* PAGE_SIZE bytes worth of data. Tmem must support potentially millions of
12
* pages and must be able to insert, find, and delete these pages at a
13
* potential frequency of thousands per second concurrently across many CPUs,
14
* (and, if used with KVM, across many vcpus across many guests).
15
* Tmem is tracked with a hierarchy of data structures, organized by
16
* the elements in a handle-tuple: pool_id, object_id, and page index.
17
* One or more "clients" (e.g. guests) each provide one or more tmem_pools.
18
* Each pool, contains a hash table of rb_trees of tmem_objs. Each
19
* tmem_obj contains a radix-tree-like tree of pointers, with intermediate
20
* nodes called tmem_objnodes. Each leaf pointer in this tree points to
21
* a pampd, which is accessible only through a small set of callbacks
22
* registered by the PAM implementation (see tmem_register_pamops). Tmem
23
* does all memory allocation via a set of callbacks registered by the tmem
24
* host implementation (e.g. see tmem_register_hostops).
27
#include <linux/list.h>
28
#include <linux/spinlock.h>
29
#include <linux/atomic.h>
33
/* data structure sentinels used for debugging... see tmem.h */
34
#define POOL_SENTINEL 0x87658765
35
#define OBJ_SENTINEL 0x12345678
36
#define OBJNODE_SENTINEL 0xfedcba09
39
* A tmem host implementation must use this function to register callbacks
40
* for memory allocation.
42
static struct tmem_hostops tmem_hostops;
44
static void tmem_objnode_tree_init(void);
46
void tmem_register_hostops(struct tmem_hostops *m)
48
tmem_objnode_tree_init();
53
* A tmem host implementation must use this function to register
54
* callbacks for a page-accessible memory (PAM) implementation
56
static struct tmem_pamops tmem_pamops;
58
void tmem_register_pamops(struct tmem_pamops *m)
64
* Oid's are potentially very sparse and tmem_objs may have an indeterminately
65
* short life, being added and deleted at a relatively high frequency.
66
* So an rb_tree is an ideal data structure to manage tmem_objs. But because
67
* of the potentially huge number of tmem_objs, each pool manages a hashtable
68
* of rb_trees to reduce search, insert, delete, and rebalancing time.
69
* Each hashbucket also has a lock to manage concurrent access.
71
* The following routines manage tmem_objs. When any tmem_obj is accessed,
72
* the hashbucket lock must be held.
75
/* searches for object==oid in pool, returns locked object if found */
76
static struct tmem_obj *tmem_obj_find(struct tmem_hashbucket *hb,
77
struct tmem_oid *oidp)
79
struct rb_node *rbnode;
82
rbnode = hb->obj_rb_root.rb_node;
84
BUG_ON(RB_EMPTY_NODE(rbnode));
85
obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
86
switch (tmem_oid_compare(oidp, &obj->oid)) {
90
rbnode = rbnode->rb_left;
93
rbnode = rbnode->rb_right;
102
static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *);
104
/* free an object that has no more pampds in it */
105
static void tmem_obj_free(struct tmem_obj *obj, struct tmem_hashbucket *hb)
107
struct tmem_pool *pool;
110
ASSERT_SENTINEL(obj, OBJ);
111
BUG_ON(obj->pampd_count > 0);
113
BUG_ON(pool == NULL);
114
if (obj->objnode_tree_root != NULL) /* may be "stump" with no leaves */
115
tmem_pampd_destroy_all_in_obj(obj);
116
BUG_ON(obj->objnode_tree_root != NULL);
117
BUG_ON((long)obj->objnode_count != 0);
118
atomic_dec(&pool->obj_count);
119
BUG_ON(atomic_read(&pool->obj_count) < 0);
120
INVERT_SENTINEL(obj, OBJ);
122
tmem_oid_set_invalid(&obj->oid);
123
rb_erase(&obj->rb_tree_node, &hb->obj_rb_root);
127
* initialize, and insert an tmem_object_root (called only if find failed)
129
static void tmem_obj_init(struct tmem_obj *obj, struct tmem_hashbucket *hb,
130
struct tmem_pool *pool,
131
struct tmem_oid *oidp)
133
struct rb_root *root = &hb->obj_rb_root;
134
struct rb_node **new = &(root->rb_node), *parent = NULL;
135
struct tmem_obj *this;
137
BUG_ON(pool == NULL);
138
atomic_inc(&pool->obj_count);
139
obj->objnode_tree_height = 0;
140
obj->objnode_tree_root = NULL;
143
obj->objnode_count = 0;
144
obj->pampd_count = 0;
145
(*tmem_pamops.new_obj)(obj);
146
SET_SENTINEL(obj, OBJ);
148
BUG_ON(RB_EMPTY_NODE(*new));
149
this = rb_entry(*new, struct tmem_obj, rb_tree_node);
151
switch (tmem_oid_compare(oidp, &this->oid)) {
153
BUG(); /* already present; should never happen! */
156
new = &(*new)->rb_left;
159
new = &(*new)->rb_right;
163
rb_link_node(&obj->rb_tree_node, parent, new);
164
rb_insert_color(&obj->rb_tree_node, root);
168
* Tmem is managed as a set of tmem_pools with certain attributes, such as
169
* "ephemeral" vs "persistent". These attributes apply to all tmem_objs
170
* and all pampds that belong to a tmem_pool. A tmem_pool is created
171
* or deleted relatively rarely (for example, when a filesystem is
172
* mounted or unmounted.
175
/* flush all data from a pool and, optionally, free it */
176
static void tmem_pool_flush(struct tmem_pool *pool, bool destroy)
178
struct rb_node *rbnode;
179
struct tmem_obj *obj;
180
struct tmem_hashbucket *hb = &pool->hashbucket[0];
183
BUG_ON(pool == NULL);
184
for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
185
spin_lock(&hb->lock);
186
rbnode = rb_first(&hb->obj_rb_root);
187
while (rbnode != NULL) {
188
obj = rb_entry(rbnode, struct tmem_obj, rb_tree_node);
189
rbnode = rb_next(rbnode);
190
tmem_pampd_destroy_all_in_obj(obj);
191
tmem_obj_free(obj, hb);
192
(*tmem_hostops.obj_free)(obj, pool);
194
spin_unlock(&hb->lock);
197
list_del(&pool->pool_list);
201
* A tmem_obj contains a radix-tree-like tree in which the intermediate
202
* nodes are called tmem_objnodes. (The kernel lib/radix-tree.c implementation
203
* is very specialized and tuned for specific uses and is not particularly
204
* suited for use from this code, though some code from the core algorithms has
205
* been reused, thus the copyright notices below). Each tmem_objnode contains
206
* a set of pointers which point to either a set of intermediate tmem_objnodes
207
* or a set of of pampds.
209
* Portions Copyright (C) 2001 Momchil Velikov
210
* Portions Copyright (C) 2001 Christoph Hellwig
211
* Portions Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
214
struct tmem_objnode_tree_path {
215
struct tmem_objnode *objnode;
219
/* objnode height_to_maxindex translation */
220
static unsigned long tmem_objnode_tree_h2max[OBJNODE_TREE_MAX_PATH + 1];
222
static void tmem_objnode_tree_init(void)
224
unsigned int ht, tmp;
226
for (ht = 0; ht < ARRAY_SIZE(tmem_objnode_tree_h2max); ht++) {
227
tmp = ht * OBJNODE_TREE_MAP_SHIFT;
228
if (tmp >= OBJNODE_TREE_INDEX_BITS)
229
tmem_objnode_tree_h2max[ht] = ~0UL;
231
tmem_objnode_tree_h2max[ht] =
232
(~0UL >> (OBJNODE_TREE_INDEX_BITS - tmp - 1)) >> 1;
236
static struct tmem_objnode *tmem_objnode_alloc(struct tmem_obj *obj)
238
struct tmem_objnode *objnode;
240
ASSERT_SENTINEL(obj, OBJ);
241
BUG_ON(obj->pool == NULL);
242
ASSERT_SENTINEL(obj->pool, POOL);
243
objnode = (*tmem_hostops.objnode_alloc)(obj->pool);
244
if (unlikely(objnode == NULL))
247
SET_SENTINEL(objnode, OBJNODE);
248
memset(&objnode->slots, 0, sizeof(objnode->slots));
249
objnode->slots_in_use = 0;
250
obj->objnode_count++;
255
static void tmem_objnode_free(struct tmem_objnode *objnode)
257
struct tmem_pool *pool;
260
BUG_ON(objnode == NULL);
261
for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++)
262
BUG_ON(objnode->slots[i] != NULL);
263
ASSERT_SENTINEL(objnode, OBJNODE);
264
INVERT_SENTINEL(objnode, OBJNODE);
265
BUG_ON(objnode->obj == NULL);
266
ASSERT_SENTINEL(objnode->obj, OBJ);
267
pool = objnode->obj->pool;
268
BUG_ON(pool == NULL);
269
ASSERT_SENTINEL(pool, POOL);
270
objnode->obj->objnode_count--;
272
(*tmem_hostops.objnode_free)(objnode, pool);
276
* lookup index in object and return associated pampd (or NULL if not found)
278
static void **__tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
280
unsigned int height, shift;
281
struct tmem_objnode **slot = NULL;
284
ASSERT_SENTINEL(obj, OBJ);
285
BUG_ON(obj->pool == NULL);
286
ASSERT_SENTINEL(obj->pool, POOL);
288
height = obj->objnode_tree_height;
289
if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height])
291
if (height == 0 && obj->objnode_tree_root) {
292
slot = &obj->objnode_tree_root;
295
shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
296
slot = &obj->objnode_tree_root;
300
slot = (struct tmem_objnode **)
302
((index >> shift) & OBJNODE_TREE_MAP_MASK));
303
shift -= OBJNODE_TREE_MAP_SHIFT;
307
return slot != NULL ? (void **)slot : NULL;
310
static void *tmem_pampd_lookup_in_obj(struct tmem_obj *obj, uint32_t index)
312
struct tmem_objnode **slot;
314
slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
315
return slot != NULL ? *slot : NULL;
318
static void *tmem_pampd_replace_in_obj(struct tmem_obj *obj, uint32_t index,
321
struct tmem_objnode **slot;
324
slot = (struct tmem_objnode **)__tmem_pampd_lookup_in_obj(obj, index);
325
if ((slot != NULL) && (*slot != NULL)) {
326
void *old_pampd = *(void **)slot;
327
*(void **)slot = new_pampd;
328
(*tmem_pamops.free)(old_pampd, obj->pool, NULL, 0);
334
static int tmem_pampd_add_to_obj(struct tmem_obj *obj, uint32_t index,
338
struct tmem_objnode *objnode = NULL, *newnode, *slot;
339
unsigned int height, shift;
342
/* if necessary, extend the tree to be higher */
343
if (index > tmem_objnode_tree_h2max[obj->objnode_tree_height]) {
344
height = obj->objnode_tree_height + 1;
345
if (index > tmem_objnode_tree_h2max[height])
346
while (index > tmem_objnode_tree_h2max[height])
348
if (obj->objnode_tree_root == NULL) {
349
obj->objnode_tree_height = height;
353
newnode = tmem_objnode_alloc(obj);
358
newnode->slots[0] = obj->objnode_tree_root;
359
newnode->slots_in_use = 1;
360
obj->objnode_tree_root = newnode;
361
obj->objnode_tree_height++;
362
} while (height > obj->objnode_tree_height);
365
slot = obj->objnode_tree_root;
366
height = obj->objnode_tree_height;
367
shift = (height-1) * OBJNODE_TREE_MAP_SHIFT;
370
/* add a child objnode. */
371
slot = tmem_objnode_alloc(obj);
378
objnode->slots[offset] = slot;
379
objnode->slots_in_use++;
381
obj->objnode_tree_root = slot;
383
/* go down a level */
384
offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
386
slot = objnode->slots[offset];
387
shift -= OBJNODE_TREE_MAP_SHIFT;
390
BUG_ON(slot != NULL);
392
objnode->slots_in_use++;
393
objnode->slots[offset] = pampd;
395
obj->objnode_tree_root = pampd;
401
static void *tmem_pampd_delete_from_obj(struct tmem_obj *obj, uint32_t index)
403
struct tmem_objnode_tree_path path[OBJNODE_TREE_MAX_PATH + 1];
404
struct tmem_objnode_tree_path *pathp = path;
405
struct tmem_objnode *slot = NULL;
406
unsigned int height, shift;
410
ASSERT_SENTINEL(obj, OBJ);
411
BUG_ON(obj->pool == NULL);
412
ASSERT_SENTINEL(obj->pool, POOL);
413
height = obj->objnode_tree_height;
414
if (index > tmem_objnode_tree_h2max[height])
416
slot = obj->objnode_tree_root;
417
if (height == 0 && obj->objnode_tree_root) {
418
obj->objnode_tree_root = NULL;
421
shift = (height - 1) * OBJNODE_TREE_MAP_SHIFT;
422
pathp->objnode = NULL;
427
offset = (index >> shift) & OBJNODE_TREE_MAP_MASK;
428
pathp->offset = offset;
429
pathp->objnode = slot;
430
slot = slot->slots[offset];
431
shift -= OBJNODE_TREE_MAP_SHIFT;
433
} while (height > 0);
436
while (pathp->objnode) {
437
pathp->objnode->slots[pathp->offset] = NULL;
438
pathp->objnode->slots_in_use--;
439
if (pathp->objnode->slots_in_use) {
440
if (pathp->objnode == obj->objnode_tree_root) {
441
while (obj->objnode_tree_height > 0 &&
442
obj->objnode_tree_root->slots_in_use == 1 &&
443
obj->objnode_tree_root->slots[0]) {
444
struct tmem_objnode *to_free =
445
obj->objnode_tree_root;
447
obj->objnode_tree_root =
449
obj->objnode_tree_height--;
450
to_free->slots[0] = NULL;
451
to_free->slots_in_use = 0;
452
tmem_objnode_free(to_free);
457
tmem_objnode_free(pathp->objnode); /* 0 slots used, free it */
460
obj->objnode_tree_height = 0;
461
obj->objnode_tree_root = NULL;
466
BUG_ON(obj->pampd_count < 0);
470
/* recursively walk the objnode_tree destroying pampds and objnodes */
471
static void tmem_objnode_node_destroy(struct tmem_obj *obj,
472
struct tmem_objnode *objnode,
479
for (i = 0; i < OBJNODE_TREE_MAP_SIZE; i++) {
480
if (objnode->slots[i]) {
483
(*tmem_pamops.free)(objnode->slots[i],
485
objnode->slots[i] = NULL;
488
tmem_objnode_node_destroy(obj, objnode->slots[i], ht-1);
489
tmem_objnode_free(objnode->slots[i]);
490
objnode->slots[i] = NULL;
495
static void tmem_pampd_destroy_all_in_obj(struct tmem_obj *obj)
497
if (obj->objnode_tree_root == NULL)
499
if (obj->objnode_tree_height == 0) {
501
(*tmem_pamops.free)(obj->objnode_tree_root, obj->pool, NULL, 0);
503
tmem_objnode_node_destroy(obj, obj->objnode_tree_root,
504
obj->objnode_tree_height);
505
tmem_objnode_free(obj->objnode_tree_root);
506
obj->objnode_tree_height = 0;
508
obj->objnode_tree_root = NULL;
509
(*tmem_pamops.free_obj)(obj->pool, obj);
513
* Tmem is operated on by a set of well-defined actions:
514
* "put", "get", "flush", "flush_object", "new pool" and "destroy pool".
515
* (The tmem ABI allows for subpages and exchanges but these operations
516
* are not included in this implementation.)
518
* These "tmem core" operations are implemented in the following functions.
522
* "Put" a page, e.g. copy a page from the kernel into newly allocated
523
* PAM space (if such space is available). Tmem_put is complicated by
524
* a corner case: What if a page with matching handle already exists in
525
* tmem? To guarantee coherency, one of two actions is necessary: Either
526
* the data for the page must be overwritten, or the page must be
527
* "flushed" so that the data is not accessible to a subsequent "get".
528
* Since these "duplicate puts" are relatively rare, this implementation
529
* always flushes for simplicity.
531
int tmem_put(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
532
char *data, size_t size, bool raw, bool ephemeral)
534
struct tmem_obj *obj = NULL, *objfound = NULL, *objnew = NULL;
535
void *pampd = NULL, *pampd_del = NULL;
537
struct tmem_hashbucket *hb;
539
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
540
spin_lock(&hb->lock);
541
obj = objfound = tmem_obj_find(hb, oidp);
543
pampd = tmem_pampd_lookup_in_obj(objfound, index);
545
/* if found, is a dup put, flush the old one */
546
pampd_del = tmem_pampd_delete_from_obj(obj, index);
547
BUG_ON(pampd_del != pampd);
548
(*tmem_pamops.free)(pampd, pool, oidp, index);
549
if (obj->pampd_count == 0) {
556
obj = objnew = (*tmem_hostops.obj_alloc)(pool);
557
if (unlikely(obj == NULL)) {
561
tmem_obj_init(obj, hb, pool, oidp);
564
BUG_ON(((objnew != obj) && (objfound != obj)) || (objnew == objfound));
565
pampd = (*tmem_pamops.create)(data, size, raw, ephemeral,
566
obj->pool, &obj->oid, index);
567
if (unlikely(pampd == NULL))
569
ret = tmem_pampd_add_to_obj(obj, index, pampd);
570
if (unlikely(ret == -ENOMEM))
571
/* may have partially built objnode tree ("stump") */
572
goto delete_and_free;
576
(void)tmem_pampd_delete_from_obj(obj, index);
579
(*tmem_pamops.free)(pampd, pool, NULL, 0);
581
tmem_obj_free(objnew, hb);
582
(*tmem_hostops.obj_free)(objnew, pool);
585
spin_unlock(&hb->lock);
590
* "Get" a page, e.g. if one can be found, copy the tmem page with the
591
* matching handle from PAM space to the kernel. By tmem definition,
592
* when a "get" is successful on an ephemeral page, the page is "flushed",
593
* and when a "get" is successful on a persistent page, the page is retained
594
* in tmem. Note that to preserve
595
* coherency, "get" can never be skipped if tmem contains the data.
596
* That is, if a get is done with a certain handle and fails, any
597
* subsequent "get" must also fail (unless of course there is a
598
* "put" done with the same handle).
601
int tmem_get(struct tmem_pool *pool, struct tmem_oid *oidp, uint32_t index,
602
char *data, size_t *size, bool raw, int get_and_free)
604
struct tmem_obj *obj;
606
bool ephemeral = is_ephemeral(pool);
608
struct tmem_hashbucket *hb;
609
bool free = (get_and_free == 1) || ((get_and_free == 0) && ephemeral);
610
bool lock_held = false;
612
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
613
spin_lock(&hb->lock);
615
obj = tmem_obj_find(hb, oidp);
619
pampd = tmem_pampd_delete_from_obj(obj, index);
621
pampd = tmem_pampd_lookup_in_obj(obj, index);
625
if (obj->pampd_count == 0) {
626
tmem_obj_free(obj, hb);
627
(*tmem_hostops.obj_free)(obj, pool);
631
if (tmem_pamops.is_remote(pampd)) {
633
spin_unlock(&hb->lock);
636
ret = (*tmem_pamops.get_data_and_free)(
637
data, size, raw, pampd, pool, oidp, index);
639
ret = (*tmem_pamops.get_data)(
640
data, size, raw, pampd, pool, oidp, index);
646
spin_unlock(&hb->lock);
651
* If a page in tmem matches the handle, "flush" this page from tmem such
652
* that any subsequent "get" does not succeed (unless, of course, there
653
* was another "put" with the same handle).
655
int tmem_flush_page(struct tmem_pool *pool,
656
struct tmem_oid *oidp, uint32_t index)
658
struct tmem_obj *obj;
661
struct tmem_hashbucket *hb;
663
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
664
spin_lock(&hb->lock);
665
obj = tmem_obj_find(hb, oidp);
668
pampd = tmem_pampd_delete_from_obj(obj, index);
671
(*tmem_pamops.free)(pampd, pool, oidp, index);
672
if (obj->pampd_count == 0) {
673
tmem_obj_free(obj, hb);
674
(*tmem_hostops.obj_free)(obj, pool);
679
spin_unlock(&hb->lock);
684
* If a page in tmem matches the handle, replace the page so that any
685
* subsequent "get" gets the new page. Returns 0 if
686
* there was a page to replace, else returns -1.
688
int tmem_replace(struct tmem_pool *pool, struct tmem_oid *oidp,
689
uint32_t index, void *new_pampd)
691
struct tmem_obj *obj;
693
struct tmem_hashbucket *hb;
695
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
696
spin_lock(&hb->lock);
697
obj = tmem_obj_find(hb, oidp);
700
new_pampd = tmem_pampd_replace_in_obj(obj, index, new_pampd);
701
ret = (*tmem_pamops.replace_in_obj)(new_pampd, obj);
703
spin_unlock(&hb->lock);
708
* "Flush" all pages in tmem matching this oid.
710
int tmem_flush_object(struct tmem_pool *pool, struct tmem_oid *oidp)
712
struct tmem_obj *obj;
713
struct tmem_hashbucket *hb;
716
hb = &pool->hashbucket[tmem_oid_hash(oidp)];
717
spin_lock(&hb->lock);
718
obj = tmem_obj_find(hb, oidp);
721
tmem_pampd_destroy_all_in_obj(obj);
722
tmem_obj_free(obj, hb);
723
(*tmem_hostops.obj_free)(obj, pool);
727
spin_unlock(&hb->lock);
732
* "Flush" all pages (and tmem_objs) from this tmem_pool and disable
733
* all subsequent access to this tmem_pool.
735
int tmem_destroy_pool(struct tmem_pool *pool)
741
tmem_pool_flush(pool, 1);
747
static LIST_HEAD(tmem_global_pool_list);
750
* Create a new tmem_pool with the provided flag and return
751
* a pool id provided by the tmem host implementation.
753
void tmem_new_pool(struct tmem_pool *pool, uint32_t flags)
755
int persistent = flags & TMEM_POOL_PERSIST;
756
int shared = flags & TMEM_POOL_SHARED;
757
struct tmem_hashbucket *hb = &pool->hashbucket[0];
760
for (i = 0; i < TMEM_HASH_BUCKETS; i++, hb++) {
761
hb->obj_rb_root = RB_ROOT;
762
spin_lock_init(&hb->lock);
764
INIT_LIST_HEAD(&pool->pool_list);
765
atomic_set(&pool->obj_count, 0);
766
SET_SENTINEL(pool, POOL);
767
list_add_tail(&pool->pool_list, &tmem_global_pool_list);
768
pool->persistent = persistent;
769
pool->shared = shared;