1 /******************************************************************************
2 * tmem.c
3 *
4 * Transcendent memory
5 *
6 * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
7 */
8
9 /* TODO list: 090129 (updated 100318)
10 - any better reclamation policy?
11 - use different tlsf pools for each client (maybe each pool)
12 - test shared access more completely (ocfs2)
13 - add feedback-driven compression (not for persistent pools though!)
14 - add data-structure total bytes overhead stats
15 */
16
17 #ifdef __XEN__
18 #include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here. */
19 #endif
20
21 #include <public/sysctl.h>
22 #include <xen/tmem.h>
23 #include <xen/rbtree.h>
24 #include <xen/radix-tree.h>
25 #include <xen/list.h>
26 #include <xen/init.h>
27
28 #define TMEM_SPEC_VERSION 1
29
30 struct tmem_statistics tmem_stats = {
31 .global_obj_count = ATOMIC_INIT(0),
32 .global_pgp_count = ATOMIC_INIT(0),
33 .global_pcd_count = ATOMIC_INIT(0),
34 .global_page_count = ATOMIC_INIT(0),
35 .global_rtree_node_count = ATOMIC_INIT(0),
36 };
37
38 /************ CORE DATA STRUCTURES ************************************/
39
40 struct tmem_object_root {
41 struct xen_tmem_oid oid;
42 struct rb_node rb_tree_node; /* Protected by pool->pool_rwlock. */
43 unsigned long objnode_count; /* Atomicity depends on obj_spinlock. */
44 long pgp_count; /* Atomicity depends on obj_spinlock. */
45 struct radix_tree_root tree_root; /* Tree of pages within object. */
46 struct tmem_pool *pool;
47 domid_t last_client;
48 spinlock_t obj_spinlock;
49 };
50
51 struct tmem_object_node {
52 struct tmem_object_root *obj;
53 struct radix_tree_node rtn;
54 };
55
56 struct tmem_page_descriptor {
57 union {
58 struct list_head global_eph_pages;
59 struct list_head client_inv_pages;
60 };
61 union {
62 struct {
63 union {
64 struct list_head client_eph_pages;
65 struct list_head pool_pers_pages;
66 };
67 struct tmem_object_root *obj;
68 } us;
69 struct xen_tmem_oid inv_oid; /* Used for invalid list only. */
70 };
71 pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
72 else compressed data (cdata). */
73 uint32_t index;
74 bool eviction_attempted; /* CHANGE TO lifetimes? (settable). */
75 union {
76 struct page_info *pfp; /* Page frame pointer. */
77 char *cdata; /* Compressed data. */
78 struct tmem_page_content_descriptor *pcd; /* Page dedup. */
79 };
80 union {
81 uint64_t timestamp;
82 uint32_t pool_id; /* Used for invalid list only. */
83 };
84 };
85
86 #define PCD_TZE_MAX_SIZE (PAGE_SIZE - (PAGE_SIZE/64))
87
88 struct tmem_page_content_descriptor {
89 union {
90 struct page_info *pfp; /* Page frame pointer. */
91 char *cdata; /* If compression_enabled. */
92 };
93 pagesize_t size; /* If compression_enabled -> 0<size<PAGE_SIZE (*cdata)
94 * else if tze, 0<=size<PAGE_SIZE, rounded up to mult of 8
95 * else PAGE_SIZE -> *pfp. */
96 };
97
98 static int tmem_initialized = 0;
99
100 struct xmem_pool *tmem_mempool = 0;
101 unsigned int tmem_mempool_maxalloc = 0;
102
103 DEFINE_SPINLOCK(tmem_page_list_lock);
104 PAGE_LIST_HEAD(tmem_page_list);
105 unsigned long tmem_page_list_pages = 0;
106
107 DEFINE_RWLOCK(tmem_rwlock);
108 static DEFINE_SPINLOCK(eph_lists_spinlock); /* Protects global AND clients. */
109 static DEFINE_SPINLOCK(pers_lists_spinlock);
110
111 #define ASSERT_SPINLOCK(_l) ASSERT(spin_is_locked(_l))
112 #define ASSERT_WRITELOCK(_l) ASSERT(rw_is_write_locked(_l))
113
114 atomic_t client_weight_total;
115
116 struct tmem_global tmem_global = {
117 .ephemeral_page_list = LIST_HEAD_INIT(tmem_global.ephemeral_page_list),
118 .client_list = LIST_HEAD_INIT(tmem_global.client_list),
119 .client_weight_total = ATOMIC_INIT(0),
120 };
121
122 /*
123 * There two types of memory allocation interfaces in tmem.
124 * One is based on xmem_pool and the other is used for allocate a whole page.
125 * Both of them are based on the lowlevel function __tmem_alloc_page/_thispool().
126 * The call trace of alloc path is like below.
127 * Persistant pool:
128 * 1.tmem_malloc()
129 * > xmem_pool_alloc()
130 * > tmem_persistent_pool_page_get()
131 * > __tmem_alloc_page_thispool()
132 * 2.tmem_alloc_page()
133 * > __tmem_alloc_page_thispool()
134 *
135 * Ephemeral pool:
136 * 1.tmem_malloc()
137 * > xmem_pool_alloc()
138 * > tmem_mempool_page_get()
139 * > __tmem_alloc_page()
140 * 2.tmem_alloc_page()
141 * > __tmem_alloc_page()
142 *
143 * The free path is done in the same manner.
144 */
tmem_malloc(size_t size,struct tmem_pool * pool)145 static void *tmem_malloc(size_t size, struct tmem_pool *pool)
146 {
147 void *v = NULL;
148
149 if ( (pool != NULL) && is_persistent(pool) ) {
150 if ( pool->client->persistent_pool )
151 v = xmem_pool_alloc(size, pool->client->persistent_pool);
152 }
153 else
154 {
155 ASSERT( size < tmem_mempool_maxalloc );
156 ASSERT( tmem_mempool != NULL );
157 v = xmem_pool_alloc(size, tmem_mempool);
158 }
159 if ( v == NULL )
160 tmem_stats.alloc_failed++;
161 return v;
162 }
163
tmem_free(void * p,struct tmem_pool * pool)164 static void tmem_free(void *p, struct tmem_pool *pool)
165 {
166 if ( pool == NULL || !is_persistent(pool) )
167 {
168 ASSERT( tmem_mempool != NULL );
169 xmem_pool_free(p, tmem_mempool);
170 }
171 else
172 {
173 ASSERT( pool->client->persistent_pool != NULL );
174 xmem_pool_free(p, pool->client->persistent_pool);
175 }
176 }
177
tmem_alloc_page(struct tmem_pool * pool)178 static struct page_info *tmem_alloc_page(struct tmem_pool *pool)
179 {
180 struct page_info *pfp = NULL;
181
182 if ( pool != NULL && is_persistent(pool) )
183 pfp = __tmem_alloc_page_thispool(pool->client->domain);
184 else
185 pfp = __tmem_alloc_page();
186 if ( pfp == NULL )
187 tmem_stats.alloc_page_failed++;
188 else
189 atomic_inc_and_max(global_page_count);
190 return pfp;
191 }
192
tmem_free_page(struct tmem_pool * pool,struct page_info * pfp)193 static void tmem_free_page(struct tmem_pool *pool, struct page_info *pfp)
194 {
195 ASSERT(pfp);
196 if ( pool == NULL || !is_persistent(pool) )
197 __tmem_free_page(pfp);
198 else
199 __tmem_free_page_thispool(pfp);
200 atomic_dec_and_assert(global_page_count);
201 }
202
tmem_mempool_page_get(unsigned long size)203 static void *tmem_mempool_page_get(unsigned long size)
204 {
205 struct page_info *pi;
206
207 ASSERT(size == PAGE_SIZE);
208 if ( (pi = __tmem_alloc_page()) == NULL )
209 return NULL;
210 return page_to_virt(pi);
211 }
212
tmem_mempool_page_put(void * page_va)213 static void tmem_mempool_page_put(void *page_va)
214 {
215 ASSERT(IS_PAGE_ALIGNED(page_va));
216 __tmem_free_page(virt_to_page(page_va));
217 }
218
tmem_mempool_init(void)219 static int __init tmem_mempool_init(void)
220 {
221 tmem_mempool = xmem_pool_create("tmem", tmem_mempool_page_get,
222 tmem_mempool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
223 if ( tmem_mempool )
224 tmem_mempool_maxalloc = xmem_pool_maxalloc(tmem_mempool);
225 return tmem_mempool != NULL;
226 }
227
228 /* Persistent pools are per-domain. */
tmem_persistent_pool_page_get(unsigned long size)229 static void *tmem_persistent_pool_page_get(unsigned long size)
230 {
231 struct page_info *pi;
232 struct domain *d = current->domain;
233
234 ASSERT(size == PAGE_SIZE);
235 if ( (pi = __tmem_alloc_page_thispool(d)) == NULL )
236 return NULL;
237 ASSERT(IS_VALID_PAGE(pi));
238 return page_to_virt(pi);
239 }
240
tmem_persistent_pool_page_put(void * page_va)241 static void tmem_persistent_pool_page_put(void *page_va)
242 {
243 struct page_info *pi;
244
245 ASSERT(IS_PAGE_ALIGNED(page_va));
246 pi = mfn_to_page(virt_to_mfn(page_va));
247 ASSERT(IS_VALID_PAGE(pi));
248 __tmem_free_page_thispool(pi);
249 }
250
251 /*
252 * Page content descriptor manipulation routines.
253 */
254 #define NOT_SHAREABLE ((uint16_t)-1UL)
255
256 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
257
258 /* Allocate a struct tmem_page_descriptor and associate it with an object. */
pgp_alloc(struct tmem_object_root * obj)259 static struct tmem_page_descriptor *pgp_alloc(struct tmem_object_root *obj)
260 {
261 struct tmem_page_descriptor *pgp;
262 struct tmem_pool *pool;
263
264 ASSERT(obj != NULL);
265 ASSERT(obj->pool != NULL);
266 pool = obj->pool;
267 if ( (pgp = tmem_malloc(sizeof(struct tmem_page_descriptor), pool)) == NULL )
268 return NULL;
269 pgp->us.obj = obj;
270 INIT_LIST_HEAD(&pgp->global_eph_pages);
271 INIT_LIST_HEAD(&pgp->us.client_eph_pages);
272 pgp->pfp = NULL;
273 pgp->size = -1;
274 pgp->index = -1;
275 pgp->timestamp = get_cycles();
276 atomic_inc_and_max(global_pgp_count);
277 atomic_inc(&pool->pgp_count);
278 if ( _atomic_read(pool->pgp_count) > pool->pgp_count_max )
279 pool->pgp_count_max = _atomic_read(pool->pgp_count);
280 return pgp;
281 }
282
pgp_lookup_in_obj(struct tmem_object_root * obj,uint32_t index)283 static struct tmem_page_descriptor *pgp_lookup_in_obj(struct tmem_object_root *obj, uint32_t index)
284 {
285 ASSERT(obj != NULL);
286 ASSERT_SPINLOCK(&obj->obj_spinlock);
287 ASSERT(obj->pool != NULL);
288 return radix_tree_lookup(&obj->tree_root, index);
289 }
290
pgp_free_data(struct tmem_page_descriptor * pgp,struct tmem_pool * pool)291 static void pgp_free_data(struct tmem_page_descriptor *pgp, struct tmem_pool *pool)
292 {
293 pagesize_t pgp_size = pgp->size;
294
295 if ( pgp->pfp == NULL )
296 return;
297 if ( pgp_size )
298 tmem_free(pgp->cdata, pool);
299 else
300 tmem_free_page(pgp->us.obj->pool,pgp->pfp);
301 if ( pool != NULL && pgp_size )
302 {
303 pool->client->compressed_pages--;
304 pool->client->compressed_sum_size -= pgp_size;
305 }
306 pgp->pfp = NULL;
307 pgp->size = -1;
308 }
309
__pgp_free(struct tmem_page_descriptor * pgp,struct tmem_pool * pool)310 static void __pgp_free(struct tmem_page_descriptor *pgp, struct tmem_pool *pool)
311 {
312 pgp->us.obj = NULL;
313 pgp->index = -1;
314 tmem_free(pgp, pool);
315 }
316
pgp_free(struct tmem_page_descriptor * pgp)317 static void pgp_free(struct tmem_page_descriptor *pgp)
318 {
319 struct tmem_pool *pool = NULL;
320
321 ASSERT(pgp->us.obj != NULL);
322 ASSERT(pgp->us.obj->pool != NULL);
323 ASSERT(pgp->us.obj->pool->client != NULL);
324
325 pool = pgp->us.obj->pool;
326 if ( !is_persistent(pool) )
327 {
328 ASSERT(list_empty(&pgp->global_eph_pages));
329 ASSERT(list_empty(&pgp->us.client_eph_pages));
330 }
331 pgp_free_data(pgp, pool);
332 atomic_dec_and_assert(global_pgp_count);
333 atomic_dec(&pool->pgp_count);
334 ASSERT(_atomic_read(pool->pgp_count) >= 0);
335 pgp->size = -1;
336 if ( is_persistent(pool) && pool->client->info.flags.u.migrating )
337 {
338 pgp->inv_oid = pgp->us.obj->oid;
339 pgp->pool_id = pool->pool_id;
340 return;
341 }
342 __pgp_free(pgp, pool);
343 }
344
345 /* Remove pgp from global/pool/client lists and free it. */
pgp_delist_free(struct tmem_page_descriptor * pgp)346 static void pgp_delist_free(struct tmem_page_descriptor *pgp)
347 {
348 struct client *client;
349 uint64_t life;
350
351 ASSERT(pgp != NULL);
352 ASSERT(pgp->us.obj != NULL);
353 ASSERT(pgp->us.obj->pool != NULL);
354 client = pgp->us.obj->pool->client;
355 ASSERT(client != NULL);
356
357 /* Delist pgp. */
358 if ( !is_persistent(pgp->us.obj->pool) )
359 {
360 spin_lock(&eph_lists_spinlock);
361 if ( !list_empty(&pgp->us.client_eph_pages) )
362 client->eph_count--;
363 ASSERT(client->eph_count >= 0);
364 list_del_init(&pgp->us.client_eph_pages);
365 if ( !list_empty(&pgp->global_eph_pages) )
366 tmem_global.eph_count--;
367 ASSERT(tmem_global.eph_count >= 0);
368 list_del_init(&pgp->global_eph_pages);
369 spin_unlock(&eph_lists_spinlock);
370 }
371 else
372 {
373 if ( client->info.flags.u.migrating )
374 {
375 spin_lock(&pers_lists_spinlock);
376 list_add_tail(&pgp->client_inv_pages,
377 &client->persistent_invalidated_list);
378 if ( pgp != pgp->us.obj->pool->cur_pgp )
379 list_del_init(&pgp->us.pool_pers_pages);
380 spin_unlock(&pers_lists_spinlock);
381 }
382 else
383 {
384 spin_lock(&pers_lists_spinlock);
385 list_del_init(&pgp->us.pool_pers_pages);
386 spin_unlock(&pers_lists_spinlock);
387 }
388 }
389 life = get_cycles() - pgp->timestamp;
390 pgp->us.obj->pool->sum_life_cycles += life;
391
392 /* Free pgp. */
393 pgp_free(pgp);
394 }
395
396 /* Called only indirectly by radix_tree_destroy. */
pgp_destroy(void * v)397 static void pgp_destroy(void *v)
398 {
399 struct tmem_page_descriptor *pgp = (struct tmem_page_descriptor *)v;
400
401 pgp->us.obj->pgp_count--;
402 pgp_delist_free(pgp);
403 }
404
pgp_add_to_obj(struct tmem_object_root * obj,uint32_t index,struct tmem_page_descriptor * pgp)405 static int pgp_add_to_obj(struct tmem_object_root *obj, uint32_t index, struct tmem_page_descriptor *pgp)
406 {
407 int ret;
408
409 ASSERT_SPINLOCK(&obj->obj_spinlock);
410 ret = radix_tree_insert(&obj->tree_root, index, pgp);
411 if ( !ret )
412 obj->pgp_count++;
413 return ret;
414 }
415
pgp_delete_from_obj(struct tmem_object_root * obj,uint32_t index)416 static struct tmem_page_descriptor *pgp_delete_from_obj(struct tmem_object_root *obj, uint32_t index)
417 {
418 struct tmem_page_descriptor *pgp;
419
420 ASSERT(obj != NULL);
421 ASSERT_SPINLOCK(&obj->obj_spinlock);
422 ASSERT(obj->pool != NULL);
423 pgp = radix_tree_delete(&obj->tree_root, index);
424 if ( pgp != NULL )
425 obj->pgp_count--;
426 ASSERT(obj->pgp_count >= 0);
427
428 return pgp;
429 }
430
431 /************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
432
433 /* Called only indirectly from radix_tree_insert. */
rtn_alloc(void * arg)434 static struct radix_tree_node *rtn_alloc(void *arg)
435 {
436 struct tmem_object_node *objnode;
437 struct tmem_object_root *obj = (struct tmem_object_root *)arg;
438
439 ASSERT(obj->pool != NULL);
440 objnode = tmem_malloc(sizeof(struct tmem_object_node),obj->pool);
441 if (objnode == NULL)
442 return NULL;
443 objnode->obj = obj;
444 memset(&objnode->rtn, 0, sizeof(struct radix_tree_node));
445 if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
446 obj->pool->objnode_count_max = obj->pool->objnode_count;
447 atomic_inc_and_max(global_rtree_node_count);
448 obj->objnode_count++;
449 return &objnode->rtn;
450 }
451
452 /* Called only indirectly from radix_tree_delete/destroy. */
rtn_free(struct radix_tree_node * rtn,void * arg)453 static void rtn_free(struct radix_tree_node *rtn, void *arg)
454 {
455 struct tmem_pool *pool;
456 struct tmem_object_node *objnode;
457
458 ASSERT(rtn != NULL);
459 objnode = container_of(rtn,struct tmem_object_node,rtn);
460 ASSERT(objnode->obj != NULL);
461 ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
462 pool = objnode->obj->pool;
463 ASSERT(pool != NULL);
464 pool->objnode_count--;
465 objnode->obj->objnode_count--;
466 objnode->obj = NULL;
467 tmem_free(objnode, pool);
468 atomic_dec_and_assert(global_rtree_node_count);
469 }
470
471 /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
472
oid_compare(struct xen_tmem_oid * left,struct xen_tmem_oid * right)473 static int oid_compare(struct xen_tmem_oid *left,
474 struct xen_tmem_oid *right)
475 {
476 if ( left->oid[2] == right->oid[2] )
477 {
478 if ( left->oid[1] == right->oid[1] )
479 {
480 if ( left->oid[0] == right->oid[0] )
481 return 0;
482 else if ( left->oid[0] < right->oid[0] )
483 return -1;
484 else
485 return 1;
486 }
487 else if ( left->oid[1] < right->oid[1] )
488 return -1;
489 else
490 return 1;
491 }
492 else if ( left->oid[2] < right->oid[2] )
493 return -1;
494 else
495 return 1;
496 }
497
oid_set_invalid(struct xen_tmem_oid * oidp)498 static void oid_set_invalid(struct xen_tmem_oid *oidp)
499 {
500 oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
501 }
502
oid_hash(struct xen_tmem_oid * oidp)503 static unsigned oid_hash(struct xen_tmem_oid *oidp)
504 {
505 return (tmem_hash(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
506 BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK);
507 }
508
509 /* Searches for object==oid in pool, returns locked object if found. */
obj_find(struct tmem_pool * pool,struct xen_tmem_oid * oidp)510 static struct tmem_object_root * obj_find(struct tmem_pool *pool,
511 struct xen_tmem_oid *oidp)
512 {
513 struct rb_node *node;
514 struct tmem_object_root *obj;
515
516 restart_find:
517 read_lock(&pool->pool_rwlock);
518 node = pool->obj_rb_root[oid_hash(oidp)].rb_node;
519 while ( node )
520 {
521 obj = container_of(node, struct tmem_object_root, rb_tree_node);
522 switch ( oid_compare(&obj->oid, oidp) )
523 {
524 case 0: /* Equal. */
525 if ( !spin_trylock(&obj->obj_spinlock) )
526 {
527 read_unlock(&pool->pool_rwlock);
528 goto restart_find;
529 }
530 read_unlock(&pool->pool_rwlock);
531 return obj;
532 case -1:
533 node = node->rb_left;
534 break;
535 case 1:
536 node = node->rb_right;
537 }
538 }
539 read_unlock(&pool->pool_rwlock);
540 return NULL;
541 }
542
543 /* Free an object that has no more pgps in it. */
obj_free(struct tmem_object_root * obj)544 static void obj_free(struct tmem_object_root *obj)
545 {
546 struct tmem_pool *pool;
547 struct xen_tmem_oid old_oid;
548
549 ASSERT_SPINLOCK(&obj->obj_spinlock);
550 ASSERT(obj != NULL);
551 ASSERT(obj->pgp_count == 0);
552 pool = obj->pool;
553 ASSERT(pool != NULL);
554 ASSERT(pool->client != NULL);
555 ASSERT_WRITELOCK(&pool->pool_rwlock);
556 if ( obj->tree_root.rnode != NULL ) /* May be a "stump" with no leaves. */
557 radix_tree_destroy(&obj->tree_root, pgp_destroy);
558 ASSERT((long)obj->objnode_count == 0);
559 ASSERT(obj->tree_root.rnode == NULL);
560 pool->obj_count--;
561 ASSERT(pool->obj_count >= 0);
562 obj->pool = NULL;
563 old_oid = obj->oid;
564 oid_set_invalid(&obj->oid);
565 obj->last_client = TMEM_CLI_ID_NULL;
566 atomic_dec_and_assert(global_obj_count);
567 rb_erase(&obj->rb_tree_node, &pool->obj_rb_root[oid_hash(&old_oid)]);
568 spin_unlock(&obj->obj_spinlock);
569 tmem_free(obj, pool);
570 }
571
obj_rb_insert(struct rb_root * root,struct tmem_object_root * obj)572 static int obj_rb_insert(struct rb_root *root, struct tmem_object_root *obj)
573 {
574 struct rb_node **new, *parent = NULL;
575 struct tmem_object_root *this;
576
577 ASSERT(obj->pool);
578 ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
579
580 new = &(root->rb_node);
581 while ( *new )
582 {
583 this = container_of(*new, struct tmem_object_root, rb_tree_node);
584 parent = *new;
585 switch ( oid_compare(&this->oid, &obj->oid) )
586 {
587 case 0:
588 return 0;
589 case -1:
590 new = &((*new)->rb_left);
591 break;
592 case 1:
593 new = &((*new)->rb_right);
594 break;
595 }
596 }
597 rb_link_node(&obj->rb_tree_node, parent, new);
598 rb_insert_color(&obj->rb_tree_node, root);
599 return 1;
600 }
601
602 /*
603 * Allocate, initialize, and insert an tmem_object_root
604 * (should be called only if find failed).
605 */
obj_alloc(struct tmem_pool * pool,struct xen_tmem_oid * oidp)606 static struct tmem_object_root * obj_alloc(struct tmem_pool *pool,
607 struct xen_tmem_oid *oidp)
608 {
609 struct tmem_object_root *obj;
610
611 ASSERT(pool != NULL);
612 if ( (obj = tmem_malloc(sizeof(struct tmem_object_root), pool)) == NULL )
613 return NULL;
614 pool->obj_count++;
615 if (pool->obj_count > pool->obj_count_max)
616 pool->obj_count_max = pool->obj_count;
617 atomic_inc_and_max(global_obj_count);
618 radix_tree_init(&obj->tree_root);
619 radix_tree_set_alloc_callbacks(&obj->tree_root, rtn_alloc, rtn_free, obj);
620 spin_lock_init(&obj->obj_spinlock);
621 obj->pool = pool;
622 obj->oid = *oidp;
623 obj->objnode_count = 0;
624 obj->pgp_count = 0;
625 obj->last_client = TMEM_CLI_ID_NULL;
626 return obj;
627 }
628
629 /* Free an object after destroying any pgps in it. */
obj_destroy(struct tmem_object_root * obj)630 static void obj_destroy(struct tmem_object_root *obj)
631 {
632 ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
633 radix_tree_destroy(&obj->tree_root, pgp_destroy);
634 obj_free(obj);
635 }
636
637 /* Destroys all objs in a pool, or only if obj->last_client matches cli_id. */
pool_destroy_objs(struct tmem_pool * pool,domid_t cli_id)638 static void pool_destroy_objs(struct tmem_pool *pool, domid_t cli_id)
639 {
640 struct rb_node *node;
641 struct tmem_object_root *obj;
642 int i;
643
644 write_lock(&pool->pool_rwlock);
645 pool->is_dying = 1;
646 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
647 {
648 node = rb_first(&pool->obj_rb_root[i]);
649 while ( node != NULL )
650 {
651 obj = container_of(node, struct tmem_object_root, rb_tree_node);
652 spin_lock(&obj->obj_spinlock);
653 node = rb_next(node);
654 if ( obj->last_client == cli_id )
655 obj_destroy(obj);
656 else
657 spin_unlock(&obj->obj_spinlock);
658 }
659 }
660 write_unlock(&pool->pool_rwlock);
661 }
662
663
664 /************ POOL MANIPULATION ROUTINES ******************************/
665
pool_alloc(void)666 static struct tmem_pool * pool_alloc(void)
667 {
668 struct tmem_pool *pool;
669 int i;
670
671 if ( (pool = xzalloc(struct tmem_pool)) == NULL )
672 return NULL;
673 for (i = 0; i < OBJ_HASH_BUCKETS; i++)
674 pool->obj_rb_root[i] = RB_ROOT;
675 INIT_LIST_HEAD(&pool->persistent_page_list);
676 rwlock_init(&pool->pool_rwlock);
677 return pool;
678 }
679
pool_free(struct tmem_pool * pool)680 static void pool_free(struct tmem_pool *pool)
681 {
682 pool->client = NULL;
683 xfree(pool);
684 }
685
686 /*
687 * Register new_client as a user of this shared pool and return 0 on succ.
688 */
shared_pool_join(struct tmem_pool * pool,struct client * new_client)689 static int shared_pool_join(struct tmem_pool *pool, struct client *new_client)
690 {
691 struct share_list *sl;
692 ASSERT(is_shared(pool));
693
694 if ( (sl = tmem_malloc(sizeof(struct share_list), NULL)) == NULL )
695 return -1;
696 sl->client = new_client;
697 list_add_tail(&sl->share_list, &pool->share_list);
698 if ( new_client->cli_id != pool->client->cli_id )
699 tmem_client_info("adding new %s %d to shared pool owned by %s %d\n",
700 tmem_client_str, new_client->cli_id, tmem_client_str,
701 pool->client->cli_id);
702 else if ( pool->shared_count )
703 tmem_client_info("inter-guest sharing of shared pool %s by client %d\n",
704 tmem_client_str, pool->client->cli_id);
705 ++pool->shared_count;
706 return 0;
707 }
708
709 /* Reassign "ownership" of the pool to another client that shares this pool. */
shared_pool_reassign(struct tmem_pool * pool)710 static void shared_pool_reassign(struct tmem_pool *pool)
711 {
712 struct share_list *sl;
713 int poolid;
714 struct client *old_client = pool->client, *new_client;
715
716 ASSERT(is_shared(pool));
717 if ( list_empty(&pool->share_list) )
718 {
719 ASSERT(pool->shared_count == 0);
720 return;
721 }
722 old_client->pools[pool->pool_id] = NULL;
723 sl = list_entry(pool->share_list.next, struct share_list, share_list);
724 /*
725 * The sl->client can be old_client if there are multiple shared pools
726 * within an guest.
727 */
728 pool->client = new_client = sl->client;
729 for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
730 if (new_client->pools[poolid] == pool)
731 break;
732 ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
733 new_client->eph_count += _atomic_read(pool->pgp_count);
734 old_client->eph_count -= _atomic_read(pool->pgp_count);
735 list_splice_init(&old_client->ephemeral_page_list,
736 &new_client->ephemeral_page_list);
737 tmem_client_info("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
738 tmem_cli_id_str, old_client->cli_id, tmem_cli_id_str, new_client->cli_id, poolid);
739 pool->pool_id = poolid;
740 }
741
742 /*
743 * Destroy all objects with last_client same as passed cli_id,
744 * remove pool's cli_id from list of sharers of this pool.
745 */
shared_pool_quit(struct tmem_pool * pool,domid_t cli_id)746 static int shared_pool_quit(struct tmem_pool *pool, domid_t cli_id)
747 {
748 struct share_list *sl;
749 int s_poolid;
750
751 ASSERT(is_shared(pool));
752 ASSERT(pool->client != NULL);
753
754 ASSERT_WRITELOCK(&tmem_rwlock);
755 pool_destroy_objs(pool, cli_id);
756 list_for_each_entry(sl,&pool->share_list, share_list)
757 {
758 if (sl->client->cli_id != cli_id)
759 continue;
760 list_del(&sl->share_list);
761 tmem_free(sl, pool);
762 --pool->shared_count;
763 if (pool->client->cli_id == cli_id)
764 shared_pool_reassign(pool);
765 if (pool->shared_count)
766 return pool->shared_count;
767 for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
768 if ( (tmem_global.shared_pools[s_poolid]) == pool )
769 {
770 tmem_global.shared_pools[s_poolid] = NULL;
771 break;
772 }
773 return 0;
774 }
775 tmem_client_warn("tmem: no match unsharing pool, %s=%d\n",
776 tmem_cli_id_str,pool->client->cli_id);
777 return -1;
778 }
779
780 /* Flush all data (owned by cli_id) from a pool and, optionally, free it. */
pool_flush(struct tmem_pool * pool,domid_t cli_id)781 static void pool_flush(struct tmem_pool *pool, domid_t cli_id)
782 {
783 ASSERT(pool != NULL);
784 if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
785 {
786 tmem_client_warn("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n",
787 tmem_cli_id_str, cli_id, pool->pool_id, tmem_cli_id_str,pool->client->cli_id);
788 return;
789 }
790 tmem_client_info("Destroying %s-%s tmem pool %s=%d pool_id=%d\n",
791 is_persistent(pool) ? "persistent" : "ephemeral" ,
792 is_shared(pool) ? "shared" : "private",
793 tmem_cli_id_str, pool->client->cli_id, pool->pool_id);
794 if ( pool->client->info.flags.u.migrating )
795 {
796 tmem_client_warn("can't destroy pool while %s is live-migrating\n",
797 tmem_client_str);
798 return;
799 }
800 pool_destroy_objs(pool, TMEM_CLI_ID_NULL);
801 pool->client->pools[pool->pool_id] = NULL;
802 pool_free(pool);
803 }
804
805 /************ CLIENT MANIPULATION OPERATIONS **************************/
806
client_create(domid_t cli_id)807 struct client *client_create(domid_t cli_id)
808 {
809 struct client *client = xzalloc(struct client);
810 int i, shift;
811 char name[5];
812 struct domain *d;
813
814 tmem_client_info("tmem: initializing tmem capability for %s=%d...",
815 tmem_cli_id_str, cli_id);
816 if ( client == NULL )
817 {
818 tmem_client_err("failed... out of memory\n");
819 goto fail;
820 }
821
822 for (i = 0, shift = 12; i < 4; shift -=4, i++)
823 name[i] = (((unsigned short)cli_id >> shift) & 0xf) + '0';
824 name[4] = '\0';
825 client->persistent_pool = xmem_pool_create(name, tmem_persistent_pool_page_get,
826 tmem_persistent_pool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
827 if ( client->persistent_pool == NULL )
828 {
829 tmem_client_err("failed... can't alloc persistent pool\n");
830 goto fail;
831 }
832
833 d = rcu_lock_domain_by_id(cli_id);
834 if ( d == NULL ) {
835 tmem_client_err("failed... can't set client\n");
836 xmem_pool_destroy(client->persistent_pool);
837 goto fail;
838 }
839 if ( !d->is_dying ) {
840 d->tmem_client = client;
841 client->domain = d;
842 }
843 rcu_unlock_domain(d);
844
845 client->cli_id = cli_id;
846 client->info.version = TMEM_SPEC_VERSION;
847 client->info.maxpools = MAX_POOLS_PER_DOMAIN;
848 client->info.flags.u.compress = tmem_compression_enabled();
849 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
850 client->shared_auth_uuid[i][0] =
851 client->shared_auth_uuid[i][1] = -1L;
852 list_add_tail(&client->client_list, &tmem_global.client_list);
853 INIT_LIST_HEAD(&client->ephemeral_page_list);
854 INIT_LIST_HEAD(&client->persistent_invalidated_list);
855 tmem_client_info("ok\n");
856 return client;
857
858 fail:
859 xfree(client);
860 return NULL;
861 }
862
client_free(struct client * client)863 static void client_free(struct client *client)
864 {
865 list_del(&client->client_list);
866 xmem_pool_destroy(client->persistent_pool);
867 xfree(client);
868 }
869
870 /* Flush all data from a client and, optionally, free it. */
client_flush(struct client * client)871 static void client_flush(struct client *client)
872 {
873 int i;
874 struct tmem_pool *pool;
875
876 for (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
877 {
878 if ( (pool = client->pools[i]) == NULL )
879 continue;
880 pool_flush(pool, client->cli_id);
881 client->pools[i] = NULL;
882 client->info.nr_pools--;
883 }
884 client_free(client);
885 }
886
client_over_quota(const struct client * client)887 static bool client_over_quota(const struct client *client)
888 {
889 int total = _atomic_read(tmem_global.client_weight_total);
890
891 ASSERT(client != NULL);
892 if ( (total == 0) || (client->info.weight == 0) ||
893 (client->eph_count == 0) )
894 return false;
895
896 return (((tmem_global.eph_count * 100L) / client->eph_count) >
897 ((total * 100L) / client->info.weight));
898 }
899
900 /************ MEMORY REVOCATION ROUTINES *******************************/
901
tmem_try_to_evict_pgp(struct tmem_page_descriptor * pgp,bool * hold_pool_rwlock)902 static bool tmem_try_to_evict_pgp(struct tmem_page_descriptor *pgp,
903 bool *hold_pool_rwlock)
904 {
905 struct tmem_object_root *obj = pgp->us.obj;
906 struct tmem_pool *pool = obj->pool;
907
908 if ( pool->is_dying )
909 return false;
910 if ( spin_trylock(&obj->obj_spinlock) )
911 {
912 if ( obj->pgp_count > 1 )
913 return true;
914 if ( write_trylock(&pool->pool_rwlock) )
915 {
916 *hold_pool_rwlock = 1;
917 return true;
918 }
919 spin_unlock(&obj->obj_spinlock);
920 }
921 return false;
922 }
923
tmem_evict(void)924 int tmem_evict(void)
925 {
926 struct client *client = current->domain->tmem_client;
927 struct tmem_page_descriptor *pgp = NULL, *pgp_del;
928 struct tmem_object_root *obj;
929 struct tmem_pool *pool;
930 int ret = 0;
931 bool hold_pool_rwlock = false;
932
933 tmem_stats.evict_attempts++;
934 spin_lock(&eph_lists_spinlock);
935 if ( (client != NULL) && client_over_quota(client) &&
936 !list_empty(&client->ephemeral_page_list) )
937 {
938 list_for_each_entry(pgp, &client->ephemeral_page_list, us.client_eph_pages)
939 if ( tmem_try_to_evict_pgp(pgp, &hold_pool_rwlock) )
940 goto found;
941 }
942 else if ( !list_empty(&tmem_global.ephemeral_page_list) )
943 {
944 list_for_each_entry(pgp, &tmem_global.ephemeral_page_list, global_eph_pages)
945 if ( tmem_try_to_evict_pgp(pgp, &hold_pool_rwlock) )
946 {
947 client = pgp->us.obj->pool->client;
948 goto found;
949 }
950 }
951 /* Global_ephemeral_page_list is empty, so we bail out. */
952 spin_unlock(&eph_lists_spinlock);
953 goto out;
954
955 found:
956 /* Delist. */
957 list_del_init(&pgp->us.client_eph_pages);
958 client->eph_count--;
959 list_del_init(&pgp->global_eph_pages);
960 tmem_global.eph_count--;
961 ASSERT(tmem_global.eph_count >= 0);
962 ASSERT(client->eph_count >= 0);
963 spin_unlock(&eph_lists_spinlock);
964
965 ASSERT(pgp != NULL);
966 obj = pgp->us.obj;
967 ASSERT(obj != NULL);
968 ASSERT(obj->pool != NULL);
969 pool = obj->pool;
970
971 ASSERT_SPINLOCK(&obj->obj_spinlock);
972 pgp_del = pgp_delete_from_obj(obj, pgp->index);
973 ASSERT(pgp_del == pgp);
974
975 /* pgp already delist, so call pgp_free directly. */
976 pgp_free(pgp);
977 if ( obj->pgp_count == 0 )
978 {
979 ASSERT_WRITELOCK(&pool->pool_rwlock);
980 obj_free(obj);
981 }
982 else
983 spin_unlock(&obj->obj_spinlock);
984 if ( hold_pool_rwlock )
985 write_unlock(&pool->pool_rwlock);
986 tmem_stats.evicted_pgs++;
987 ret = 1;
988 out:
989 return ret;
990 }
991
992
993 /*
994 * Under certain conditions (e.g. if each client is putting pages for exactly
995 * one object), once locks are held, freeing up memory may
996 * result in livelocks and very long "put" times, so we try to ensure there
997 * is a minimum amount of memory (1MB) available BEFORE any data structure
998 * locks are held.
999 */
tmem_ensure_avail_pages(void)1000 static inline bool tmem_ensure_avail_pages(void)
1001 {
1002 int failed_evict = 10;
1003 unsigned long free_mem;
1004
1005 do {
1006 free_mem = (tmem_page_list_pages + total_free_pages())
1007 >> (20 - PAGE_SHIFT);
1008 if ( free_mem )
1009 return true;
1010 if ( !tmem_evict() )
1011 failed_evict--;
1012 } while ( failed_evict > 0 );
1013
1014 return false;
1015 }
1016
1017 /************ TMEM CORE OPERATIONS ************************************/
1018
do_tmem_put_compress(struct tmem_page_descriptor * pgp,xen_pfn_t cmfn,tmem_cli_va_param_t clibuf)1019 static int do_tmem_put_compress(struct tmem_page_descriptor *pgp, xen_pfn_t cmfn,
1020 tmem_cli_va_param_t clibuf)
1021 {
1022 void *dst, *p;
1023 size_t size;
1024 int ret = 0;
1025
1026 ASSERT(pgp != NULL);
1027 ASSERT(pgp->us.obj != NULL);
1028 ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock);
1029 ASSERT(pgp->us.obj->pool != NULL);
1030 ASSERT(pgp->us.obj->pool->client != NULL);
1031
1032 if ( pgp->pfp != NULL )
1033 pgp_free_data(pgp, pgp->us.obj->pool);
1034 ret = tmem_compress_from_client(cmfn, &dst, &size, clibuf);
1035 if ( ret <= 0 )
1036 goto out;
1037 else if ( (size == 0) || (size >= tmem_mempool_maxalloc) ) {
1038 ret = 0;
1039 goto out;
1040 } else if ( (p = tmem_malloc(size,pgp->us.obj->pool)) == NULL ) {
1041 ret = -ENOMEM;
1042 goto out;
1043 } else {
1044 memcpy(p,dst,size);
1045 pgp->cdata = p;
1046 }
1047 pgp->size = size;
1048 pgp->us.obj->pool->client->compressed_pages++;
1049 pgp->us.obj->pool->client->compressed_sum_size += size;
1050 ret = 1;
1051
1052 out:
1053 return ret;
1054 }
1055
do_tmem_dup_put(struct tmem_page_descriptor * pgp,xen_pfn_t cmfn,tmem_cli_va_param_t clibuf)1056 static int do_tmem_dup_put(struct tmem_page_descriptor *pgp, xen_pfn_t cmfn,
1057 tmem_cli_va_param_t clibuf)
1058 {
1059 struct tmem_pool *pool;
1060 struct tmem_object_root *obj;
1061 struct client *client;
1062 struct tmem_page_descriptor *pgpfound = NULL;
1063 int ret;
1064
1065 ASSERT(pgp != NULL);
1066 ASSERT(pgp->pfp != NULL);
1067 ASSERT(pgp->size != -1);
1068 obj = pgp->us.obj;
1069 ASSERT_SPINLOCK(&obj->obj_spinlock);
1070 ASSERT(obj != NULL);
1071 pool = obj->pool;
1072 ASSERT(pool != NULL);
1073 client = pool->client;
1074 if ( client->info.flags.u.migrating )
1075 goto failed_dup; /* No dups allowed when migrating. */
1076 /* Can we successfully manipulate pgp to change out the data? */
1077 if ( client->info.flags.u.compress && pgp->size != 0 )
1078 {
1079 ret = do_tmem_put_compress(pgp, cmfn, clibuf);
1080 if ( ret == 1 )
1081 goto done;
1082 else if ( ret == 0 )
1083 goto copy_uncompressed;
1084 else if ( ret == -ENOMEM )
1085 goto failed_dup;
1086 else if ( ret == -EFAULT )
1087 goto bad_copy;
1088 }
1089
1090 copy_uncompressed:
1091 if ( pgp->pfp )
1092 pgp_free_data(pgp, pool);
1093 if ( ( pgp->pfp = tmem_alloc_page(pool) ) == NULL )
1094 goto failed_dup;
1095 pgp->size = 0;
1096 ret = tmem_copy_from_client(pgp->pfp, cmfn, tmem_cli_buf_null);
1097 if ( ret < 0 )
1098 goto bad_copy;
1099
1100 done:
1101 /* Successfully replaced data, clean up and return success. */
1102 if ( is_shared(pool) )
1103 obj->last_client = client->cli_id;
1104 spin_unlock(&obj->obj_spinlock);
1105 pool->dup_puts_replaced++;
1106 pool->good_puts++;
1107 if ( is_persistent(pool) )
1108 client->succ_pers_puts++;
1109 return 1;
1110
1111 bad_copy:
1112 tmem_stats.failed_copies++;
1113 goto cleanup;
1114
1115 failed_dup:
1116 /*
1117 * Couldn't change out the data, flush the old data and return
1118 * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put.
1119 */
1120 ret = -ENOSPC;
1121 cleanup:
1122 pgpfound = pgp_delete_from_obj(obj, pgp->index);
1123 ASSERT(pgpfound == pgp);
1124 pgp_delist_free(pgpfound);
1125 if ( obj->pgp_count == 0 )
1126 {
1127 write_lock(&pool->pool_rwlock);
1128 obj_free(obj);
1129 write_unlock(&pool->pool_rwlock);
1130 } else {
1131 spin_unlock(&obj->obj_spinlock);
1132 }
1133 pool->dup_puts_flushed++;
1134 return ret;
1135 }
1136
do_tmem_put(struct tmem_pool * pool,struct xen_tmem_oid * oidp,uint32_t index,xen_pfn_t cmfn,tmem_cli_va_param_t clibuf)1137 static int do_tmem_put(struct tmem_pool *pool,
1138 struct xen_tmem_oid *oidp, uint32_t index,
1139 xen_pfn_t cmfn, tmem_cli_va_param_t clibuf)
1140 {
1141 struct tmem_object_root *obj = NULL;
1142 struct tmem_page_descriptor *pgp = NULL;
1143 struct client *client;
1144 int ret, newobj = 0;
1145
1146 ASSERT(pool != NULL);
1147 client = pool->client;
1148 ASSERT(client != NULL);
1149 ret = client->info.flags.u.frozen ? -EFROZEN : -ENOMEM;
1150 pool->puts++;
1151
1152 refind:
1153 /* Does page already exist (dup)? if so, handle specially. */
1154 if ( (obj = obj_find(pool, oidp)) != NULL )
1155 {
1156 if ((pgp = pgp_lookup_in_obj(obj, index)) != NULL)
1157 {
1158 return do_tmem_dup_put(pgp, cmfn, clibuf);
1159 }
1160 else
1161 {
1162 /* No puts allowed into a frozen pool (except dup puts). */
1163 if ( client->info.flags.u.frozen )
1164 goto unlock_obj;
1165 }
1166 }
1167 else
1168 {
1169 /* No puts allowed into a frozen pool (except dup puts). */
1170 if ( client->info.flags.u.frozen )
1171 return ret;
1172 if ( (obj = obj_alloc(pool, oidp)) == NULL )
1173 return -ENOMEM;
1174
1175 write_lock(&pool->pool_rwlock);
1176 /*
1177 * Parallel callers may already allocated obj and inserted to obj_rb_root
1178 * before us.
1179 */
1180 if ( !obj_rb_insert(&pool->obj_rb_root[oid_hash(oidp)], obj) )
1181 {
1182 tmem_free(obj, pool);
1183 write_unlock(&pool->pool_rwlock);
1184 goto refind;
1185 }
1186
1187 spin_lock(&obj->obj_spinlock);
1188 newobj = 1;
1189 write_unlock(&pool->pool_rwlock);
1190 }
1191
1192 /* When arrive here, we have a spinlocked obj for use. */
1193 ASSERT_SPINLOCK(&obj->obj_spinlock);
1194 if ( (pgp = pgp_alloc(obj)) == NULL )
1195 goto unlock_obj;
1196
1197 ret = pgp_add_to_obj(obj, index, pgp);
1198 if ( ret == -ENOMEM )
1199 /* Warning: may result in partially built radix tree ("stump"). */
1200 goto free_pgp;
1201
1202 pgp->index = index;
1203 pgp->size = 0;
1204
1205 if ( client->info.flags.u.compress )
1206 {
1207 ASSERT(pgp->pfp == NULL);
1208 ret = do_tmem_put_compress(pgp, cmfn, clibuf);
1209 if ( ret == 1 )
1210 goto insert_page;
1211 if ( ret == -ENOMEM )
1212 {
1213 client->compress_nomem++;
1214 goto del_pgp_from_obj;
1215 }
1216 if ( ret == 0 )
1217 {
1218 client->compress_poor++;
1219 goto copy_uncompressed;
1220 }
1221 if ( ret == -EFAULT )
1222 goto bad_copy;
1223 }
1224
1225 copy_uncompressed:
1226 if ( ( pgp->pfp = tmem_alloc_page(pool) ) == NULL )
1227 {
1228 ret = -ENOMEM;
1229 goto del_pgp_from_obj;
1230 }
1231 ret = tmem_copy_from_client(pgp->pfp, cmfn, clibuf);
1232 if ( ret < 0 )
1233 goto bad_copy;
1234
1235 insert_page:
1236 if ( !is_persistent(pool) )
1237 {
1238 spin_lock(&eph_lists_spinlock);
1239 list_add_tail(&pgp->global_eph_pages, &tmem_global.ephemeral_page_list);
1240 if (++tmem_global.eph_count > tmem_stats.global_eph_count_max)
1241 tmem_stats.global_eph_count_max = tmem_global.eph_count;
1242 list_add_tail(&pgp->us.client_eph_pages,
1243 &client->ephemeral_page_list);
1244 if (++client->eph_count > client->eph_count_max)
1245 client->eph_count_max = client->eph_count;
1246 spin_unlock(&eph_lists_spinlock);
1247 }
1248 else
1249 { /* is_persistent. */
1250 spin_lock(&pers_lists_spinlock);
1251 list_add_tail(&pgp->us.pool_pers_pages,
1252 &pool->persistent_page_list);
1253 spin_unlock(&pers_lists_spinlock);
1254 }
1255
1256 if ( is_shared(pool) )
1257 obj->last_client = client->cli_id;
1258
1259 /* Free the obj spinlock. */
1260 spin_unlock(&obj->obj_spinlock);
1261 pool->good_puts++;
1262
1263 if ( is_persistent(pool) )
1264 client->succ_pers_puts++;
1265 else
1266 tmem_stats.tot_good_eph_puts++;
1267 return 1;
1268
1269 bad_copy:
1270 tmem_stats.failed_copies++;
1271
1272 del_pgp_from_obj:
1273 ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
1274 pgp_delete_from_obj(obj, pgp->index);
1275
1276 free_pgp:
1277 pgp_free(pgp);
1278 unlock_obj:
1279 if ( newobj )
1280 {
1281 write_lock(&pool->pool_rwlock);
1282 obj_free(obj);
1283 write_unlock(&pool->pool_rwlock);
1284 }
1285 else
1286 {
1287 spin_unlock(&obj->obj_spinlock);
1288 }
1289 pool->no_mem_puts++;
1290 return ret;
1291 }
1292
do_tmem_get(struct tmem_pool * pool,struct xen_tmem_oid * oidp,uint32_t index,xen_pfn_t cmfn,tmem_cli_va_param_t clibuf)1293 static int do_tmem_get(struct tmem_pool *pool,
1294 struct xen_tmem_oid *oidp, uint32_t index,
1295 xen_pfn_t cmfn, tmem_cli_va_param_t clibuf)
1296 {
1297 struct tmem_object_root *obj;
1298 struct tmem_page_descriptor *pgp;
1299 struct client *client = pool->client;
1300 int rc;
1301
1302 if ( !_atomic_read(pool->pgp_count) )
1303 return -EEMPTY;
1304
1305 pool->gets++;
1306 obj = obj_find(pool,oidp);
1307 if ( obj == NULL )
1308 return 0;
1309
1310 ASSERT_SPINLOCK(&obj->obj_spinlock);
1311 if (is_shared(pool) || is_persistent(pool) )
1312 pgp = pgp_lookup_in_obj(obj, index);
1313 else
1314 pgp = pgp_delete_from_obj(obj, index);
1315 if ( pgp == NULL )
1316 {
1317 spin_unlock(&obj->obj_spinlock);
1318 return 0;
1319 }
1320 ASSERT(pgp->size != -1);
1321 if ( pgp->size != 0 )
1322 {
1323 rc = tmem_decompress_to_client(cmfn, pgp->cdata, pgp->size, clibuf);
1324 }
1325 else
1326 rc = tmem_copy_to_client(cmfn, pgp->pfp, clibuf);
1327 if ( rc <= 0 )
1328 goto bad_copy;
1329
1330 if ( !is_persistent(pool) )
1331 {
1332 if ( !is_shared(pool) )
1333 {
1334 pgp_delist_free(pgp);
1335 if ( obj->pgp_count == 0 )
1336 {
1337 write_lock(&pool->pool_rwlock);
1338 obj_free(obj);
1339 obj = NULL;
1340 write_unlock(&pool->pool_rwlock);
1341 }
1342 } else {
1343 spin_lock(&eph_lists_spinlock);
1344 list_del(&pgp->global_eph_pages);
1345 list_add_tail(&pgp->global_eph_pages,&tmem_global.ephemeral_page_list);
1346 list_del(&pgp->us.client_eph_pages);
1347 list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list);
1348 spin_unlock(&eph_lists_spinlock);
1349 obj->last_client = current->domain->domain_id;
1350 }
1351 }
1352 if ( obj != NULL )
1353 {
1354 spin_unlock(&obj->obj_spinlock);
1355 }
1356 pool->found_gets++;
1357 if ( is_persistent(pool) )
1358 client->succ_pers_gets++;
1359 else
1360 client->succ_eph_gets++;
1361 return 1;
1362
1363 bad_copy:
1364 spin_unlock(&obj->obj_spinlock);
1365 tmem_stats.failed_copies++;
1366 return rc;
1367 }
1368
do_tmem_flush_page(struct tmem_pool * pool,struct xen_tmem_oid * oidp,uint32_t index)1369 static int do_tmem_flush_page(struct tmem_pool *pool,
1370 struct xen_tmem_oid *oidp, uint32_t index)
1371 {
1372 struct tmem_object_root *obj;
1373 struct tmem_page_descriptor *pgp;
1374
1375 pool->flushs++;
1376 obj = obj_find(pool,oidp);
1377 if ( obj == NULL )
1378 goto out;
1379 pgp = pgp_delete_from_obj(obj, index);
1380 if ( pgp == NULL )
1381 {
1382 spin_unlock(&obj->obj_spinlock);
1383 goto out;
1384 }
1385 pgp_delist_free(pgp);
1386 if ( obj->pgp_count == 0 )
1387 {
1388 write_lock(&pool->pool_rwlock);
1389 obj_free(obj);
1390 write_unlock(&pool->pool_rwlock);
1391 } else {
1392 spin_unlock(&obj->obj_spinlock);
1393 }
1394 pool->flushs_found++;
1395
1396 out:
1397 if ( pool->client->info.flags.u.frozen )
1398 return -EFROZEN;
1399 else
1400 return 1;
1401 }
1402
do_tmem_flush_object(struct tmem_pool * pool,struct xen_tmem_oid * oidp)1403 static int do_tmem_flush_object(struct tmem_pool *pool,
1404 struct xen_tmem_oid *oidp)
1405 {
1406 struct tmem_object_root *obj;
1407
1408 pool->flush_objs++;
1409 obj = obj_find(pool,oidp);
1410 if ( obj == NULL )
1411 goto out;
1412 write_lock(&pool->pool_rwlock);
1413 obj_destroy(obj);
1414 pool->flush_objs_found++;
1415 write_unlock(&pool->pool_rwlock);
1416
1417 out:
1418 if ( pool->client->info.flags.u.frozen )
1419 return -EFROZEN;
1420 else
1421 return 1;
1422 }
1423
do_tmem_destroy_pool(uint32_t pool_id)1424 static int do_tmem_destroy_pool(uint32_t pool_id)
1425 {
1426 struct client *client = current->domain->tmem_client;
1427 struct tmem_pool *pool;
1428
1429 if ( pool_id >= MAX_POOLS_PER_DOMAIN )
1430 return 0;
1431 if ( (pool = client->pools[pool_id]) == NULL )
1432 return 0;
1433 client->pools[pool_id] = NULL;
1434 pool_flush(pool, client->cli_id);
1435 client->info.nr_pools--;
1436 return 1;
1437 }
1438
do_tmem_new_pool(domid_t this_cli_id,uint32_t d_poolid,uint32_t flags,uint64_t uuid_lo,uint64_t uuid_hi)1439 int do_tmem_new_pool(domid_t this_cli_id,
1440 uint32_t d_poolid, uint32_t flags,
1441 uint64_t uuid_lo, uint64_t uuid_hi)
1442 {
1443 struct client *client;
1444 domid_t cli_id;
1445 int persistent = flags & TMEM_POOL_PERSIST;
1446 int shared = flags & TMEM_POOL_SHARED;
1447 int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
1448 & TMEM_POOL_PAGESIZE_MASK;
1449 int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
1450 & TMEM_POOL_VERSION_MASK;
1451 struct tmem_pool *pool, *shpool;
1452 int i, first_unused_s_poolid;
1453
1454 if ( this_cli_id == TMEM_CLI_ID_NULL )
1455 cli_id = current->domain->domain_id;
1456 else
1457 cli_id = this_cli_id;
1458 tmem_client_info("tmem: allocating %s-%s tmem pool for %s=%d...",
1459 persistent ? "persistent" : "ephemeral" ,
1460 shared ? "shared" : "private", tmem_cli_id_str, cli_id);
1461 if ( specversion != TMEM_SPEC_VERSION )
1462 {
1463 tmem_client_err("failed... unsupported spec version\n");
1464 return -EPERM;
1465 }
1466 if ( shared && persistent )
1467 {
1468 tmem_client_err("failed... unable to create a shared-persistant pool\n");
1469 return -EPERM;
1470 }
1471 if ( pagebits != (PAGE_SHIFT - 12) )
1472 {
1473 tmem_client_err("failed... unsupported pagesize %d\n",
1474 1 << (pagebits + 12));
1475 return -EPERM;
1476 }
1477 if ( flags & TMEM_POOL_PRECOMPRESSED )
1478 {
1479 tmem_client_err("failed... precompression flag set but unsupported\n");
1480 return -EPERM;
1481 }
1482 if ( flags & TMEM_POOL_RESERVED_BITS )
1483 {
1484 tmem_client_err("failed... reserved bits must be zero\n");
1485 return -EPERM;
1486 }
1487 if ( this_cli_id != TMEM_CLI_ID_NULL )
1488 {
1489 if ( (client = tmem_client_from_cli_id(this_cli_id)) == NULL
1490 || d_poolid >= MAX_POOLS_PER_DOMAIN
1491 || client->pools[d_poolid] != NULL )
1492 return -EPERM;
1493 }
1494 else
1495 {
1496 client = current->domain->tmem_client;
1497 ASSERT(client != NULL);
1498 for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
1499 if ( client->pools[d_poolid] == NULL )
1500 break;
1501 if ( d_poolid >= MAX_POOLS_PER_DOMAIN )
1502 {
1503 tmem_client_err("failed... no more pool slots available for this %s\n",
1504 tmem_client_str);
1505 return -EPERM;
1506 }
1507 }
1508
1509 if ( (pool = pool_alloc()) == NULL )
1510 {
1511 tmem_client_err("failed... out of memory\n");
1512 return -ENOMEM;
1513 }
1514 client->pools[d_poolid] = pool;
1515 pool->client = client;
1516 pool->pool_id = d_poolid;
1517 pool->shared = shared;
1518 pool->persistent = persistent;
1519 pool->uuid[0] = uuid_lo;
1520 pool->uuid[1] = uuid_hi;
1521
1522 /*
1523 * Already created a pool when arrived here, but need some special process
1524 * for shared pool.
1525 */
1526 if ( shared )
1527 {
1528 if ( uuid_lo == -1L && uuid_hi == -1L )
1529 {
1530 tmem_client_info("Invalid uuid, create non shared pool instead!\n");
1531 pool->shared = 0;
1532 goto out;
1533 }
1534 if ( !tmem_global.shared_auth )
1535 {
1536 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1537 if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
1538 (client->shared_auth_uuid[i][1] == uuid_hi) )
1539 break;
1540 if ( i == MAX_GLOBAL_SHARED_POOLS )
1541 {
1542 tmem_client_info("Shared auth failed, create non shared pool instead!\n");
1543 pool->shared = 0;
1544 goto out;
1545 }
1546 }
1547
1548 /*
1549 * Authorize okay, match a global shared pool or use the newly allocated
1550 * one.
1551 */
1552 first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
1553 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
1554 {
1555 if ( (shpool = tmem_global.shared_pools[i]) != NULL )
1556 {
1557 if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
1558 {
1559 /* Succ to match a global shared pool. */
1560 tmem_client_info("(matches shared pool uuid=%"PRIx64".%"PRIx64") pool_id=%d\n",
1561 uuid_hi, uuid_lo, d_poolid);
1562 client->pools[d_poolid] = shpool;
1563 if ( !shared_pool_join(shpool, client) )
1564 {
1565 pool_free(pool);
1566 goto out;
1567 }
1568 else
1569 goto fail;
1570 }
1571 }
1572 else
1573 {
1574 if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1575 first_unused_s_poolid = i;
1576 }
1577 }
1578
1579 /* Failed to find a global shared pool slot. */
1580 if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1581 {
1582 tmem_client_warn("tmem: failed... no global shared pool slots available\n");
1583 goto fail;
1584 }
1585 /* Add pool to global shared pool. */
1586 else
1587 {
1588 INIT_LIST_HEAD(&pool->share_list);
1589 pool->shared_count = 0;
1590 if ( shared_pool_join(pool, client) )
1591 goto fail;
1592 tmem_global.shared_pools[first_unused_s_poolid] = pool;
1593 }
1594 }
1595
1596 out:
1597 tmem_client_info("pool_id=%d\n", d_poolid);
1598 client->info.nr_pools++;
1599 return d_poolid;
1600
1601 fail:
1602 pool_free(pool);
1603 return -EPERM;
1604 }
1605
1606 /************ TMEM CONTROL OPERATIONS ************************************/
1607
tmemc_shared_pool_auth(domid_t cli_id,uint64_t uuid_lo,uint64_t uuid_hi,bool auth)1608 int tmemc_shared_pool_auth(domid_t cli_id, uint64_t uuid_lo,
1609 uint64_t uuid_hi, bool auth)
1610 {
1611 struct client *client;
1612 int i, free = -1;
1613
1614 if ( cli_id == TMEM_CLI_ID_NULL )
1615 {
1616 tmem_global.shared_auth = auth;
1617 return 1;
1618 }
1619 client = tmem_client_from_cli_id(cli_id);
1620 if ( client == NULL )
1621 return -EINVAL;
1622
1623 for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1624 {
1625 if ( auth == 0 )
1626 {
1627 if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
1628 (client->shared_auth_uuid[i][1] == uuid_hi) )
1629 {
1630 client->shared_auth_uuid[i][0] = -1L;
1631 client->shared_auth_uuid[i][1] = -1L;
1632 return 1;
1633 }
1634 }
1635 else
1636 {
1637 if ( (client->shared_auth_uuid[i][0] == -1L) &&
1638 (client->shared_auth_uuid[i][1] == -1L) )
1639 {
1640 free = i;
1641 break;
1642 }
1643 }
1644 }
1645 if ( auth == 0 )
1646 return 0;
1647 else if ( free == -1)
1648 return -ENOMEM;
1649 else
1650 {
1651 client->shared_auth_uuid[free][0] = uuid_lo;
1652 client->shared_auth_uuid[free][1] = uuid_hi;
1653 return 1;
1654 }
1655 }
1656
tmemc_save_subop(int cli_id,uint32_t pool_id,uint32_t subop,tmem_cli_va_param_t buf,uint32_t arg)1657 static int tmemc_save_subop(int cli_id, uint32_t pool_id,
1658 uint32_t subop, tmem_cli_va_param_t buf, uint32_t arg)
1659 {
1660 struct client *client = tmem_client_from_cli_id(cli_id);
1661 uint32_t p;
1662 struct tmem_page_descriptor *pgp, *pgp2;
1663 int rc = -ENOENT;
1664
1665 switch(subop)
1666 {
1667 case XEN_SYSCTL_TMEM_OP_SAVE_BEGIN:
1668 if ( client == NULL )
1669 break;
1670 for (p = 0; p < MAX_POOLS_PER_DOMAIN; p++)
1671 if ( client->pools[p] != NULL )
1672 break;
1673
1674 if ( p == MAX_POOLS_PER_DOMAIN )
1675 break;
1676
1677 client->was_frozen = client->info.flags.u.frozen;
1678 client->info.flags.u.frozen = 1;
1679 if ( arg != 0 )
1680 client->info.flags.u.migrating = 1;
1681 rc = 0;
1682 break;
1683 case XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN:
1684 if ( client == NULL )
1685 rc = client_create(cli_id) ? 0 : -ENOMEM;
1686 else
1687 rc = -EEXIST;
1688 break;
1689 case XEN_SYSCTL_TMEM_OP_SAVE_END:
1690 if ( client == NULL )
1691 break;
1692 client->info.flags.u.migrating = 0;
1693 if ( !list_empty(&client->persistent_invalidated_list) )
1694 list_for_each_entry_safe(pgp,pgp2,
1695 &client->persistent_invalidated_list, client_inv_pages)
1696 __pgp_free(pgp, client->pools[pgp->pool_id]);
1697 client->info.flags.u.frozen = client->was_frozen;
1698 rc = 0;
1699 break;
1700 }
1701 return rc;
1702 }
1703
tmemc_save_get_next_page(int cli_id,uint32_t pool_id,tmem_cli_va_param_t buf,uint32_t bufsize)1704 static int tmemc_save_get_next_page(int cli_id, uint32_t pool_id,
1705 tmem_cli_va_param_t buf, uint32_t bufsize)
1706 {
1707 struct client *client = tmem_client_from_cli_id(cli_id);
1708 struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
1709 ? NULL : client->pools[pool_id];
1710 struct tmem_page_descriptor *pgp;
1711 struct xen_tmem_oid *oid;
1712 int ret = 0;
1713 struct tmem_handle h;
1714
1715 if ( pool == NULL || !is_persistent(pool) )
1716 return -1;
1717
1718 if ( bufsize < PAGE_SIZE + sizeof(struct tmem_handle) )
1719 return -ENOMEM;
1720
1721 spin_lock(&pers_lists_spinlock);
1722 if ( list_empty(&pool->persistent_page_list) )
1723 {
1724 ret = -1;
1725 goto out;
1726 }
1727 /* Note: pool->cur_pgp is the pgp last returned by get_next_page. */
1728 if ( pool->cur_pgp == NULL )
1729 {
1730 /* Process the first one. */
1731 pool->cur_pgp = pgp = list_entry((&pool->persistent_page_list)->next,
1732 struct tmem_page_descriptor,us.pool_pers_pages);
1733 } else if ( list_is_last(&pool->cur_pgp->us.pool_pers_pages,
1734 &pool->persistent_page_list) )
1735 {
1736 /* Already processed the last one in the list. */
1737 ret = -1;
1738 goto out;
1739 }
1740 pgp = list_entry((&pool->cur_pgp->us.pool_pers_pages)->next,
1741 struct tmem_page_descriptor,us.pool_pers_pages);
1742 pool->cur_pgp = pgp;
1743 oid = &pgp->us.obj->oid;
1744 h.pool_id = pool_id;
1745 BUILD_BUG_ON(sizeof(h.oid) != sizeof(*oid));
1746 memcpy(&(h.oid), oid, sizeof(h.oid));
1747 h.index = pgp->index;
1748 if ( copy_to_guest(guest_handle_cast(buf, void), &h, 1) )
1749 {
1750 ret = -EFAULT;
1751 goto out;
1752 }
1753 guest_handle_add_offset(buf, sizeof(h));
1754 ret = do_tmem_get(pool, oid, pgp->index, 0, buf);
1755
1756 out:
1757 spin_unlock(&pers_lists_spinlock);
1758 return ret;
1759 }
1760
tmemc_save_get_next_inv(int cli_id,tmem_cli_va_param_t buf,uint32_t bufsize)1761 static int tmemc_save_get_next_inv(int cli_id, tmem_cli_va_param_t buf,
1762 uint32_t bufsize)
1763 {
1764 struct client *client = tmem_client_from_cli_id(cli_id);
1765 struct tmem_page_descriptor *pgp;
1766 struct tmem_handle h;
1767 int ret = 0;
1768
1769 if ( client == NULL )
1770 return 0;
1771 if ( bufsize < sizeof(struct tmem_handle) )
1772 return 0;
1773 spin_lock(&pers_lists_spinlock);
1774 if ( list_empty(&client->persistent_invalidated_list) )
1775 goto out;
1776 if ( client->cur_pgp == NULL )
1777 {
1778 pgp = list_entry((&client->persistent_invalidated_list)->next,
1779 struct tmem_page_descriptor,client_inv_pages);
1780 client->cur_pgp = pgp;
1781 } else if ( list_is_last(&client->cur_pgp->client_inv_pages,
1782 &client->persistent_invalidated_list) )
1783 {
1784 client->cur_pgp = NULL;
1785 ret = 0;
1786 goto out;
1787 } else {
1788 pgp = list_entry((&client->cur_pgp->client_inv_pages)->next,
1789 struct tmem_page_descriptor,client_inv_pages);
1790 client->cur_pgp = pgp;
1791 }
1792 h.pool_id = pgp->pool_id;
1793 BUILD_BUG_ON(sizeof(h.oid) != sizeof(pgp->inv_oid));
1794 memcpy(&(h.oid), &(pgp->inv_oid), sizeof(h.oid));
1795 h.index = pgp->index;
1796 ret = 1;
1797 if ( copy_to_guest(guest_handle_cast(buf, void), &h, 1) )
1798 ret = -EFAULT;
1799 out:
1800 spin_unlock(&pers_lists_spinlock);
1801 return ret;
1802 }
1803
tmemc_restore_put_page(int cli_id,uint32_t pool_id,struct xen_tmem_oid * oidp,uint32_t index,tmem_cli_va_param_t buf,uint32_t bufsize)1804 static int tmemc_restore_put_page(int cli_id, uint32_t pool_id,
1805 struct xen_tmem_oid *oidp,
1806 uint32_t index, tmem_cli_va_param_t buf,
1807 uint32_t bufsize)
1808 {
1809 struct client *client = tmem_client_from_cli_id(cli_id);
1810 struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
1811 ? NULL : client->pools[pool_id];
1812
1813 if ( pool == NULL )
1814 return -1;
1815 if (bufsize != PAGE_SIZE) {
1816 tmem_client_err("tmem: %s: invalid parameter bufsize(%d) != (%ld)\n",
1817 __func__, bufsize, PAGE_SIZE);
1818 return -EINVAL;
1819 }
1820 return do_tmem_put(pool, oidp, index, 0, buf);
1821 }
1822
tmemc_restore_flush_page(int cli_id,uint32_t pool_id,struct xen_tmem_oid * oidp,uint32_t index)1823 static int tmemc_restore_flush_page(int cli_id, uint32_t pool_id,
1824 struct xen_tmem_oid *oidp,
1825 uint32_t index)
1826 {
1827 struct client *client = tmem_client_from_cli_id(cli_id);
1828 struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
1829 ? NULL : client->pools[pool_id];
1830
1831 if ( pool == NULL )
1832 return -1;
1833 return do_tmem_flush_page(pool,oidp,index);
1834 }
1835
do_tmem_control(struct xen_sysctl_tmem_op * op)1836 int do_tmem_control(struct xen_sysctl_tmem_op *op)
1837 {
1838 int ret;
1839 uint32_t pool_id = op->pool_id;
1840 uint32_t cmd = op->cmd;
1841 struct xen_tmem_oid *oidp = &op->oid;
1842
1843 ASSERT(rw_is_write_locked(&tmem_rwlock));
1844
1845 switch (cmd)
1846 {
1847 case XEN_SYSCTL_TMEM_OP_SAVE_BEGIN:
1848 case XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN:
1849 case XEN_SYSCTL_TMEM_OP_SAVE_END:
1850 ret = tmemc_save_subop(op->cli_id, pool_id, cmd,
1851 guest_handle_cast(op->u.buf, char), op->arg);
1852 break;
1853 case XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_PAGE:
1854 ret = tmemc_save_get_next_page(op->cli_id, pool_id,
1855 guest_handle_cast(op->u.buf, char), op->len);
1856 break;
1857 case XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_INV:
1858 ret = tmemc_save_get_next_inv(op->cli_id,
1859 guest_handle_cast(op->u.buf, char), op->len);
1860 break;
1861 case XEN_SYSCTL_TMEM_OP_RESTORE_PUT_PAGE:
1862 ret = tmemc_restore_put_page(op->cli_id, pool_id, oidp, op->arg,
1863 guest_handle_cast(op->u.buf, char), op->len);
1864 break;
1865 case XEN_SYSCTL_TMEM_OP_RESTORE_FLUSH_PAGE:
1866 ret = tmemc_restore_flush_page(op->cli_id, pool_id, oidp, op->arg);
1867 break;
1868 default:
1869 ret = -1;
1870 }
1871
1872 return ret;
1873 }
1874
1875 /************ EXPORTed FUNCTIONS **************************************/
1876
do_tmem_op(tmem_cli_op_t uops)1877 long do_tmem_op(tmem_cli_op_t uops)
1878 {
1879 struct tmem_op op;
1880 struct client *client = current->domain->tmem_client;
1881 struct tmem_pool *pool = NULL;
1882 struct xen_tmem_oid *oidp;
1883 int rc = 0;
1884
1885 if ( !tmem_initialized )
1886 return -ENODEV;
1887
1888 if ( xsm_tmem_op(XSM_HOOK) )
1889 return -EPERM;
1890
1891 tmem_stats.total_tmem_ops++;
1892
1893 if ( client != NULL && client->domain->is_dying )
1894 {
1895 tmem_stats.errored_tmem_ops++;
1896 return -ENODEV;
1897 }
1898
1899 if ( unlikely(tmem_get_tmemop_from_client(&op, uops) != 0) )
1900 {
1901 tmem_client_err("tmem: can't get tmem struct from %s\n", tmem_client_str);
1902 tmem_stats.errored_tmem_ops++;
1903 return -EFAULT;
1904 }
1905
1906 /* Acquire write lock for all commands at first. */
1907 write_lock(&tmem_rwlock);
1908
1909 switch ( op.cmd )
1910 {
1911 case TMEM_CONTROL:
1912 case TMEM_RESTORE_NEW:
1913 case TMEM_AUTH:
1914 rc = -EOPNOTSUPP;
1915 break;
1916
1917 default:
1918 /*
1919 * For other commands, create per-client tmem structure dynamically on
1920 * first use by client.
1921 */
1922 if ( client == NULL )
1923 {
1924 if ( (client = client_create(current->domain->domain_id)) == NULL )
1925 {
1926 tmem_client_err("tmem: can't create tmem structure for %s\n",
1927 tmem_client_str);
1928 rc = -ENOMEM;
1929 goto out;
1930 }
1931 }
1932
1933 if ( op.cmd == TMEM_NEW_POOL || op.cmd == TMEM_DESTROY_POOL )
1934 {
1935 if ( op.cmd == TMEM_NEW_POOL )
1936 rc = do_tmem_new_pool(TMEM_CLI_ID_NULL, 0, op.u.creat.flags,
1937 op.u.creat.uuid[0], op.u.creat.uuid[1]);
1938 else
1939 rc = do_tmem_destroy_pool(op.pool_id);
1940 }
1941 else
1942 {
1943 if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
1944 ((pool = client->pools[op.pool_id]) == NULL) )
1945 {
1946 tmem_client_err("tmem: operation requested on uncreated pool\n");
1947 rc = -ENODEV;
1948 goto out;
1949 }
1950 /* Commands that only need read lock. */
1951 write_unlock(&tmem_rwlock);
1952 read_lock(&tmem_rwlock);
1953
1954 oidp = &op.u.gen.oid;
1955 switch ( op.cmd )
1956 {
1957 case TMEM_NEW_POOL:
1958 case TMEM_DESTROY_POOL:
1959 BUG(); /* Done earlier. */
1960 break;
1961 case TMEM_PUT_PAGE:
1962 if (tmem_ensure_avail_pages())
1963 rc = do_tmem_put(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
1964 tmem_cli_buf_null);
1965 else
1966 rc = -ENOMEM;
1967 break;
1968 case TMEM_GET_PAGE:
1969 rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
1970 tmem_cli_buf_null);
1971 break;
1972 case TMEM_FLUSH_PAGE:
1973 rc = do_tmem_flush_page(pool, oidp, op.u.gen.index);
1974 break;
1975 case TMEM_FLUSH_OBJECT:
1976 rc = do_tmem_flush_object(pool, oidp);
1977 break;
1978 default:
1979 tmem_client_warn("tmem: op %d not implemented\n", op.cmd);
1980 rc = -ENOSYS;
1981 break;
1982 }
1983 read_unlock(&tmem_rwlock);
1984 if ( rc < 0 )
1985 tmem_stats.errored_tmem_ops++;
1986 return rc;
1987 }
1988 break;
1989
1990 }
1991 out:
1992 write_unlock(&tmem_rwlock);
1993 if ( rc < 0 )
1994 tmem_stats.errored_tmem_ops++;
1995 return rc;
1996 }
1997
1998 /* This should be called when the host is destroying a client (domain). */
tmem_destroy(void * v)1999 void tmem_destroy(void *v)
2000 {
2001 struct client *client = (struct client *)v;
2002
2003 if ( client == NULL )
2004 return;
2005
2006 if ( !client->domain->is_dying )
2007 {
2008 printk("tmem: tmem_destroy can only destroy dying client\n");
2009 return;
2010 }
2011
2012 write_lock(&tmem_rwlock);
2013
2014 printk("tmem: flushing tmem pools for %s=%d\n",
2015 tmem_cli_id_str, client->cli_id);
2016 client_flush(client);
2017
2018 write_unlock(&tmem_rwlock);
2019 }
2020
2021 #define MAX_EVICTS 10 /* Should be variable or set via XEN_SYSCTL_TMEM_OP_ ?? */
tmem_relinquish_pages(unsigned int order,unsigned int memflags)2022 void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
2023 {
2024 struct page_info *pfp;
2025 unsigned long evicts_per_relinq = 0;
2026 int max_evictions = 10;
2027
2028 if (!tmem_enabled() || !tmem_freeable_pages())
2029 return NULL;
2030
2031 tmem_stats.relinq_attempts++;
2032 if ( order > 0 )
2033 {
2034 #ifndef NDEBUG
2035 printk("tmem_relinquish_page: failing order=%d\n", order);
2036 #endif
2037 return NULL;
2038 }
2039
2040 while ( (pfp = tmem_page_list_get()) == NULL )
2041 {
2042 if ( (max_evictions-- <= 0) || !tmem_evict())
2043 break;
2044 evicts_per_relinq++;
2045 }
2046 if ( evicts_per_relinq > tmem_stats.max_evicts_per_relinq )
2047 tmem_stats.max_evicts_per_relinq = evicts_per_relinq;
2048 if ( pfp != NULL )
2049 {
2050 if ( !(memflags & MEMF_tmem) )
2051 scrub_one_page(pfp);
2052 tmem_stats.relinq_pgs++;
2053 }
2054
2055 return pfp;
2056 }
2057
tmem_freeable_pages(void)2058 unsigned long tmem_freeable_pages(void)
2059 {
2060 if ( !tmem_enabled() )
2061 return 0;
2062
2063 return tmem_page_list_pages + _atomic_read(freeable_page_count);
2064 }
2065
2066 /* Called at hypervisor startup. */
init_tmem(void)2067 static int __init init_tmem(void)
2068 {
2069 if ( !tmem_enabled() )
2070 return 0;
2071
2072 if ( !tmem_mempool_init() )
2073 return 0;
2074
2075 if ( tmem_init() )
2076 {
2077 printk("tmem: initialized comp=%d\n", tmem_compression_enabled());
2078 tmem_initialized = 1;
2079 }
2080 else
2081 printk("tmem: initialization FAILED\n");
2082
2083 return 0;
2084 }
2085 __initcall(init_tmem);
2086
2087 /*
2088 * Local variables:
2089 * mode: C
2090 * c-file-style: "BSD"
2091 * c-basic-offset: 4
2092 * tab-width: 4
2093 * indent-tabs-mode: nil
2094 * End:
2095 */
2096