1 /******************************************************************************
2  * tmem.c
3  *
4  * Transcendent memory
5  *
6  * Copyright (c) 2009, Dan Magenheimer, Oracle Corp.
7  */
8 
9 /* TODO list: 090129 (updated 100318)
10    - any better reclamation policy?
11    - use different tlsf pools for each client (maybe each pool)
12    - test shared access more completely (ocfs2)
13    - add feedback-driven compression (not for persistent pools though!)
14    - add data-structure total bytes overhead stats
15  */
16 
17 #ifdef __XEN__
18 #include <xen/tmem_xen.h> /* host-specific (eg Xen) code goes here. */
19 #endif
20 
21 #include <public/sysctl.h>
22 #include <xen/tmem.h>
23 #include <xen/rbtree.h>
24 #include <xen/radix-tree.h>
25 #include <xen/list.h>
26 #include <xen/init.h>
27 
28 #define TMEM_SPEC_VERSION 1
29 
30 struct tmem_statistics tmem_stats = {
31     .global_obj_count = ATOMIC_INIT(0),
32     .global_pgp_count = ATOMIC_INIT(0),
33     .global_pcd_count = ATOMIC_INIT(0),
34     .global_page_count = ATOMIC_INIT(0),
35     .global_rtree_node_count = ATOMIC_INIT(0),
36 };
37 
38 /************ CORE DATA STRUCTURES ************************************/
39 
40 struct tmem_object_root {
41     struct xen_tmem_oid oid;
42     struct rb_node rb_tree_node; /* Protected by pool->pool_rwlock. */
43     unsigned long objnode_count; /* Atomicity depends on obj_spinlock. */
44     long pgp_count; /* Atomicity depends on obj_spinlock. */
45     struct radix_tree_root tree_root; /* Tree of pages within object. */
46     struct tmem_pool *pool;
47     domid_t last_client;
48     spinlock_t obj_spinlock;
49 };
50 
51 struct tmem_object_node {
52     struct tmem_object_root *obj;
53     struct radix_tree_node rtn;
54 };
55 
56 struct tmem_page_descriptor {
57     union {
58         struct list_head global_eph_pages;
59         struct list_head client_inv_pages;
60     };
61     union {
62         struct {
63             union {
64                 struct list_head client_eph_pages;
65                 struct list_head pool_pers_pages;
66             };
67             struct tmem_object_root *obj;
68         } us;
69         struct xen_tmem_oid inv_oid;  /* Used for invalid list only. */
70     };
71     pagesize_t size; /* 0 == PAGE_SIZE (pfp), -1 == data invalid,
72                     else compressed data (cdata). */
73     uint32_t index;
74     bool eviction_attempted;  /* CHANGE TO lifetimes? (settable). */
75     union {
76         struct page_info *pfp;  /* Page frame pointer. */
77         char *cdata; /* Compressed data. */
78         struct tmem_page_content_descriptor *pcd; /* Page dedup. */
79     };
80     union {
81         uint64_t timestamp;
82         uint32_t pool_id;  /* Used for invalid list only. */
83     };
84 };
85 
86 #define PCD_TZE_MAX_SIZE (PAGE_SIZE - (PAGE_SIZE/64))
87 
88 struct tmem_page_content_descriptor {
89     union {
90         struct page_info *pfp;  /* Page frame pointer. */
91         char *cdata; /* If compression_enabled. */
92     };
93     pagesize_t size; /* If compression_enabled -> 0<size<PAGE_SIZE (*cdata)
94                      * else if tze, 0<=size<PAGE_SIZE, rounded up to mult of 8
95                      * else PAGE_SIZE -> *pfp. */
96 };
97 
98 static int tmem_initialized = 0;
99 
100 struct xmem_pool *tmem_mempool = 0;
101 unsigned int tmem_mempool_maxalloc = 0;
102 
103 DEFINE_SPINLOCK(tmem_page_list_lock);
104 PAGE_LIST_HEAD(tmem_page_list);
105 unsigned long tmem_page_list_pages = 0;
106 
107 DEFINE_RWLOCK(tmem_rwlock);
108 static DEFINE_SPINLOCK(eph_lists_spinlock); /* Protects global AND clients. */
109 static DEFINE_SPINLOCK(pers_lists_spinlock);
110 
111 #define ASSERT_SPINLOCK(_l) ASSERT(spin_is_locked(_l))
112 #define ASSERT_WRITELOCK(_l) ASSERT(rw_is_write_locked(_l))
113 
114     atomic_t client_weight_total;
115 
116 struct tmem_global tmem_global = {
117     .ephemeral_page_list = LIST_HEAD_INIT(tmem_global.ephemeral_page_list),
118     .client_list = LIST_HEAD_INIT(tmem_global.client_list),
119     .client_weight_total = ATOMIC_INIT(0),
120 };
121 
122 /*
123  * There two types of memory allocation interfaces in tmem.
124  * One is based on xmem_pool and the other is used for allocate a whole page.
125  * Both of them are based on the lowlevel function __tmem_alloc_page/_thispool().
126  * The call trace of alloc path is like below.
127  * Persistant pool:
128  *     1.tmem_malloc()
129  *         > xmem_pool_alloc()
130  *             > tmem_persistent_pool_page_get()
131  *                 > __tmem_alloc_page_thispool()
132  *     2.tmem_alloc_page()
133  *         > __tmem_alloc_page_thispool()
134  *
135  * Ephemeral pool:
136  *     1.tmem_malloc()
137  *         > xmem_pool_alloc()
138  *             > tmem_mempool_page_get()
139  *                 > __tmem_alloc_page()
140  *     2.tmem_alloc_page()
141  *         > __tmem_alloc_page()
142  *
143  * The free path is done in the same manner.
144  */
tmem_malloc(size_t size,struct tmem_pool * pool)145 static void *tmem_malloc(size_t size, struct tmem_pool *pool)
146 {
147     void *v = NULL;
148 
149     if ( (pool != NULL) && is_persistent(pool) ) {
150         if ( pool->client->persistent_pool )
151             v = xmem_pool_alloc(size, pool->client->persistent_pool);
152     }
153     else
154     {
155         ASSERT( size < tmem_mempool_maxalloc );
156         ASSERT( tmem_mempool != NULL );
157         v = xmem_pool_alloc(size, tmem_mempool);
158     }
159     if ( v == NULL )
160         tmem_stats.alloc_failed++;
161     return v;
162 }
163 
tmem_free(void * p,struct tmem_pool * pool)164 static void tmem_free(void *p, struct tmem_pool *pool)
165 {
166     if ( pool == NULL || !is_persistent(pool) )
167     {
168         ASSERT( tmem_mempool != NULL );
169         xmem_pool_free(p, tmem_mempool);
170     }
171     else
172     {
173         ASSERT( pool->client->persistent_pool != NULL );
174         xmem_pool_free(p, pool->client->persistent_pool);
175     }
176 }
177 
tmem_alloc_page(struct tmem_pool * pool)178 static struct page_info *tmem_alloc_page(struct tmem_pool *pool)
179 {
180     struct page_info *pfp = NULL;
181 
182     if ( pool != NULL && is_persistent(pool) )
183         pfp = __tmem_alloc_page_thispool(pool->client->domain);
184     else
185         pfp = __tmem_alloc_page();
186     if ( pfp == NULL )
187         tmem_stats.alloc_page_failed++;
188     else
189         atomic_inc_and_max(global_page_count);
190     return pfp;
191 }
192 
tmem_free_page(struct tmem_pool * pool,struct page_info * pfp)193 static void tmem_free_page(struct tmem_pool *pool, struct page_info *pfp)
194 {
195     ASSERT(pfp);
196     if ( pool == NULL || !is_persistent(pool) )
197         __tmem_free_page(pfp);
198     else
199         __tmem_free_page_thispool(pfp);
200     atomic_dec_and_assert(global_page_count);
201 }
202 
tmem_mempool_page_get(unsigned long size)203 static void *tmem_mempool_page_get(unsigned long size)
204 {
205     struct page_info *pi;
206 
207     ASSERT(size == PAGE_SIZE);
208     if ( (pi = __tmem_alloc_page()) == NULL )
209         return NULL;
210     return page_to_virt(pi);
211 }
212 
tmem_mempool_page_put(void * page_va)213 static void tmem_mempool_page_put(void *page_va)
214 {
215     ASSERT(IS_PAGE_ALIGNED(page_va));
216     __tmem_free_page(virt_to_page(page_va));
217 }
218 
tmem_mempool_init(void)219 static int __init tmem_mempool_init(void)
220 {
221     tmem_mempool = xmem_pool_create("tmem", tmem_mempool_page_get,
222         tmem_mempool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
223     if ( tmem_mempool )
224         tmem_mempool_maxalloc = xmem_pool_maxalloc(tmem_mempool);
225     return tmem_mempool != NULL;
226 }
227 
228 /* Persistent pools are per-domain. */
tmem_persistent_pool_page_get(unsigned long size)229 static void *tmem_persistent_pool_page_get(unsigned long size)
230 {
231     struct page_info *pi;
232     struct domain *d = current->domain;
233 
234     ASSERT(size == PAGE_SIZE);
235     if ( (pi = __tmem_alloc_page_thispool(d)) == NULL )
236         return NULL;
237     ASSERT(IS_VALID_PAGE(pi));
238     return page_to_virt(pi);
239 }
240 
tmem_persistent_pool_page_put(void * page_va)241 static void tmem_persistent_pool_page_put(void *page_va)
242 {
243     struct page_info *pi;
244 
245     ASSERT(IS_PAGE_ALIGNED(page_va));
246     pi = mfn_to_page(virt_to_mfn(page_va));
247     ASSERT(IS_VALID_PAGE(pi));
248     __tmem_free_page_thispool(pi);
249 }
250 
251 /*
252  * Page content descriptor manipulation routines.
253  */
254 #define NOT_SHAREABLE ((uint16_t)-1UL)
255 
256 /************ PAGE DESCRIPTOR MANIPULATION ROUTINES *******************/
257 
258 /* Allocate a struct tmem_page_descriptor and associate it with an object. */
pgp_alloc(struct tmem_object_root * obj)259 static struct tmem_page_descriptor *pgp_alloc(struct tmem_object_root *obj)
260 {
261     struct tmem_page_descriptor *pgp;
262     struct tmem_pool *pool;
263 
264     ASSERT(obj != NULL);
265     ASSERT(obj->pool != NULL);
266     pool = obj->pool;
267     if ( (pgp = tmem_malloc(sizeof(struct tmem_page_descriptor), pool)) == NULL )
268         return NULL;
269     pgp->us.obj = obj;
270     INIT_LIST_HEAD(&pgp->global_eph_pages);
271     INIT_LIST_HEAD(&pgp->us.client_eph_pages);
272     pgp->pfp = NULL;
273     pgp->size = -1;
274     pgp->index = -1;
275     pgp->timestamp = get_cycles();
276     atomic_inc_and_max(global_pgp_count);
277     atomic_inc(&pool->pgp_count);
278     if ( _atomic_read(pool->pgp_count) > pool->pgp_count_max )
279         pool->pgp_count_max = _atomic_read(pool->pgp_count);
280     return pgp;
281 }
282 
pgp_lookup_in_obj(struct tmem_object_root * obj,uint32_t index)283 static struct tmem_page_descriptor *pgp_lookup_in_obj(struct tmem_object_root *obj, uint32_t index)
284 {
285     ASSERT(obj != NULL);
286     ASSERT_SPINLOCK(&obj->obj_spinlock);
287     ASSERT(obj->pool != NULL);
288     return radix_tree_lookup(&obj->tree_root, index);
289 }
290 
pgp_free_data(struct tmem_page_descriptor * pgp,struct tmem_pool * pool)291 static void pgp_free_data(struct tmem_page_descriptor *pgp, struct tmem_pool *pool)
292 {
293     pagesize_t pgp_size = pgp->size;
294 
295     if ( pgp->pfp == NULL )
296         return;
297     if ( pgp_size )
298         tmem_free(pgp->cdata, pool);
299     else
300         tmem_free_page(pgp->us.obj->pool,pgp->pfp);
301     if ( pool != NULL && pgp_size )
302     {
303         pool->client->compressed_pages--;
304         pool->client->compressed_sum_size -= pgp_size;
305     }
306     pgp->pfp = NULL;
307     pgp->size = -1;
308 }
309 
__pgp_free(struct tmem_page_descriptor * pgp,struct tmem_pool * pool)310 static void __pgp_free(struct tmem_page_descriptor *pgp, struct tmem_pool *pool)
311 {
312     pgp->us.obj = NULL;
313     pgp->index = -1;
314     tmem_free(pgp, pool);
315 }
316 
pgp_free(struct tmem_page_descriptor * pgp)317 static void pgp_free(struct tmem_page_descriptor *pgp)
318 {
319     struct tmem_pool *pool = NULL;
320 
321     ASSERT(pgp->us.obj != NULL);
322     ASSERT(pgp->us.obj->pool != NULL);
323     ASSERT(pgp->us.obj->pool->client != NULL);
324 
325     pool = pgp->us.obj->pool;
326     if ( !is_persistent(pool) )
327     {
328         ASSERT(list_empty(&pgp->global_eph_pages));
329         ASSERT(list_empty(&pgp->us.client_eph_pages));
330     }
331     pgp_free_data(pgp, pool);
332     atomic_dec_and_assert(global_pgp_count);
333     atomic_dec(&pool->pgp_count);
334     ASSERT(_atomic_read(pool->pgp_count) >= 0);
335     pgp->size = -1;
336     if ( is_persistent(pool) && pool->client->info.flags.u.migrating )
337     {
338         pgp->inv_oid = pgp->us.obj->oid;
339         pgp->pool_id = pool->pool_id;
340         return;
341     }
342     __pgp_free(pgp, pool);
343 }
344 
345 /* Remove pgp from global/pool/client lists and free it. */
pgp_delist_free(struct tmem_page_descriptor * pgp)346 static void pgp_delist_free(struct tmem_page_descriptor *pgp)
347 {
348     struct client *client;
349     uint64_t life;
350 
351     ASSERT(pgp != NULL);
352     ASSERT(pgp->us.obj != NULL);
353     ASSERT(pgp->us.obj->pool != NULL);
354     client = pgp->us.obj->pool->client;
355     ASSERT(client != NULL);
356 
357     /* Delist pgp. */
358     if ( !is_persistent(pgp->us.obj->pool) )
359     {
360         spin_lock(&eph_lists_spinlock);
361         if ( !list_empty(&pgp->us.client_eph_pages) )
362             client->eph_count--;
363         ASSERT(client->eph_count >= 0);
364         list_del_init(&pgp->us.client_eph_pages);
365         if ( !list_empty(&pgp->global_eph_pages) )
366             tmem_global.eph_count--;
367         ASSERT(tmem_global.eph_count >= 0);
368         list_del_init(&pgp->global_eph_pages);
369         spin_unlock(&eph_lists_spinlock);
370     }
371     else
372     {
373         if ( client->info.flags.u.migrating )
374         {
375             spin_lock(&pers_lists_spinlock);
376             list_add_tail(&pgp->client_inv_pages,
377                           &client->persistent_invalidated_list);
378             if ( pgp != pgp->us.obj->pool->cur_pgp )
379                 list_del_init(&pgp->us.pool_pers_pages);
380             spin_unlock(&pers_lists_spinlock);
381         }
382         else
383         {
384             spin_lock(&pers_lists_spinlock);
385             list_del_init(&pgp->us.pool_pers_pages);
386             spin_unlock(&pers_lists_spinlock);
387         }
388     }
389     life = get_cycles() - pgp->timestamp;
390     pgp->us.obj->pool->sum_life_cycles += life;
391 
392     /* Free pgp. */
393     pgp_free(pgp);
394 }
395 
396 /* Called only indirectly by radix_tree_destroy. */
pgp_destroy(void * v)397 static void pgp_destroy(void *v)
398 {
399     struct tmem_page_descriptor *pgp = (struct tmem_page_descriptor *)v;
400 
401     pgp->us.obj->pgp_count--;
402     pgp_delist_free(pgp);
403 }
404 
pgp_add_to_obj(struct tmem_object_root * obj,uint32_t index,struct tmem_page_descriptor * pgp)405 static int pgp_add_to_obj(struct tmem_object_root *obj, uint32_t index, struct tmem_page_descriptor *pgp)
406 {
407     int ret;
408 
409     ASSERT_SPINLOCK(&obj->obj_spinlock);
410     ret = radix_tree_insert(&obj->tree_root, index, pgp);
411     if ( !ret )
412         obj->pgp_count++;
413     return ret;
414 }
415 
pgp_delete_from_obj(struct tmem_object_root * obj,uint32_t index)416 static struct tmem_page_descriptor *pgp_delete_from_obj(struct tmem_object_root *obj, uint32_t index)
417 {
418     struct tmem_page_descriptor *pgp;
419 
420     ASSERT(obj != NULL);
421     ASSERT_SPINLOCK(&obj->obj_spinlock);
422     ASSERT(obj->pool != NULL);
423     pgp = radix_tree_delete(&obj->tree_root, index);
424     if ( pgp != NULL )
425         obj->pgp_count--;
426     ASSERT(obj->pgp_count >= 0);
427 
428     return pgp;
429 }
430 
431 /************ RADIX TREE NODE MANIPULATION ROUTINES *******************/
432 
433 /* Called only indirectly from radix_tree_insert. */
rtn_alloc(void * arg)434 static struct radix_tree_node *rtn_alloc(void *arg)
435 {
436     struct tmem_object_node *objnode;
437     struct tmem_object_root *obj = (struct tmem_object_root *)arg;
438 
439     ASSERT(obj->pool != NULL);
440     objnode = tmem_malloc(sizeof(struct tmem_object_node),obj->pool);
441     if (objnode == NULL)
442         return NULL;
443     objnode->obj = obj;
444     memset(&objnode->rtn, 0, sizeof(struct radix_tree_node));
445     if (++obj->pool->objnode_count > obj->pool->objnode_count_max)
446         obj->pool->objnode_count_max = obj->pool->objnode_count;
447     atomic_inc_and_max(global_rtree_node_count);
448     obj->objnode_count++;
449     return &objnode->rtn;
450 }
451 
452 /* Called only indirectly from radix_tree_delete/destroy. */
rtn_free(struct radix_tree_node * rtn,void * arg)453 static void rtn_free(struct radix_tree_node *rtn, void *arg)
454 {
455     struct tmem_pool *pool;
456     struct tmem_object_node *objnode;
457 
458     ASSERT(rtn != NULL);
459     objnode = container_of(rtn,struct tmem_object_node,rtn);
460     ASSERT(objnode->obj != NULL);
461     ASSERT_SPINLOCK(&objnode->obj->obj_spinlock);
462     pool = objnode->obj->pool;
463     ASSERT(pool != NULL);
464     pool->objnode_count--;
465     objnode->obj->objnode_count--;
466     objnode->obj = NULL;
467     tmem_free(objnode, pool);
468     atomic_dec_and_assert(global_rtree_node_count);
469 }
470 
471 /************ POOL OBJECT COLLECTION MANIPULATION ROUTINES *******************/
472 
oid_compare(struct xen_tmem_oid * left,struct xen_tmem_oid * right)473 static int oid_compare(struct xen_tmem_oid *left,
474                        struct xen_tmem_oid *right)
475 {
476     if ( left->oid[2] == right->oid[2] )
477     {
478         if ( left->oid[1] == right->oid[1] )
479         {
480             if ( left->oid[0] == right->oid[0] )
481                 return 0;
482             else if ( left->oid[0] < right->oid[0] )
483                 return -1;
484             else
485                 return 1;
486         }
487         else if ( left->oid[1] < right->oid[1] )
488             return -1;
489         else
490             return 1;
491     }
492     else if ( left->oid[2] < right->oid[2] )
493         return -1;
494     else
495         return 1;
496 }
497 
oid_set_invalid(struct xen_tmem_oid * oidp)498 static void oid_set_invalid(struct xen_tmem_oid *oidp)
499 {
500     oidp->oid[0] = oidp->oid[1] = oidp->oid[2] = -1UL;
501 }
502 
oid_hash(struct xen_tmem_oid * oidp)503 static unsigned oid_hash(struct xen_tmem_oid *oidp)
504 {
505     return (tmem_hash(oidp->oid[0] ^ oidp->oid[1] ^ oidp->oid[2],
506                      BITS_PER_LONG) & OBJ_HASH_BUCKETS_MASK);
507 }
508 
509 /* Searches for object==oid in pool, returns locked object if found. */
obj_find(struct tmem_pool * pool,struct xen_tmem_oid * oidp)510 static struct tmem_object_root * obj_find(struct tmem_pool *pool,
511                                           struct xen_tmem_oid *oidp)
512 {
513     struct rb_node *node;
514     struct tmem_object_root *obj;
515 
516 restart_find:
517     read_lock(&pool->pool_rwlock);
518     node = pool->obj_rb_root[oid_hash(oidp)].rb_node;
519     while ( node )
520     {
521         obj = container_of(node, struct tmem_object_root, rb_tree_node);
522         switch ( oid_compare(&obj->oid, oidp) )
523         {
524             case 0: /* Equal. */
525                 if ( !spin_trylock(&obj->obj_spinlock) )
526                 {
527                     read_unlock(&pool->pool_rwlock);
528                     goto restart_find;
529                 }
530                 read_unlock(&pool->pool_rwlock);
531                 return obj;
532             case -1:
533                 node = node->rb_left;
534                 break;
535             case 1:
536                 node = node->rb_right;
537         }
538     }
539     read_unlock(&pool->pool_rwlock);
540     return NULL;
541 }
542 
543 /* Free an object that has no more pgps in it. */
obj_free(struct tmem_object_root * obj)544 static void obj_free(struct tmem_object_root *obj)
545 {
546     struct tmem_pool *pool;
547     struct xen_tmem_oid old_oid;
548 
549     ASSERT_SPINLOCK(&obj->obj_spinlock);
550     ASSERT(obj != NULL);
551     ASSERT(obj->pgp_count == 0);
552     pool = obj->pool;
553     ASSERT(pool != NULL);
554     ASSERT(pool->client != NULL);
555     ASSERT_WRITELOCK(&pool->pool_rwlock);
556     if ( obj->tree_root.rnode != NULL ) /* May be a "stump" with no leaves. */
557         radix_tree_destroy(&obj->tree_root, pgp_destroy);
558     ASSERT((long)obj->objnode_count == 0);
559     ASSERT(obj->tree_root.rnode == NULL);
560     pool->obj_count--;
561     ASSERT(pool->obj_count >= 0);
562     obj->pool = NULL;
563     old_oid = obj->oid;
564     oid_set_invalid(&obj->oid);
565     obj->last_client = TMEM_CLI_ID_NULL;
566     atomic_dec_and_assert(global_obj_count);
567     rb_erase(&obj->rb_tree_node, &pool->obj_rb_root[oid_hash(&old_oid)]);
568     spin_unlock(&obj->obj_spinlock);
569     tmem_free(obj, pool);
570 }
571 
obj_rb_insert(struct rb_root * root,struct tmem_object_root * obj)572 static int obj_rb_insert(struct rb_root *root, struct tmem_object_root *obj)
573 {
574     struct rb_node **new, *parent = NULL;
575     struct tmem_object_root *this;
576 
577     ASSERT(obj->pool);
578     ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
579 
580     new = &(root->rb_node);
581     while ( *new )
582     {
583         this = container_of(*new, struct tmem_object_root, rb_tree_node);
584         parent = *new;
585         switch ( oid_compare(&this->oid, &obj->oid) )
586         {
587             case 0:
588                 return 0;
589             case -1:
590                 new = &((*new)->rb_left);
591                 break;
592             case 1:
593                 new = &((*new)->rb_right);
594                 break;
595         }
596     }
597     rb_link_node(&obj->rb_tree_node, parent, new);
598     rb_insert_color(&obj->rb_tree_node, root);
599     return 1;
600 }
601 
602 /*
603  * Allocate, initialize, and insert an tmem_object_root
604  * (should be called only if find failed).
605  */
obj_alloc(struct tmem_pool * pool,struct xen_tmem_oid * oidp)606 static struct tmem_object_root * obj_alloc(struct tmem_pool *pool,
607                                            struct xen_tmem_oid *oidp)
608 {
609     struct tmem_object_root *obj;
610 
611     ASSERT(pool != NULL);
612     if ( (obj = tmem_malloc(sizeof(struct tmem_object_root), pool)) == NULL )
613         return NULL;
614     pool->obj_count++;
615     if (pool->obj_count > pool->obj_count_max)
616         pool->obj_count_max = pool->obj_count;
617     atomic_inc_and_max(global_obj_count);
618     radix_tree_init(&obj->tree_root);
619     radix_tree_set_alloc_callbacks(&obj->tree_root, rtn_alloc, rtn_free, obj);
620     spin_lock_init(&obj->obj_spinlock);
621     obj->pool = pool;
622     obj->oid = *oidp;
623     obj->objnode_count = 0;
624     obj->pgp_count = 0;
625     obj->last_client = TMEM_CLI_ID_NULL;
626     return obj;
627 }
628 
629 /* Free an object after destroying any pgps in it. */
obj_destroy(struct tmem_object_root * obj)630 static void obj_destroy(struct tmem_object_root *obj)
631 {
632     ASSERT_WRITELOCK(&obj->pool->pool_rwlock);
633     radix_tree_destroy(&obj->tree_root, pgp_destroy);
634     obj_free(obj);
635 }
636 
637 /* Destroys all objs in a pool, or only if obj->last_client matches cli_id. */
pool_destroy_objs(struct tmem_pool * pool,domid_t cli_id)638 static void pool_destroy_objs(struct tmem_pool *pool, domid_t cli_id)
639 {
640     struct rb_node *node;
641     struct tmem_object_root *obj;
642     int i;
643 
644     write_lock(&pool->pool_rwlock);
645     pool->is_dying = 1;
646     for (i = 0; i < OBJ_HASH_BUCKETS; i++)
647     {
648         node = rb_first(&pool->obj_rb_root[i]);
649         while ( node != NULL )
650         {
651             obj = container_of(node, struct tmem_object_root, rb_tree_node);
652             spin_lock(&obj->obj_spinlock);
653             node = rb_next(node);
654             if ( obj->last_client == cli_id )
655                 obj_destroy(obj);
656             else
657                 spin_unlock(&obj->obj_spinlock);
658         }
659     }
660     write_unlock(&pool->pool_rwlock);
661 }
662 
663 
664 /************ POOL MANIPULATION ROUTINES ******************************/
665 
pool_alloc(void)666 static struct tmem_pool * pool_alloc(void)
667 {
668     struct tmem_pool *pool;
669     int i;
670 
671     if ( (pool = xzalloc(struct tmem_pool)) == NULL )
672         return NULL;
673     for (i = 0; i < OBJ_HASH_BUCKETS; i++)
674         pool->obj_rb_root[i] = RB_ROOT;
675     INIT_LIST_HEAD(&pool->persistent_page_list);
676     rwlock_init(&pool->pool_rwlock);
677     return pool;
678 }
679 
pool_free(struct tmem_pool * pool)680 static void pool_free(struct tmem_pool *pool)
681 {
682     pool->client = NULL;
683     xfree(pool);
684 }
685 
686 /*
687  * Register new_client as a user of this shared pool and return 0 on succ.
688  */
shared_pool_join(struct tmem_pool * pool,struct client * new_client)689 static int shared_pool_join(struct tmem_pool *pool, struct client *new_client)
690 {
691     struct share_list *sl;
692     ASSERT(is_shared(pool));
693 
694     if ( (sl = tmem_malloc(sizeof(struct share_list), NULL)) == NULL )
695         return -1;
696     sl->client = new_client;
697     list_add_tail(&sl->share_list, &pool->share_list);
698     if ( new_client->cli_id != pool->client->cli_id )
699         tmem_client_info("adding new %s %d to shared pool owned by %s %d\n",
700                     tmem_client_str, new_client->cli_id, tmem_client_str,
701                     pool->client->cli_id);
702     else if ( pool->shared_count )
703         tmem_client_info("inter-guest sharing of shared pool %s by client %d\n",
704                          tmem_client_str, pool->client->cli_id);
705     ++pool->shared_count;
706     return 0;
707 }
708 
709 /* Reassign "ownership" of the pool to another client that shares this pool. */
shared_pool_reassign(struct tmem_pool * pool)710 static void shared_pool_reassign(struct tmem_pool *pool)
711 {
712     struct share_list *sl;
713     int poolid;
714     struct client *old_client = pool->client, *new_client;
715 
716     ASSERT(is_shared(pool));
717     if ( list_empty(&pool->share_list) )
718     {
719         ASSERT(pool->shared_count == 0);
720         return;
721     }
722     old_client->pools[pool->pool_id] = NULL;
723     sl = list_entry(pool->share_list.next, struct share_list, share_list);
724     /*
725      * The sl->client can be old_client if there are multiple shared pools
726      * within an guest.
727      */
728     pool->client = new_client = sl->client;
729     for (poolid = 0; poolid < MAX_POOLS_PER_DOMAIN; poolid++)
730         if (new_client->pools[poolid] == pool)
731             break;
732     ASSERT(poolid != MAX_POOLS_PER_DOMAIN);
733     new_client->eph_count += _atomic_read(pool->pgp_count);
734     old_client->eph_count -= _atomic_read(pool->pgp_count);
735     list_splice_init(&old_client->ephemeral_page_list,
736                      &new_client->ephemeral_page_list);
737     tmem_client_info("reassigned shared pool from %s=%d to %s=%d pool_id=%d\n",
738         tmem_cli_id_str, old_client->cli_id, tmem_cli_id_str, new_client->cli_id, poolid);
739     pool->pool_id = poolid;
740 }
741 
742 /*
743  * Destroy all objects with last_client same as passed cli_id,
744  * remove pool's cli_id from list of sharers of this pool.
745  */
shared_pool_quit(struct tmem_pool * pool,domid_t cli_id)746 static int shared_pool_quit(struct tmem_pool *pool, domid_t cli_id)
747 {
748     struct share_list *sl;
749     int s_poolid;
750 
751     ASSERT(is_shared(pool));
752     ASSERT(pool->client != NULL);
753 
754     ASSERT_WRITELOCK(&tmem_rwlock);
755     pool_destroy_objs(pool, cli_id);
756     list_for_each_entry(sl,&pool->share_list, share_list)
757     {
758         if (sl->client->cli_id != cli_id)
759             continue;
760         list_del(&sl->share_list);
761         tmem_free(sl, pool);
762         --pool->shared_count;
763         if (pool->client->cli_id == cli_id)
764             shared_pool_reassign(pool);
765         if (pool->shared_count)
766             return pool->shared_count;
767         for (s_poolid = 0; s_poolid < MAX_GLOBAL_SHARED_POOLS; s_poolid++)
768             if ( (tmem_global.shared_pools[s_poolid]) == pool )
769             {
770                 tmem_global.shared_pools[s_poolid] = NULL;
771                 break;
772             }
773         return 0;
774     }
775     tmem_client_warn("tmem: no match unsharing pool, %s=%d\n",
776         tmem_cli_id_str,pool->client->cli_id);
777     return -1;
778 }
779 
780 /* Flush all data (owned by cli_id) from a pool and, optionally, free it. */
pool_flush(struct tmem_pool * pool,domid_t cli_id)781 static void pool_flush(struct tmem_pool *pool, domid_t cli_id)
782 {
783     ASSERT(pool != NULL);
784     if ( (is_shared(pool)) && (shared_pool_quit(pool,cli_id) > 0) )
785     {
786         tmem_client_warn("tmem: %s=%d no longer using shared pool %d owned by %s=%d\n",
787            tmem_cli_id_str, cli_id, pool->pool_id, tmem_cli_id_str,pool->client->cli_id);
788         return;
789     }
790     tmem_client_info("Destroying %s-%s tmem pool %s=%d pool_id=%d\n",
791                     is_persistent(pool) ? "persistent" : "ephemeral" ,
792                     is_shared(pool) ? "shared" : "private",
793                     tmem_cli_id_str, pool->client->cli_id, pool->pool_id);
794     if ( pool->client->info.flags.u.migrating )
795     {
796         tmem_client_warn("can't destroy pool while %s is live-migrating\n",
797                     tmem_client_str);
798         return;
799     }
800     pool_destroy_objs(pool, TMEM_CLI_ID_NULL);
801     pool->client->pools[pool->pool_id] = NULL;
802     pool_free(pool);
803 }
804 
805 /************ CLIENT MANIPULATION OPERATIONS **************************/
806 
client_create(domid_t cli_id)807 struct client *client_create(domid_t cli_id)
808 {
809     struct client *client = xzalloc(struct client);
810     int i, shift;
811     char name[5];
812     struct domain *d;
813 
814     tmem_client_info("tmem: initializing tmem capability for %s=%d...",
815                     tmem_cli_id_str, cli_id);
816     if ( client == NULL )
817     {
818         tmem_client_err("failed... out of memory\n");
819         goto fail;
820     }
821 
822     for (i = 0, shift = 12; i < 4; shift -=4, i++)
823         name[i] = (((unsigned short)cli_id >> shift) & 0xf) + '0';
824     name[4] = '\0';
825     client->persistent_pool = xmem_pool_create(name, tmem_persistent_pool_page_get,
826         tmem_persistent_pool_page_put, PAGE_SIZE, 0, PAGE_SIZE);
827     if ( client->persistent_pool == NULL )
828     {
829         tmem_client_err("failed... can't alloc persistent pool\n");
830         goto fail;
831     }
832 
833     d = rcu_lock_domain_by_id(cli_id);
834     if ( d == NULL ) {
835         tmem_client_err("failed... can't set client\n");
836         xmem_pool_destroy(client->persistent_pool);
837         goto fail;
838     }
839     if ( !d->is_dying ) {
840         d->tmem_client = client;
841         client->domain = d;
842     }
843     rcu_unlock_domain(d);
844 
845     client->cli_id = cli_id;
846     client->info.version = TMEM_SPEC_VERSION;
847     client->info.maxpools = MAX_POOLS_PER_DOMAIN;
848     client->info.flags.u.compress = tmem_compression_enabled();
849     for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
850         client->shared_auth_uuid[i][0] =
851             client->shared_auth_uuid[i][1] = -1L;
852     list_add_tail(&client->client_list, &tmem_global.client_list);
853     INIT_LIST_HEAD(&client->ephemeral_page_list);
854     INIT_LIST_HEAD(&client->persistent_invalidated_list);
855     tmem_client_info("ok\n");
856     return client;
857 
858  fail:
859     xfree(client);
860     return NULL;
861 }
862 
client_free(struct client * client)863 static void client_free(struct client *client)
864 {
865     list_del(&client->client_list);
866     xmem_pool_destroy(client->persistent_pool);
867     xfree(client);
868 }
869 
870 /* Flush all data from a client and, optionally, free it. */
client_flush(struct client * client)871 static void client_flush(struct client *client)
872 {
873     int i;
874     struct tmem_pool *pool;
875 
876     for  (i = 0; i < MAX_POOLS_PER_DOMAIN; i++)
877     {
878         if ( (pool = client->pools[i]) == NULL )
879             continue;
880         pool_flush(pool, client->cli_id);
881         client->pools[i] = NULL;
882         client->info.nr_pools--;
883     }
884     client_free(client);
885 }
886 
client_over_quota(const struct client * client)887 static bool client_over_quota(const struct client *client)
888 {
889     int total = _atomic_read(tmem_global.client_weight_total);
890 
891     ASSERT(client != NULL);
892     if ( (total == 0) || (client->info.weight == 0) ||
893           (client->eph_count == 0) )
894         return false;
895 
896     return (((tmem_global.eph_count * 100L) / client->eph_count) >
897             ((total * 100L) / client->info.weight));
898 }
899 
900 /************ MEMORY REVOCATION ROUTINES *******************************/
901 
tmem_try_to_evict_pgp(struct tmem_page_descriptor * pgp,bool * hold_pool_rwlock)902 static bool tmem_try_to_evict_pgp(struct tmem_page_descriptor *pgp,
903                                   bool *hold_pool_rwlock)
904 {
905     struct tmem_object_root *obj = pgp->us.obj;
906     struct tmem_pool *pool = obj->pool;
907 
908     if ( pool->is_dying )
909         return false;
910     if ( spin_trylock(&obj->obj_spinlock) )
911     {
912         if ( obj->pgp_count > 1 )
913             return true;
914         if ( write_trylock(&pool->pool_rwlock) )
915         {
916             *hold_pool_rwlock = 1;
917             return true;
918         }
919         spin_unlock(&obj->obj_spinlock);
920     }
921     return false;
922 }
923 
tmem_evict(void)924 int tmem_evict(void)
925 {
926     struct client *client = current->domain->tmem_client;
927     struct tmem_page_descriptor *pgp = NULL, *pgp_del;
928     struct tmem_object_root *obj;
929     struct tmem_pool *pool;
930     int ret = 0;
931     bool hold_pool_rwlock = false;
932 
933     tmem_stats.evict_attempts++;
934     spin_lock(&eph_lists_spinlock);
935     if ( (client != NULL) && client_over_quota(client) &&
936          !list_empty(&client->ephemeral_page_list) )
937     {
938         list_for_each_entry(pgp, &client->ephemeral_page_list, us.client_eph_pages)
939             if ( tmem_try_to_evict_pgp(pgp, &hold_pool_rwlock) )
940                 goto found;
941     }
942     else if ( !list_empty(&tmem_global.ephemeral_page_list) )
943     {
944         list_for_each_entry(pgp, &tmem_global.ephemeral_page_list, global_eph_pages)
945             if ( tmem_try_to_evict_pgp(pgp, &hold_pool_rwlock) )
946             {
947                 client = pgp->us.obj->pool->client;
948                 goto found;
949             }
950     }
951      /* Global_ephemeral_page_list is empty, so we bail out. */
952     spin_unlock(&eph_lists_spinlock);
953     goto out;
954 
955 found:
956     /* Delist. */
957     list_del_init(&pgp->us.client_eph_pages);
958     client->eph_count--;
959     list_del_init(&pgp->global_eph_pages);
960     tmem_global.eph_count--;
961     ASSERT(tmem_global.eph_count >= 0);
962     ASSERT(client->eph_count >= 0);
963     spin_unlock(&eph_lists_spinlock);
964 
965     ASSERT(pgp != NULL);
966     obj = pgp->us.obj;
967     ASSERT(obj != NULL);
968     ASSERT(obj->pool != NULL);
969     pool = obj->pool;
970 
971     ASSERT_SPINLOCK(&obj->obj_spinlock);
972     pgp_del = pgp_delete_from_obj(obj, pgp->index);
973     ASSERT(pgp_del == pgp);
974 
975     /* pgp already delist, so call pgp_free directly. */
976     pgp_free(pgp);
977     if ( obj->pgp_count == 0 )
978     {
979         ASSERT_WRITELOCK(&pool->pool_rwlock);
980         obj_free(obj);
981     }
982     else
983         spin_unlock(&obj->obj_spinlock);
984     if ( hold_pool_rwlock )
985         write_unlock(&pool->pool_rwlock);
986     tmem_stats.evicted_pgs++;
987     ret = 1;
988 out:
989     return ret;
990 }
991 
992 
993 /*
994  * Under certain conditions (e.g. if each client is putting pages for exactly
995  * one object), once locks are held, freeing up memory may
996  * result in livelocks and very long "put" times, so we try to ensure there
997  * is a minimum amount of memory (1MB) available BEFORE any data structure
998  * locks are held.
999  */
tmem_ensure_avail_pages(void)1000 static inline bool tmem_ensure_avail_pages(void)
1001 {
1002     int failed_evict = 10;
1003     unsigned long free_mem;
1004 
1005     do {
1006         free_mem = (tmem_page_list_pages + total_free_pages())
1007                         >> (20 - PAGE_SHIFT);
1008         if ( free_mem )
1009             return true;
1010         if ( !tmem_evict() )
1011             failed_evict--;
1012     } while ( failed_evict > 0 );
1013 
1014     return false;
1015 }
1016 
1017 /************ TMEM CORE OPERATIONS ************************************/
1018 
do_tmem_put_compress(struct tmem_page_descriptor * pgp,xen_pfn_t cmfn,tmem_cli_va_param_t clibuf)1019 static int do_tmem_put_compress(struct tmem_page_descriptor *pgp, xen_pfn_t cmfn,
1020                                          tmem_cli_va_param_t clibuf)
1021 {
1022     void *dst, *p;
1023     size_t size;
1024     int ret = 0;
1025 
1026     ASSERT(pgp != NULL);
1027     ASSERT(pgp->us.obj != NULL);
1028     ASSERT_SPINLOCK(&pgp->us.obj->obj_spinlock);
1029     ASSERT(pgp->us.obj->pool != NULL);
1030     ASSERT(pgp->us.obj->pool->client != NULL);
1031 
1032     if ( pgp->pfp != NULL )
1033         pgp_free_data(pgp, pgp->us.obj->pool);
1034     ret = tmem_compress_from_client(cmfn, &dst, &size, clibuf);
1035     if ( ret <= 0 )
1036         goto out;
1037     else if ( (size == 0) || (size >= tmem_mempool_maxalloc) ) {
1038         ret = 0;
1039         goto out;
1040     } else if ( (p = tmem_malloc(size,pgp->us.obj->pool)) == NULL ) {
1041         ret = -ENOMEM;
1042         goto out;
1043     } else {
1044         memcpy(p,dst,size);
1045         pgp->cdata = p;
1046     }
1047     pgp->size = size;
1048     pgp->us.obj->pool->client->compressed_pages++;
1049     pgp->us.obj->pool->client->compressed_sum_size += size;
1050     ret = 1;
1051 
1052 out:
1053     return ret;
1054 }
1055 
do_tmem_dup_put(struct tmem_page_descriptor * pgp,xen_pfn_t cmfn,tmem_cli_va_param_t clibuf)1056 static int do_tmem_dup_put(struct tmem_page_descriptor *pgp, xen_pfn_t cmfn,
1057        tmem_cli_va_param_t clibuf)
1058 {
1059     struct tmem_pool *pool;
1060     struct tmem_object_root *obj;
1061     struct client *client;
1062     struct tmem_page_descriptor *pgpfound = NULL;
1063     int ret;
1064 
1065     ASSERT(pgp != NULL);
1066     ASSERT(pgp->pfp != NULL);
1067     ASSERT(pgp->size != -1);
1068     obj = pgp->us.obj;
1069     ASSERT_SPINLOCK(&obj->obj_spinlock);
1070     ASSERT(obj != NULL);
1071     pool = obj->pool;
1072     ASSERT(pool != NULL);
1073     client = pool->client;
1074     if ( client->info.flags.u.migrating )
1075         goto failed_dup; /* No dups allowed when migrating. */
1076     /* Can we successfully manipulate pgp to change out the data? */
1077     if ( client->info.flags.u.compress && pgp->size != 0 )
1078     {
1079         ret = do_tmem_put_compress(pgp, cmfn, clibuf);
1080         if ( ret == 1 )
1081             goto done;
1082         else if ( ret == 0 )
1083             goto copy_uncompressed;
1084         else if ( ret == -ENOMEM )
1085             goto failed_dup;
1086         else if ( ret == -EFAULT )
1087             goto bad_copy;
1088     }
1089 
1090 copy_uncompressed:
1091     if ( pgp->pfp )
1092         pgp_free_data(pgp, pool);
1093     if ( ( pgp->pfp = tmem_alloc_page(pool) ) == NULL )
1094         goto failed_dup;
1095     pgp->size = 0;
1096     ret = tmem_copy_from_client(pgp->pfp, cmfn, tmem_cli_buf_null);
1097     if ( ret < 0 )
1098         goto bad_copy;
1099 
1100 done:
1101     /* Successfully replaced data, clean up and return success. */
1102     if ( is_shared(pool) )
1103         obj->last_client = client->cli_id;
1104     spin_unlock(&obj->obj_spinlock);
1105     pool->dup_puts_replaced++;
1106     pool->good_puts++;
1107     if ( is_persistent(pool) )
1108         client->succ_pers_puts++;
1109     return 1;
1110 
1111 bad_copy:
1112     tmem_stats.failed_copies++;
1113     goto cleanup;
1114 
1115 failed_dup:
1116     /*
1117      * Couldn't change out the data, flush the old data and return
1118      * -ENOSPC instead of -ENOMEM to differentiate failed _dup_ put.
1119      */
1120     ret = -ENOSPC;
1121 cleanup:
1122     pgpfound = pgp_delete_from_obj(obj, pgp->index);
1123     ASSERT(pgpfound == pgp);
1124     pgp_delist_free(pgpfound);
1125     if ( obj->pgp_count == 0 )
1126     {
1127         write_lock(&pool->pool_rwlock);
1128         obj_free(obj);
1129         write_unlock(&pool->pool_rwlock);
1130     } else {
1131         spin_unlock(&obj->obj_spinlock);
1132     }
1133     pool->dup_puts_flushed++;
1134     return ret;
1135 }
1136 
do_tmem_put(struct tmem_pool * pool,struct xen_tmem_oid * oidp,uint32_t index,xen_pfn_t cmfn,tmem_cli_va_param_t clibuf)1137 static int do_tmem_put(struct tmem_pool *pool,
1138                        struct xen_tmem_oid *oidp, uint32_t index,
1139                        xen_pfn_t cmfn, tmem_cli_va_param_t clibuf)
1140 {
1141     struct tmem_object_root *obj = NULL;
1142     struct tmem_page_descriptor *pgp = NULL;
1143     struct client *client;
1144     int ret, newobj = 0;
1145 
1146     ASSERT(pool != NULL);
1147     client = pool->client;
1148     ASSERT(client != NULL);
1149     ret = client->info.flags.u.frozen  ? -EFROZEN : -ENOMEM;
1150     pool->puts++;
1151 
1152 refind:
1153     /* Does page already exist (dup)?  if so, handle specially. */
1154     if ( (obj = obj_find(pool, oidp)) != NULL )
1155     {
1156         if ((pgp = pgp_lookup_in_obj(obj, index)) != NULL)
1157         {
1158             return do_tmem_dup_put(pgp, cmfn, clibuf);
1159         }
1160         else
1161         {
1162             /* No puts allowed into a frozen pool (except dup puts). */
1163             if ( client->info.flags.u.frozen )
1164                 goto unlock_obj;
1165         }
1166     }
1167     else
1168     {
1169         /* No puts allowed into a frozen pool (except dup puts). */
1170         if ( client->info.flags.u.frozen )
1171             return ret;
1172         if ( (obj = obj_alloc(pool, oidp)) == NULL )
1173             return -ENOMEM;
1174 
1175         write_lock(&pool->pool_rwlock);
1176         /*
1177          * Parallel callers may already allocated obj and inserted to obj_rb_root
1178          * before us.
1179          */
1180         if ( !obj_rb_insert(&pool->obj_rb_root[oid_hash(oidp)], obj) )
1181         {
1182             tmem_free(obj, pool);
1183             write_unlock(&pool->pool_rwlock);
1184             goto refind;
1185         }
1186 
1187         spin_lock(&obj->obj_spinlock);
1188         newobj = 1;
1189         write_unlock(&pool->pool_rwlock);
1190     }
1191 
1192     /* When arrive here, we have a spinlocked obj for use. */
1193     ASSERT_SPINLOCK(&obj->obj_spinlock);
1194     if ( (pgp = pgp_alloc(obj)) == NULL )
1195         goto unlock_obj;
1196 
1197     ret = pgp_add_to_obj(obj, index, pgp);
1198     if ( ret == -ENOMEM  )
1199         /* Warning: may result in partially built radix tree ("stump"). */
1200         goto free_pgp;
1201 
1202     pgp->index = index;
1203     pgp->size = 0;
1204 
1205     if ( client->info.flags.u.compress )
1206     {
1207         ASSERT(pgp->pfp == NULL);
1208         ret = do_tmem_put_compress(pgp, cmfn, clibuf);
1209         if ( ret == 1 )
1210             goto insert_page;
1211         if ( ret == -ENOMEM )
1212         {
1213             client->compress_nomem++;
1214             goto del_pgp_from_obj;
1215         }
1216         if ( ret == 0 )
1217         {
1218             client->compress_poor++;
1219             goto copy_uncompressed;
1220         }
1221         if ( ret == -EFAULT )
1222             goto bad_copy;
1223     }
1224 
1225 copy_uncompressed:
1226     if ( ( pgp->pfp = tmem_alloc_page(pool) ) == NULL )
1227     {
1228         ret = -ENOMEM;
1229         goto del_pgp_from_obj;
1230     }
1231     ret = tmem_copy_from_client(pgp->pfp, cmfn, clibuf);
1232     if ( ret < 0 )
1233         goto bad_copy;
1234 
1235 insert_page:
1236     if ( !is_persistent(pool) )
1237     {
1238         spin_lock(&eph_lists_spinlock);
1239         list_add_tail(&pgp->global_eph_pages, &tmem_global.ephemeral_page_list);
1240         if (++tmem_global.eph_count > tmem_stats.global_eph_count_max)
1241             tmem_stats.global_eph_count_max = tmem_global.eph_count;
1242         list_add_tail(&pgp->us.client_eph_pages,
1243             &client->ephemeral_page_list);
1244         if (++client->eph_count > client->eph_count_max)
1245             client->eph_count_max = client->eph_count;
1246         spin_unlock(&eph_lists_spinlock);
1247     }
1248     else
1249     { /* is_persistent. */
1250         spin_lock(&pers_lists_spinlock);
1251         list_add_tail(&pgp->us.pool_pers_pages,
1252             &pool->persistent_page_list);
1253         spin_unlock(&pers_lists_spinlock);
1254     }
1255 
1256     if ( is_shared(pool) )
1257         obj->last_client = client->cli_id;
1258 
1259     /* Free the obj spinlock. */
1260     spin_unlock(&obj->obj_spinlock);
1261     pool->good_puts++;
1262 
1263     if ( is_persistent(pool) )
1264         client->succ_pers_puts++;
1265     else
1266         tmem_stats.tot_good_eph_puts++;
1267     return 1;
1268 
1269 bad_copy:
1270     tmem_stats.failed_copies++;
1271 
1272 del_pgp_from_obj:
1273     ASSERT((obj != NULL) && (pgp != NULL) && (pgp->index != -1));
1274     pgp_delete_from_obj(obj, pgp->index);
1275 
1276 free_pgp:
1277     pgp_free(pgp);
1278 unlock_obj:
1279     if ( newobj )
1280     {
1281         write_lock(&pool->pool_rwlock);
1282         obj_free(obj);
1283         write_unlock(&pool->pool_rwlock);
1284     }
1285     else
1286     {
1287         spin_unlock(&obj->obj_spinlock);
1288     }
1289     pool->no_mem_puts++;
1290     return ret;
1291 }
1292 
do_tmem_get(struct tmem_pool * pool,struct xen_tmem_oid * oidp,uint32_t index,xen_pfn_t cmfn,tmem_cli_va_param_t clibuf)1293 static int do_tmem_get(struct tmem_pool *pool,
1294                        struct xen_tmem_oid *oidp, uint32_t index,
1295                        xen_pfn_t cmfn, tmem_cli_va_param_t clibuf)
1296 {
1297     struct tmem_object_root *obj;
1298     struct tmem_page_descriptor *pgp;
1299     struct client *client = pool->client;
1300     int rc;
1301 
1302     if ( !_atomic_read(pool->pgp_count) )
1303         return -EEMPTY;
1304 
1305     pool->gets++;
1306     obj = obj_find(pool,oidp);
1307     if ( obj == NULL )
1308         return 0;
1309 
1310     ASSERT_SPINLOCK(&obj->obj_spinlock);
1311     if (is_shared(pool) || is_persistent(pool) )
1312         pgp = pgp_lookup_in_obj(obj, index);
1313     else
1314         pgp = pgp_delete_from_obj(obj, index);
1315     if ( pgp == NULL )
1316     {
1317         spin_unlock(&obj->obj_spinlock);
1318         return 0;
1319     }
1320     ASSERT(pgp->size != -1);
1321     if ( pgp->size != 0 )
1322     {
1323         rc = tmem_decompress_to_client(cmfn, pgp->cdata, pgp->size, clibuf);
1324     }
1325     else
1326         rc = tmem_copy_to_client(cmfn, pgp->pfp, clibuf);
1327     if ( rc <= 0 )
1328         goto bad_copy;
1329 
1330     if ( !is_persistent(pool) )
1331     {
1332         if ( !is_shared(pool) )
1333         {
1334             pgp_delist_free(pgp);
1335             if ( obj->pgp_count == 0 )
1336             {
1337                 write_lock(&pool->pool_rwlock);
1338                 obj_free(obj);
1339                 obj = NULL;
1340                 write_unlock(&pool->pool_rwlock);
1341             }
1342         } else {
1343             spin_lock(&eph_lists_spinlock);
1344             list_del(&pgp->global_eph_pages);
1345             list_add_tail(&pgp->global_eph_pages,&tmem_global.ephemeral_page_list);
1346             list_del(&pgp->us.client_eph_pages);
1347             list_add_tail(&pgp->us.client_eph_pages,&client->ephemeral_page_list);
1348             spin_unlock(&eph_lists_spinlock);
1349             obj->last_client = current->domain->domain_id;
1350         }
1351     }
1352     if ( obj != NULL )
1353     {
1354         spin_unlock(&obj->obj_spinlock);
1355     }
1356     pool->found_gets++;
1357     if ( is_persistent(pool) )
1358         client->succ_pers_gets++;
1359     else
1360         client->succ_eph_gets++;
1361     return 1;
1362 
1363 bad_copy:
1364     spin_unlock(&obj->obj_spinlock);
1365     tmem_stats.failed_copies++;
1366     return rc;
1367 }
1368 
do_tmem_flush_page(struct tmem_pool * pool,struct xen_tmem_oid * oidp,uint32_t index)1369 static int do_tmem_flush_page(struct tmem_pool *pool,
1370                               struct xen_tmem_oid *oidp, uint32_t index)
1371 {
1372     struct tmem_object_root *obj;
1373     struct tmem_page_descriptor *pgp;
1374 
1375     pool->flushs++;
1376     obj = obj_find(pool,oidp);
1377     if ( obj == NULL )
1378         goto out;
1379     pgp = pgp_delete_from_obj(obj, index);
1380     if ( pgp == NULL )
1381     {
1382         spin_unlock(&obj->obj_spinlock);
1383         goto out;
1384     }
1385     pgp_delist_free(pgp);
1386     if ( obj->pgp_count == 0 )
1387     {
1388         write_lock(&pool->pool_rwlock);
1389         obj_free(obj);
1390         write_unlock(&pool->pool_rwlock);
1391     } else {
1392         spin_unlock(&obj->obj_spinlock);
1393     }
1394     pool->flushs_found++;
1395 
1396 out:
1397     if ( pool->client->info.flags.u.frozen )
1398         return -EFROZEN;
1399     else
1400         return 1;
1401 }
1402 
do_tmem_flush_object(struct tmem_pool * pool,struct xen_tmem_oid * oidp)1403 static int do_tmem_flush_object(struct tmem_pool *pool,
1404                                 struct xen_tmem_oid *oidp)
1405 {
1406     struct tmem_object_root *obj;
1407 
1408     pool->flush_objs++;
1409     obj = obj_find(pool,oidp);
1410     if ( obj == NULL )
1411         goto out;
1412     write_lock(&pool->pool_rwlock);
1413     obj_destroy(obj);
1414     pool->flush_objs_found++;
1415     write_unlock(&pool->pool_rwlock);
1416 
1417 out:
1418     if ( pool->client->info.flags.u.frozen )
1419         return -EFROZEN;
1420     else
1421         return 1;
1422 }
1423 
do_tmem_destroy_pool(uint32_t pool_id)1424 static int do_tmem_destroy_pool(uint32_t pool_id)
1425 {
1426     struct client *client = current->domain->tmem_client;
1427     struct tmem_pool *pool;
1428 
1429     if ( pool_id >= MAX_POOLS_PER_DOMAIN )
1430         return 0;
1431     if ( (pool = client->pools[pool_id]) == NULL )
1432         return 0;
1433     client->pools[pool_id] = NULL;
1434     pool_flush(pool, client->cli_id);
1435     client->info.nr_pools--;
1436     return 1;
1437 }
1438 
do_tmem_new_pool(domid_t this_cli_id,uint32_t d_poolid,uint32_t flags,uint64_t uuid_lo,uint64_t uuid_hi)1439 int do_tmem_new_pool(domid_t this_cli_id,
1440                      uint32_t d_poolid, uint32_t flags,
1441                      uint64_t uuid_lo, uint64_t uuid_hi)
1442 {
1443     struct client *client;
1444     domid_t cli_id;
1445     int persistent = flags & TMEM_POOL_PERSIST;
1446     int shared = flags & TMEM_POOL_SHARED;
1447     int pagebits = (flags >> TMEM_POOL_PAGESIZE_SHIFT)
1448          & TMEM_POOL_PAGESIZE_MASK;
1449     int specversion = (flags >> TMEM_POOL_VERSION_SHIFT)
1450          & TMEM_POOL_VERSION_MASK;
1451     struct tmem_pool *pool, *shpool;
1452     int i, first_unused_s_poolid;
1453 
1454     if ( this_cli_id == TMEM_CLI_ID_NULL )
1455         cli_id = current->domain->domain_id;
1456     else
1457         cli_id = this_cli_id;
1458     tmem_client_info("tmem: allocating %s-%s tmem pool for %s=%d...",
1459         persistent ? "persistent" : "ephemeral" ,
1460         shared ? "shared" : "private", tmem_cli_id_str, cli_id);
1461     if ( specversion != TMEM_SPEC_VERSION )
1462     {
1463         tmem_client_err("failed... unsupported spec version\n");
1464         return -EPERM;
1465     }
1466     if ( shared && persistent )
1467     {
1468         tmem_client_err("failed... unable to create a shared-persistant pool\n");
1469         return -EPERM;
1470     }
1471     if ( pagebits != (PAGE_SHIFT - 12) )
1472     {
1473         tmem_client_err("failed... unsupported pagesize %d\n",
1474                        1 << (pagebits + 12));
1475         return -EPERM;
1476     }
1477     if ( flags & TMEM_POOL_PRECOMPRESSED )
1478     {
1479         tmem_client_err("failed... precompression flag set but unsupported\n");
1480         return -EPERM;
1481     }
1482     if ( flags & TMEM_POOL_RESERVED_BITS )
1483     {
1484         tmem_client_err("failed... reserved bits must be zero\n");
1485         return -EPERM;
1486     }
1487     if ( this_cli_id != TMEM_CLI_ID_NULL )
1488     {
1489         if ( (client = tmem_client_from_cli_id(this_cli_id)) == NULL
1490              || d_poolid >= MAX_POOLS_PER_DOMAIN
1491              || client->pools[d_poolid] != NULL )
1492             return -EPERM;
1493     }
1494     else
1495     {
1496         client = current->domain->tmem_client;
1497         ASSERT(client != NULL);
1498         for ( d_poolid = 0; d_poolid < MAX_POOLS_PER_DOMAIN; d_poolid++ )
1499             if ( client->pools[d_poolid] == NULL )
1500                 break;
1501         if ( d_poolid >= MAX_POOLS_PER_DOMAIN )
1502         {
1503             tmem_client_err("failed... no more pool slots available for this %s\n",
1504                    tmem_client_str);
1505             return -EPERM;
1506         }
1507     }
1508 
1509     if ( (pool = pool_alloc()) == NULL )
1510     {
1511         tmem_client_err("failed... out of memory\n");
1512         return -ENOMEM;
1513     }
1514     client->pools[d_poolid] = pool;
1515     pool->client = client;
1516     pool->pool_id = d_poolid;
1517     pool->shared = shared;
1518     pool->persistent = persistent;
1519     pool->uuid[0] = uuid_lo;
1520     pool->uuid[1] = uuid_hi;
1521 
1522     /*
1523      * Already created a pool when arrived here, but need some special process
1524      * for shared pool.
1525      */
1526     if ( shared )
1527     {
1528         if ( uuid_lo == -1L && uuid_hi == -1L )
1529         {
1530             tmem_client_info("Invalid uuid, create non shared pool instead!\n");
1531             pool->shared = 0;
1532             goto out;
1533         }
1534         if ( !tmem_global.shared_auth )
1535         {
1536             for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1537                 if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
1538                      (client->shared_auth_uuid[i][1] == uuid_hi) )
1539                     break;
1540             if ( i == MAX_GLOBAL_SHARED_POOLS )
1541             {
1542                 tmem_client_info("Shared auth failed, create non shared pool instead!\n");
1543                 pool->shared = 0;
1544                 goto out;
1545             }
1546         }
1547 
1548         /*
1549          * Authorize okay, match a global shared pool or use the newly allocated
1550          * one.
1551          */
1552         first_unused_s_poolid = MAX_GLOBAL_SHARED_POOLS;
1553         for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++ )
1554         {
1555             if ( (shpool = tmem_global.shared_pools[i]) != NULL )
1556             {
1557                 if ( shpool->uuid[0] == uuid_lo && shpool->uuid[1] == uuid_hi )
1558                 {
1559                     /* Succ to match a global shared pool. */
1560                     tmem_client_info("(matches shared pool uuid=%"PRIx64".%"PRIx64") pool_id=%d\n",
1561                         uuid_hi, uuid_lo, d_poolid);
1562                     client->pools[d_poolid] = shpool;
1563                     if ( !shared_pool_join(shpool, client) )
1564                     {
1565                         pool_free(pool);
1566                         goto out;
1567                     }
1568                     else
1569                         goto fail;
1570                 }
1571             }
1572             else
1573             {
1574                 if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1575                     first_unused_s_poolid = i;
1576             }
1577         }
1578 
1579         /* Failed to find a global shared pool slot. */
1580         if ( first_unused_s_poolid == MAX_GLOBAL_SHARED_POOLS )
1581         {
1582             tmem_client_warn("tmem: failed... no global shared pool slots available\n");
1583             goto fail;
1584         }
1585         /* Add pool to global shared pool. */
1586         else
1587         {
1588             INIT_LIST_HEAD(&pool->share_list);
1589             pool->shared_count = 0;
1590             if ( shared_pool_join(pool, client) )
1591                 goto fail;
1592             tmem_global.shared_pools[first_unused_s_poolid] = pool;
1593         }
1594     }
1595 
1596 out:
1597     tmem_client_info("pool_id=%d\n", d_poolid);
1598     client->info.nr_pools++;
1599     return d_poolid;
1600 
1601 fail:
1602     pool_free(pool);
1603     return -EPERM;
1604 }
1605 
1606 /************ TMEM CONTROL OPERATIONS ************************************/
1607 
tmemc_shared_pool_auth(domid_t cli_id,uint64_t uuid_lo,uint64_t uuid_hi,bool auth)1608 int tmemc_shared_pool_auth(domid_t cli_id, uint64_t uuid_lo,
1609                            uint64_t uuid_hi, bool auth)
1610 {
1611     struct client *client;
1612     int i, free = -1;
1613 
1614     if ( cli_id == TMEM_CLI_ID_NULL )
1615     {
1616         tmem_global.shared_auth = auth;
1617         return 1;
1618     }
1619     client = tmem_client_from_cli_id(cli_id);
1620     if ( client == NULL )
1621         return -EINVAL;
1622 
1623     for ( i = 0; i < MAX_GLOBAL_SHARED_POOLS; i++)
1624     {
1625         if ( auth == 0 )
1626         {
1627             if ( (client->shared_auth_uuid[i][0] == uuid_lo) &&
1628                     (client->shared_auth_uuid[i][1] == uuid_hi) )
1629             {
1630                 client->shared_auth_uuid[i][0] = -1L;
1631                 client->shared_auth_uuid[i][1] = -1L;
1632                 return 1;
1633             }
1634         }
1635         else
1636         {
1637             if ( (client->shared_auth_uuid[i][0] == -1L) &&
1638                     (client->shared_auth_uuid[i][1] == -1L) )
1639             {
1640                 free = i;
1641                 break;
1642             }
1643 	}
1644     }
1645     if ( auth == 0 )
1646         return 0;
1647     else if ( free == -1)
1648         return -ENOMEM;
1649     else
1650     {
1651         client->shared_auth_uuid[free][0] = uuid_lo;
1652         client->shared_auth_uuid[free][1] = uuid_hi;
1653         return 1;
1654     }
1655 }
1656 
tmemc_save_subop(int cli_id,uint32_t pool_id,uint32_t subop,tmem_cli_va_param_t buf,uint32_t arg)1657 static int tmemc_save_subop(int cli_id, uint32_t pool_id,
1658                         uint32_t subop, tmem_cli_va_param_t buf, uint32_t arg)
1659 {
1660     struct client *client = tmem_client_from_cli_id(cli_id);
1661     uint32_t p;
1662     struct tmem_page_descriptor *pgp, *pgp2;
1663     int rc = -ENOENT;
1664 
1665     switch(subop)
1666     {
1667     case XEN_SYSCTL_TMEM_OP_SAVE_BEGIN:
1668         if ( client == NULL )
1669             break;
1670         for (p = 0; p < MAX_POOLS_PER_DOMAIN; p++)
1671             if ( client->pools[p] != NULL )
1672                 break;
1673 
1674         if ( p == MAX_POOLS_PER_DOMAIN )
1675             break;
1676 
1677         client->was_frozen = client->info.flags.u.frozen;
1678         client->info.flags.u.frozen = 1;
1679         if ( arg != 0 )
1680             client->info.flags.u.migrating = 1;
1681         rc = 0;
1682         break;
1683     case XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN:
1684         if ( client == NULL )
1685             rc = client_create(cli_id) ? 0 : -ENOMEM;
1686         else
1687             rc = -EEXIST;
1688         break;
1689     case XEN_SYSCTL_TMEM_OP_SAVE_END:
1690         if ( client == NULL )
1691             break;
1692         client->info.flags.u.migrating = 0;
1693         if ( !list_empty(&client->persistent_invalidated_list) )
1694             list_for_each_entry_safe(pgp,pgp2,
1695               &client->persistent_invalidated_list, client_inv_pages)
1696                 __pgp_free(pgp, client->pools[pgp->pool_id]);
1697         client->info.flags.u.frozen = client->was_frozen;
1698         rc = 0;
1699         break;
1700     }
1701     return rc;
1702 }
1703 
tmemc_save_get_next_page(int cli_id,uint32_t pool_id,tmem_cli_va_param_t buf,uint32_t bufsize)1704 static int tmemc_save_get_next_page(int cli_id, uint32_t pool_id,
1705                         tmem_cli_va_param_t buf, uint32_t bufsize)
1706 {
1707     struct client *client = tmem_client_from_cli_id(cli_id);
1708     struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
1709                    ? NULL : client->pools[pool_id];
1710     struct tmem_page_descriptor *pgp;
1711     struct xen_tmem_oid *oid;
1712     int ret = 0;
1713     struct tmem_handle h;
1714 
1715     if ( pool == NULL || !is_persistent(pool) )
1716         return -1;
1717 
1718     if ( bufsize < PAGE_SIZE + sizeof(struct tmem_handle) )
1719         return -ENOMEM;
1720 
1721     spin_lock(&pers_lists_spinlock);
1722     if ( list_empty(&pool->persistent_page_list) )
1723     {
1724         ret = -1;
1725         goto out;
1726     }
1727     /* Note: pool->cur_pgp is the pgp last returned by get_next_page. */
1728     if ( pool->cur_pgp == NULL )
1729     {
1730         /* Process the first one. */
1731         pool->cur_pgp = pgp = list_entry((&pool->persistent_page_list)->next,
1732                          struct tmem_page_descriptor,us.pool_pers_pages);
1733     } else if ( list_is_last(&pool->cur_pgp->us.pool_pers_pages,
1734                              &pool->persistent_page_list) )
1735     {
1736         /* Already processed the last one in the list. */
1737         ret = -1;
1738         goto out;
1739     }
1740     pgp = list_entry((&pool->cur_pgp->us.pool_pers_pages)->next,
1741                          struct tmem_page_descriptor,us.pool_pers_pages);
1742     pool->cur_pgp = pgp;
1743     oid = &pgp->us.obj->oid;
1744     h.pool_id = pool_id;
1745     BUILD_BUG_ON(sizeof(h.oid) != sizeof(*oid));
1746     memcpy(&(h.oid), oid, sizeof(h.oid));
1747     h.index = pgp->index;
1748     if ( copy_to_guest(guest_handle_cast(buf, void), &h, 1) )
1749     {
1750         ret = -EFAULT;
1751         goto out;
1752     }
1753     guest_handle_add_offset(buf, sizeof(h));
1754     ret = do_tmem_get(pool, oid, pgp->index, 0, buf);
1755 
1756 out:
1757     spin_unlock(&pers_lists_spinlock);
1758     return ret;
1759 }
1760 
tmemc_save_get_next_inv(int cli_id,tmem_cli_va_param_t buf,uint32_t bufsize)1761 static int tmemc_save_get_next_inv(int cli_id, tmem_cli_va_param_t buf,
1762                         uint32_t bufsize)
1763 {
1764     struct client *client = tmem_client_from_cli_id(cli_id);
1765     struct tmem_page_descriptor *pgp;
1766     struct tmem_handle h;
1767     int ret = 0;
1768 
1769     if ( client == NULL )
1770         return 0;
1771     if ( bufsize < sizeof(struct tmem_handle) )
1772         return 0;
1773     spin_lock(&pers_lists_spinlock);
1774     if ( list_empty(&client->persistent_invalidated_list) )
1775         goto out;
1776     if ( client->cur_pgp == NULL )
1777     {
1778         pgp = list_entry((&client->persistent_invalidated_list)->next,
1779                          struct tmem_page_descriptor,client_inv_pages);
1780         client->cur_pgp = pgp;
1781     } else if ( list_is_last(&client->cur_pgp->client_inv_pages,
1782                              &client->persistent_invalidated_list) )
1783     {
1784         client->cur_pgp = NULL;
1785         ret = 0;
1786         goto out;
1787     } else {
1788         pgp = list_entry((&client->cur_pgp->client_inv_pages)->next,
1789                          struct tmem_page_descriptor,client_inv_pages);
1790         client->cur_pgp = pgp;
1791     }
1792     h.pool_id = pgp->pool_id;
1793     BUILD_BUG_ON(sizeof(h.oid) != sizeof(pgp->inv_oid));
1794     memcpy(&(h.oid), &(pgp->inv_oid), sizeof(h.oid));
1795     h.index = pgp->index;
1796     ret = 1;
1797     if ( copy_to_guest(guest_handle_cast(buf, void), &h, 1) )
1798         ret = -EFAULT;
1799 out:
1800     spin_unlock(&pers_lists_spinlock);
1801     return ret;
1802 }
1803 
tmemc_restore_put_page(int cli_id,uint32_t pool_id,struct xen_tmem_oid * oidp,uint32_t index,tmem_cli_va_param_t buf,uint32_t bufsize)1804 static int tmemc_restore_put_page(int cli_id, uint32_t pool_id,
1805                                   struct xen_tmem_oid *oidp,
1806                                   uint32_t index, tmem_cli_va_param_t buf,
1807                                   uint32_t bufsize)
1808 {
1809     struct client *client = tmem_client_from_cli_id(cli_id);
1810     struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
1811                    ? NULL : client->pools[pool_id];
1812 
1813     if ( pool == NULL )
1814         return -1;
1815     if (bufsize != PAGE_SIZE) {
1816         tmem_client_err("tmem: %s: invalid parameter bufsize(%d) != (%ld)\n",
1817                 __func__, bufsize, PAGE_SIZE);
1818         return -EINVAL;
1819     }
1820     return do_tmem_put(pool, oidp, index, 0, buf);
1821 }
1822 
tmemc_restore_flush_page(int cli_id,uint32_t pool_id,struct xen_tmem_oid * oidp,uint32_t index)1823 static int tmemc_restore_flush_page(int cli_id, uint32_t pool_id,
1824                                     struct xen_tmem_oid *oidp,
1825                                     uint32_t index)
1826 {
1827     struct client *client = tmem_client_from_cli_id(cli_id);
1828     struct tmem_pool *pool = (client == NULL || pool_id >= MAX_POOLS_PER_DOMAIN)
1829                    ? NULL : client->pools[pool_id];
1830 
1831     if ( pool == NULL )
1832         return -1;
1833     return do_tmem_flush_page(pool,oidp,index);
1834 }
1835 
do_tmem_control(struct xen_sysctl_tmem_op * op)1836 int do_tmem_control(struct xen_sysctl_tmem_op *op)
1837 {
1838     int ret;
1839     uint32_t pool_id = op->pool_id;
1840     uint32_t cmd = op->cmd;
1841     struct xen_tmem_oid *oidp = &op->oid;
1842 
1843     ASSERT(rw_is_write_locked(&tmem_rwlock));
1844 
1845     switch (cmd)
1846     {
1847     case XEN_SYSCTL_TMEM_OP_SAVE_BEGIN:
1848     case XEN_SYSCTL_TMEM_OP_RESTORE_BEGIN:
1849     case XEN_SYSCTL_TMEM_OP_SAVE_END:
1850         ret = tmemc_save_subop(op->cli_id, pool_id, cmd,
1851                                guest_handle_cast(op->u.buf, char), op->arg);
1852         break;
1853     case XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_PAGE:
1854         ret = tmemc_save_get_next_page(op->cli_id, pool_id,
1855                                        guest_handle_cast(op->u.buf, char), op->len);
1856         break;
1857     case XEN_SYSCTL_TMEM_OP_SAVE_GET_NEXT_INV:
1858         ret = tmemc_save_get_next_inv(op->cli_id,
1859                                       guest_handle_cast(op->u.buf, char), op->len);
1860         break;
1861     case XEN_SYSCTL_TMEM_OP_RESTORE_PUT_PAGE:
1862         ret = tmemc_restore_put_page(op->cli_id, pool_id, oidp, op->arg,
1863                                      guest_handle_cast(op->u.buf, char), op->len);
1864         break;
1865     case XEN_SYSCTL_TMEM_OP_RESTORE_FLUSH_PAGE:
1866         ret = tmemc_restore_flush_page(op->cli_id, pool_id, oidp, op->arg);
1867         break;
1868     default:
1869         ret = -1;
1870     }
1871 
1872     return ret;
1873 }
1874 
1875 /************ EXPORTed FUNCTIONS **************************************/
1876 
do_tmem_op(tmem_cli_op_t uops)1877 long do_tmem_op(tmem_cli_op_t uops)
1878 {
1879     struct tmem_op op;
1880     struct client *client = current->domain->tmem_client;
1881     struct tmem_pool *pool = NULL;
1882     struct xen_tmem_oid *oidp;
1883     int rc = 0;
1884 
1885     if ( !tmem_initialized )
1886         return -ENODEV;
1887 
1888     if ( xsm_tmem_op(XSM_HOOK) )
1889         return -EPERM;
1890 
1891     tmem_stats.total_tmem_ops++;
1892 
1893     if ( client != NULL && client->domain->is_dying )
1894     {
1895         tmem_stats.errored_tmem_ops++;
1896         return -ENODEV;
1897     }
1898 
1899     if ( unlikely(tmem_get_tmemop_from_client(&op, uops) != 0) )
1900     {
1901         tmem_client_err("tmem: can't get tmem struct from %s\n", tmem_client_str);
1902         tmem_stats.errored_tmem_ops++;
1903         return -EFAULT;
1904     }
1905 
1906     /* Acquire write lock for all commands at first. */
1907     write_lock(&tmem_rwlock);
1908 
1909     switch ( op.cmd )
1910     {
1911     case TMEM_CONTROL:
1912     case TMEM_RESTORE_NEW:
1913     case TMEM_AUTH:
1914         rc = -EOPNOTSUPP;
1915         break;
1916 
1917     default:
1918     /*
1919 	 * For other commands, create per-client tmem structure dynamically on
1920 	 * first use by client.
1921 	 */
1922         if ( client == NULL )
1923         {
1924             if ( (client = client_create(current->domain->domain_id)) == NULL )
1925             {
1926                 tmem_client_err("tmem: can't create tmem structure for %s\n",
1927                                tmem_client_str);
1928                 rc = -ENOMEM;
1929                 goto out;
1930             }
1931         }
1932 
1933         if ( op.cmd == TMEM_NEW_POOL || op.cmd == TMEM_DESTROY_POOL )
1934         {
1935             if ( op.cmd == TMEM_NEW_POOL )
1936                 rc = do_tmem_new_pool(TMEM_CLI_ID_NULL, 0, op.u.creat.flags,
1937                                 op.u.creat.uuid[0], op.u.creat.uuid[1]);
1938 	        else
1939                 rc = do_tmem_destroy_pool(op.pool_id);
1940         }
1941         else
1942         {
1943             if ( ((uint32_t)op.pool_id >= MAX_POOLS_PER_DOMAIN) ||
1944                  ((pool = client->pools[op.pool_id]) == NULL) )
1945             {
1946                 tmem_client_err("tmem: operation requested on uncreated pool\n");
1947                 rc = -ENODEV;
1948                 goto out;
1949             }
1950             /* Commands that only need read lock. */
1951             write_unlock(&tmem_rwlock);
1952             read_lock(&tmem_rwlock);
1953 
1954             oidp = &op.u.gen.oid;
1955             switch ( op.cmd )
1956             {
1957             case TMEM_NEW_POOL:
1958             case TMEM_DESTROY_POOL:
1959                 BUG(); /* Done earlier. */
1960                 break;
1961             case TMEM_PUT_PAGE:
1962                 if (tmem_ensure_avail_pages())
1963                     rc = do_tmem_put(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
1964                                 tmem_cli_buf_null);
1965                 else
1966                     rc = -ENOMEM;
1967                 break;
1968             case TMEM_GET_PAGE:
1969                 rc = do_tmem_get(pool, oidp, op.u.gen.index, op.u.gen.cmfn,
1970                                 tmem_cli_buf_null);
1971                 break;
1972             case TMEM_FLUSH_PAGE:
1973                 rc = do_tmem_flush_page(pool, oidp, op.u.gen.index);
1974                 break;
1975             case TMEM_FLUSH_OBJECT:
1976                 rc = do_tmem_flush_object(pool, oidp);
1977                 break;
1978             default:
1979                 tmem_client_warn("tmem: op %d not implemented\n", op.cmd);
1980                 rc = -ENOSYS;
1981                 break;
1982             }
1983             read_unlock(&tmem_rwlock);
1984             if ( rc < 0 )
1985                 tmem_stats.errored_tmem_ops++;
1986             return rc;
1987         }
1988         break;
1989 
1990     }
1991 out:
1992     write_unlock(&tmem_rwlock);
1993     if ( rc < 0 )
1994         tmem_stats.errored_tmem_ops++;
1995     return rc;
1996 }
1997 
1998 /* This should be called when the host is destroying a client (domain). */
tmem_destroy(void * v)1999 void tmem_destroy(void *v)
2000 {
2001     struct client *client = (struct client *)v;
2002 
2003     if ( client == NULL )
2004         return;
2005 
2006     if ( !client->domain->is_dying )
2007     {
2008         printk("tmem: tmem_destroy can only destroy dying client\n");
2009         return;
2010     }
2011 
2012     write_lock(&tmem_rwlock);
2013 
2014     printk("tmem: flushing tmem pools for %s=%d\n",
2015            tmem_cli_id_str, client->cli_id);
2016     client_flush(client);
2017 
2018     write_unlock(&tmem_rwlock);
2019 }
2020 
2021 #define MAX_EVICTS 10  /* Should be variable or set via XEN_SYSCTL_TMEM_OP_ ?? */
tmem_relinquish_pages(unsigned int order,unsigned int memflags)2022 void *tmem_relinquish_pages(unsigned int order, unsigned int memflags)
2023 {
2024     struct page_info *pfp;
2025     unsigned long evicts_per_relinq = 0;
2026     int max_evictions = 10;
2027 
2028     if (!tmem_enabled() || !tmem_freeable_pages())
2029         return NULL;
2030 
2031     tmem_stats.relinq_attempts++;
2032     if ( order > 0 )
2033     {
2034 #ifndef NDEBUG
2035         printk("tmem_relinquish_page: failing order=%d\n", order);
2036 #endif
2037         return NULL;
2038     }
2039 
2040     while ( (pfp = tmem_page_list_get()) == NULL )
2041     {
2042         if ( (max_evictions-- <= 0) || !tmem_evict())
2043             break;
2044         evicts_per_relinq++;
2045     }
2046     if ( evicts_per_relinq > tmem_stats.max_evicts_per_relinq )
2047         tmem_stats.max_evicts_per_relinq = evicts_per_relinq;
2048     if ( pfp != NULL )
2049     {
2050         if ( !(memflags & MEMF_tmem) )
2051             scrub_one_page(pfp);
2052         tmem_stats.relinq_pgs++;
2053     }
2054 
2055     return pfp;
2056 }
2057 
tmem_freeable_pages(void)2058 unsigned long tmem_freeable_pages(void)
2059 {
2060     if ( !tmem_enabled() )
2061         return 0;
2062 
2063     return tmem_page_list_pages + _atomic_read(freeable_page_count);
2064 }
2065 
2066 /* Called at hypervisor startup. */
init_tmem(void)2067 static int __init init_tmem(void)
2068 {
2069     if ( !tmem_enabled() )
2070         return 0;
2071 
2072     if ( !tmem_mempool_init() )
2073         return 0;
2074 
2075     if ( tmem_init() )
2076     {
2077         printk("tmem: initialized comp=%d\n", tmem_compression_enabled());
2078         tmem_initialized = 1;
2079     }
2080     else
2081         printk("tmem: initialization FAILED\n");
2082 
2083     return 0;
2084 }
2085 __initcall(init_tmem);
2086 
2087 /*
2088  * Local variables:
2089  * mode: C
2090  * c-file-style: "BSD"
2091  * c-basic-offset: 4
2092  * tab-width: 4
2093  * indent-tabs-mode: nil
2094  * End:
2095  */
2096