1 /******************************************************************************
2  * arch/x86/mm/p2m-pod.c
3  *
4  * Populate-on-demand p2m entries.
5  *
6  * Copyright (c) 2009-2011 Citrix Systems, Inc.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; If not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include <xen/event.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/trace.h>
26 #include <asm/page.h>
27 #include <asm/paging.h>
28 #include <asm/p2m.h>
29 
30 #include "mm-locks.h"
31 
32 /* Override macros from asm/page.h to make them work with mfn_t */
33 #undef mfn_to_page
34 #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
35 #undef page_to_mfn
36 #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
37 
38 #define superpage_aligned(_x)  (((_x)&(SUPERPAGE_PAGES-1))==0)
39 
40 /* Enforce lock ordering when grabbing the "external" page_alloc lock */
lock_page_alloc(struct p2m_domain * p2m)41 static inline void lock_page_alloc(struct p2m_domain *p2m)
42 {
43     page_alloc_mm_pre_lock();
44     spin_lock(&(p2m->domain->page_alloc_lock));
45     page_alloc_mm_post_lock(p2m->domain->arch.page_alloc_unlock_level);
46 }
47 
unlock_page_alloc(struct p2m_domain * p2m)48 static inline void unlock_page_alloc(struct p2m_domain *p2m)
49 {
50     page_alloc_mm_unlock(p2m->domain->arch.page_alloc_unlock_level);
51     spin_unlock(&(p2m->domain->page_alloc_lock));
52 }
53 
54 /*
55  * Populate-on-demand functionality
56  */
57 
58 static int
p2m_pod_cache_add(struct p2m_domain * p2m,struct page_info * page,unsigned int order)59 p2m_pod_cache_add(struct p2m_domain *p2m,
60                   struct page_info *page,
61                   unsigned int order)
62 {
63     unsigned long i;
64     struct page_info *p;
65     struct domain *d = p2m->domain;
66 
67 #ifndef NDEBUG
68     mfn_t mfn;
69 
70     mfn = page_to_mfn(page);
71 
72     /* Check to make sure this is a contiguous region */
73     if ( mfn_x(mfn) & ((1UL << order) - 1) )
74     {
75         printk("%s: mfn %lx not aligned order %u! (mask %lx)\n",
76                __func__, mfn_x(mfn), order, ((1UL << order) - 1));
77         return -1;
78     }
79 
80     for ( i = 0; i < 1UL << order ; i++)
81     {
82         struct domain * od;
83 
84         p = mfn_to_page(_mfn(mfn_x(mfn) + i));
85         od = page_get_owner(p);
86         if ( od != d )
87         {
88             printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
89                    __func__, mfn_x(mfn), d->domain_id,
90                    od ? od->domain_id : -1);
91             return -1;
92         }
93     }
94 #endif
95 
96     ASSERT(pod_locked_by_me(p2m));
97 
98     /*
99      * Pages from domain_alloc and returned by the balloon driver aren't
100      * guaranteed to be zero; but by reclaiming zero pages, we implicitly
101      * promise to provide zero pages. So we scrub pages before using.
102      */
103     for ( i = 0; i < (1UL << order); i++ )
104         clear_domain_page(mfn_add(page_to_mfn(page), i));
105 
106     /* First, take all pages off the domain list */
107     lock_page_alloc(p2m);
108     for ( i = 0; i < 1UL << order ; i++ )
109     {
110         p = page + i;
111         page_list_del(p, &d->page_list);
112     }
113 
114     unlock_page_alloc(p2m);
115 
116     /* Then add to the appropriate populate-on-demand list. */
117     switch ( order )
118     {
119     case PAGE_ORDER_1G:
120         for ( i = 0; i < (1UL << PAGE_ORDER_1G); i += 1UL << PAGE_ORDER_2M )
121             page_list_add_tail(page + i, &p2m->pod.super);
122         break;
123     case PAGE_ORDER_2M:
124         page_list_add_tail(page, &p2m->pod.super);
125         break;
126     case PAGE_ORDER_4K:
127         page_list_add_tail(page, &p2m->pod.single);
128         break;
129     default:
130         BUG();
131     }
132     p2m->pod.count += 1UL << order;
133 
134     return 0;
135 }
136 
137 /* Get a page of size order from the populate-on-demand cache.  Will break
138  * down 2-meg pages into singleton pages automatically.  Returns null if
139  * a superpage is requested and no superpages are available. */
p2m_pod_cache_get(struct p2m_domain * p2m,unsigned int order)140 static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
141                                             unsigned int order)
142 {
143     struct page_info *p = NULL;
144     unsigned long i;
145 
146     ASSERT(pod_locked_by_me(p2m));
147 
148     if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) )
149     {
150         return NULL;
151     }
152     else if ( order == PAGE_ORDER_4K && page_list_empty(&p2m->pod.single) )
153     {
154         unsigned long mfn;
155         struct page_info *q;
156 
157         BUG_ON( page_list_empty(&p2m->pod.super) );
158 
159         /*
160          * Break up a superpage to make single pages. NB count doesn't
161          * need to be adjusted.
162          */
163         p = page_list_remove_head(&p2m->pod.super);
164         mfn = mfn_x(page_to_mfn(p));
165 
166         for ( i = 0; i < SUPERPAGE_PAGES; i++ )
167         {
168             q = mfn_to_page(_mfn(mfn+i));
169             page_list_add_tail(q, &p2m->pod.single);
170         }
171     }
172 
173     switch ( order )
174     {
175     case PAGE_ORDER_2M:
176         BUG_ON( page_list_empty(&p2m->pod.super) );
177         p = page_list_remove_head(&p2m->pod.super);
178         p2m->pod.count -= 1UL << order;
179         break;
180     case PAGE_ORDER_4K:
181         BUG_ON( page_list_empty(&p2m->pod.single) );
182         p = page_list_remove_head(&p2m->pod.single);
183         p2m->pod.count -= 1UL;
184         break;
185     default:
186         BUG();
187     }
188 
189     /* Put the pages back on the domain page_list */
190     lock_page_alloc(p2m);
191     for ( i = 0 ; i < (1UL << order); i++ )
192     {
193         BUG_ON(page_get_owner(p + i) != p2m->domain);
194         page_list_add_tail(p + i, &p2m->domain->page_list);
195     }
196     unlock_page_alloc(p2m);
197 
198     return p;
199 }
200 
201 /* Set the size of the cache, allocating or freeing as necessary. */
202 static int
p2m_pod_set_cache_target(struct p2m_domain * p2m,unsigned long pod_target,int preemptible)203 p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int preemptible)
204 {
205     struct domain *d = p2m->domain;
206     int ret = 0;
207 
208     ASSERT(pod_locked_by_me(p2m));
209 
210     /* Increasing the target */
211     while ( pod_target > p2m->pod.count )
212     {
213         struct page_info * page;
214         int order;
215 
216         if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES )
217             order = PAGE_ORDER_2M;
218         else
219             order = PAGE_ORDER_4K;
220     retry:
221         page = alloc_domheap_pages(d, order, 0);
222         if ( unlikely(page == NULL) )
223         {
224             if ( order == PAGE_ORDER_2M )
225             {
226                 /* If we can't allocate a superpage, try singleton pages */
227                 order = PAGE_ORDER_4K;
228                 goto retry;
229             }
230 
231             printk("%s: Unable to allocate page for PoD cache (target=%lu cache=%ld)\n",
232                    __func__, pod_target, p2m->pod.count);
233             ret = -ENOMEM;
234             goto out;
235         }
236 
237         p2m_pod_cache_add(p2m, page, order);
238 
239         if ( preemptible && pod_target != p2m->pod.count &&
240              hypercall_preempt_check() )
241         {
242             ret = -ERESTART;
243             goto out;
244         }
245     }
246 
247     /* Decreasing the target */
248     /*
249      * We hold the pod lock here, so we don't need to worry about
250      * cache disappearing under our feet.
251      */
252     while ( pod_target < p2m->pod.count )
253     {
254         struct page_info * page;
255         unsigned int order;
256         unsigned long i;
257 
258         if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES
259              && !page_list_empty(&p2m->pod.super) )
260             order = PAGE_ORDER_2M;
261         else
262             order = PAGE_ORDER_4K;
263 
264         page = p2m_pod_cache_get(p2m, order);
265 
266         ASSERT(page != NULL);
267 
268         /* Then free them */
269         for ( i = 0 ; i < (1UL << order) ; i++ )
270         {
271             /* Copied from common/memory.c:guest_remove_page() */
272             if ( unlikely(!get_page(page + i, d)) )
273             {
274                 gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id);
275                 ret = -EINVAL;
276                 goto out;
277             }
278 
279             if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) )
280                 put_page_and_type(page + i);
281 
282             if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
283                 put_page(page + i);
284 
285             put_page(page + i);
286 
287             if ( preemptible && pod_target != p2m->pod.count &&
288                  hypercall_preempt_check() )
289             {
290                 ret = -ERESTART;
291                 goto out;
292             }
293         }
294     }
295 
296 out:
297     return ret;
298 }
299 
300 /*
301  * The "right behavior" here requires some careful thought.  First, some
302  * definitions:
303  * + M: static_max
304  * + B: number of pages the balloon driver has ballooned down to.
305  * + P: Number of populated pages.
306  * + T: Old target
307  * + T': New target
308  *
309  * The following equations should hold:
310  *  0 <= P <= T <= B <= M
311  *  d->arch.p2m->pod.entry_count == B - P
312  *  d->tot_pages == P + d->arch.p2m->pod.count
313  *
314  * Now we have the following potential cases to cover:
315  *     B <T': Set the PoD cache size equal to the number of outstanding PoD
316  *   entries.  The balloon driver will deflate the balloon to give back
317  *   the remainder of the ram to the guest OS.
318  *  T <T'<B : Increase PoD cache size.
319  *  T'<T<=B : Here we have a choice.  We can decrease the size of the cache,
320  *   get the memory right away.  However, that means every time we
321  *   reduce the memory target we risk the guest attempting to populate the
322  *   memory before the balloon driver has reached its new target.  Safer to
323  *   never reduce the cache size here, but only when the balloon driver frees
324  *   PoD ranges.
325  *
326  * If there are many zero pages, we could reach the target also by doing
327  * zero sweeps and marking the ranges PoD; but the balloon driver will have
328  * to free this memory eventually anyway, so we don't actually gain that much
329  * by doing so.
330  *
331  * NB that the equation (B<T') may require adjustment to the cache
332  * size as PoD pages are freed as well; i.e., freeing a PoD-backed
333  * entry when pod.entry_count == pod.count requires us to reduce both
334  * pod.entry_count and pod.count.
335  */
336 int
p2m_pod_set_mem_target(struct domain * d,unsigned long target)337 p2m_pod_set_mem_target(struct domain *d, unsigned long target)
338 {
339     struct p2m_domain *p2m = p2m_get_hostp2m(d);
340     int ret = 0;
341     unsigned long populated, pod_target;
342 
343     pod_lock(p2m);
344 
345     /* P == B: Nothing to do (unless the guest is being created). */
346     populated = d->tot_pages - p2m->pod.count;
347     if ( populated > 0 && p2m->pod.entry_count == 0 )
348         goto out;
349 
350     /* Don't do anything if the domain is being torn down */
351     if ( d->is_dying )
352         goto out;
353 
354     /*
355      * T' < B: Don't reduce the cache size; let the balloon driver
356      * take care of it.
357      */
358     if ( target < d->tot_pages )
359         goto out;
360 
361     pod_target = target - populated;
362 
363     /*
364      * B < T': Set the cache size equal to # of outstanding entries,
365      * let the balloon driver fill in the rest.
366      */
367     if ( populated > 0 && pod_target > p2m->pod.entry_count )
368         pod_target = p2m->pod.entry_count;
369 
370     ASSERT( pod_target >= p2m->pod.count );
371 
372     ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
373 
374 out:
375     pod_unlock(p2m);
376 
377     return ret;
378 }
379 
p2m_pod_empty_cache(struct domain * d)380 int p2m_pod_empty_cache(struct domain *d)
381 {
382     struct p2m_domain *p2m = p2m_get_hostp2m(d);
383     struct page_info *page;
384     unsigned int i;
385 
386     /* After this barrier no new PoD activities can happen. */
387     BUG_ON(!d->is_dying);
388     spin_barrier(&p2m->pod.lock.lock);
389 
390     lock_page_alloc(p2m);
391 
392     while ( (page = page_list_remove_head(&p2m->pod.super)) )
393     {
394         for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ )
395         {
396             BUG_ON(page_get_owner(page + i) != d);
397             page_list_add_tail(page + i, &d->page_list);
398         }
399 
400         p2m->pod.count -= SUPERPAGE_PAGES;
401 
402         if ( hypercall_preempt_check() )
403             goto out;
404     }
405 
406     for ( i = 0; (page = page_list_remove_head(&p2m->pod.single)); ++i )
407     {
408         BUG_ON(page_get_owner(page) != d);
409         page_list_add_tail(page, &d->page_list);
410 
411         p2m->pod.count -= 1;
412 
413         if ( i && !(i & 511) && hypercall_preempt_check() )
414             goto out;
415     }
416 
417     BUG_ON(p2m->pod.count != 0);
418 
419  out:
420     unlock_page_alloc(p2m);
421     return p2m->pod.count ? -ERESTART : 0;
422 }
423 
424 int
p2m_pod_offline_or_broken_hit(struct page_info * p)425 p2m_pod_offline_or_broken_hit(struct page_info *p)
426 {
427     struct domain *d;
428     struct p2m_domain *p2m;
429     struct page_info *q, *tmp;
430     unsigned long mfn, bmfn;
431 
432     if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
433         return 0;
434 
435     pod_lock(p2m);
436     bmfn = mfn_x(page_to_mfn(p));
437     page_list_for_each_safe(q, tmp, &p2m->pod.super)
438     {
439         mfn = mfn_x(page_to_mfn(q));
440         if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) )
441         {
442             unsigned long i;
443             page_list_del(q, &p2m->pod.super);
444             for ( i = 0; i < SUPERPAGE_PAGES; i++)
445             {
446                 q = mfn_to_page(_mfn(mfn + i));
447                 page_list_add_tail(q, &p2m->pod.single);
448             }
449             page_list_del(p, &p2m->pod.single);
450             p2m->pod.count--;
451             goto pod_hit;
452         }
453     }
454 
455     page_list_for_each_safe(q, tmp, &p2m->pod.single)
456     {
457         mfn = mfn_x(page_to_mfn(q));
458         if ( mfn == bmfn )
459         {
460             page_list_del(p, &p2m->pod.single);
461             p2m->pod.count--;
462             goto pod_hit;
463         }
464     }
465 
466     pod_unlock(p2m);
467     return 0;
468 
469 pod_hit:
470     lock_page_alloc(p2m);
471     /* Insertion must be at list head (see iommu_populate_page_table()). */
472     page_list_add(p, &d->arch.relmem_list);
473     unlock_page_alloc(p2m);
474     pod_unlock(p2m);
475     return 1;
476 }
477 
478 void
p2m_pod_offline_or_broken_replace(struct page_info * p)479 p2m_pod_offline_or_broken_replace(struct page_info *p)
480 {
481     struct domain *d;
482     struct p2m_domain *p2m;
483     nodeid_t node = phys_to_nid(page_to_maddr(p));
484 
485     if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
486         return;
487 
488     free_domheap_page(p);
489 
490     p = alloc_domheap_page(d, MEMF_node(node));
491     if ( unlikely(!p) )
492         return;
493 
494     pod_lock(p2m);
495     p2m_pod_cache_add(p2m, p, PAGE_ORDER_4K);
496     pod_unlock(p2m);
497     return;
498 }
499 
500 static int
501 p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn);
502 
503 
504 /*
505  * This function is needed for two reasons:
506  * + To properly handle clearing of PoD entries
507  * + To "steal back" memory being freed for the PoD cache, rather than
508  *   releasing it.
509  *
510  * Once both of these functions have been completed, we can return and
511  * allow decrease_reservation() to handle everything else.
512  */
513 int
p2m_pod_decrease_reservation(struct domain * d,gfn_t gfn,unsigned int order)514 p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
515 {
516     int ret = 0;
517     unsigned long i, n;
518     struct p2m_domain *p2m = p2m_get_hostp2m(d);
519     bool_t steal_for_cache;
520     long pod, nonpod, ram;
521 
522     gfn_lock(p2m, gfn, order);
523     pod_lock(p2m);
524 
525     /*
526      * If we don't have any outstanding PoD entries, let things take their
527      * course.
528      */
529     if ( p2m->pod.entry_count == 0 )
530         goto out_unlock;
531 
532     if ( unlikely(d->is_dying) )
533         goto out_unlock;
534 
535     pod = nonpod = ram = 0;
536 
537     /* Figure out if we need to steal some freed memory for our cache */
538     steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
539 
540     for ( i = 0; i < (1UL << order); i += n )
541     {
542         p2m_access_t a;
543         p2m_type_t t;
544         unsigned int cur_order;
545 
546         p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, &cur_order, NULL);
547         n = 1UL << min(order, cur_order);
548         if ( t == p2m_populate_on_demand )
549             pod += n;
550         else
551         {
552             nonpod += n;
553             if ( p2m_is_ram(t) )
554                 ram += n;
555         }
556     }
557 
558     /* No populate-on-demand?  Don't need to steal anything?  Then we're done!*/
559     if ( !pod && !steal_for_cache )
560         goto out_unlock;
561 
562     if ( !nonpod )
563     {
564         /*
565          * All PoD: Mark the whole region invalid and tell caller
566          * we're done.
567          */
568         if ( p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid,
569                            p2m->default_access) )
570         {
571             /*
572              * If this fails, we can't tell how much of the range was changed.
573              * Best to crash the domain unless we're sure a partial change is
574              * impossible.
575              */
576             if ( order != 0 )
577                 domain_crash(d);
578             goto out_unlock;
579         }
580         p2m->pod.entry_count -= 1UL << order;
581         BUG_ON(p2m->pod.entry_count < 0);
582         ret = 1;
583         goto out_entry_check;
584     }
585 
586     /*
587      * Try to grab entire superpages if possible.  Since the common case is for
588      * drivers to pass back singleton pages, see if we can take the whole page
589      * back and mark the rest PoD.
590      * No need to do this though if
591      * - order >= SUPERPAGE_ORDER (the loop below will take care of this)
592      * - not all of the pages were RAM (now knowing order < SUPERPAGE_ORDER)
593      */
594     if ( steal_for_cache && order < SUPERPAGE_ORDER && ram == (1UL << order) &&
595          p2m_pod_zero_check_superpage(p2m, _gfn(gfn_x(gfn) & ~(SUPERPAGE_PAGES - 1))) )
596     {
597         pod = 1UL << order;
598         ram = nonpod = 0;
599         ASSERT(steal_for_cache == (p2m->pod.entry_count > p2m->pod.count));
600     }
601 
602     /*
603      * Process as long as:
604      * + There are PoD entries to handle, or
605      * + There is ram left, and we want to steal it
606      */
607     for ( i = 0;
608           i < (1UL << order) && (pod > 0 || (steal_for_cache && ram > 0));
609           i += n )
610     {
611         mfn_t mfn;
612         p2m_type_t t;
613         p2m_access_t a;
614         unsigned int cur_order;
615 
616         mfn = p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, &cur_order, NULL);
617         if ( order < cur_order )
618             cur_order = order;
619         n = 1UL << cur_order;
620         if ( t == p2m_populate_on_demand )
621         {
622             /* This shouldn't be able to fail */
623             if ( p2m_set_entry(p2m, gfn_add(gfn, i), INVALID_MFN, cur_order,
624                                p2m_invalid, p2m->default_access) )
625             {
626                 ASSERT_UNREACHABLE();
627                 domain_crash(d);
628                 goto out_unlock;
629             }
630             p2m->pod.entry_count -= n;
631             BUG_ON(p2m->pod.entry_count < 0);
632             pod -= n;
633         }
634         else if ( steal_for_cache && p2m_is_ram(t) )
635         {
636             /*
637              * If we need less than 1 << cur_order, we may end up stealing
638              * more memory here than we actually need. This will be rectified
639              * below, however; and stealing too much and then freeing what we
640              * need may allow us to free smaller pages from the cache, and
641              * avoid breaking up superpages.
642              */
643             struct page_info *page;
644             unsigned long j;
645 
646             ASSERT(mfn_valid(mfn));
647 
648             page = mfn_to_page(mfn);
649 
650             /* This shouldn't be able to fail */
651             if ( p2m_set_entry(p2m, gfn_add(gfn, i), INVALID_MFN, cur_order,
652                                p2m_invalid, p2m->default_access) )
653             {
654                 ASSERT_UNREACHABLE();
655                 domain_crash(d);
656                 goto out_unlock;
657             }
658             p2m_tlb_flush_sync(p2m);
659             for ( j = 0; j < n; ++j )
660                 set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY);
661             p2m_pod_cache_add(p2m, page, cur_order);
662 
663             steal_for_cache =  ( p2m->pod.entry_count > p2m->pod.count );
664 
665             nonpod -= n;
666             ram -= n;
667         }
668     }
669 
670     /*
671      * If there are no more non-PoD entries, tell decrease_reservation() that
672      * there's nothing left to do.
673      */
674     if ( nonpod == 0 )
675         ret = 1;
676 
677 out_entry_check:
678     /* If we've reduced our "liabilities" beyond our "assets", free some */
679     if ( p2m->pod.entry_count < p2m->pod.count )
680     {
681         p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't preempt*/);
682     }
683 
684 out_unlock:
685     pod_unlock(p2m);
686     gfn_unlock(p2m, gfn, order);
687     return ret;
688 }
689 
p2m_pod_dump_data(struct domain * d)690 void p2m_pod_dump_data(struct domain *d)
691 {
692     struct p2m_domain *p2m = p2m_get_hostp2m(d);
693 
694     printk("    PoD entries=%ld cachesize=%ld\n",
695            p2m->pod.entry_count, p2m->pod.count);
696 }
697 
698 
699 /*
700  * Search for all-zero superpages to be reclaimed as superpages for the
701  * PoD cache. Must be called w/ pod lock held, must lock the superpage
702  * in the p2m.
703  */
704 static int
p2m_pod_zero_check_superpage(struct p2m_domain * p2m,gfn_t gfn)705 p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn)
706 {
707     mfn_t mfn, mfn0 = INVALID_MFN;
708     p2m_type_t type, type0 = 0;
709     unsigned long * map = NULL;
710     int ret=0, reset = 0;
711     unsigned long i, n;
712     unsigned int j;
713     int max_ref = 1;
714     struct domain *d = p2m->domain;
715 
716     ASSERT(pod_locked_by_me(p2m));
717 
718     if ( !superpage_aligned(gfn_x(gfn)) )
719         goto out;
720 
721     /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
722     if ( paging_mode_shadow(d) )
723         max_ref++;
724 
725     /*
726      * NOTE: this is why we don't enforce deadlock constraints between p2m
727      * and pod locks.
728      */
729     gfn_lock(p2m, gfn, SUPERPAGE_ORDER);
730 
731     /*
732      * Look up the mfns, checking to make sure they're the same mfn
733      * and aligned, and mapping them.
734      */
735     for ( i = 0; i < SUPERPAGE_PAGES; i += n )
736     {
737         p2m_access_t a;
738         unsigned int cur_order;
739         unsigned long k;
740         const struct page_info *page;
741 
742         mfn = p2m->get_entry(p2m, gfn_add(gfn, i), &type, &a, 0,
743                              &cur_order, NULL);
744 
745         /*
746          * Conditions that must be met for superpage-superpage:
747          * + All gfns are ram types
748          * + All gfns have the same type
749          * + All of the mfns are allocated to a domain
750          * + None of the mfns are used as pagetables, or allocated via xenheap
751          * + The first mfn is 2-meg aligned
752          * + All the other mfns are in sequence
753          * Adding for good measure:
754          * + None of the mfns are likely to be mapped elsewhere (refcount
755          *   2 or less for shadow, 1 for hap)
756          */
757         if ( !p2m_is_ram(type) )
758             goto out;
759 
760         if ( i == 0 )
761         {
762             if ( !superpage_aligned(mfn_x(mfn)) )
763                 goto out;
764             mfn0 = mfn;
765             type0 = type;
766         }
767         else if ( type != type0 || !mfn_eq(mfn, mfn_add(mfn0, i)) )
768             goto out;
769 
770         n = 1UL << min(cur_order, SUPERPAGE_ORDER + 0U);
771         for ( k = 0, page = mfn_to_page(mfn); k < n; ++k, ++page )
772             if ( !(page->count_info & PGC_allocated) ||
773                  (page->count_info & (PGC_page_table | PGC_xen_heap)) ||
774                  (page->count_info & PGC_count_mask) > max_ref )
775                 goto out;
776     }
777 
778     /* Now, do a quick check to see if it may be zero before unmapping. */
779     for ( i = 0; i < SUPERPAGE_PAGES; i++ )
780     {
781         /* Quick zero-check */
782         map = map_domain_page(mfn_add(mfn0, i));
783 
784         for ( j = 0; j < 16; j++ )
785             if ( *(map + j) != 0 )
786                 break;
787 
788         unmap_domain_page(map);
789 
790         if ( j < 16 )
791             goto out;
792 
793     }
794 
795     /* Try to remove the page, restoring old mapping if it fails. */
796     if ( p2m_set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_2M,
797                        p2m_populate_on_demand, p2m->default_access) )
798         goto out;
799 
800     p2m_tlb_flush_sync(p2m);
801 
802     /*
803      * Make none of the MFNs are used elsewhere... for example, mapped
804      * via the grant table interface, or by qemu.  Allow one refcount for
805      * being allocated to the domain.
806      */
807     for ( i = 0; i < SUPERPAGE_PAGES; i++ )
808     {
809         mfn = mfn_add(mfn0, i);
810         if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 )
811         {
812             reset = 1;
813             goto out_reset;
814         }
815     }
816 
817     /* Finally, do a full zero-check */
818     for ( i = 0; i < SUPERPAGE_PAGES; i++ )
819     {
820         map = map_domain_page(mfn_add(mfn0, i));
821 
822         for ( j = 0; j < (PAGE_SIZE / sizeof(*map)); j++ )
823             if ( *(map+j) != 0 )
824             {
825                 reset = 1;
826                 break;
827             }
828 
829         unmap_domain_page(map);
830 
831         if ( reset )
832             goto out_reset;
833     }
834 
835     if ( tb_init_done )
836     {
837         struct {
838             u64 gfn, mfn;
839             int d:16,order:16;
840         } t;
841 
842         t.gfn = gfn_x(gfn);
843         t.mfn = mfn_x(mfn);
844         t.d = d->domain_id;
845         t.order = 9;
846 
847         __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
848     }
849 
850     /*
851      * Finally!  We've passed all the checks, and can add the mfn superpage
852      * back on the PoD cache, and account for the new p2m PoD entries.
853      */
854     p2m_pod_cache_add(p2m, mfn_to_page(mfn0), PAGE_ORDER_2M);
855     p2m->pod.entry_count += SUPERPAGE_PAGES;
856 
857     ret = SUPERPAGE_PAGES;
858 
859 out_reset:
860     /*
861      * This p2m_set_entry() call shouldn't be able to fail, since the same order
862      * on the same gfn succeeded above.  If that turns out to be false, crashing
863      * the domain should be the safest way of making sure we don't leak memory.
864      */
865     if ( reset && p2m_set_entry(p2m, gfn, mfn0, PAGE_ORDER_2M,
866                                 type0, p2m->default_access) )
867     {
868         ASSERT_UNREACHABLE();
869         domain_crash(d);
870     }
871 
872 out:
873     gfn_unlock(p2m, gfn, SUPERPAGE_ORDER);
874     return ret;
875 }
876 
877 static void
p2m_pod_zero_check(struct p2m_domain * p2m,const gfn_t * gfns,int count)878 p2m_pod_zero_check(struct p2m_domain *p2m, const gfn_t *gfns, int count)
879 {
880     mfn_t mfns[count];
881     p2m_type_t types[count];
882     unsigned long *map[count];
883     struct domain *d = p2m->domain;
884 
885     int i, j;
886     int max_ref = 1;
887 
888     /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
889     if ( paging_mode_shadow(d) )
890         max_ref++;
891 
892     /* First, get the gfn list, translate to mfns, and map the pages. */
893     for ( i = 0; i < count; i++ )
894     {
895         p2m_access_t a;
896         struct page_info *pg;
897 
898         mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, &a,
899                                  0, NULL, NULL);
900         pg = mfn_to_page(mfns[i]);
901 
902         /*
903          * If this is ram, and not a pagetable or from the xen heap, and
904          * probably not mapped elsewhere, map it; otherwise, skip.
905          */
906         if ( p2m_is_ram(types[i]) && (pg->count_info & PGC_allocated) &&
907              !(pg->count_info & (PGC_page_table | PGC_xen_heap)) &&
908              ((pg->count_info & PGC_count_mask) <= max_ref) )
909             map[i] = map_domain_page(mfns[i]);
910         else
911             map[i] = NULL;
912     }
913 
914     /*
915      * Then, go through and check for zeroed pages, removing write permission
916      * for those with zeroes.
917      */
918     for ( i = 0; i < count; i++ )
919     {
920         if ( !map[i] )
921             continue;
922 
923         /* Quick zero-check */
924         for ( j = 0; j < 16; j++ )
925             if ( *(map[i] + j) != 0 )
926                 break;
927 
928         if ( j < 16 )
929         {
930             unmap_domain_page(map[i]);
931             map[i] = NULL;
932             continue;
933         }
934 
935         /* Try to remove the page, restoring old mapping if it fails. */
936         if ( p2m_set_entry(p2m, gfns[i], INVALID_MFN, PAGE_ORDER_4K,
937                            p2m_populate_on_demand, p2m->default_access) )
938             goto skip;
939 
940         /*
941          * See if the page was successfully unmapped.  (Allow one refcount
942          * for being allocated to a domain.)
943          */
944         if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 )
945         {
946             /*
947              * If the previous p2m_set_entry call succeeded, this one shouldn't
948              * be able to fail.  If it does, crashing the domain should be safe.
949              */
950             if ( p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K,
951                                types[i], p2m->default_access) )
952             {
953                 ASSERT_UNREACHABLE();
954                 domain_crash(d);
955                 goto out_unmap;
956             }
957 
958         skip:
959             unmap_domain_page(map[i]);
960             map[i] = NULL;
961 
962             continue;
963         }
964     }
965 
966     p2m_tlb_flush_sync(p2m);
967 
968     /* Now check each page for real */
969     for ( i = 0; i < count; i++ )
970     {
971         if ( !map[i] )
972             continue;
973 
974         for ( j = 0; j < (PAGE_SIZE / sizeof(*map[i])); j++ )
975             if ( *(map[i] + j) != 0 )
976                 break;
977 
978         unmap_domain_page(map[i]);
979 
980         map[i] = NULL;
981 
982         /*
983          * See comment in p2m_pod_zero_check_superpage() re gnttab
984          * check timing.
985          */
986         if ( j < (PAGE_SIZE / sizeof(*map[i])) )
987         {
988             /*
989              * If the previous p2m_set_entry call succeeded, this one shouldn't
990              * be able to fail.  If it does, crashing the domain should be safe.
991              */
992             if ( p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K,
993                                types[i], p2m->default_access) )
994             {
995                 ASSERT_UNREACHABLE();
996                 domain_crash(d);
997                 goto out_unmap;
998             }
999         }
1000         else
1001         {
1002             if ( tb_init_done )
1003             {
1004                 struct {
1005                     u64 gfn, mfn;
1006                     int d:16,order:16;
1007                 } t;
1008 
1009                 t.gfn = gfn_x(gfns[i]);
1010                 t.mfn = mfn_x(mfns[i]);
1011                 t.d = d->domain_id;
1012                 t.order = 0;
1013 
1014                 __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
1015             }
1016 
1017             /* Add to cache, and account for the new p2m PoD entry */
1018             p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), PAGE_ORDER_4K);
1019             p2m->pod.entry_count++;
1020         }
1021     }
1022 
1023     return;
1024 
1025 out_unmap:
1026     /*
1027      * Something went wrong, probably crashing the domain.  Unmap
1028      * everything and return.
1029      */
1030     for ( i = 0; i < count; i++ )
1031         if ( map[i] )
1032             unmap_domain_page(map[i]);
1033 }
1034 
1035 #define POD_SWEEP_LIMIT 1024
1036 #define POD_SWEEP_STRIDE  16
1037 static void
p2m_pod_emergency_sweep(struct p2m_domain * p2m)1038 p2m_pod_emergency_sweep(struct p2m_domain *p2m)
1039 {
1040     gfn_t gfns[POD_SWEEP_STRIDE];
1041     unsigned long i, j = 0, start, limit;
1042     p2m_type_t t;
1043 
1044 
1045     if ( gfn_eq(p2m->pod.reclaim_single, _gfn(0)) )
1046         p2m->pod.reclaim_single = p2m->pod.max_guest;
1047 
1048     start = gfn_x(p2m->pod.reclaim_single);
1049     limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
1050 
1051     /* FIXME: Figure out how to avoid superpages */
1052     /*
1053      * NOTE: Promote to globally locking the p2m. This will get complicated
1054      * in a fine-grained scenario. If we lock each gfn individually we must be
1055      * careful about spinlock recursion limits and POD_SWEEP_STRIDE.
1056      */
1057     p2m_lock(p2m);
1058     for ( i = gfn_x(p2m->pod.reclaim_single); i > 0 ; i-- )
1059     {
1060         p2m_access_t a;
1061         (void)p2m->get_entry(p2m, _gfn(i), &t, &a, 0, NULL, NULL);
1062         if ( p2m_is_ram(t) )
1063         {
1064             gfns[j] = _gfn(i);
1065             j++;
1066             BUG_ON(j > POD_SWEEP_STRIDE);
1067             if ( j == POD_SWEEP_STRIDE )
1068             {
1069                 p2m_pod_zero_check(p2m, gfns, j);
1070                 j = 0;
1071             }
1072         }
1073         /*
1074          * Stop if we're past our limit and we have found *something*.
1075          *
1076          * NB that this is a zero-sum game; we're increasing our cache size
1077          * by re-increasing our 'debt'.  Since we hold the pod lock,
1078          * (entry_count - count) must remain the same.
1079          */
1080         if ( i < limit && (p2m->pod.count > 0 || hypercall_preempt_check()) )
1081             break;
1082     }
1083 
1084     if ( j )
1085         p2m_pod_zero_check(p2m, gfns, j);
1086 
1087     p2m_unlock(p2m);
1088     p2m->pod.reclaim_single = _gfn(i ? i - 1 : i);
1089 
1090 }
1091 
pod_eager_reclaim(struct p2m_domain * p2m)1092 static void pod_eager_reclaim(struct p2m_domain *p2m)
1093 {
1094     struct pod_mrp_list *mrp = &p2m->pod.mrp;
1095     unsigned int i = 0;
1096 
1097     /*
1098      * Always check one page for reclaimation.
1099      *
1100      * If the PoD pool is empty, keep checking some space is found, or all
1101      * entries have been exhaused.
1102      */
1103     do
1104     {
1105         unsigned int idx = (mrp->idx + i++) % ARRAY_SIZE(mrp->list);
1106         gfn_t gfn = _gfn(mrp->list[idx]);
1107 
1108         if ( !gfn_eq(gfn, INVALID_GFN) )
1109         {
1110             if ( gfn_x(gfn) & POD_LAST_SUPERPAGE )
1111             {
1112                 gfn = _gfn(gfn_x(gfn) & ~POD_LAST_SUPERPAGE);
1113 
1114                 if ( p2m_pod_zero_check_superpage(p2m, gfn) == 0 )
1115                 {
1116                     unsigned int x;
1117 
1118                     for ( x = 0; x < SUPERPAGE_PAGES; ++x, gfn = gfn_add(gfn, 1) )
1119                         p2m_pod_zero_check(p2m, &gfn, 1);
1120                 }
1121             }
1122             else
1123                 p2m_pod_zero_check(p2m, &gfn, 1);
1124 
1125             mrp->list[idx] = gfn_x(INVALID_GFN);
1126         }
1127 
1128     } while ( (p2m->pod.count == 0) && (i < ARRAY_SIZE(mrp->list)) );
1129 }
1130 
pod_eager_record(struct p2m_domain * p2m,gfn_t gfn,unsigned int order)1131 static void pod_eager_record(struct p2m_domain *p2m, gfn_t gfn,
1132                              unsigned int order)
1133 {
1134     struct pod_mrp_list *mrp = &p2m->pod.mrp;
1135 
1136     ASSERT(!gfn_eq(gfn, INVALID_GFN));
1137 
1138     mrp->list[mrp->idx++] =
1139         gfn_x(gfn) | (order == PAGE_ORDER_2M ? POD_LAST_SUPERPAGE : 0);
1140     mrp->idx %= ARRAY_SIZE(mrp->list);
1141 }
1142 
1143 bool
p2m_pod_demand_populate(struct p2m_domain * p2m,gfn_t gfn,unsigned int order)1144 p2m_pod_demand_populate(struct p2m_domain *p2m, gfn_t gfn,
1145                         unsigned int order)
1146 {
1147     struct domain *d = p2m->domain;
1148     struct page_info *p = NULL; /* Compiler warnings */
1149     gfn_t gfn_aligned = _gfn((gfn_x(gfn) >> order) << order);
1150     mfn_t mfn;
1151     unsigned long i;
1152 
1153     ASSERT(gfn_locked_by_me(p2m, gfn));
1154     pod_lock(p2m);
1155 
1156     /*
1157      * This check is done with the pod lock held.  This will make sure that
1158      * even if d->is_dying changes under our feet, p2m_pod_empty_cache()
1159      * won't start until we're done.
1160      */
1161     if ( unlikely(d->is_dying) )
1162         goto out_fail;
1163 
1164 
1165     /*
1166      * Because PoD does not have cache list for 1GB pages, it has to remap
1167      * 1GB region to 2MB chunks for a retry.
1168      */
1169     if ( order == PAGE_ORDER_1G )
1170     {
1171         pod_unlock(p2m);
1172         /*
1173          * Note that we are supposed to call p2m_set_entry() 512 times to
1174          * split 1GB into 512 2MB pages here. But We only do once here because
1175          * p2m_set_entry() should automatically shatter the 1GB page into
1176          * 512 2MB pages. The rest of 511 calls are unnecessary.
1177          *
1178          * NOTE: In a fine-grained p2m locking scenario this operation
1179          * may need to promote its locking from gfn->1g superpage
1180          */
1181         return !p2m_set_entry(p2m, gfn_aligned, INVALID_MFN, PAGE_ORDER_2M,
1182                               p2m_populate_on_demand, p2m->default_access);
1183     }
1184 
1185     /* Only reclaim if we're in actual need of more cache. */
1186     if ( p2m->pod.entry_count > p2m->pod.count )
1187         pod_eager_reclaim(p2m);
1188 
1189     /*
1190      * Only sweep if we're actually out of memory.  Doing anything else
1191      * causes unnecessary time and fragmentation of superpages in the p2m.
1192      */
1193     if ( p2m->pod.count == 0 )
1194         p2m_pod_emergency_sweep(p2m);
1195 
1196     /* If the sweep failed, give up. */
1197     if ( p2m->pod.count == 0 )
1198         goto out_of_memory;
1199 
1200     /* Keep track of the highest gfn demand-populated by a guest fault */
1201     p2m->pod.max_guest = gfn_max(gfn, p2m->pod.max_guest);
1202 
1203     /*
1204      * Get a page f/ the cache.  A NULL return value indicates that the
1205      * 2-meg range should be marked singleton PoD, and retried.
1206      */
1207     if ( (p = p2m_pod_cache_get(p2m, order)) == NULL )
1208         goto remap_and_retry;
1209 
1210     mfn = page_to_mfn(p);
1211 
1212     BUG_ON((mfn_x(mfn) & ((1UL << order) - 1)) != 0);
1213 
1214     if ( p2m_set_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw,
1215                        p2m->default_access) )
1216     {
1217         p2m_pod_cache_add(p2m, p, order);
1218         goto out_fail;
1219     }
1220 
1221     for( i = 0; i < (1UL << order); i++ )
1222     {
1223         set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_x(gfn_aligned) + i);
1224         paging_mark_dirty(d, mfn_add(mfn, i));
1225     }
1226 
1227     p2m->pod.entry_count -= (1UL << order);
1228     BUG_ON(p2m->pod.entry_count < 0);
1229 
1230     pod_eager_record(p2m, gfn_aligned, order);
1231 
1232     if ( tb_init_done )
1233     {
1234         struct {
1235             u64 gfn, mfn;
1236             int d:16,order:16;
1237         } t;
1238 
1239         t.gfn = gfn_x(gfn);
1240         t.mfn = mfn_x(mfn);
1241         t.d = d->domain_id;
1242         t.order = order;
1243 
1244         __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
1245     }
1246 
1247     pod_unlock(p2m);
1248     return true;
1249 out_of_memory:
1250     pod_unlock(p2m);
1251 
1252     printk("%s: Dom%d out of PoD memory! (tot=%"PRIu32" ents=%ld dom%d)\n",
1253            __func__, d->domain_id, d->tot_pages, p2m->pod.entry_count,
1254            current->domain->domain_id);
1255     domain_crash(d);
1256     return false;
1257 out_fail:
1258     pod_unlock(p2m);
1259     return false;
1260 remap_and_retry:
1261     BUG_ON(order != PAGE_ORDER_2M);
1262     pod_unlock(p2m);
1263 
1264     /*
1265      * Remap this 2-meg region in singleton chunks. See the comment on the
1266      * 1G page splitting path above for why a single call suffices.
1267      *
1268      * NOTE: In a p2m fine-grained lock scenario this might
1269      * need promoting the gfn lock from gfn->2M superpage.
1270      */
1271     if ( p2m_set_entry(p2m, gfn_aligned, INVALID_MFN, PAGE_ORDER_4K,
1272                        p2m_populate_on_demand, p2m->default_access) )
1273         return false;
1274 
1275     if ( tb_init_done )
1276     {
1277         struct {
1278             u64 gfn;
1279             int d:16;
1280         } t;
1281 
1282         t.gfn = gfn_x(gfn);
1283         t.d = d->domain_id;
1284 
1285         __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t);
1286     }
1287 
1288     return true;
1289 }
1290 
1291 
1292 int
guest_physmap_mark_populate_on_demand(struct domain * d,unsigned long gfn_l,unsigned int order)1293 guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
1294                                       unsigned int order)
1295 {
1296     struct p2m_domain *p2m = p2m_get_hostp2m(d);
1297     gfn_t gfn = _gfn(gfn_l);
1298     unsigned long i, n, pod_count = 0;
1299     int rc = 0;
1300 
1301     if ( !paging_mode_translate(d) )
1302         return -EINVAL;
1303 
1304     gfn_lock(p2m, gfn, order);
1305 
1306     P2M_DEBUG("mark pod gfn=%#lx\n", gfn_l);
1307 
1308     /* Make sure all gpfns are unused */
1309     for ( i = 0; i < (1UL << order); i += n )
1310     {
1311         p2m_type_t ot;
1312         p2m_access_t a;
1313         unsigned int cur_order;
1314 
1315         p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, &cur_order, NULL);
1316         n = 1UL << min(order, cur_order);
1317         if ( p2m_is_ram(ot) )
1318         {
1319             P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot);
1320             rc = -EBUSY;
1321             goto out;
1322         }
1323         else if ( ot == p2m_populate_on_demand )
1324         {
1325             /* Count how man PoD entries we'll be replacing if successful */
1326             pod_count += n;
1327         }
1328     }
1329 
1330     /* Now, actually do the two-way mapping */
1331     rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order,
1332                        p2m_populate_on_demand, p2m->default_access);
1333     if ( rc == 0 )
1334     {
1335         pod_lock(p2m);
1336         p2m->pod.entry_count += 1UL << order;
1337         p2m->pod.entry_count -= pod_count;
1338         BUG_ON(p2m->pod.entry_count < 0);
1339         pod_unlock(p2m);
1340     }
1341 
1342 out:
1343     gfn_unlock(p2m, gfn, order);
1344 
1345     return rc;
1346 }
1347 
1348