1 /******************************************************************************
2 * arch/x86/mm/p2m-pod.c
3 *
4 * Populate-on-demand p2m entries.
5 *
6 * Copyright (c) 2009-2011 Citrix Systems, Inc.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include <xen/event.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/trace.h>
26 #include <asm/page.h>
27 #include <asm/paging.h>
28 #include <asm/p2m.h>
29
30 #include "mm-locks.h"
31
32 /* Override macros from asm/page.h to make them work with mfn_t */
33 #undef mfn_to_page
34 #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
35 #undef page_to_mfn
36 #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
37
38 #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0)
39
40 /* Enforce lock ordering when grabbing the "external" page_alloc lock */
lock_page_alloc(struct p2m_domain * p2m)41 static inline void lock_page_alloc(struct p2m_domain *p2m)
42 {
43 page_alloc_mm_pre_lock();
44 spin_lock(&(p2m->domain->page_alloc_lock));
45 page_alloc_mm_post_lock(p2m->domain->arch.page_alloc_unlock_level);
46 }
47
unlock_page_alloc(struct p2m_domain * p2m)48 static inline void unlock_page_alloc(struct p2m_domain *p2m)
49 {
50 page_alloc_mm_unlock(p2m->domain->arch.page_alloc_unlock_level);
51 spin_unlock(&(p2m->domain->page_alloc_lock));
52 }
53
54 /*
55 * Populate-on-demand functionality
56 */
57
58 static int
p2m_pod_cache_add(struct p2m_domain * p2m,struct page_info * page,unsigned int order)59 p2m_pod_cache_add(struct p2m_domain *p2m,
60 struct page_info *page,
61 unsigned int order)
62 {
63 unsigned long i;
64 struct page_info *p;
65 struct domain *d = p2m->domain;
66
67 #ifndef NDEBUG
68 mfn_t mfn;
69
70 mfn = page_to_mfn(page);
71
72 /* Check to make sure this is a contiguous region */
73 if ( mfn_x(mfn) & ((1UL << order) - 1) )
74 {
75 printk("%s: mfn %lx not aligned order %u! (mask %lx)\n",
76 __func__, mfn_x(mfn), order, ((1UL << order) - 1));
77 return -1;
78 }
79
80 for ( i = 0; i < 1UL << order ; i++)
81 {
82 struct domain * od;
83
84 p = mfn_to_page(_mfn(mfn_x(mfn) + i));
85 od = page_get_owner(p);
86 if ( od != d )
87 {
88 printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
89 __func__, mfn_x(mfn), d->domain_id,
90 od ? od->domain_id : -1);
91 return -1;
92 }
93 }
94 #endif
95
96 ASSERT(pod_locked_by_me(p2m));
97
98 /*
99 * Pages from domain_alloc and returned by the balloon driver aren't
100 * guaranteed to be zero; but by reclaiming zero pages, we implicitly
101 * promise to provide zero pages. So we scrub pages before using.
102 */
103 for ( i = 0; i < (1UL << order); i++ )
104 clear_domain_page(mfn_add(page_to_mfn(page), i));
105
106 /* First, take all pages off the domain list */
107 lock_page_alloc(p2m);
108 for ( i = 0; i < 1UL << order ; i++ )
109 {
110 p = page + i;
111 page_list_del(p, &d->page_list);
112 }
113
114 unlock_page_alloc(p2m);
115
116 /* Then add to the appropriate populate-on-demand list. */
117 switch ( order )
118 {
119 case PAGE_ORDER_1G:
120 for ( i = 0; i < (1UL << PAGE_ORDER_1G); i += 1UL << PAGE_ORDER_2M )
121 page_list_add_tail(page + i, &p2m->pod.super);
122 break;
123 case PAGE_ORDER_2M:
124 page_list_add_tail(page, &p2m->pod.super);
125 break;
126 case PAGE_ORDER_4K:
127 page_list_add_tail(page, &p2m->pod.single);
128 break;
129 default:
130 BUG();
131 }
132 p2m->pod.count += 1UL << order;
133
134 return 0;
135 }
136
137 /* Get a page of size order from the populate-on-demand cache. Will break
138 * down 2-meg pages into singleton pages automatically. Returns null if
139 * a superpage is requested and no superpages are available. */
p2m_pod_cache_get(struct p2m_domain * p2m,unsigned int order)140 static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
141 unsigned int order)
142 {
143 struct page_info *p = NULL;
144 unsigned long i;
145
146 ASSERT(pod_locked_by_me(p2m));
147
148 if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) )
149 {
150 return NULL;
151 }
152 else if ( order == PAGE_ORDER_4K && page_list_empty(&p2m->pod.single) )
153 {
154 unsigned long mfn;
155 struct page_info *q;
156
157 BUG_ON( page_list_empty(&p2m->pod.super) );
158
159 /*
160 * Break up a superpage to make single pages. NB count doesn't
161 * need to be adjusted.
162 */
163 p = page_list_remove_head(&p2m->pod.super);
164 mfn = mfn_x(page_to_mfn(p));
165
166 for ( i = 0; i < SUPERPAGE_PAGES; i++ )
167 {
168 q = mfn_to_page(_mfn(mfn+i));
169 page_list_add_tail(q, &p2m->pod.single);
170 }
171 }
172
173 switch ( order )
174 {
175 case PAGE_ORDER_2M:
176 BUG_ON( page_list_empty(&p2m->pod.super) );
177 p = page_list_remove_head(&p2m->pod.super);
178 p2m->pod.count -= 1UL << order;
179 break;
180 case PAGE_ORDER_4K:
181 BUG_ON( page_list_empty(&p2m->pod.single) );
182 p = page_list_remove_head(&p2m->pod.single);
183 p2m->pod.count -= 1UL;
184 break;
185 default:
186 BUG();
187 }
188
189 /* Put the pages back on the domain page_list */
190 lock_page_alloc(p2m);
191 for ( i = 0 ; i < (1UL << order); i++ )
192 {
193 BUG_ON(page_get_owner(p + i) != p2m->domain);
194 page_list_add_tail(p + i, &p2m->domain->page_list);
195 }
196 unlock_page_alloc(p2m);
197
198 return p;
199 }
200
201 /* Set the size of the cache, allocating or freeing as necessary. */
202 static int
p2m_pod_set_cache_target(struct p2m_domain * p2m,unsigned long pod_target,int preemptible)203 p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int preemptible)
204 {
205 struct domain *d = p2m->domain;
206 int ret = 0;
207
208 ASSERT(pod_locked_by_me(p2m));
209
210 /* Increasing the target */
211 while ( pod_target > p2m->pod.count )
212 {
213 struct page_info * page;
214 int order;
215
216 if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES )
217 order = PAGE_ORDER_2M;
218 else
219 order = PAGE_ORDER_4K;
220 retry:
221 page = alloc_domheap_pages(d, order, 0);
222 if ( unlikely(page == NULL) )
223 {
224 if ( order == PAGE_ORDER_2M )
225 {
226 /* If we can't allocate a superpage, try singleton pages */
227 order = PAGE_ORDER_4K;
228 goto retry;
229 }
230
231 printk("%s: Unable to allocate page for PoD cache (target=%lu cache=%ld)\n",
232 __func__, pod_target, p2m->pod.count);
233 ret = -ENOMEM;
234 goto out;
235 }
236
237 p2m_pod_cache_add(p2m, page, order);
238
239 if ( preemptible && pod_target != p2m->pod.count &&
240 hypercall_preempt_check() )
241 {
242 ret = -ERESTART;
243 goto out;
244 }
245 }
246
247 /* Decreasing the target */
248 /*
249 * We hold the pod lock here, so we don't need to worry about
250 * cache disappearing under our feet.
251 */
252 while ( pod_target < p2m->pod.count )
253 {
254 struct page_info * page;
255 unsigned int order;
256 unsigned long i;
257
258 if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES
259 && !page_list_empty(&p2m->pod.super) )
260 order = PAGE_ORDER_2M;
261 else
262 order = PAGE_ORDER_4K;
263
264 page = p2m_pod_cache_get(p2m, order);
265
266 ASSERT(page != NULL);
267
268 /* Then free them */
269 for ( i = 0 ; i < (1UL << order) ; i++ )
270 {
271 /* Copied from common/memory.c:guest_remove_page() */
272 if ( unlikely(!get_page(page + i, d)) )
273 {
274 gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id);
275 ret = -EINVAL;
276 goto out;
277 }
278
279 if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) )
280 put_page_and_type(page + i);
281
282 if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
283 put_page(page + i);
284
285 put_page(page + i);
286
287 if ( preemptible && pod_target != p2m->pod.count &&
288 hypercall_preempt_check() )
289 {
290 ret = -ERESTART;
291 goto out;
292 }
293 }
294 }
295
296 out:
297 return ret;
298 }
299
300 /*
301 * The "right behavior" here requires some careful thought. First, some
302 * definitions:
303 * + M: static_max
304 * + B: number of pages the balloon driver has ballooned down to.
305 * + P: Number of populated pages.
306 * + T: Old target
307 * + T': New target
308 *
309 * The following equations should hold:
310 * 0 <= P <= T <= B <= M
311 * d->arch.p2m->pod.entry_count == B - P
312 * d->tot_pages == P + d->arch.p2m->pod.count
313 *
314 * Now we have the following potential cases to cover:
315 * B <T': Set the PoD cache size equal to the number of outstanding PoD
316 * entries. The balloon driver will deflate the balloon to give back
317 * the remainder of the ram to the guest OS.
318 * T <T'<B : Increase PoD cache size.
319 * T'<T<=B : Here we have a choice. We can decrease the size of the cache,
320 * get the memory right away. However, that means every time we
321 * reduce the memory target we risk the guest attempting to populate the
322 * memory before the balloon driver has reached its new target. Safer to
323 * never reduce the cache size here, but only when the balloon driver frees
324 * PoD ranges.
325 *
326 * If there are many zero pages, we could reach the target also by doing
327 * zero sweeps and marking the ranges PoD; but the balloon driver will have
328 * to free this memory eventually anyway, so we don't actually gain that much
329 * by doing so.
330 *
331 * NB that the equation (B<T') may require adjustment to the cache
332 * size as PoD pages are freed as well; i.e., freeing a PoD-backed
333 * entry when pod.entry_count == pod.count requires us to reduce both
334 * pod.entry_count and pod.count.
335 */
336 int
p2m_pod_set_mem_target(struct domain * d,unsigned long target)337 p2m_pod_set_mem_target(struct domain *d, unsigned long target)
338 {
339 struct p2m_domain *p2m = p2m_get_hostp2m(d);
340 int ret = 0;
341 unsigned long populated, pod_target;
342
343 pod_lock(p2m);
344
345 /* P == B: Nothing to do (unless the guest is being created). */
346 populated = d->tot_pages - p2m->pod.count;
347 if ( populated > 0 && p2m->pod.entry_count == 0 )
348 goto out;
349
350 /* Don't do anything if the domain is being torn down */
351 if ( d->is_dying )
352 goto out;
353
354 /*
355 * T' < B: Don't reduce the cache size; let the balloon driver
356 * take care of it.
357 */
358 if ( target < d->tot_pages )
359 goto out;
360
361 pod_target = target - populated;
362
363 /*
364 * B < T': Set the cache size equal to # of outstanding entries,
365 * let the balloon driver fill in the rest.
366 */
367 if ( populated > 0 && pod_target > p2m->pod.entry_count )
368 pod_target = p2m->pod.entry_count;
369
370 ASSERT( pod_target >= p2m->pod.count );
371
372 ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
373
374 out:
375 pod_unlock(p2m);
376
377 return ret;
378 }
379
p2m_pod_empty_cache(struct domain * d)380 int p2m_pod_empty_cache(struct domain *d)
381 {
382 struct p2m_domain *p2m = p2m_get_hostp2m(d);
383 struct page_info *page;
384 unsigned int i;
385
386 /* After this barrier no new PoD activities can happen. */
387 BUG_ON(!d->is_dying);
388 spin_barrier(&p2m->pod.lock.lock);
389
390 lock_page_alloc(p2m);
391
392 while ( (page = page_list_remove_head(&p2m->pod.super)) )
393 {
394 for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ )
395 {
396 BUG_ON(page_get_owner(page + i) != d);
397 page_list_add_tail(page + i, &d->page_list);
398 }
399
400 p2m->pod.count -= SUPERPAGE_PAGES;
401
402 if ( hypercall_preempt_check() )
403 goto out;
404 }
405
406 for ( i = 0; (page = page_list_remove_head(&p2m->pod.single)); ++i )
407 {
408 BUG_ON(page_get_owner(page) != d);
409 page_list_add_tail(page, &d->page_list);
410
411 p2m->pod.count -= 1;
412
413 if ( i && !(i & 511) && hypercall_preempt_check() )
414 goto out;
415 }
416
417 BUG_ON(p2m->pod.count != 0);
418
419 out:
420 unlock_page_alloc(p2m);
421 return p2m->pod.count ? -ERESTART : 0;
422 }
423
424 int
p2m_pod_offline_or_broken_hit(struct page_info * p)425 p2m_pod_offline_or_broken_hit(struct page_info *p)
426 {
427 struct domain *d;
428 struct p2m_domain *p2m;
429 struct page_info *q, *tmp;
430 unsigned long mfn, bmfn;
431
432 if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
433 return 0;
434
435 pod_lock(p2m);
436 bmfn = mfn_x(page_to_mfn(p));
437 page_list_for_each_safe(q, tmp, &p2m->pod.super)
438 {
439 mfn = mfn_x(page_to_mfn(q));
440 if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) )
441 {
442 unsigned long i;
443 page_list_del(q, &p2m->pod.super);
444 for ( i = 0; i < SUPERPAGE_PAGES; i++)
445 {
446 q = mfn_to_page(_mfn(mfn + i));
447 page_list_add_tail(q, &p2m->pod.single);
448 }
449 page_list_del(p, &p2m->pod.single);
450 p2m->pod.count--;
451 goto pod_hit;
452 }
453 }
454
455 page_list_for_each_safe(q, tmp, &p2m->pod.single)
456 {
457 mfn = mfn_x(page_to_mfn(q));
458 if ( mfn == bmfn )
459 {
460 page_list_del(p, &p2m->pod.single);
461 p2m->pod.count--;
462 goto pod_hit;
463 }
464 }
465
466 pod_unlock(p2m);
467 return 0;
468
469 pod_hit:
470 lock_page_alloc(p2m);
471 /* Insertion must be at list head (see iommu_populate_page_table()). */
472 page_list_add(p, &d->arch.relmem_list);
473 unlock_page_alloc(p2m);
474 pod_unlock(p2m);
475 return 1;
476 }
477
478 void
p2m_pod_offline_or_broken_replace(struct page_info * p)479 p2m_pod_offline_or_broken_replace(struct page_info *p)
480 {
481 struct domain *d;
482 struct p2m_domain *p2m;
483 nodeid_t node = phys_to_nid(page_to_maddr(p));
484
485 if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
486 return;
487
488 free_domheap_page(p);
489
490 p = alloc_domheap_page(d, MEMF_node(node));
491 if ( unlikely(!p) )
492 return;
493
494 pod_lock(p2m);
495 p2m_pod_cache_add(p2m, p, PAGE_ORDER_4K);
496 pod_unlock(p2m);
497 return;
498 }
499
500 static int
501 p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn);
502
503
504 /*
505 * This function is needed for two reasons:
506 * + To properly handle clearing of PoD entries
507 * + To "steal back" memory being freed for the PoD cache, rather than
508 * releasing it.
509 *
510 * Once both of these functions have been completed, we can return and
511 * allow decrease_reservation() to handle everything else.
512 */
513 int
p2m_pod_decrease_reservation(struct domain * d,gfn_t gfn,unsigned int order)514 p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
515 {
516 int ret = 0;
517 unsigned long i, n;
518 struct p2m_domain *p2m = p2m_get_hostp2m(d);
519 bool_t steal_for_cache;
520 long pod, nonpod, ram;
521
522 gfn_lock(p2m, gfn, order);
523 pod_lock(p2m);
524
525 /*
526 * If we don't have any outstanding PoD entries, let things take their
527 * course.
528 */
529 if ( p2m->pod.entry_count == 0 )
530 goto out_unlock;
531
532 if ( unlikely(d->is_dying) )
533 goto out_unlock;
534
535 pod = nonpod = ram = 0;
536
537 /* Figure out if we need to steal some freed memory for our cache */
538 steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count );
539
540 for ( i = 0; i < (1UL << order); i += n )
541 {
542 p2m_access_t a;
543 p2m_type_t t;
544 unsigned int cur_order;
545
546 p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, &cur_order, NULL);
547 n = 1UL << min(order, cur_order);
548 if ( t == p2m_populate_on_demand )
549 pod += n;
550 else
551 {
552 nonpod += n;
553 if ( p2m_is_ram(t) )
554 ram += n;
555 }
556 }
557
558 /* No populate-on-demand? Don't need to steal anything? Then we're done!*/
559 if ( !pod && !steal_for_cache )
560 goto out_unlock;
561
562 if ( !nonpod )
563 {
564 /*
565 * All PoD: Mark the whole region invalid and tell caller
566 * we're done.
567 */
568 if ( p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid,
569 p2m->default_access) )
570 {
571 /*
572 * If this fails, we can't tell how much of the range was changed.
573 * Best to crash the domain unless we're sure a partial change is
574 * impossible.
575 */
576 if ( order != 0 )
577 domain_crash(d);
578 goto out_unlock;
579 }
580 p2m->pod.entry_count -= 1UL << order;
581 BUG_ON(p2m->pod.entry_count < 0);
582 ret = 1;
583 goto out_entry_check;
584 }
585
586 /*
587 * Try to grab entire superpages if possible. Since the common case is for
588 * drivers to pass back singleton pages, see if we can take the whole page
589 * back and mark the rest PoD.
590 * No need to do this though if
591 * - order >= SUPERPAGE_ORDER (the loop below will take care of this)
592 * - not all of the pages were RAM (now knowing order < SUPERPAGE_ORDER)
593 */
594 if ( steal_for_cache && order < SUPERPAGE_ORDER && ram == (1UL << order) &&
595 p2m_pod_zero_check_superpage(p2m, _gfn(gfn_x(gfn) & ~(SUPERPAGE_PAGES - 1))) )
596 {
597 pod = 1UL << order;
598 ram = nonpod = 0;
599 ASSERT(steal_for_cache == (p2m->pod.entry_count > p2m->pod.count));
600 }
601
602 /*
603 * Process as long as:
604 * + There are PoD entries to handle, or
605 * + There is ram left, and we want to steal it
606 */
607 for ( i = 0;
608 i < (1UL << order) && (pod > 0 || (steal_for_cache && ram > 0));
609 i += n )
610 {
611 mfn_t mfn;
612 p2m_type_t t;
613 p2m_access_t a;
614 unsigned int cur_order;
615
616 mfn = p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, &cur_order, NULL);
617 if ( order < cur_order )
618 cur_order = order;
619 n = 1UL << cur_order;
620 if ( t == p2m_populate_on_demand )
621 {
622 /* This shouldn't be able to fail */
623 if ( p2m_set_entry(p2m, gfn_add(gfn, i), INVALID_MFN, cur_order,
624 p2m_invalid, p2m->default_access) )
625 {
626 ASSERT_UNREACHABLE();
627 domain_crash(d);
628 goto out_unlock;
629 }
630 p2m->pod.entry_count -= n;
631 BUG_ON(p2m->pod.entry_count < 0);
632 pod -= n;
633 }
634 else if ( steal_for_cache && p2m_is_ram(t) )
635 {
636 /*
637 * If we need less than 1 << cur_order, we may end up stealing
638 * more memory here than we actually need. This will be rectified
639 * below, however; and stealing too much and then freeing what we
640 * need may allow us to free smaller pages from the cache, and
641 * avoid breaking up superpages.
642 */
643 struct page_info *page;
644 unsigned long j;
645
646 ASSERT(mfn_valid(mfn));
647
648 page = mfn_to_page(mfn);
649
650 /* This shouldn't be able to fail */
651 if ( p2m_set_entry(p2m, gfn_add(gfn, i), INVALID_MFN, cur_order,
652 p2m_invalid, p2m->default_access) )
653 {
654 ASSERT_UNREACHABLE();
655 domain_crash(d);
656 goto out_unlock;
657 }
658 p2m_tlb_flush_sync(p2m);
659 for ( j = 0; j < n; ++j )
660 set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY);
661 p2m_pod_cache_add(p2m, page, cur_order);
662
663 steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count );
664
665 nonpod -= n;
666 ram -= n;
667 }
668 }
669
670 /*
671 * If there are no more non-PoD entries, tell decrease_reservation() that
672 * there's nothing left to do.
673 */
674 if ( nonpod == 0 )
675 ret = 1;
676
677 out_entry_check:
678 /* If we've reduced our "liabilities" beyond our "assets", free some */
679 if ( p2m->pod.entry_count < p2m->pod.count )
680 {
681 p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't preempt*/);
682 }
683
684 out_unlock:
685 pod_unlock(p2m);
686 gfn_unlock(p2m, gfn, order);
687 return ret;
688 }
689
p2m_pod_dump_data(struct domain * d)690 void p2m_pod_dump_data(struct domain *d)
691 {
692 struct p2m_domain *p2m = p2m_get_hostp2m(d);
693
694 printk(" PoD entries=%ld cachesize=%ld\n",
695 p2m->pod.entry_count, p2m->pod.count);
696 }
697
698
699 /*
700 * Search for all-zero superpages to be reclaimed as superpages for the
701 * PoD cache. Must be called w/ pod lock held, must lock the superpage
702 * in the p2m.
703 */
704 static int
p2m_pod_zero_check_superpage(struct p2m_domain * p2m,gfn_t gfn)705 p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn)
706 {
707 mfn_t mfn, mfn0 = INVALID_MFN;
708 p2m_type_t type, type0 = 0;
709 unsigned long * map = NULL;
710 int ret=0, reset = 0;
711 unsigned long i, n;
712 unsigned int j;
713 int max_ref = 1;
714 struct domain *d = p2m->domain;
715
716 ASSERT(pod_locked_by_me(p2m));
717
718 if ( !superpage_aligned(gfn_x(gfn)) )
719 goto out;
720
721 /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
722 if ( paging_mode_shadow(d) )
723 max_ref++;
724
725 /*
726 * NOTE: this is why we don't enforce deadlock constraints between p2m
727 * and pod locks.
728 */
729 gfn_lock(p2m, gfn, SUPERPAGE_ORDER);
730
731 /*
732 * Look up the mfns, checking to make sure they're the same mfn
733 * and aligned, and mapping them.
734 */
735 for ( i = 0; i < SUPERPAGE_PAGES; i += n )
736 {
737 p2m_access_t a;
738 unsigned int cur_order;
739 unsigned long k;
740 const struct page_info *page;
741
742 mfn = p2m->get_entry(p2m, gfn_add(gfn, i), &type, &a, 0,
743 &cur_order, NULL);
744
745 /*
746 * Conditions that must be met for superpage-superpage:
747 * + All gfns are ram types
748 * + All gfns have the same type
749 * + All of the mfns are allocated to a domain
750 * + None of the mfns are used as pagetables, or allocated via xenheap
751 * + The first mfn is 2-meg aligned
752 * + All the other mfns are in sequence
753 * Adding for good measure:
754 * + None of the mfns are likely to be mapped elsewhere (refcount
755 * 2 or less for shadow, 1 for hap)
756 */
757 if ( !p2m_is_ram(type) )
758 goto out;
759
760 if ( i == 0 )
761 {
762 if ( !superpage_aligned(mfn_x(mfn)) )
763 goto out;
764 mfn0 = mfn;
765 type0 = type;
766 }
767 else if ( type != type0 || !mfn_eq(mfn, mfn_add(mfn0, i)) )
768 goto out;
769
770 n = 1UL << min(cur_order, SUPERPAGE_ORDER + 0U);
771 for ( k = 0, page = mfn_to_page(mfn); k < n; ++k, ++page )
772 if ( !(page->count_info & PGC_allocated) ||
773 (page->count_info & (PGC_page_table | PGC_xen_heap)) ||
774 (page->count_info & PGC_count_mask) > max_ref )
775 goto out;
776 }
777
778 /* Now, do a quick check to see if it may be zero before unmapping. */
779 for ( i = 0; i < SUPERPAGE_PAGES; i++ )
780 {
781 /* Quick zero-check */
782 map = map_domain_page(mfn_add(mfn0, i));
783
784 for ( j = 0; j < 16; j++ )
785 if ( *(map + j) != 0 )
786 break;
787
788 unmap_domain_page(map);
789
790 if ( j < 16 )
791 goto out;
792
793 }
794
795 /* Try to remove the page, restoring old mapping if it fails. */
796 if ( p2m_set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_2M,
797 p2m_populate_on_demand, p2m->default_access) )
798 goto out;
799
800 p2m_tlb_flush_sync(p2m);
801
802 /*
803 * Make none of the MFNs are used elsewhere... for example, mapped
804 * via the grant table interface, or by qemu. Allow one refcount for
805 * being allocated to the domain.
806 */
807 for ( i = 0; i < SUPERPAGE_PAGES; i++ )
808 {
809 mfn = mfn_add(mfn0, i);
810 if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 )
811 {
812 reset = 1;
813 goto out_reset;
814 }
815 }
816
817 /* Finally, do a full zero-check */
818 for ( i = 0; i < SUPERPAGE_PAGES; i++ )
819 {
820 map = map_domain_page(mfn_add(mfn0, i));
821
822 for ( j = 0; j < (PAGE_SIZE / sizeof(*map)); j++ )
823 if ( *(map+j) != 0 )
824 {
825 reset = 1;
826 break;
827 }
828
829 unmap_domain_page(map);
830
831 if ( reset )
832 goto out_reset;
833 }
834
835 if ( tb_init_done )
836 {
837 struct {
838 u64 gfn, mfn;
839 int d:16,order:16;
840 } t;
841
842 t.gfn = gfn_x(gfn);
843 t.mfn = mfn_x(mfn);
844 t.d = d->domain_id;
845 t.order = 9;
846
847 __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
848 }
849
850 /*
851 * Finally! We've passed all the checks, and can add the mfn superpage
852 * back on the PoD cache, and account for the new p2m PoD entries.
853 */
854 p2m_pod_cache_add(p2m, mfn_to_page(mfn0), PAGE_ORDER_2M);
855 p2m->pod.entry_count += SUPERPAGE_PAGES;
856
857 ret = SUPERPAGE_PAGES;
858
859 out_reset:
860 /*
861 * This p2m_set_entry() call shouldn't be able to fail, since the same order
862 * on the same gfn succeeded above. If that turns out to be false, crashing
863 * the domain should be the safest way of making sure we don't leak memory.
864 */
865 if ( reset && p2m_set_entry(p2m, gfn, mfn0, PAGE_ORDER_2M,
866 type0, p2m->default_access) )
867 {
868 ASSERT_UNREACHABLE();
869 domain_crash(d);
870 }
871
872 out:
873 gfn_unlock(p2m, gfn, SUPERPAGE_ORDER);
874 return ret;
875 }
876
877 static void
p2m_pod_zero_check(struct p2m_domain * p2m,const gfn_t * gfns,int count)878 p2m_pod_zero_check(struct p2m_domain *p2m, const gfn_t *gfns, int count)
879 {
880 mfn_t mfns[count];
881 p2m_type_t types[count];
882 unsigned long *map[count];
883 struct domain *d = p2m->domain;
884
885 int i, j;
886 int max_ref = 1;
887
888 /* Allow an extra refcount for one shadow pt mapping in shadowed domains */
889 if ( paging_mode_shadow(d) )
890 max_ref++;
891
892 /* First, get the gfn list, translate to mfns, and map the pages. */
893 for ( i = 0; i < count; i++ )
894 {
895 p2m_access_t a;
896 struct page_info *pg;
897
898 mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, &a,
899 0, NULL, NULL);
900 pg = mfn_to_page(mfns[i]);
901
902 /*
903 * If this is ram, and not a pagetable or from the xen heap, and
904 * probably not mapped elsewhere, map it; otherwise, skip.
905 */
906 if ( p2m_is_ram(types[i]) && (pg->count_info & PGC_allocated) &&
907 !(pg->count_info & (PGC_page_table | PGC_xen_heap)) &&
908 ((pg->count_info & PGC_count_mask) <= max_ref) )
909 map[i] = map_domain_page(mfns[i]);
910 else
911 map[i] = NULL;
912 }
913
914 /*
915 * Then, go through and check for zeroed pages, removing write permission
916 * for those with zeroes.
917 */
918 for ( i = 0; i < count; i++ )
919 {
920 if ( !map[i] )
921 continue;
922
923 /* Quick zero-check */
924 for ( j = 0; j < 16; j++ )
925 if ( *(map[i] + j) != 0 )
926 break;
927
928 if ( j < 16 )
929 {
930 unmap_domain_page(map[i]);
931 map[i] = NULL;
932 continue;
933 }
934
935 /* Try to remove the page, restoring old mapping if it fails. */
936 if ( p2m_set_entry(p2m, gfns[i], INVALID_MFN, PAGE_ORDER_4K,
937 p2m_populate_on_demand, p2m->default_access) )
938 goto skip;
939
940 /*
941 * See if the page was successfully unmapped. (Allow one refcount
942 * for being allocated to a domain.)
943 */
944 if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 )
945 {
946 /*
947 * If the previous p2m_set_entry call succeeded, this one shouldn't
948 * be able to fail. If it does, crashing the domain should be safe.
949 */
950 if ( p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K,
951 types[i], p2m->default_access) )
952 {
953 ASSERT_UNREACHABLE();
954 domain_crash(d);
955 goto out_unmap;
956 }
957
958 skip:
959 unmap_domain_page(map[i]);
960 map[i] = NULL;
961
962 continue;
963 }
964 }
965
966 p2m_tlb_flush_sync(p2m);
967
968 /* Now check each page for real */
969 for ( i = 0; i < count; i++ )
970 {
971 if ( !map[i] )
972 continue;
973
974 for ( j = 0; j < (PAGE_SIZE / sizeof(*map[i])); j++ )
975 if ( *(map[i] + j) != 0 )
976 break;
977
978 unmap_domain_page(map[i]);
979
980 map[i] = NULL;
981
982 /*
983 * See comment in p2m_pod_zero_check_superpage() re gnttab
984 * check timing.
985 */
986 if ( j < (PAGE_SIZE / sizeof(*map[i])) )
987 {
988 /*
989 * If the previous p2m_set_entry call succeeded, this one shouldn't
990 * be able to fail. If it does, crashing the domain should be safe.
991 */
992 if ( p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K,
993 types[i], p2m->default_access) )
994 {
995 ASSERT_UNREACHABLE();
996 domain_crash(d);
997 goto out_unmap;
998 }
999 }
1000 else
1001 {
1002 if ( tb_init_done )
1003 {
1004 struct {
1005 u64 gfn, mfn;
1006 int d:16,order:16;
1007 } t;
1008
1009 t.gfn = gfn_x(gfns[i]);
1010 t.mfn = mfn_x(mfns[i]);
1011 t.d = d->domain_id;
1012 t.order = 0;
1013
1014 __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
1015 }
1016
1017 /* Add to cache, and account for the new p2m PoD entry */
1018 p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), PAGE_ORDER_4K);
1019 p2m->pod.entry_count++;
1020 }
1021 }
1022
1023 return;
1024
1025 out_unmap:
1026 /*
1027 * Something went wrong, probably crashing the domain. Unmap
1028 * everything and return.
1029 */
1030 for ( i = 0; i < count; i++ )
1031 if ( map[i] )
1032 unmap_domain_page(map[i]);
1033 }
1034
1035 #define POD_SWEEP_LIMIT 1024
1036 #define POD_SWEEP_STRIDE 16
1037 static void
p2m_pod_emergency_sweep(struct p2m_domain * p2m)1038 p2m_pod_emergency_sweep(struct p2m_domain *p2m)
1039 {
1040 gfn_t gfns[POD_SWEEP_STRIDE];
1041 unsigned long i, j = 0, start, limit;
1042 p2m_type_t t;
1043
1044
1045 if ( gfn_eq(p2m->pod.reclaim_single, _gfn(0)) )
1046 p2m->pod.reclaim_single = p2m->pod.max_guest;
1047
1048 start = gfn_x(p2m->pod.reclaim_single);
1049 limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
1050
1051 /* FIXME: Figure out how to avoid superpages */
1052 /*
1053 * NOTE: Promote to globally locking the p2m. This will get complicated
1054 * in a fine-grained scenario. If we lock each gfn individually we must be
1055 * careful about spinlock recursion limits and POD_SWEEP_STRIDE.
1056 */
1057 p2m_lock(p2m);
1058 for ( i = gfn_x(p2m->pod.reclaim_single); i > 0 ; i-- )
1059 {
1060 p2m_access_t a;
1061 (void)p2m->get_entry(p2m, _gfn(i), &t, &a, 0, NULL, NULL);
1062 if ( p2m_is_ram(t) )
1063 {
1064 gfns[j] = _gfn(i);
1065 j++;
1066 BUG_ON(j > POD_SWEEP_STRIDE);
1067 if ( j == POD_SWEEP_STRIDE )
1068 {
1069 p2m_pod_zero_check(p2m, gfns, j);
1070 j = 0;
1071 }
1072 }
1073 /*
1074 * Stop if we're past our limit and we have found *something*.
1075 *
1076 * NB that this is a zero-sum game; we're increasing our cache size
1077 * by re-increasing our 'debt'. Since we hold the pod lock,
1078 * (entry_count - count) must remain the same.
1079 */
1080 if ( i < limit && (p2m->pod.count > 0 || hypercall_preempt_check()) )
1081 break;
1082 }
1083
1084 if ( j )
1085 p2m_pod_zero_check(p2m, gfns, j);
1086
1087 p2m_unlock(p2m);
1088 p2m->pod.reclaim_single = _gfn(i ? i - 1 : i);
1089
1090 }
1091
pod_eager_reclaim(struct p2m_domain * p2m)1092 static void pod_eager_reclaim(struct p2m_domain *p2m)
1093 {
1094 struct pod_mrp_list *mrp = &p2m->pod.mrp;
1095 unsigned int i = 0;
1096
1097 /*
1098 * Always check one page for reclaimation.
1099 *
1100 * If the PoD pool is empty, keep checking some space is found, or all
1101 * entries have been exhaused.
1102 */
1103 do
1104 {
1105 unsigned int idx = (mrp->idx + i++) % ARRAY_SIZE(mrp->list);
1106 gfn_t gfn = _gfn(mrp->list[idx]);
1107
1108 if ( !gfn_eq(gfn, INVALID_GFN) )
1109 {
1110 if ( gfn_x(gfn) & POD_LAST_SUPERPAGE )
1111 {
1112 gfn = _gfn(gfn_x(gfn) & ~POD_LAST_SUPERPAGE);
1113
1114 if ( p2m_pod_zero_check_superpage(p2m, gfn) == 0 )
1115 {
1116 unsigned int x;
1117
1118 for ( x = 0; x < SUPERPAGE_PAGES; ++x, gfn = gfn_add(gfn, 1) )
1119 p2m_pod_zero_check(p2m, &gfn, 1);
1120 }
1121 }
1122 else
1123 p2m_pod_zero_check(p2m, &gfn, 1);
1124
1125 mrp->list[idx] = gfn_x(INVALID_GFN);
1126 }
1127
1128 } while ( (p2m->pod.count == 0) && (i < ARRAY_SIZE(mrp->list)) );
1129 }
1130
pod_eager_record(struct p2m_domain * p2m,gfn_t gfn,unsigned int order)1131 static void pod_eager_record(struct p2m_domain *p2m, gfn_t gfn,
1132 unsigned int order)
1133 {
1134 struct pod_mrp_list *mrp = &p2m->pod.mrp;
1135
1136 ASSERT(!gfn_eq(gfn, INVALID_GFN));
1137
1138 mrp->list[mrp->idx++] =
1139 gfn_x(gfn) | (order == PAGE_ORDER_2M ? POD_LAST_SUPERPAGE : 0);
1140 mrp->idx %= ARRAY_SIZE(mrp->list);
1141 }
1142
1143 bool
p2m_pod_demand_populate(struct p2m_domain * p2m,gfn_t gfn,unsigned int order)1144 p2m_pod_demand_populate(struct p2m_domain *p2m, gfn_t gfn,
1145 unsigned int order)
1146 {
1147 struct domain *d = p2m->domain;
1148 struct page_info *p = NULL; /* Compiler warnings */
1149 gfn_t gfn_aligned = _gfn((gfn_x(gfn) >> order) << order);
1150 mfn_t mfn;
1151 unsigned long i;
1152
1153 ASSERT(gfn_locked_by_me(p2m, gfn));
1154 pod_lock(p2m);
1155
1156 /*
1157 * This check is done with the pod lock held. This will make sure that
1158 * even if d->is_dying changes under our feet, p2m_pod_empty_cache()
1159 * won't start until we're done.
1160 */
1161 if ( unlikely(d->is_dying) )
1162 goto out_fail;
1163
1164
1165 /*
1166 * Because PoD does not have cache list for 1GB pages, it has to remap
1167 * 1GB region to 2MB chunks for a retry.
1168 */
1169 if ( order == PAGE_ORDER_1G )
1170 {
1171 pod_unlock(p2m);
1172 /*
1173 * Note that we are supposed to call p2m_set_entry() 512 times to
1174 * split 1GB into 512 2MB pages here. But We only do once here because
1175 * p2m_set_entry() should automatically shatter the 1GB page into
1176 * 512 2MB pages. The rest of 511 calls are unnecessary.
1177 *
1178 * NOTE: In a fine-grained p2m locking scenario this operation
1179 * may need to promote its locking from gfn->1g superpage
1180 */
1181 return !p2m_set_entry(p2m, gfn_aligned, INVALID_MFN, PAGE_ORDER_2M,
1182 p2m_populate_on_demand, p2m->default_access);
1183 }
1184
1185 /* Only reclaim if we're in actual need of more cache. */
1186 if ( p2m->pod.entry_count > p2m->pod.count )
1187 pod_eager_reclaim(p2m);
1188
1189 /*
1190 * Only sweep if we're actually out of memory. Doing anything else
1191 * causes unnecessary time and fragmentation of superpages in the p2m.
1192 */
1193 if ( p2m->pod.count == 0 )
1194 p2m_pod_emergency_sweep(p2m);
1195
1196 /* If the sweep failed, give up. */
1197 if ( p2m->pod.count == 0 )
1198 goto out_of_memory;
1199
1200 /* Keep track of the highest gfn demand-populated by a guest fault */
1201 p2m->pod.max_guest = gfn_max(gfn, p2m->pod.max_guest);
1202
1203 /*
1204 * Get a page f/ the cache. A NULL return value indicates that the
1205 * 2-meg range should be marked singleton PoD, and retried.
1206 */
1207 if ( (p = p2m_pod_cache_get(p2m, order)) == NULL )
1208 goto remap_and_retry;
1209
1210 mfn = page_to_mfn(p);
1211
1212 BUG_ON((mfn_x(mfn) & ((1UL << order) - 1)) != 0);
1213
1214 if ( p2m_set_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw,
1215 p2m->default_access) )
1216 {
1217 p2m_pod_cache_add(p2m, p, order);
1218 goto out_fail;
1219 }
1220
1221 for( i = 0; i < (1UL << order); i++ )
1222 {
1223 set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_x(gfn_aligned) + i);
1224 paging_mark_dirty(d, mfn_add(mfn, i));
1225 }
1226
1227 p2m->pod.entry_count -= (1UL << order);
1228 BUG_ON(p2m->pod.entry_count < 0);
1229
1230 pod_eager_record(p2m, gfn_aligned, order);
1231
1232 if ( tb_init_done )
1233 {
1234 struct {
1235 u64 gfn, mfn;
1236 int d:16,order:16;
1237 } t;
1238
1239 t.gfn = gfn_x(gfn);
1240 t.mfn = mfn_x(mfn);
1241 t.d = d->domain_id;
1242 t.order = order;
1243
1244 __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
1245 }
1246
1247 pod_unlock(p2m);
1248 return true;
1249 out_of_memory:
1250 pod_unlock(p2m);
1251
1252 printk("%s: Dom%d out of PoD memory! (tot=%"PRIu32" ents=%ld dom%d)\n",
1253 __func__, d->domain_id, d->tot_pages, p2m->pod.entry_count,
1254 current->domain->domain_id);
1255 domain_crash(d);
1256 return false;
1257 out_fail:
1258 pod_unlock(p2m);
1259 return false;
1260 remap_and_retry:
1261 BUG_ON(order != PAGE_ORDER_2M);
1262 pod_unlock(p2m);
1263
1264 /*
1265 * Remap this 2-meg region in singleton chunks. See the comment on the
1266 * 1G page splitting path above for why a single call suffices.
1267 *
1268 * NOTE: In a p2m fine-grained lock scenario this might
1269 * need promoting the gfn lock from gfn->2M superpage.
1270 */
1271 if ( p2m_set_entry(p2m, gfn_aligned, INVALID_MFN, PAGE_ORDER_4K,
1272 p2m_populate_on_demand, p2m->default_access) )
1273 return false;
1274
1275 if ( tb_init_done )
1276 {
1277 struct {
1278 u64 gfn;
1279 int d:16;
1280 } t;
1281
1282 t.gfn = gfn_x(gfn);
1283 t.d = d->domain_id;
1284
1285 __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t);
1286 }
1287
1288 return true;
1289 }
1290
1291
1292 int
guest_physmap_mark_populate_on_demand(struct domain * d,unsigned long gfn_l,unsigned int order)1293 guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
1294 unsigned int order)
1295 {
1296 struct p2m_domain *p2m = p2m_get_hostp2m(d);
1297 gfn_t gfn = _gfn(gfn_l);
1298 unsigned long i, n, pod_count = 0;
1299 int rc = 0;
1300
1301 if ( !paging_mode_translate(d) )
1302 return -EINVAL;
1303
1304 gfn_lock(p2m, gfn, order);
1305
1306 P2M_DEBUG("mark pod gfn=%#lx\n", gfn_l);
1307
1308 /* Make sure all gpfns are unused */
1309 for ( i = 0; i < (1UL << order); i += n )
1310 {
1311 p2m_type_t ot;
1312 p2m_access_t a;
1313 unsigned int cur_order;
1314
1315 p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, &cur_order, NULL);
1316 n = 1UL << min(order, cur_order);
1317 if ( p2m_is_ram(ot) )
1318 {
1319 P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot);
1320 rc = -EBUSY;
1321 goto out;
1322 }
1323 else if ( ot == p2m_populate_on_demand )
1324 {
1325 /* Count how man PoD entries we'll be replacing if successful */
1326 pod_count += n;
1327 }
1328 }
1329
1330 /* Now, actually do the two-way mapping */
1331 rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order,
1332 p2m_populate_on_demand, p2m->default_access);
1333 if ( rc == 0 )
1334 {
1335 pod_lock(p2m);
1336 p2m->pod.entry_count += 1UL << order;
1337 p2m->pod.entry_count -= pod_count;
1338 BUG_ON(p2m->pod.entry_count < 0);
1339 pod_unlock(p2m);
1340 }
1341
1342 out:
1343 gfn_unlock(p2m, gfn, order);
1344
1345 return rc;
1346 }
1347
1348