1 /******************************************************************************
2 * page_alloc.c
3 *
4 * Simple buddy heap allocator for Xen.
5 *
6 * Copyright (c) 2002-2004 K A Fraser
7 * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; If not, see <http://www.gnu.org/licenses/>.
21 */
22
23 /*
24 * In general Xen maintains two pools of memory:
25 *
26 * - Xen heap: Memory which is always mapped (i.e accessible by
27 * virtual address), via a permanent and contiguous
28 * "direct mapping". Macros like va() and pa() are valid
29 * for such memory and it is always permissible to stash
30 * pointers to Xen heap memory in data structures etc.
31 *
32 * Xen heap pages are always anonymous (that is, not tied
33 * or accounted to any particular domain).
34 *
35 * - Dom heap: Memory which must be explicitly mapped, usually
36 * transiently with map_domain_page(), in order to be
37 * used. va() and pa() are not valid for such memory. Care
38 * should be taken when stashing pointers to dom heap
39 * pages that those mappings are permanent (e.g. vmap() or
40 * map_domain_page_global()), it is not safe to stash
41 * transient mappings such as those from map_domain_page()
42 *
43 * Dom heap pages are often tied to a particular domain,
44 * but need not be (passing domain==NULL results in an
45 * anonymous dom heap allocation).
46 *
47 * The exact nature of this split is a (sub)arch decision which can
48 * select one of three main variants:
49 *
50 * CONFIG_SEPARATE_XENHEAP=y
51 *
52 * The xen heap is maintained as an entirely separate heap.
53 *
54 * Arch code arranges for some (perhaps small) amount of physical
55 * memory to be covered by a direct mapping and registers that
56 * memory as the Xen heap (via init_xenheap_pages()) and the
57 * remainder as the dom heap.
58 *
59 * This mode of operation is most commonly used by 32-bit arches
60 * where the virtual address space is insufficient to map all RAM.
61 *
62 * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ALL RAM
63 *
64 * All of RAM is covered by a permanent contiguous mapping and there
65 * is only a single heap.
66 *
67 * Memory allocated from the Xen heap is flagged (in
68 * page_info.count_info) with PGC_xen_heap. Memory allocated from
69 * the Dom heap must still be explicitly mapped before use
70 * (e.g. with map_domain_page) in particular in common code.
71 *
72 * xenheap_max_mfn() should not be called by arch code.
73 *
74 * This mode of operation is most commonly used by 64-bit arches
75 * which have sufficient free virtual address space to permanently
76 * map the largest practical amount RAM currently expected on that
77 * arch.
78 *
79 * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ONLY PARTIAL RAM
80 *
81 * There is a single heap, but only the beginning (up to some
82 * threshold) is covered by a permanent contiguous mapping.
83 *
84 * Memory allocated from the Xen heap is allocated from below the
85 * threshold and flagged with PGC_xen_heap. Memory allocated from
86 * the dom heap is allocated from anywhere in the heap (although it
87 * will prefer to allocate from as high as possible to try and keep
88 * Xen heap suitable memory available).
89 *
90 * Arch code must call xenheap_max_mfn() to signal the limit of the
91 * direct mapping.
92 *
93 * This mode of operation is most commonly used by 64-bit arches
94 * which have a restricted amount of virtual address space available
95 * for a direct map (due to e.g. reservations for other purposes)
96 * such that it is not possible to map all of RAM on systems with
97 * the largest practical amount of RAM currently expected on that
98 * arch.
99 *
100 * Boot Allocator
101 *
102 * In addition to the two primary pools (xen heap and dom heap) a
103 * third "boot allocator" is used at start of day. This is a
104 * simplified allocator which can be used.
105 *
106 * Typically all memory which is destined to be dom heap memory
107 * (which is everything in the CONFIG_SEPARATE_XENHEAP=n
108 * configurations) is first allocated to the boot allocator (with
109 * init_boot_pages()) and is then handed over to the main dom heap in
110 * end_boot_allocator().
111 *
112 * "Contiguous" mappings
113 *
114 * Note that although the above talks about "contiguous" mappings
115 * some architectures implement a scheme ("PDX compression") to
116 * compress unused portions of the machine address space (i.e. large
117 * gaps between distinct banks of memory) in order to avoid creating
118 * enormous frame tables and direct maps which mostly map
119 * nothing. Thus a contiguous mapping may still have distinct
120 * regions within it.
121 */
122
123 #include <xen/init.h>
124 #include <xen/types.h>
125 #include <xen/lib.h>
126 #include <xen/sched.h>
127 #include <xen/spinlock.h>
128 #include <xen/mm.h>
129 #include <xen/irq.h>
130 #include <xen/softirq.h>
131 #include <xen/domain_page.h>
132 #include <xen/keyhandler.h>
133 #include <xen/perfc.h>
134 #include <xen/pfn.h>
135 #include <xen/numa.h>
136 #include <xen/nodemask.h>
137 #include <xen/event.h>
138 #include <xen/tmem.h>
139 #include <xen/tmem_xen.h>
140 #include <public/sysctl.h>
141 #include <public/sched.h>
142 #include <asm/page.h>
143 #include <asm/numa.h>
144 #include <asm/flushtlb.h>
145 #ifdef CONFIG_X86
146 #include <asm/guest.h>
147 #include <asm/p2m.h>
148 #include <asm/setup.h> /* for highmem_start only */
149 #else
150 #define p2m_pod_offline_or_broken_hit(pg) 0
151 #define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL)
152 #endif
153
154 /*
155 * Comma-separated list of hexadecimal page numbers containing bad bytes.
156 * e.g. 'badpage=0x3f45,0x8a321'.
157 */
158 static char __initdata opt_badpage[100] = "";
159 string_param("badpage", opt_badpage);
160
161 /*
162 * no-bootscrub -> Free pages are not zeroed during boot.
163 */
164 static bool_t opt_bootscrub __initdata = 1;
165 boolean_param("bootscrub", opt_bootscrub);
166
167 /*
168 * bootscrub_chunk -> Amount of bytes to scrub lockstep on non-SMT CPUs
169 * on all NUMA nodes.
170 */
171 static unsigned long __initdata opt_bootscrub_chunk = MB(128);
172 size_param("bootscrub_chunk", opt_bootscrub_chunk);
173
174 #ifdef CONFIG_SCRUB_DEBUG
175 static bool __read_mostly scrub_debug;
176 #else
177 #define scrub_debug false
178 #endif
179
180 /*
181 * Bit width of the DMA heap -- used to override NUMA-node-first.
182 * allocation strategy, which can otherwise exhaust low memory.
183 */
184 static unsigned int dma_bitsize;
185 integer_param("dma_bits", dma_bitsize);
186
187 /* Offlined page list, protected by heap_lock. */
188 PAGE_LIST_HEAD(page_offlined_list);
189 /* Broken page list, protected by heap_lock. */
190 PAGE_LIST_HEAD(page_broken_list);
191
192 /*************************
193 * BOOT-TIME ALLOCATOR
194 */
195
196 /*
197 * first_valid_mfn is exported because it is use in ARM specific NUMA
198 * helpers. See comment in asm-arm/numa.h.
199 */
200 unsigned long first_valid_mfn = ~0UL;
201
202 static struct bootmem_region {
203 unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */
204 } *__initdata bootmem_region_list;
205 static unsigned int __initdata nr_bootmem_regions;
206
207 struct scrub_region {
208 unsigned long offset;
209 unsigned long start;
210 unsigned long per_cpu_sz;
211 unsigned long rem;
212 cpumask_t cpus;
213 };
214 static struct scrub_region __initdata region[MAX_NUMNODES];
215 static unsigned long __initdata chunk_size;
216
bootmem_region_add(unsigned long s,unsigned long e)217 static void __init bootmem_region_add(unsigned long s, unsigned long e)
218 {
219 unsigned int i;
220
221 if ( (bootmem_region_list == NULL) && (s < e) )
222 bootmem_region_list = mfn_to_virt(s++);
223
224 if ( s >= e )
225 return;
226
227 for ( i = 0; i < nr_bootmem_regions; i++ )
228 if ( s < bootmem_region_list[i].e )
229 break;
230
231 BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s));
232 BUG_ON(nr_bootmem_regions == (PAGE_SIZE / sizeof(struct bootmem_region)));
233
234 memmove(&bootmem_region_list[i+1], &bootmem_region_list[i],
235 (nr_bootmem_regions - i) * sizeof(*bootmem_region_list));
236 bootmem_region_list[i] = (struct bootmem_region) { s, e };
237 nr_bootmem_regions++;
238 }
239
bootmem_region_zap(unsigned long s,unsigned long e)240 static void __init bootmem_region_zap(unsigned long s, unsigned long e)
241 {
242 unsigned int i;
243
244 for ( i = 0; i < nr_bootmem_regions; i++ )
245 {
246 struct bootmem_region *r = &bootmem_region_list[i];
247 if ( e <= r->s )
248 break;
249 if ( s >= r->e )
250 continue;
251 if ( s <= r->s )
252 {
253 r->s = min(e, r->e);
254 }
255 else if ( e >= r->e )
256 {
257 r->e = s;
258 }
259 else
260 {
261 unsigned long _e = r->e;
262 r->e = s;
263 bootmem_region_add(e, _e);
264 }
265 }
266 }
267
init_boot_pages(paddr_t ps,paddr_t pe)268 void __init init_boot_pages(paddr_t ps, paddr_t pe)
269 {
270 unsigned long bad_spfn, bad_epfn;
271 const char *p;
272 #ifdef CONFIG_X86
273 const unsigned long *badpage = NULL;
274 unsigned int i, array_size;
275
276 BUILD_BUG_ON(8 * sizeof(frame_table->u.free.first_dirty) <
277 MAX_ORDER + 1);
278 #endif
279 BUILD_BUG_ON(sizeof(frame_table->u) != sizeof(unsigned long));
280
281 ps = round_pgup(ps);
282 pe = round_pgdown(pe);
283 if ( pe <= ps )
284 return;
285
286 first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
287
288 bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT);
289
290 #ifdef CONFIG_X86
291 /*
292 * Here we put platform-specific memory range workarounds, i.e.
293 * memory known to be corrupt or otherwise in need to be reserved on
294 * specific platforms.
295 * We get these certain pages and remove them from memory region list.
296 */
297 badpage = get_platform_badpages(&array_size);
298 if ( badpage )
299 {
300 for ( i = 0; i < array_size; i++ )
301 {
302 bootmem_region_zap(*badpage >> PAGE_SHIFT,
303 (*badpage >> PAGE_SHIFT) + 1);
304 badpage++;
305 }
306 }
307
308 if ( xen_guest )
309 {
310 badpage = hypervisor_reserved_pages(&array_size);
311 if ( badpage )
312 {
313 for ( i = 0; i < array_size; i++ )
314 {
315 bootmem_region_zap(*badpage >> PAGE_SHIFT,
316 (*badpage >> PAGE_SHIFT) + 1);
317 badpage++;
318 }
319 }
320 }
321 #endif
322
323 /* Check new pages against the bad-page list. */
324 p = opt_badpage;
325 while ( *p != '\0' )
326 {
327 bad_spfn = simple_strtoul(p, &p, 0);
328 bad_epfn = bad_spfn;
329
330 if ( *p == '-' )
331 {
332 p++;
333 bad_epfn = simple_strtoul(p, &p, 0);
334 if ( bad_epfn < bad_spfn )
335 bad_epfn = bad_spfn;
336 }
337
338 if ( *p == ',' )
339 p++;
340 else if ( *p != '\0' )
341 break;
342
343 bootmem_region_zap(bad_spfn, bad_epfn+1);
344 }
345 }
346
alloc_boot_pages(unsigned long nr_pfns,unsigned long pfn_align)347 mfn_t __init alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
348 {
349 unsigned long pg, _e;
350 unsigned int i = nr_bootmem_regions;
351
352 BUG_ON(!nr_bootmem_regions);
353
354 while ( i-- )
355 {
356 struct bootmem_region *r = &bootmem_region_list[i];
357
358 pg = (r->e - nr_pfns) & ~(pfn_align - 1);
359 if ( pg >= r->e || pg < r->s )
360 continue;
361
362 #if defined(CONFIG_X86) && !defined(NDEBUG)
363 /*
364 * Filtering pfn_align == 1 since the only allocations using a bigger
365 * alignment are the ones used for setting up the frame table chunks.
366 * Those allocations get remapped anyway, i.e. them not having 1:1
367 * mappings always accessible is not a problem.
368 */
369 if ( highmem_start && pfn_align == 1 &&
370 r->e > PFN_DOWN(highmem_start) )
371 {
372 pg = r->s;
373 if ( pg + nr_pfns > PFN_DOWN(highmem_start) )
374 continue;
375 r->s = pg + nr_pfns;
376 return _mfn(pg);
377 }
378 #endif
379
380 _e = r->e;
381 r->e = pg;
382 bootmem_region_add(pg + nr_pfns, _e);
383 return _mfn(pg);
384 }
385
386 BUG();
387 }
388
389
390
391 /*************************
392 * BINARY BUDDY ALLOCATOR
393 */
394
395 #define MEMZONE_XEN 0
396 #define NR_ZONES (PADDR_BITS - PAGE_SHIFT + 1)
397
398 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT))
399 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN : \
400 (flsl(page_to_mfn(pg)) ? : 1))
401
402 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
403 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
404 #define heap(node, zone, order) ((*_heap[node])[zone][order])
405
406 static unsigned long node_need_scrub[MAX_NUMNODES];
407
408 static unsigned long *avail[MAX_NUMNODES];
409 static long total_avail_pages;
410
411 /* TMEM: Reserve a fraction of memory for mid-size (0<order<9) allocations.*/
412 static long midsize_alloc_zone_pages;
413 #define MIDSIZE_ALLOC_FRAC 128
414
415 static DEFINE_SPINLOCK(heap_lock);
416 static long outstanding_claims; /* total outstanding claims by all domains */
417
domain_adjust_tot_pages(struct domain * d,long pages)418 unsigned long domain_adjust_tot_pages(struct domain *d, long pages)
419 {
420 long dom_before, dom_after, dom_claimed, sys_before, sys_after;
421
422 ASSERT(spin_is_locked(&d->page_alloc_lock));
423 d->tot_pages += pages;
424
425 /*
426 * can test d->claimed_pages race-free because it can only change
427 * if d->page_alloc_lock and heap_lock are both held, see also
428 * domain_set_outstanding_pages below
429 */
430 if ( !d->outstanding_pages )
431 goto out;
432
433 spin_lock(&heap_lock);
434 /* adjust domain outstanding pages; may not go negative */
435 dom_before = d->outstanding_pages;
436 dom_after = dom_before - pages;
437 BUG_ON(dom_before < 0);
438 dom_claimed = dom_after < 0 ? 0 : dom_after;
439 d->outstanding_pages = dom_claimed;
440 /* flag accounting bug if system outstanding_claims would go negative */
441 sys_before = outstanding_claims;
442 sys_after = sys_before - (dom_before - dom_claimed);
443 BUG_ON(sys_after < 0);
444 outstanding_claims = sys_after;
445 spin_unlock(&heap_lock);
446
447 out:
448 return d->tot_pages;
449 }
450
domain_set_outstanding_pages(struct domain * d,unsigned long pages)451 int domain_set_outstanding_pages(struct domain *d, unsigned long pages)
452 {
453 int ret = -ENOMEM;
454 unsigned long claim, avail_pages;
455
456 /*
457 * take the domain's page_alloc_lock, else all d->tot_page adjustments
458 * must always take the global heap_lock rather than only in the much
459 * rarer case that d->outstanding_pages is non-zero
460 */
461 spin_lock(&d->page_alloc_lock);
462 spin_lock(&heap_lock);
463
464 /* pages==0 means "unset" the claim. */
465 if ( pages == 0 )
466 {
467 outstanding_claims -= d->outstanding_pages;
468 d->outstanding_pages = 0;
469 ret = 0;
470 goto out;
471 }
472
473 /* only one active claim per domain please */
474 if ( d->outstanding_pages )
475 {
476 ret = -EINVAL;
477 goto out;
478 }
479
480 /* disallow a claim not exceeding current tot_pages or above max_pages */
481 if ( (pages <= d->tot_pages) || (pages > d->max_pages) )
482 {
483 ret = -EINVAL;
484 goto out;
485 }
486
487 /* how much memory is available? */
488 avail_pages = total_avail_pages;
489
490 /* Note: The usage of claim means that allocation from a guest *might*
491 * have to come from freeable memory. Using free memory is always better, if
492 * it is available, than using freeable memory.
493 *
494 * But that is OK as once the claim has been made, it still can take minutes
495 * before the claim is fully satisfied. Tmem can make use of the unclaimed
496 * pages during this time (to store ephemeral/freeable pages only,
497 * not persistent pages).
498 */
499 avail_pages += tmem_freeable_pages();
500 avail_pages -= outstanding_claims;
501
502 /*
503 * Note, if domain has already allocated memory before making a claim
504 * then the claim must take tot_pages into account
505 */
506 claim = pages - d->tot_pages;
507 if ( claim > avail_pages )
508 goto out;
509
510 /* yay, claim fits in available memory, stake the claim, success! */
511 d->outstanding_pages = claim;
512 outstanding_claims += d->outstanding_pages;
513 ret = 0;
514
515 out:
516 spin_unlock(&heap_lock);
517 spin_unlock(&d->page_alloc_lock);
518 return ret;
519 }
520
get_outstanding_claims(uint64_t * free_pages,uint64_t * outstanding_pages)521 void get_outstanding_claims(uint64_t *free_pages, uint64_t *outstanding_pages)
522 {
523 spin_lock(&heap_lock);
524 *outstanding_pages = outstanding_claims;
525 *free_pages = avail_domheap_pages();
526 spin_unlock(&heap_lock);
527 }
528
529 static bool_t __read_mostly first_node_initialised;
530 #ifndef CONFIG_SEPARATE_XENHEAP
531 static unsigned int __read_mostly xenheap_bits;
532 #else
533 #define xenheap_bits 0
534 #endif
535
init_node_heap(int node,unsigned long mfn,unsigned long nr,bool_t * use_tail)536 static unsigned long init_node_heap(int node, unsigned long mfn,
537 unsigned long nr, bool_t *use_tail)
538 {
539 /* First node to be discovered has its heap metadata statically alloced. */
540 static heap_by_zone_and_order_t _heap_static;
541 static unsigned long avail_static[NR_ZONES];
542 unsigned long needed = (sizeof(**_heap) +
543 sizeof(**avail) * NR_ZONES +
544 PAGE_SIZE - 1) >> PAGE_SHIFT;
545 int i, j;
546
547 if ( !first_node_initialised )
548 {
549 _heap[node] = &_heap_static;
550 avail[node] = avail_static;
551 first_node_initialised = 1;
552 needed = 0;
553 }
554 else if ( *use_tail && nr >= needed &&
555 arch_mfn_in_directmap(mfn + nr) &&
556 (!xenheap_bits ||
557 !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
558 {
559 _heap[node] = mfn_to_virt(mfn + nr - needed);
560 avail[node] = mfn_to_virt(mfn + nr - 1) +
561 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
562 }
563 else if ( nr >= needed &&
564 arch_mfn_in_directmap(mfn + needed) &&
565 (!xenheap_bits ||
566 !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
567 {
568 _heap[node] = mfn_to_virt(mfn);
569 avail[node] = mfn_to_virt(mfn + needed - 1) +
570 PAGE_SIZE - sizeof(**avail) * NR_ZONES;
571 *use_tail = 0;
572 }
573 else if ( get_order_from_bytes(sizeof(**_heap)) ==
574 get_order_from_pages(needed) )
575 {
576 _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
577 BUG_ON(!_heap[node]);
578 avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
579 sizeof(**avail) * NR_ZONES;
580 needed = 0;
581 }
582 else
583 {
584 _heap[node] = xmalloc(heap_by_zone_and_order_t);
585 avail[node] = xmalloc_array(unsigned long, NR_ZONES);
586 BUG_ON(!_heap[node] || !avail[node]);
587 needed = 0;
588 }
589
590 memset(avail[node], 0, NR_ZONES * sizeof(long));
591
592 for ( i = 0; i < NR_ZONES; i++ )
593 for ( j = 0; j <= MAX_ORDER; j++ )
594 INIT_PAGE_LIST_HEAD(&heap(node, i, j));
595
596 return needed;
597 }
598
599 /* Default to 64 MiB */
600 #define DEFAULT_LOW_MEM_VIRQ (((paddr_t) 64) << 20)
601 #define MAX_LOW_MEM_VIRQ (((paddr_t) 1024) << 20)
602
603 static paddr_t __read_mostly opt_low_mem_virq = ((paddr_t) -1);
604 size_param("low_mem_virq_limit", opt_low_mem_virq);
605
606 /* Thresholds to control hysteresis. In pages */
607 /* When memory grows above this threshold, reset hysteresis.
608 * -1 initially to not reset until at least one virq issued. */
609 static unsigned long low_mem_virq_high = -1UL;
610 /* Threshold at which we issue virq */
611 static unsigned long low_mem_virq_th = 0;
612 /* Original threshold after all checks completed */
613 static unsigned long low_mem_virq_orig = 0;
614 /* Order for current threshold */
615 static unsigned int low_mem_virq_th_order = 0;
616
617 /* Perform bootstrapping checks and set bounds */
setup_low_mem_virq(void)618 static void __init setup_low_mem_virq(void)
619 {
620 unsigned int order;
621 paddr_t threshold;
622 bool_t halve;
623
624 /* If the user specifies zero, then he/she doesn't want this virq
625 * to ever trigger. */
626 if ( opt_low_mem_virq == 0 )
627 {
628 low_mem_virq_th = -1UL;
629 return;
630 }
631
632 /* If the user did not specify a knob, remember that */
633 halve = (opt_low_mem_virq == ((paddr_t) -1));
634 threshold = halve ? DEFAULT_LOW_MEM_VIRQ : opt_low_mem_virq;
635
636 /* Dom0 has already been allocated by now. So check we won't be
637 * complaining immediately with whatever's left of the heap. */
638 threshold = min(threshold,
639 ((paddr_t) total_avail_pages) << PAGE_SHIFT);
640
641 /* Then, cap to some predefined maximum */
642 threshold = min(threshold, MAX_LOW_MEM_VIRQ);
643
644 /* If the user specified no knob, and we are at the current available
645 * level, halve the threshold. */
646 if ( halve &&
647 (threshold == (((paddr_t) total_avail_pages) << PAGE_SHIFT)) )
648 threshold >>= 1;
649
650 /* Zero? Have to fire immediately */
651 threshold = max(threshold, (paddr_t) PAGE_SIZE);
652
653 /* Threshold bytes -> pages */
654 low_mem_virq_th = threshold >> PAGE_SHIFT;
655
656 /* Next, round the threshold down to the next order */
657 order = get_order_from_pages(low_mem_virq_th);
658 if ( (1UL << order) > low_mem_virq_th )
659 order--;
660
661 /* Set bounds, ready to go */
662 low_mem_virq_th = low_mem_virq_orig = 1UL << order;
663 low_mem_virq_th_order = order;
664
665 printk("Initial low memory virq threshold set at %#lx pages.\n",
666 low_mem_virq_th);
667 }
668
check_low_mem_virq(void)669 static void check_low_mem_virq(void)
670 {
671 unsigned long avail_pages = total_avail_pages +
672 tmem_freeable_pages() - outstanding_claims;
673
674 if ( unlikely(avail_pages <= low_mem_virq_th) )
675 {
676 send_global_virq(VIRQ_ENOMEM);
677
678 /* Update thresholds. Next warning will be when we drop below
679 * next order. However, we wait until we grow beyond one
680 * order above us to complain again at the current order */
681 low_mem_virq_high = 1UL << (low_mem_virq_th_order + 1);
682 if ( low_mem_virq_th_order > 0 )
683 low_mem_virq_th_order--;
684 low_mem_virq_th = 1UL << low_mem_virq_th_order;
685 return;
686 }
687
688 if ( unlikely(avail_pages >= low_mem_virq_high) )
689 {
690 /* Reset hysteresis. Bring threshold up one order.
691 * If we are back where originally set, set high
692 * threshold to -1 to avoid further growth of
693 * virq threshold. */
694 low_mem_virq_th_order++;
695 low_mem_virq_th = 1UL << low_mem_virq_th_order;
696 if ( low_mem_virq_th == low_mem_virq_orig )
697 low_mem_virq_high = -1UL;
698 else
699 low_mem_virq_high = 1UL << (low_mem_virq_th_order + 2);
700 }
701 }
702
703 /* Pages that need a scrub are added to tail, otherwise to head. */
page_list_add_scrub(struct page_info * pg,unsigned int node,unsigned int zone,unsigned int order,unsigned int first_dirty)704 static void page_list_add_scrub(struct page_info *pg, unsigned int node,
705 unsigned int zone, unsigned int order,
706 unsigned int first_dirty)
707 {
708 PFN_ORDER(pg) = order;
709 pg->u.free.first_dirty = first_dirty;
710 pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
711
712 if ( first_dirty != INVALID_DIRTY_IDX )
713 {
714 ASSERT(first_dirty < (1U << order));
715 page_list_add_tail(pg, &heap(node, zone, order));
716 }
717 else
718 page_list_add(pg, &heap(node, zone, order));
719 }
720
721 /* SCRUB_PATTERN needs to be a repeating series of bytes. */
722 #ifndef NDEBUG
723 #define SCRUB_PATTERN 0xc2c2c2c2c2c2c2c2ULL
724 #else
725 #define SCRUB_PATTERN 0ULL
726 #endif
727 #define SCRUB_BYTE_PATTERN (SCRUB_PATTERN & 0xff)
728
poison_one_page(struct page_info * pg)729 static void poison_one_page(struct page_info *pg)
730 {
731 #ifdef CONFIG_SCRUB_DEBUG
732 mfn_t mfn = _mfn(page_to_mfn(pg));
733 uint64_t *ptr;
734
735 if ( !scrub_debug )
736 return;
737
738 ptr = map_domain_page(mfn);
739 *ptr = ~SCRUB_PATTERN;
740 unmap_domain_page(ptr);
741 #endif
742 }
743
check_one_page(struct page_info * pg)744 static void check_one_page(struct page_info *pg)
745 {
746 #ifdef CONFIG_SCRUB_DEBUG
747 mfn_t mfn = _mfn(page_to_mfn(pg));
748 const uint64_t *ptr;
749 unsigned int i;
750
751 if ( !scrub_debug )
752 return;
753
754 ptr = map_domain_page(mfn);
755 for ( i = 0; i < PAGE_SIZE / sizeof (*ptr); i++ )
756 BUG_ON(ptr[i] != SCRUB_PATTERN);
757 unmap_domain_page(ptr);
758 #endif
759 }
760
check_and_stop_scrub(struct page_info * head)761 static void check_and_stop_scrub(struct page_info *head)
762 {
763 if ( head->u.free.scrub_state == BUDDY_SCRUBBING )
764 {
765 typeof(head->u.free) pgfree;
766
767 head->u.free.scrub_state = BUDDY_SCRUB_ABORT;
768 spin_lock_kick();
769 for ( ; ; )
770 {
771 /* Can't ACCESS_ONCE() a bitfield. */
772 pgfree.val = ACCESS_ONCE(head->u.free.val);
773 if ( pgfree.scrub_state != BUDDY_SCRUB_ABORT )
774 break;
775 cpu_relax();
776 }
777 }
778 }
779
get_free_buddy(unsigned int zone_lo,unsigned int zone_hi,unsigned int order,unsigned int memflags,const struct domain * d)780 static struct page_info *get_free_buddy(unsigned int zone_lo,
781 unsigned int zone_hi,
782 unsigned int order, unsigned int memflags,
783 const struct domain *d)
784 {
785 nodeid_t first_node, node = MEMF_get_node(memflags), req_node = node;
786 nodemask_t nodemask = d ? d->node_affinity : node_online_map;
787 unsigned int j, zone, nodemask_retry = 0;
788 struct page_info *pg;
789 bool use_unscrubbed = (memflags & MEMF_no_scrub);
790
791 if ( node == NUMA_NO_NODE )
792 {
793 if ( d != NULL )
794 {
795 node = next_node(d->last_alloc_node, nodemask);
796 if ( node >= MAX_NUMNODES )
797 node = first_node(nodemask);
798 }
799 if ( node >= MAX_NUMNODES )
800 node = cpu_to_node(smp_processor_id());
801 }
802 else if ( unlikely(node >= MAX_NUMNODES) )
803 {
804 ASSERT_UNREACHABLE();
805 return NULL;
806 }
807 first_node = node;
808
809 /*
810 * Start with requested node, but exhaust all node memory in requested
811 * zone before failing, only calc new node value if we fail to find memory
812 * in target node, this avoids needless computation on fast-path.
813 */
814 for ( ; ; )
815 {
816 zone = zone_hi;
817 do {
818 /* Check if target node can support the allocation. */
819 if ( !avail[node] || (avail[node][zone] < (1UL << order)) )
820 continue;
821
822 /* Find smallest order which can satisfy the request. */
823 for ( j = order; j <= MAX_ORDER; j++ )
824 {
825 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
826 {
827 if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
828 return pg;
829 /*
830 * We grab single pages (order=0) even if they are
831 * unscrubbed. Given that scrubbing one page is fairly quick
832 * it is not worth breaking higher orders.
833 */
834 if ( (order == 0) || use_unscrubbed )
835 {
836 check_and_stop_scrub(pg);
837 return pg;
838 }
839
840 page_list_add_tail(pg, &heap(node, zone, j));
841 }
842 }
843 } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
844
845 if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE )
846 return NULL;
847
848 /* Pick next node. */
849 if ( !node_isset(node, nodemask) )
850 {
851 /* Very first node may be caller-specified and outside nodemask. */
852 ASSERT(!nodemask_retry);
853 first_node = node = first_node(nodemask);
854 if ( node < MAX_NUMNODES )
855 continue;
856 }
857 else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
858 node = first_node(nodemask);
859 if ( node == first_node )
860 {
861 /* When we have tried all in nodemask, we fall back to others. */
862 if ( (memflags & MEMF_exact_node) || nodemask_retry++ )
863 return NULL;
864 nodes_andnot(nodemask, node_online_map, nodemask);
865 first_node = node = first_node(nodemask);
866 if ( node >= MAX_NUMNODES )
867 return NULL;
868 }
869 }
870 }
871
872 /* Allocate 2^@order contiguous pages. */
alloc_heap_pages(unsigned int zone_lo,unsigned int zone_hi,unsigned int order,unsigned int memflags,struct domain * d)873 static struct page_info *alloc_heap_pages(
874 unsigned int zone_lo, unsigned int zone_hi,
875 unsigned int order, unsigned int memflags,
876 struct domain *d)
877 {
878 nodeid_t node;
879 unsigned int i, buddy_order, zone, first_dirty;
880 unsigned long request = 1UL << order;
881 struct page_info *pg;
882 bool need_tlbflush = false;
883 uint32_t tlbflush_timestamp = 0;
884 unsigned int dirty_cnt = 0;
885
886 /* Make sure there are enough bits in memflags for nodeID. */
887 BUILD_BUG_ON((_MEMF_bits - _MEMF_node) < (8 * sizeof(nodeid_t)));
888
889 ASSERT(zone_lo <= zone_hi);
890 ASSERT(zone_hi < NR_ZONES);
891
892 if ( unlikely(order > MAX_ORDER) )
893 return NULL;
894
895 spin_lock(&heap_lock);
896
897 /*
898 * Claimed memory is considered unavailable unless the request
899 * is made by a domain with sufficient unclaimed pages.
900 */
901 if ( (outstanding_claims + request >
902 total_avail_pages + tmem_freeable_pages()) &&
903 ((memflags & MEMF_no_refcount) ||
904 !d || d->outstanding_pages < request) )
905 {
906 spin_unlock(&heap_lock);
907 return NULL;
908 }
909
910 /*
911 * TMEM: When available memory is scarce due to tmem absorbing it, allow
912 * only mid-size allocations to avoid worst of fragmentation issues.
913 * Others try tmem pools then fail. This is a workaround until all
914 * post-dom0-creation-multi-page allocations can be eliminated.
915 */
916 if ( ((order == 0) || (order >= 9)) &&
917 (total_avail_pages <= midsize_alloc_zone_pages) &&
918 tmem_freeable_pages() )
919 {
920 /* Try to free memory from tmem. */
921 pg = tmem_relinquish_pages(order, memflags);
922 spin_unlock(&heap_lock);
923 return pg;
924 }
925
926 pg = get_free_buddy(zone_lo, zone_hi, order, memflags, d);
927 /* Try getting a dirty buddy if we couldn't get a clean one. */
928 if ( !pg && !(memflags & MEMF_no_scrub) )
929 pg = get_free_buddy(zone_lo, zone_hi, order,
930 memflags | MEMF_no_scrub, d);
931 if ( !pg )
932 {
933 /* No suitable memory blocks. Fail the request. */
934 spin_unlock(&heap_lock);
935 return NULL;
936 }
937
938 node = phys_to_nid(page_to_maddr(pg));
939 zone = page_to_zone(pg);
940 buddy_order = PFN_ORDER(pg);
941
942 first_dirty = pg->u.free.first_dirty;
943
944 /* We may have to halve the chunk a number of times. */
945 while ( buddy_order != order )
946 {
947 buddy_order--;
948 page_list_add_scrub(pg, node, zone, buddy_order,
949 (1U << buddy_order) > first_dirty ?
950 first_dirty : INVALID_DIRTY_IDX);
951 pg += 1U << buddy_order;
952
953 if ( first_dirty != INVALID_DIRTY_IDX )
954 {
955 /* Adjust first_dirty */
956 if ( first_dirty >= 1U << buddy_order )
957 first_dirty -= 1U << buddy_order;
958 else
959 first_dirty = 0; /* We've moved past original first_dirty */
960 }
961 }
962
963 ASSERT(avail[node][zone] >= request);
964 avail[node][zone] -= request;
965 total_avail_pages -= request;
966 ASSERT(total_avail_pages >= 0);
967
968 check_low_mem_virq();
969
970 if ( d != NULL )
971 d->last_alloc_node = node;
972
973 for ( i = 0; i < (1 << order); i++ )
974 {
975 /* Reference count must continuously be zero for free pages. */
976 BUG_ON((pg[i].count_info & ~PGC_need_scrub) != PGC_state_free);
977
978 /* PGC_need_scrub can only be set if first_dirty is valid */
979 ASSERT(first_dirty != INVALID_DIRTY_IDX || !(pg[i].count_info & PGC_need_scrub));
980
981 /* Preserve PGC_need_scrub so we can check it after lock is dropped. */
982 pg[i].count_info = PGC_state_inuse | (pg[i].count_info & PGC_need_scrub);
983
984 if ( !(memflags & MEMF_no_tlbflush) )
985 accumulate_tlbflush(&need_tlbflush, &pg[i],
986 &tlbflush_timestamp);
987
988 /* Initialise fields which have other uses for free pages. */
989 pg[i].u.inuse.type_info = 0;
990 page_set_owner(&pg[i], NULL);
991
992 /* Ensure cache and RAM are consistent for platforms where the
993 * guest can control its own visibility of/through the cache.
994 */
995 flush_page_to_ram(page_to_mfn(&pg[i]), !(memflags & MEMF_no_icache_flush));
996 }
997
998 spin_unlock(&heap_lock);
999
1000 if ( first_dirty != INVALID_DIRTY_IDX ||
1001 (scrub_debug && !(memflags & MEMF_no_scrub)) )
1002 {
1003 for ( i = 0; i < (1U << order); i++ )
1004 {
1005 if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
1006 {
1007 if ( !(memflags & MEMF_no_scrub) )
1008 scrub_one_page(&pg[i]);
1009
1010 dirty_cnt++;
1011
1012 spin_lock(&heap_lock);
1013 pg[i].count_info &= ~PGC_need_scrub;
1014 spin_unlock(&heap_lock);
1015 }
1016 else if ( !(memflags & MEMF_no_scrub) )
1017 check_one_page(&pg[i]);
1018 }
1019
1020 if ( dirty_cnt )
1021 {
1022 spin_lock(&heap_lock);
1023 node_need_scrub[node] -= dirty_cnt;
1024 spin_unlock(&heap_lock);
1025 }
1026 }
1027
1028 if ( need_tlbflush )
1029 filtered_flush_tlb_mask(tlbflush_timestamp);
1030
1031 return pg;
1032 }
1033
1034 /* Remove any offlined page in the buddy pointed to by head. */
reserve_offlined_page(struct page_info * head)1035 static int reserve_offlined_page(struct page_info *head)
1036 {
1037 unsigned int node = phys_to_nid(page_to_maddr(head));
1038 int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
1039 struct page_info *cur_head;
1040 unsigned int cur_order, first_dirty;
1041
1042 ASSERT(spin_is_locked(&heap_lock));
1043
1044 cur_head = head;
1045
1046 check_and_stop_scrub(head);
1047 /*
1048 * We may break the buddy so let's mark the head as clean. Then, when
1049 * merging chunks back into the heap, we will see whether the chunk has
1050 * unscrubbed pages and set its first_dirty properly.
1051 */
1052 first_dirty = head->u.free.first_dirty;
1053 head->u.free.first_dirty = INVALID_DIRTY_IDX;
1054
1055 page_list_del(head, &heap(node, zone, head_order));
1056
1057 while ( cur_head < (head + (1 << head_order)) )
1058 {
1059 struct page_info *pg;
1060 int next_order;
1061
1062 if ( page_state_is(cur_head, offlined) )
1063 {
1064 cur_head++;
1065 if ( first_dirty != INVALID_DIRTY_IDX && first_dirty )
1066 first_dirty--;
1067 continue;
1068 }
1069
1070 next_order = cur_order = 0;
1071
1072 while ( cur_order < head_order )
1073 {
1074 next_order = cur_order + 1;
1075
1076 if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
1077 goto merge;
1078
1079 for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
1080 i < (1 << next_order);
1081 i++, pg++ )
1082 if ( page_state_is(pg, offlined) )
1083 break;
1084 if ( i == ( 1 << next_order) )
1085 {
1086 cur_order = next_order;
1087 continue;
1088 }
1089 else
1090 {
1091 merge:
1092 /* We don't consider merging outside the head_order. */
1093 page_list_add_scrub(cur_head, node, zone, cur_order,
1094 (1U << cur_order) > first_dirty ?
1095 first_dirty : INVALID_DIRTY_IDX);
1096 cur_head += (1 << cur_order);
1097
1098 /* Adjust first_dirty if needed. */
1099 if ( first_dirty != INVALID_DIRTY_IDX )
1100 {
1101 if ( first_dirty >= 1U << cur_order )
1102 first_dirty -= 1U << cur_order;
1103 else
1104 first_dirty = 0;
1105 }
1106
1107 break;
1108 }
1109 }
1110 }
1111
1112 for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
1113 {
1114 if ( !page_state_is(cur_head, offlined) )
1115 continue;
1116
1117 avail[node][zone]--;
1118 total_avail_pages--;
1119 ASSERT(total_avail_pages >= 0);
1120
1121 page_list_add_tail(cur_head,
1122 test_bit(_PGC_broken, &cur_head->count_info) ?
1123 &page_broken_list : &page_offlined_list);
1124
1125 count++;
1126 }
1127
1128 return count;
1129 }
1130
1131 static nodemask_t node_scrubbing;
1132
1133 /*
1134 * If get_node is true this will return closest node that needs to be scrubbed,
1135 * with appropriate bit in node_scrubbing set.
1136 * If get_node is not set, this will return *a* node that needs to be scrubbed.
1137 * node_scrubbing bitmask will no be updated.
1138 * If no node needs scrubbing then NUMA_NO_NODE is returned.
1139 */
node_to_scrub(bool get_node)1140 static unsigned int node_to_scrub(bool get_node)
1141 {
1142 nodeid_t node = cpu_to_node(smp_processor_id()), local_node;
1143 nodeid_t closest = NUMA_NO_NODE;
1144 u8 dist, shortest = 0xff;
1145
1146 if ( node == NUMA_NO_NODE )
1147 node = 0;
1148
1149 if ( node_need_scrub[node] &&
1150 (!get_node || !node_test_and_set(node, node_scrubbing)) )
1151 return node;
1152
1153 /*
1154 * See if there are memory-only nodes that need scrubbing and choose
1155 * the closest one.
1156 */
1157 local_node = node;
1158 for ( ; ; )
1159 {
1160 do {
1161 node = cycle_node(node, node_online_map);
1162 } while ( !cpumask_empty(&node_to_cpumask(node)) &&
1163 (node != local_node) );
1164
1165 if ( node == local_node )
1166 break;
1167
1168 if ( node_need_scrub[node] )
1169 {
1170 if ( !get_node )
1171 return node;
1172
1173 dist = __node_distance(local_node, node);
1174
1175 /*
1176 * Grab the node right away. If we find a closer node later we will
1177 * release this one. While there is a chance that another CPU will
1178 * not be able to scrub that node when it is searching for scrub work
1179 * at the same time it will be able to do so next time it wakes up.
1180 * The alternative would be to perform this search under a lock but
1181 * then we'd need to take this lock every time we come in here.
1182 */
1183 if ( (dist < shortest || closest == NUMA_NO_NODE) &&
1184 !node_test_and_set(node, node_scrubbing) )
1185 {
1186 if ( closest != NUMA_NO_NODE )
1187 node_clear(closest, node_scrubbing);
1188 shortest = dist;
1189 closest = node;
1190 }
1191 }
1192 }
1193
1194 return closest;
1195 }
1196
1197 struct scrub_wait_state {
1198 struct page_info *pg;
1199 unsigned int first_dirty;
1200 bool drop;
1201 };
1202
scrub_continue(void * data)1203 static void scrub_continue(void *data)
1204 {
1205 struct scrub_wait_state *st = data;
1206
1207 if ( st->drop )
1208 return;
1209
1210 if ( st->pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
1211 {
1212 /* There is a waiter for this buddy. Release it. */
1213 st->drop = true;
1214 st->pg->u.free.first_dirty = st->first_dirty;
1215 smp_wmb();
1216 st->pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1217 }
1218 }
1219
scrub_free_pages(void)1220 bool scrub_free_pages(void)
1221 {
1222 struct page_info *pg;
1223 unsigned int zone;
1224 unsigned int cpu = smp_processor_id();
1225 bool preempt = false;
1226 nodeid_t node;
1227 unsigned int cnt = 0;
1228
1229 node = node_to_scrub(true);
1230 if ( node == NUMA_NO_NODE )
1231 return false;
1232
1233 spin_lock(&heap_lock);
1234
1235 for ( zone = 0; zone < NR_ZONES; zone++ )
1236 {
1237 unsigned int order = MAX_ORDER;
1238
1239 do {
1240 while ( !page_list_empty(&heap(node, zone, order)) )
1241 {
1242 unsigned int i, dirty_cnt;
1243 struct scrub_wait_state st;
1244
1245 /* Unscrubbed pages are always at the end of the list. */
1246 pg = page_list_last(&heap(node, zone, order));
1247 if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
1248 break;
1249
1250 ASSERT(pg->u.free.scrub_state == BUDDY_NOT_SCRUBBING);
1251 pg->u.free.scrub_state = BUDDY_SCRUBBING;
1252
1253 spin_unlock(&heap_lock);
1254
1255 dirty_cnt = 0;
1256
1257 for ( i = pg->u.free.first_dirty; i < (1U << order); i++)
1258 {
1259 if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
1260 {
1261 scrub_one_page(&pg[i]);
1262 /*
1263 * We can modify count_info without holding heap
1264 * lock since we effectively locked this buddy by
1265 * setting its scrub_state.
1266 */
1267 pg[i].count_info &= ~PGC_need_scrub;
1268 dirty_cnt++;
1269 cnt += 100; /* scrubbed pages add heavier weight. */
1270 }
1271 else
1272 cnt++;
1273
1274 if ( pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
1275 {
1276 /* Someone wants this chunk. Drop everything. */
1277
1278 pg->u.free.first_dirty = (i == (1U << order) - 1) ?
1279 INVALID_DIRTY_IDX : i + 1;
1280 smp_wmb();
1281 pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1282
1283 spin_lock(&heap_lock);
1284 node_need_scrub[node] -= dirty_cnt;
1285 spin_unlock(&heap_lock);
1286 goto out_nolock;
1287 }
1288
1289 /*
1290 * Scrub a few (8) pages before becoming eligible for
1291 * preemption. But also count non-scrubbing loop iterations
1292 * so that we don't get stuck here with an almost clean
1293 * heap.
1294 */
1295 if ( cnt > 800 && softirq_pending(cpu) )
1296 {
1297 preempt = true;
1298 break;
1299 }
1300 }
1301
1302 st.pg = pg;
1303 /*
1304 * get_free_buddy() grabs a buddy with first_dirty set to
1305 * INVALID_DIRTY_IDX so we can't set pg's first_dirty here.
1306 * It will be set either below or in the lock callback (in
1307 * scrub_continue()).
1308 */
1309 st.first_dirty = (i >= (1U << order) - 1) ?
1310 INVALID_DIRTY_IDX : i + 1;
1311 st.drop = false;
1312 spin_lock_cb(&heap_lock, scrub_continue, &st);
1313
1314 node_need_scrub[node] -= dirty_cnt;
1315
1316 if ( st.drop )
1317 goto out;
1318
1319 if ( i >= (1U << order) - 1 )
1320 {
1321 page_list_del(pg, &heap(node, zone, order));
1322 page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
1323 }
1324 else
1325 pg->u.free.first_dirty = i + 1;
1326
1327 pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1328
1329 if ( preempt || (node_need_scrub[node] == 0) )
1330 goto out;
1331 }
1332 } while ( order-- != 0 );
1333 }
1334
1335 out:
1336 spin_unlock(&heap_lock);
1337
1338 out_nolock:
1339 node_clear(node, node_scrubbing);
1340 return node_to_scrub(false) != NUMA_NO_NODE;
1341 }
1342
1343 /* Free 2^@order set of pages. */
free_heap_pages(struct page_info * pg,unsigned int order,bool need_scrub)1344 static void free_heap_pages(
1345 struct page_info *pg, unsigned int order, bool need_scrub)
1346 {
1347 unsigned long mask, mfn = page_to_mfn(pg);
1348 unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
1349 unsigned int zone = page_to_zone(pg);
1350
1351 ASSERT(order <= MAX_ORDER);
1352 ASSERT(node >= 0);
1353
1354 spin_lock(&heap_lock);
1355
1356 for ( i = 0; i < (1 << order); i++ )
1357 {
1358 /*
1359 * Cannot assume that count_info == 0, as there are some corner cases
1360 * where it isn't the case and yet it isn't a bug:
1361 * 1. page_get_owner() is NULL
1362 * 2. page_get_owner() is a domain that was never accessible by
1363 * its domid (e.g., failed to fully construct the domain).
1364 * 3. page was never addressable by the guest (e.g., it's an
1365 * auto-translate-physmap guest and the page was never included
1366 * in its pseudophysical address space).
1367 * In all the above cases there can be no guest mappings of this page.
1368 */
1369 ASSERT(!page_state_is(&pg[i], offlined));
1370 pg[i].count_info =
1371 ((pg[i].count_info & PGC_broken) |
1372 (page_state_is(&pg[i], offlining)
1373 ? PGC_state_offlined : PGC_state_free));
1374 if ( page_state_is(&pg[i], offlined) )
1375 tainted = 1;
1376
1377 /* If a page has no owner it will need no safety TLB flush. */
1378 pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
1379 if ( pg[i].u.free.need_tlbflush )
1380 page_set_tlbflush_timestamp(&pg[i]);
1381
1382 /* This page is not a guest frame any more. */
1383 page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
1384 set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY);
1385
1386 if ( need_scrub )
1387 {
1388 pg[i].count_info |= PGC_need_scrub;
1389 poison_one_page(&pg[i]);
1390 }
1391 }
1392
1393 avail[node][zone] += 1 << order;
1394 total_avail_pages += 1 << order;
1395 if ( need_scrub )
1396 {
1397 node_need_scrub[node] += 1 << order;
1398 pg->u.free.first_dirty = 0;
1399 }
1400 else
1401 pg->u.free.first_dirty = INVALID_DIRTY_IDX;
1402
1403 if ( tmem_enabled() )
1404 midsize_alloc_zone_pages = max(
1405 midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
1406
1407 /* Merge chunks as far as possible. */
1408 while ( order < MAX_ORDER )
1409 {
1410 mask = 1UL << order;
1411
1412 if ( (page_to_mfn(pg) & mask) )
1413 {
1414 struct page_info *predecessor = pg - mask;
1415
1416 /* Merge with predecessor block? */
1417 if ( !mfn_valid(_mfn(page_to_mfn(predecessor))) ||
1418 !page_state_is(predecessor, free) ||
1419 (PFN_ORDER(predecessor) != order) ||
1420 (phys_to_nid(page_to_maddr(predecessor)) != node) )
1421 break;
1422
1423 check_and_stop_scrub(predecessor);
1424
1425 page_list_del(predecessor, &heap(node, zone, order));
1426
1427 /* Keep predecessor's first_dirty if it is already set. */
1428 if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
1429 pg->u.free.first_dirty != INVALID_DIRTY_IDX )
1430 predecessor->u.free.first_dirty = (1U << order) +
1431 pg->u.free.first_dirty;
1432
1433 pg = predecessor;
1434 }
1435 else
1436 {
1437 struct page_info *successor = pg + mask;
1438
1439 /* Merge with successor block? */
1440 if ( !mfn_valid(_mfn(page_to_mfn(successor))) ||
1441 !page_state_is(successor, free) ||
1442 (PFN_ORDER(successor) != order) ||
1443 (phys_to_nid(page_to_maddr(successor)) != node) )
1444 break;
1445
1446 check_and_stop_scrub(successor);
1447
1448 page_list_del(successor, &heap(node, zone, order));
1449 }
1450
1451 order++;
1452 }
1453
1454 page_list_add_scrub(pg, node, zone, order, pg->u.free.first_dirty);
1455
1456 if ( tainted )
1457 reserve_offlined_page(pg);
1458
1459 spin_unlock(&heap_lock);
1460 }
1461
1462
1463 /*
1464 * Following rules applied for page offline:
1465 * Once a page is broken, it can't be assigned anymore
1466 * A page will be offlined only if it is free
1467 * return original count_info
1468 */
mark_page_offline(struct page_info * pg,int broken)1469 static unsigned long mark_page_offline(struct page_info *pg, int broken)
1470 {
1471 unsigned long nx, x, y = pg->count_info;
1472
1473 ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
1474 ASSERT(spin_is_locked(&heap_lock));
1475
1476 do {
1477 nx = x = y;
1478
1479 if ( ((x & PGC_state) != PGC_state_offlined) &&
1480 ((x & PGC_state) != PGC_state_offlining) )
1481 {
1482 nx &= ~PGC_state;
1483 nx |= (((x & PGC_state) == PGC_state_free)
1484 ? PGC_state_offlined : PGC_state_offlining);
1485 }
1486
1487 if ( broken )
1488 nx |= PGC_broken;
1489
1490 if ( x == nx )
1491 break;
1492 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
1493
1494 return y;
1495 }
1496
reserve_heap_page(struct page_info * pg)1497 static int reserve_heap_page(struct page_info *pg)
1498 {
1499 struct page_info *head = NULL;
1500 unsigned int i, node = phys_to_nid(page_to_maddr(pg));
1501 unsigned int zone = page_to_zone(pg);
1502
1503 for ( i = 0; i <= MAX_ORDER; i++ )
1504 {
1505 struct page_info *tmp;
1506
1507 if ( page_list_empty(&heap(node, zone, i)) )
1508 continue;
1509
1510 page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
1511 {
1512 if ( (head <= pg) &&
1513 (head + (1UL << i) > pg) )
1514 return reserve_offlined_page(head);
1515 }
1516 }
1517
1518 return -EINVAL;
1519
1520 }
1521
offline_page(unsigned long mfn,int broken,uint32_t * status)1522 int offline_page(unsigned long mfn, int broken, uint32_t *status)
1523 {
1524 unsigned long old_info = 0;
1525 struct domain *owner;
1526 struct page_info *pg;
1527
1528 if ( !mfn_valid(_mfn(mfn)) )
1529 {
1530 dprintk(XENLOG_WARNING,
1531 "try to offline page out of range %lx\n", mfn);
1532 return -EINVAL;
1533 }
1534
1535 *status = 0;
1536 pg = mfn_to_page(mfn);
1537
1538 if ( is_xen_fixed_mfn(mfn) )
1539 {
1540 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
1541 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
1542 return -EPERM;
1543 }
1544
1545 /*
1546 * N.B. xen's txt in x86_64 is marked reserved and handled already.
1547 * Also kexec range is reserved.
1548 */
1549 if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
1550 {
1551 *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
1552 return -EINVAL;
1553 }
1554
1555 /*
1556 * NB. When broken page belong to guest, usually hypervisor will
1557 * notify the guest to handle the broken page. However, hypervisor
1558 * need to prevent malicious guest access the broken page again.
1559 * Under such case, hypervisor shutdown guest, preventing recursive mce.
1560 */
1561 if ( (pg->count_info & PGC_broken) && (owner = page_get_owner(pg)) )
1562 {
1563 *status = PG_OFFLINE_AGAIN;
1564 domain_shutdown(owner, SHUTDOWN_crash);
1565 return 0;
1566 }
1567
1568 spin_lock(&heap_lock);
1569
1570 old_info = mark_page_offline(pg, broken);
1571
1572 if ( page_state_is(pg, offlined) )
1573 {
1574 reserve_heap_page(pg);
1575
1576 spin_unlock(&heap_lock);
1577
1578 *status = broken ? PG_OFFLINE_OFFLINED | PG_OFFLINE_BROKEN
1579 : PG_OFFLINE_OFFLINED;
1580 return 0;
1581 }
1582
1583 spin_unlock(&heap_lock);
1584
1585 if ( (owner = page_get_owner_and_reference(pg)) )
1586 {
1587 if ( p2m_pod_offline_or_broken_hit(pg) )
1588 {
1589 put_page(pg);
1590 p2m_pod_offline_or_broken_replace(pg);
1591 *status = PG_OFFLINE_OFFLINED;
1592 }
1593 else
1594 {
1595 *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
1596 (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
1597 /* Release the reference since it will not be allocated anymore */
1598 put_page(pg);
1599 }
1600 }
1601 else if ( old_info & PGC_xen_heap )
1602 {
1603 *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
1604 (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
1605 }
1606 else
1607 {
1608 /*
1609 * assign_pages does not hold heap_lock, so small window that the owner
1610 * may be set later, but please notice owner will only change from
1611 * NULL to be set, not verse, since page is offlining now.
1612 * No windows If called from #MC handler, since all CPU are in softirq
1613 * If called from user space like CE handling, tools can wait some time
1614 * before call again.
1615 */
1616 *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
1617 (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
1618 }
1619
1620 if ( broken )
1621 *status |= PG_OFFLINE_BROKEN;
1622
1623 return 0;
1624 }
1625
1626 /*
1627 * Online the memory.
1628 * The caller should make sure end_pfn <= max_page,
1629 * if not, expand_pages() should be called prior to online_page().
1630 */
online_page(unsigned long mfn,uint32_t * status)1631 unsigned int online_page(unsigned long mfn, uint32_t *status)
1632 {
1633 unsigned long x, nx, y;
1634 struct page_info *pg;
1635 int ret;
1636
1637 if ( !mfn_valid(_mfn(mfn)) )
1638 {
1639 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
1640 return -EINVAL;
1641 }
1642
1643 pg = mfn_to_page(mfn);
1644
1645 spin_lock(&heap_lock);
1646
1647 y = pg->count_info;
1648 do {
1649 ret = *status = 0;
1650
1651 if ( y & PGC_broken )
1652 {
1653 ret = -EINVAL;
1654 *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
1655 break;
1656 }
1657
1658 if ( (y & PGC_state) == PGC_state_offlined )
1659 {
1660 page_list_del(pg, &page_offlined_list);
1661 *status = PG_ONLINE_ONLINED;
1662 }
1663 else if ( (y & PGC_state) == PGC_state_offlining )
1664 {
1665 *status = PG_ONLINE_ONLINED;
1666 }
1667 else
1668 {
1669 break;
1670 }
1671
1672 x = y;
1673 nx = (x & ~PGC_state) | PGC_state_inuse;
1674 } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
1675
1676 spin_unlock(&heap_lock);
1677
1678 if ( (y & PGC_state) == PGC_state_offlined )
1679 free_heap_pages(pg, 0, false);
1680
1681 return ret;
1682 }
1683
query_page_offline(unsigned long mfn,uint32_t * status)1684 int query_page_offline(unsigned long mfn, uint32_t *status)
1685 {
1686 struct page_info *pg;
1687
1688 if ( !mfn_valid(_mfn(mfn)) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
1689 {
1690 dprintk(XENLOG_WARNING, "call expand_pages() first\n");
1691 return -EINVAL;
1692 }
1693
1694 *status = 0;
1695 spin_lock(&heap_lock);
1696
1697 pg = mfn_to_page(mfn);
1698
1699 if ( page_state_is(pg, offlining) )
1700 *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
1701 if ( pg->count_info & PGC_broken )
1702 *status |= PG_OFFLINE_STATUS_BROKEN;
1703 if ( page_state_is(pg, offlined) )
1704 *status |= PG_OFFLINE_STATUS_OFFLINED;
1705
1706 spin_unlock(&heap_lock);
1707
1708 return 0;
1709 }
1710
1711 /*
1712 * Hand the specified arbitrary page range to the specified heap zone
1713 * checking the node_id of the previous page. If they differ and the
1714 * latter is not on a MAX_ORDER boundary, then we reserve the page by
1715 * not freeing it to the buddy allocator.
1716 */
init_heap_pages(struct page_info * pg,unsigned long nr_pages)1717 static void init_heap_pages(
1718 struct page_info *pg, unsigned long nr_pages)
1719 {
1720 unsigned long i;
1721
1722 /*
1723 * Some pages may not go through the boot allocator (e.g reserved
1724 * memory at boot but released just after --- kernel, initramfs,
1725 * etc.).
1726 * Update first_valid_mfn to ensure those regions are covered.
1727 */
1728 spin_lock(&heap_lock);
1729 first_valid_mfn = min_t(unsigned long, page_to_mfn(pg), first_valid_mfn);
1730 spin_unlock(&heap_lock);
1731
1732 for ( i = 0; i < nr_pages; i++ )
1733 {
1734 unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
1735
1736 if ( unlikely(!avail[nid]) )
1737 {
1738 unsigned long s = page_to_mfn(pg + i);
1739 unsigned long e = page_to_mfn(pg + nr_pages - 1) + 1;
1740 bool_t use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
1741 !(s & ((1UL << MAX_ORDER) - 1)) &&
1742 (find_first_set_bit(e) <= find_first_set_bit(s));
1743 unsigned long n;
1744
1745 n = init_node_heap(nid, page_to_mfn(pg+i), nr_pages - i,
1746 &use_tail);
1747 BUG_ON(i + n > nr_pages);
1748 if ( n && !use_tail )
1749 {
1750 i += n - 1;
1751 continue;
1752 }
1753 if ( i + n == nr_pages )
1754 break;
1755 nr_pages -= n;
1756 }
1757
1758 free_heap_pages(pg + i, 0, scrub_debug);
1759 }
1760 }
1761
avail_heap_pages(unsigned int zone_lo,unsigned int zone_hi,unsigned int node)1762 static unsigned long avail_heap_pages(
1763 unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
1764 {
1765 unsigned int i, zone;
1766 unsigned long free_pages = 0;
1767
1768 if ( zone_hi >= NR_ZONES )
1769 zone_hi = NR_ZONES - 1;
1770
1771 for_each_online_node(i)
1772 {
1773 if ( !avail[i] )
1774 continue;
1775 for ( zone = zone_lo; zone <= zone_hi; zone++ )
1776 if ( (node == -1) || (node == i) )
1777 free_pages += avail[i][zone];
1778 }
1779
1780 return free_pages;
1781 }
1782
total_free_pages(void)1783 unsigned long total_free_pages(void)
1784 {
1785 return total_avail_pages - midsize_alloc_zone_pages;
1786 }
1787
end_boot_allocator(void)1788 void __init end_boot_allocator(void)
1789 {
1790 unsigned int i;
1791
1792 /* Pages that are free now go to the domain sub-allocator. */
1793 for ( i = 0; i < nr_bootmem_regions; i++ )
1794 {
1795 struct bootmem_region *r = &bootmem_region_list[i];
1796 if ( (r->s < r->e) &&
1797 (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) )
1798 {
1799 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
1800 r->e = r->s;
1801 break;
1802 }
1803 }
1804 for ( i = nr_bootmem_regions; i-- > 0; )
1805 {
1806 struct bootmem_region *r = &bootmem_region_list[i];
1807 if ( r->s < r->e )
1808 init_heap_pages(mfn_to_page(r->s), r->e - r->s);
1809 }
1810 nr_bootmem_regions = 0;
1811 init_heap_pages(virt_to_page(bootmem_region_list), 1);
1812
1813 if ( !dma_bitsize && (num_online_nodes() > 1) )
1814 dma_bitsize = arch_get_dma_bitsize();
1815
1816 printk("Domain heap initialised");
1817 if ( dma_bitsize )
1818 printk(" DMA width %u bits", dma_bitsize);
1819 printk("\n");
1820 }
1821
smp_scrub_heap_pages(void * data)1822 static void __init smp_scrub_heap_pages(void *data)
1823 {
1824 unsigned long mfn, start, end;
1825 struct page_info *pg;
1826 struct scrub_region *r;
1827 unsigned int temp_cpu, cpu_idx = 0;
1828 nodeid_t node;
1829 unsigned int cpu = smp_processor_id();
1830
1831 if ( data )
1832 r = data;
1833 else
1834 {
1835 node = cpu_to_node(cpu);
1836 if ( node == NUMA_NO_NODE )
1837 return;
1838 r = ®ion[node];
1839 }
1840
1841 /* Determine the current CPU's index into CPU's linked to this node. */
1842 for_each_cpu ( temp_cpu, &r->cpus )
1843 {
1844 if ( cpu == temp_cpu )
1845 break;
1846 cpu_idx++;
1847 }
1848
1849 /* Calculate the starting mfn for this CPU's memory block. */
1850 start = r->start + (r->per_cpu_sz * cpu_idx) + r->offset;
1851
1852 /* Calculate the end mfn into this CPU's memory block for this iteration. */
1853 if ( r->offset + chunk_size >= r->per_cpu_sz )
1854 {
1855 end = r->start + (r->per_cpu_sz * cpu_idx) + r->per_cpu_sz;
1856
1857 if ( r->rem && (cpumask_weight(&r->cpus) - 1 == cpu_idx) )
1858 end += r->rem;
1859 }
1860 else
1861 end = start + chunk_size;
1862
1863 for ( mfn = start; mfn < end; mfn++ )
1864 {
1865 pg = mfn_to_page(mfn);
1866
1867 /* Check the mfn is valid and page is free. */
1868 if ( !mfn_valid(_mfn(mfn)) || !page_state_is(pg, free) )
1869 continue;
1870
1871 scrub_one_page(pg);
1872 }
1873 }
1874
find_non_smt(unsigned int node,cpumask_t * dest)1875 static int __init find_non_smt(unsigned int node, cpumask_t *dest)
1876 {
1877 cpumask_t node_cpus;
1878 unsigned int i, cpu;
1879
1880 cpumask_and(&node_cpus, &node_to_cpumask(node), &cpu_online_map);
1881 cpumask_clear(dest);
1882 for_each_cpu ( i, &node_cpus )
1883 {
1884 if ( cpumask_intersects(dest, per_cpu(cpu_sibling_mask, i)) )
1885 continue;
1886 cpu = cpumask_first(per_cpu(cpu_sibling_mask, i));
1887 __cpumask_set_cpu(cpu, dest);
1888 }
1889 return cpumask_weight(dest);
1890 }
1891
1892 /*
1893 * Scrub all unallocated pages in all heap zones. This function uses all
1894 * online cpu's to scrub the memory in parallel.
1895 */
scrub_heap_pages(void)1896 static void __init scrub_heap_pages(void)
1897 {
1898 cpumask_t node_cpus, all_worker_cpus;
1899 unsigned int i, j;
1900 unsigned long offset, max_per_cpu_sz = 0;
1901 unsigned long start, end;
1902 unsigned long rem = 0;
1903 int last_distance, best_node;
1904 int cpus;
1905
1906 cpumask_clear(&all_worker_cpus);
1907 /* Scrub block size. */
1908 chunk_size = opt_bootscrub_chunk >> PAGE_SHIFT;
1909 if ( chunk_size == 0 )
1910 chunk_size = MB(128) >> PAGE_SHIFT;
1911
1912 /* Round #0 - figure out amounts and which CPUs to use. */
1913 for_each_online_node ( i )
1914 {
1915 if ( !node_spanned_pages(i) )
1916 continue;
1917 /* Calculate Node memory start and end address. */
1918 start = max(node_start_pfn(i), first_valid_mfn);
1919 end = min(node_start_pfn(i) + node_spanned_pages(i), max_page);
1920 /* Just in case NODE has 1 page and starts below first_valid_mfn. */
1921 end = max(end, start);
1922 /* CPUs that are online and on this node (if none, that it is OK). */
1923 cpus = find_non_smt(i, &node_cpus);
1924 cpumask_or(&all_worker_cpus, &all_worker_cpus, &node_cpus);
1925 if ( cpus <= 0 )
1926 {
1927 /* No CPUs on this node. Round #2 will take of it. */
1928 rem = 0;
1929 region[i].per_cpu_sz = (end - start);
1930 }
1931 else
1932 {
1933 rem = (end - start) % cpus;
1934 region[i].per_cpu_sz = (end - start) / cpus;
1935 if ( region[i].per_cpu_sz > max_per_cpu_sz )
1936 max_per_cpu_sz = region[i].per_cpu_sz;
1937 }
1938 region[i].start = start;
1939 region[i].rem = rem;
1940 cpumask_copy(®ion[i].cpus, &node_cpus);
1941 }
1942
1943 printk("Scrubbing Free RAM on %d nodes using %d CPUs\n", num_online_nodes(),
1944 cpumask_weight(&all_worker_cpus));
1945
1946 /* Round: #1 - do NUMA nodes with CPUs. */
1947 for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
1948 {
1949 for_each_online_node ( i )
1950 region[i].offset = offset;
1951
1952 process_pending_softirqs();
1953
1954 spin_lock(&heap_lock);
1955 on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, NULL, 1);
1956 spin_unlock(&heap_lock);
1957
1958 printk(".");
1959 }
1960
1961 /*
1962 * Round #2: NUMA nodes with no CPUs get scrubbed with CPUs on the node
1963 * closest to us and with CPUs.
1964 */
1965 for_each_online_node ( i )
1966 {
1967 node_cpus = node_to_cpumask(i);
1968
1969 if ( !cpumask_empty(&node_cpus) )
1970 continue;
1971
1972 last_distance = INT_MAX;
1973 best_node = first_node(node_online_map);
1974 /* Figure out which NODE CPUs are close. */
1975 for_each_online_node ( j )
1976 {
1977 u8 distance;
1978
1979 if ( cpumask_empty(&node_to_cpumask(j)) )
1980 continue;
1981
1982 distance = __node_distance(i, j);
1983 if ( (distance < last_distance) && (distance != NUMA_NO_DISTANCE) )
1984 {
1985 last_distance = distance;
1986 best_node = j;
1987 }
1988 }
1989 /*
1990 * Use CPUs from best node, and if there are no CPUs on the
1991 * first node (the default) use the BSP.
1992 */
1993 cpus = find_non_smt(best_node, &node_cpus);
1994 if ( cpus == 0 )
1995 {
1996 __cpumask_set_cpu(smp_processor_id(), &node_cpus);
1997 cpus = 1;
1998 }
1999 /* We already have the node information from round #0. */
2000 region[i].rem = region[i].per_cpu_sz % cpus;
2001 region[i].per_cpu_sz /= cpus;
2002 max_per_cpu_sz = region[i].per_cpu_sz;
2003 cpumask_copy(®ion[i].cpus, &node_cpus);
2004
2005 for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
2006 {
2007 region[i].offset = offset;
2008
2009 process_pending_softirqs();
2010
2011 spin_lock(&heap_lock);
2012 on_selected_cpus(&node_cpus, smp_scrub_heap_pages, ®ion[i], 1);
2013 spin_unlock(&heap_lock);
2014
2015 printk(".");
2016 }
2017 }
2018
2019 printk("done.\n");
2020
2021 #ifdef CONFIG_SCRUB_DEBUG
2022 scrub_debug = true;
2023 #endif
2024 }
2025
heap_init_late(void)2026 void __init heap_init_late(void)
2027 {
2028 /*
2029 * Now that the heap is initialized set bounds
2030 * for the low mem virq algorithm.
2031 */
2032 setup_low_mem_virq();
2033
2034 if ( opt_bootscrub )
2035 scrub_heap_pages();
2036 }
2037
2038
2039 /*************************
2040 * XEN-HEAP SUB-ALLOCATOR
2041 */
2042
2043 #if defined(CONFIG_SEPARATE_XENHEAP)
2044
init_xenheap_pages(paddr_t ps,paddr_t pe)2045 void init_xenheap_pages(paddr_t ps, paddr_t pe)
2046 {
2047 ps = round_pgup(ps);
2048 pe = round_pgdown(pe);
2049 if ( pe <= ps )
2050 return;
2051
2052 /*
2053 * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
2054 * prevent merging of power-of-two blocks across the zone boundary.
2055 */
2056 if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
2057 ps += PAGE_SIZE;
2058 if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
2059 pe -= PAGE_SIZE;
2060
2061 memguard_guard_range(maddr_to_virt(ps), pe - ps);
2062
2063 init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
2064 }
2065
2066
alloc_xenheap_pages(unsigned int order,unsigned int memflags)2067 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
2068 {
2069 struct page_info *pg;
2070
2071 ASSERT(!in_irq());
2072
2073 pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
2074 order, memflags | MEMF_no_scrub, NULL);
2075 if ( unlikely(pg == NULL) )
2076 return NULL;
2077
2078 memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
2079
2080 return page_to_virt(pg);
2081 }
2082
2083
free_xenheap_pages(void * v,unsigned int order)2084 void free_xenheap_pages(void *v, unsigned int order)
2085 {
2086 ASSERT(!in_irq());
2087
2088 if ( v == NULL )
2089 return;
2090
2091 memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
2092
2093 free_heap_pages(virt_to_page(v), order, false);
2094 }
2095
2096 #else
2097
xenheap_max_mfn(unsigned long mfn)2098 void __init xenheap_max_mfn(unsigned long mfn)
2099 {
2100 ASSERT(!first_node_initialised);
2101 ASSERT(!xenheap_bits);
2102 BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG);
2103 xenheap_bits = min(flsl(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS);
2104 printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits);
2105 }
2106
init_xenheap_pages(paddr_t ps,paddr_t pe)2107 void init_xenheap_pages(paddr_t ps, paddr_t pe)
2108 {
2109 init_domheap_pages(ps, pe);
2110 }
2111
alloc_xenheap_pages(unsigned int order,unsigned int memflags)2112 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
2113 {
2114 struct page_info *pg;
2115 unsigned int i;
2116
2117 ASSERT(!in_irq());
2118
2119 if ( xenheap_bits && (memflags >> _MEMF_bits) > xenheap_bits )
2120 memflags &= ~MEMF_bits(~0U);
2121 if ( !(memflags >> _MEMF_bits) )
2122 memflags |= MEMF_bits(xenheap_bits);
2123
2124 pg = alloc_domheap_pages(NULL, order, memflags | MEMF_no_scrub);
2125 if ( unlikely(pg == NULL) )
2126 return NULL;
2127
2128 for ( i = 0; i < (1u << order); i++ )
2129 pg[i].count_info |= PGC_xen_heap;
2130
2131 return page_to_virt(pg);
2132 }
2133
free_xenheap_pages(void * v,unsigned int order)2134 void free_xenheap_pages(void *v, unsigned int order)
2135 {
2136 struct page_info *pg;
2137 unsigned int i;
2138
2139 ASSERT(!in_irq());
2140
2141 if ( v == NULL )
2142 return;
2143
2144 pg = virt_to_page(v);
2145
2146 for ( i = 0; i < (1u << order); i++ )
2147 pg[i].count_info &= ~PGC_xen_heap;
2148
2149 free_heap_pages(pg, order, true);
2150 }
2151
2152 #endif
2153
2154
2155
2156 /*************************
2157 * DOMAIN-HEAP SUB-ALLOCATOR
2158 */
2159
init_domheap_pages(paddr_t ps,paddr_t pe)2160 void init_domheap_pages(paddr_t ps, paddr_t pe)
2161 {
2162 unsigned long smfn, emfn;
2163
2164 ASSERT(!in_irq());
2165
2166 smfn = round_pgup(ps) >> PAGE_SHIFT;
2167 emfn = round_pgdown(pe) >> PAGE_SHIFT;
2168
2169 if ( emfn <= smfn )
2170 return;
2171
2172 init_heap_pages(mfn_to_page(smfn), emfn - smfn);
2173 }
2174
2175
assign_pages(struct domain * d,struct page_info * pg,unsigned int order,unsigned int memflags)2176 int assign_pages(
2177 struct domain *d,
2178 struct page_info *pg,
2179 unsigned int order,
2180 unsigned int memflags)
2181 {
2182 int rc = 0;
2183 unsigned long i;
2184
2185 spin_lock(&d->page_alloc_lock);
2186
2187 if ( unlikely(d->is_dying) )
2188 {
2189 gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
2190 d->domain_id);
2191 rc = -EINVAL;
2192 goto out;
2193 }
2194
2195 if ( !(memflags & MEMF_no_refcount) )
2196 {
2197 if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
2198 {
2199 if ( !tmem_enabled() || order != 0 || d->tot_pages != d->max_pages )
2200 gprintk(XENLOG_INFO, "Over-allocation for domain %u: "
2201 "%u > %u\n", d->domain_id,
2202 d->tot_pages + (1 << order), d->max_pages);
2203 rc = -E2BIG;
2204 goto out;
2205 }
2206
2207 if ( unlikely(d->tot_pages == 0) )
2208 get_knownalive_domain(d);
2209
2210 domain_adjust_tot_pages(d, 1 << order);
2211 }
2212
2213 for ( i = 0; i < (1 << order); i++ )
2214 {
2215 ASSERT(page_get_owner(&pg[i]) == NULL);
2216 ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
2217 page_set_owner(&pg[i], d);
2218 smp_wmb(); /* Domain pointer must be visible before updating refcnt. */
2219 pg[i].count_info = PGC_allocated | 1;
2220 page_list_add_tail(&pg[i], &d->page_list);
2221 }
2222
2223 out:
2224 spin_unlock(&d->page_alloc_lock);
2225 return rc;
2226 }
2227
2228
alloc_domheap_pages(struct domain * d,unsigned int order,unsigned int memflags)2229 struct page_info *alloc_domheap_pages(
2230 struct domain *d, unsigned int order, unsigned int memflags)
2231 {
2232 struct page_info *pg = NULL;
2233 unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
2234 unsigned int dma_zone;
2235
2236 ASSERT(!in_irq());
2237
2238 bits = domain_clamp_alloc_bitsize(memflags & MEMF_no_owner ? NULL : d,
2239 bits ? : (BITS_PER_LONG+PAGE_SHIFT));
2240 if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
2241 return NULL;
2242
2243 if ( memflags & MEMF_no_owner )
2244 memflags |= MEMF_no_refcount;
2245
2246 if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
2247 pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
2248
2249 if ( (pg == NULL) &&
2250 ((memflags & MEMF_no_dma) ||
2251 ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
2252 memflags, d)) == NULL)) )
2253 return NULL;
2254
2255 if ( d && !(memflags & MEMF_no_owner) &&
2256 assign_pages(d, pg, order, memflags) )
2257 {
2258 free_heap_pages(pg, order, memflags & MEMF_no_scrub);
2259 return NULL;
2260 }
2261
2262 return pg;
2263 }
2264
free_domheap_pages(struct page_info * pg,unsigned int order)2265 void free_domheap_pages(struct page_info *pg, unsigned int order)
2266 {
2267 struct domain *d = page_get_owner(pg);
2268 unsigned int i;
2269 bool_t drop_dom_ref;
2270
2271 ASSERT(!in_irq());
2272
2273 if ( unlikely(is_xen_heap_page(pg)) )
2274 {
2275 /* NB. May recursively lock from relinquish_memory(). */
2276 spin_lock_recursive(&d->page_alloc_lock);
2277
2278 for ( i = 0; i < (1 << order); i++ )
2279 arch_free_heap_page(d, &pg[i]);
2280
2281 d->xenheap_pages -= 1 << order;
2282 drop_dom_ref = (d->xenheap_pages == 0);
2283
2284 spin_unlock_recursive(&d->page_alloc_lock);
2285 }
2286 else
2287 {
2288 bool_t scrub;
2289
2290 if ( likely(d) && likely(d != dom_cow) )
2291 {
2292 /* NB. May recursively lock from relinquish_memory(). */
2293 spin_lock_recursive(&d->page_alloc_lock);
2294
2295 for ( i = 0; i < (1 << order); i++ )
2296 {
2297 BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
2298 arch_free_heap_page(d, &pg[i]);
2299 }
2300
2301 drop_dom_ref = !domain_adjust_tot_pages(d, -(1 << order));
2302
2303 spin_unlock_recursive(&d->page_alloc_lock);
2304
2305 /*
2306 * Normally we expect a domain to clear pages before freeing them,
2307 * if it cares about the secrecy of their contents. However, after
2308 * a domain has died we assume responsibility for erasure.
2309 */
2310 scrub = d->is_dying || scrub_debug;
2311 }
2312 else
2313 {
2314 /*
2315 * All we need to check is that on dom_cow only order-0 chunks
2316 * make it here. Due to the if() above, the only two possible
2317 * cases right now are d == NULL and d == dom_cow. To protect
2318 * against relaxation of that if() condition without updating the
2319 * check here, don't check d != dom_cow for now.
2320 */
2321 ASSERT(!d || !order);
2322 drop_dom_ref = 0;
2323 scrub = 1;
2324 }
2325
2326 free_heap_pages(pg, order, scrub);
2327 }
2328
2329 if ( drop_dom_ref )
2330 put_domain(d);
2331 }
2332
avail_domheap_pages_region(unsigned int node,unsigned int min_width,unsigned int max_width)2333 unsigned long avail_domheap_pages_region(
2334 unsigned int node, unsigned int min_width, unsigned int max_width)
2335 {
2336 int zone_lo, zone_hi;
2337
2338 zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
2339 zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
2340
2341 zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
2342 zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
2343
2344 return avail_heap_pages(zone_lo, zone_hi, node);
2345 }
2346
avail_domheap_pages(void)2347 unsigned long avail_domheap_pages(void)
2348 {
2349 return avail_heap_pages(MEMZONE_XEN + 1,
2350 NR_ZONES - 1,
2351 -1);
2352 }
2353
avail_node_heap_pages(unsigned int nodeid)2354 unsigned long avail_node_heap_pages(unsigned int nodeid)
2355 {
2356 return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid);
2357 }
2358
2359
pagealloc_info(unsigned char key)2360 static void pagealloc_info(unsigned char key)
2361 {
2362 unsigned int zone = MEMZONE_XEN;
2363 unsigned long n, total = 0;
2364
2365 printk("Physical memory information:\n");
2366 printk(" Xen heap: %lukB free\n",
2367 avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
2368
2369 while ( ++zone < NR_ZONES )
2370 {
2371 if ( (zone + PAGE_SHIFT) == dma_bitsize )
2372 {
2373 printk(" DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
2374 total = 0;
2375 }
2376
2377 if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
2378 {
2379 total += n;
2380 printk(" heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
2381 }
2382 }
2383
2384 printk(" Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
2385 }
2386
pagealloc_keyhandler_init(void)2387 static __init int pagealloc_keyhandler_init(void)
2388 {
2389 register_keyhandler('m', pagealloc_info, "memory info", 1);
2390 return 0;
2391 }
2392 __initcall(pagealloc_keyhandler_init);
2393
2394
scrub_one_page(struct page_info * pg)2395 void scrub_one_page(struct page_info *pg)
2396 {
2397 if ( unlikely(pg->count_info & PGC_broken) )
2398 return;
2399
2400 #ifndef NDEBUG
2401 /* Avoid callers relying on allocations returning zeroed pages. */
2402 unmap_domain_page(memset(__map_domain_page(pg),
2403 SCRUB_BYTE_PATTERN, PAGE_SIZE));
2404 #else
2405 /* For a production build, clear_page() is the fastest way to scrub. */
2406 clear_domain_page(_mfn(page_to_mfn(pg)));
2407 #endif
2408 }
2409
dump_heap(unsigned char key)2410 static void dump_heap(unsigned char key)
2411 {
2412 s_time_t now = NOW();
2413 int i, j;
2414
2415 printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
2416 (u32)(now>>32), (u32)now);
2417
2418 for ( i = 0; i < MAX_NUMNODES; i++ )
2419 {
2420 if ( !avail[i] )
2421 continue;
2422 for ( j = 0; j < NR_ZONES; j++ )
2423 printk("heap[node=%d][zone=%d] -> %lu pages\n",
2424 i, j, avail[i][j]);
2425 }
2426
2427 for ( i = 0; i < MAX_NUMNODES; i++ )
2428 {
2429 if ( !node_need_scrub[i] )
2430 continue;
2431 printk("Node %d has %lu unscrubbed pages\n", i, node_need_scrub[i]);
2432 }
2433 }
2434
register_heap_trigger(void)2435 static __init int register_heap_trigger(void)
2436 {
2437 register_keyhandler('H', dump_heap, "dump heap info", 1);
2438 return 0;
2439 }
2440 __initcall(register_heap_trigger);
2441
2442 /*
2443 * Local variables:
2444 * mode: C
2445 * c-file-style: "BSD"
2446 * c-basic-offset: 4
2447 * tab-width: 4
2448 * indent-tabs-mode: nil
2449 * End:
2450 */
2451