1 /******************************************************************************
2  * page_alloc.c
3  *
4  * Simple buddy heap allocator for Xen.
5  *
6  * Copyright (c) 2002-2004 K A Fraser
7  * Copyright (c) 2006 IBM Ryan Harper <ryanh@us.ibm.com>
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of the GNU General Public License as published by
11  * the Free Software Foundation; either version 2 of the License, or
12  * (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; If not, see <http://www.gnu.org/licenses/>.
21  */
22 
23 /*
24  * In general Xen maintains two pools of memory:
25  *
26  * - Xen heap: Memory which is always mapped (i.e accessible by
27  *             virtual address), via a permanent and contiguous
28  *             "direct mapping". Macros like va() and pa() are valid
29  *             for such memory and it is always permissible to stash
30  *             pointers to Xen heap memory in data structures etc.
31  *
32  *             Xen heap pages are always anonymous (that is, not tied
33  *             or accounted to any particular domain).
34  *
35  * - Dom heap: Memory which must be explicitly mapped, usually
36  *             transiently with map_domain_page(), in order to be
37  *             used. va() and pa() are not valid for such memory. Care
38  *             should be taken when stashing pointers to dom heap
39  *             pages that those mappings are permanent (e.g. vmap() or
40  *             map_domain_page_global()), it is not safe to stash
41  *             transient mappings such as those from map_domain_page()
42  *
43  *             Dom heap pages are often tied to a particular domain,
44  *             but need not be (passing domain==NULL results in an
45  *             anonymous dom heap allocation).
46  *
47  * The exact nature of this split is a (sub)arch decision which can
48  * select one of three main variants:
49  *
50  * CONFIG_SEPARATE_XENHEAP=y
51  *
52  *   The xen heap is maintained as an entirely separate heap.
53  *
54  *   Arch code arranges for some (perhaps small) amount of physical
55  *   memory to be covered by a direct mapping and registers that
56  *   memory as the Xen heap (via init_xenheap_pages()) and the
57  *   remainder as the dom heap.
58  *
59  *   This mode of operation is most commonly used by 32-bit arches
60  *   where the virtual address space is insufficient to map all RAM.
61  *
62  * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ALL RAM
63  *
64  *   All of RAM is covered by a permanent contiguous mapping and there
65  *   is only a single heap.
66  *
67  *   Memory allocated from the Xen heap is flagged (in
68  *   page_info.count_info) with PGC_xen_heap. Memory allocated from
69  *   the Dom heap must still be explicitly mapped before use
70  *   (e.g. with map_domain_page) in particular in common code.
71  *
72  *   xenheap_max_mfn() should not be called by arch code.
73  *
74  *   This mode of operation is most commonly used by 64-bit arches
75  *   which have sufficient free virtual address space to permanently
76  *   map the largest practical amount RAM currently expected on that
77  *   arch.
78  *
79  * CONFIG_SEPARATE_XENHEAP=n W/ DIRECT MAP OF ONLY PARTIAL RAM
80  *
81  *   There is a single heap, but only the beginning (up to some
82  *   threshold) is covered by a permanent contiguous mapping.
83  *
84  *   Memory allocated from the Xen heap is allocated from below the
85  *   threshold and flagged with PGC_xen_heap. Memory allocated from
86  *   the dom heap is allocated from anywhere in the heap (although it
87  *   will prefer to allocate from as high as possible to try and keep
88  *   Xen heap suitable memory available).
89  *
90  *   Arch code must call xenheap_max_mfn() to signal the limit of the
91  *   direct mapping.
92  *
93  *   This mode of operation is most commonly used by 64-bit arches
94  *   which have a restricted amount of virtual address space available
95  *   for a direct map (due to e.g. reservations for other purposes)
96  *   such that it is not possible to map all of RAM on systems with
97  *   the largest practical amount of RAM currently expected on that
98  *   arch.
99  *
100  * Boot Allocator
101  *
102  *   In addition to the two primary pools (xen heap and dom heap) a
103  *   third "boot allocator" is used at start of day. This is a
104  *   simplified allocator which can be used.
105  *
106  *   Typically all memory which is destined to be dom heap memory
107  *   (which is everything in the CONFIG_SEPARATE_XENHEAP=n
108  *   configurations) is first allocated to the boot allocator (with
109  *   init_boot_pages()) and is then handed over to the main dom heap in
110  *   end_boot_allocator().
111  *
112  * "Contiguous" mappings
113  *
114  *   Note that although the above talks about "contiguous" mappings
115  *   some architectures implement a scheme ("PDX compression") to
116  *   compress unused portions of the machine address space (i.e. large
117  *   gaps between distinct banks of memory) in order to avoid creating
118  *   enormous frame tables and direct maps which mostly map
119  *   nothing. Thus a contiguous mapping may still have distinct
120  *   regions within it.
121  */
122 
123 #include <xen/init.h>
124 #include <xen/types.h>
125 #include <xen/lib.h>
126 #include <xen/sched.h>
127 #include <xen/spinlock.h>
128 #include <xen/mm.h>
129 #include <xen/irq.h>
130 #include <xen/softirq.h>
131 #include <xen/domain_page.h>
132 #include <xen/keyhandler.h>
133 #include <xen/perfc.h>
134 #include <xen/pfn.h>
135 #include <xen/numa.h>
136 #include <xen/nodemask.h>
137 #include <xen/event.h>
138 #include <xen/tmem.h>
139 #include <xen/tmem_xen.h>
140 #include <public/sysctl.h>
141 #include <public/sched.h>
142 #include <asm/page.h>
143 #include <asm/numa.h>
144 #include <asm/flushtlb.h>
145 #ifdef CONFIG_X86
146 #include <asm/guest.h>
147 #include <asm/p2m.h>
148 #include <asm/setup.h> /* for highmem_start only */
149 #else
150 #define p2m_pod_offline_or_broken_hit(pg) 0
151 #define p2m_pod_offline_or_broken_replace(pg) BUG_ON(pg != NULL)
152 #endif
153 
154 /*
155  * Comma-separated list of hexadecimal page numbers containing bad bytes.
156  * e.g. 'badpage=0x3f45,0x8a321'.
157  */
158 static char __initdata opt_badpage[100] = "";
159 string_param("badpage", opt_badpage);
160 
161 /*
162  * no-bootscrub -> Free pages are not zeroed during boot.
163  */
164 static bool_t opt_bootscrub __initdata = 1;
165 boolean_param("bootscrub", opt_bootscrub);
166 
167 /*
168  * bootscrub_chunk -> Amount of bytes to scrub lockstep on non-SMT CPUs
169  * on all NUMA nodes.
170  */
171 static unsigned long __initdata opt_bootscrub_chunk = MB(128);
172 size_param("bootscrub_chunk", opt_bootscrub_chunk);
173 
174 #ifdef CONFIG_SCRUB_DEBUG
175 static bool __read_mostly scrub_debug;
176 #else
177 #define scrub_debug    false
178 #endif
179 
180 /*
181  * Bit width of the DMA heap -- used to override NUMA-node-first.
182  * allocation strategy, which can otherwise exhaust low memory.
183  */
184 static unsigned int dma_bitsize;
185 integer_param("dma_bits", dma_bitsize);
186 
187 /* Offlined page list, protected by heap_lock. */
188 PAGE_LIST_HEAD(page_offlined_list);
189 /* Broken page list, protected by heap_lock. */
190 PAGE_LIST_HEAD(page_broken_list);
191 
192 /*************************
193  * BOOT-TIME ALLOCATOR
194  */
195 
196 /*
197  * first_valid_mfn is exported because it is use in ARM specific NUMA
198  * helpers. See comment in asm-arm/numa.h.
199  */
200 unsigned long first_valid_mfn = ~0UL;
201 
202 static struct bootmem_region {
203     unsigned long s, e; /* MFNs @s through @e-1 inclusive are free */
204 } *__initdata bootmem_region_list;
205 static unsigned int __initdata nr_bootmem_regions;
206 
207 struct scrub_region {
208     unsigned long offset;
209     unsigned long start;
210     unsigned long per_cpu_sz;
211     unsigned long rem;
212     cpumask_t cpus;
213 };
214 static struct scrub_region __initdata region[MAX_NUMNODES];
215 static unsigned long __initdata chunk_size;
216 
bootmem_region_add(unsigned long s,unsigned long e)217 static void __init bootmem_region_add(unsigned long s, unsigned long e)
218 {
219     unsigned int i;
220 
221     if ( (bootmem_region_list == NULL) && (s < e) )
222         bootmem_region_list = mfn_to_virt(s++);
223 
224     if ( s >= e )
225         return;
226 
227     for ( i = 0; i < nr_bootmem_regions; i++ )
228         if ( s < bootmem_region_list[i].e )
229             break;
230 
231     BUG_ON((i < nr_bootmem_regions) && (e > bootmem_region_list[i].s));
232     BUG_ON(nr_bootmem_regions == (PAGE_SIZE / sizeof(struct bootmem_region)));
233 
234     memmove(&bootmem_region_list[i+1], &bootmem_region_list[i],
235             (nr_bootmem_regions - i) * sizeof(*bootmem_region_list));
236     bootmem_region_list[i] = (struct bootmem_region) { s, e };
237     nr_bootmem_regions++;
238 }
239 
bootmem_region_zap(unsigned long s,unsigned long e)240 static void __init bootmem_region_zap(unsigned long s, unsigned long e)
241 {
242     unsigned int i;
243 
244     for ( i = 0; i < nr_bootmem_regions; i++ )
245     {
246         struct bootmem_region *r = &bootmem_region_list[i];
247         if ( e <= r->s )
248             break;
249         if ( s >= r->e )
250             continue;
251         if ( s <= r->s )
252         {
253             r->s = min(e, r->e);
254         }
255         else if ( e >= r->e )
256         {
257             r->e = s;
258         }
259         else
260         {
261             unsigned long _e = r->e;
262             r->e = s;
263             bootmem_region_add(e, _e);
264         }
265     }
266 }
267 
init_boot_pages(paddr_t ps,paddr_t pe)268 void __init init_boot_pages(paddr_t ps, paddr_t pe)
269 {
270     unsigned long bad_spfn, bad_epfn;
271     const char *p;
272 #ifdef CONFIG_X86
273     const unsigned long *badpage = NULL;
274     unsigned int i, array_size;
275 
276     BUILD_BUG_ON(8 * sizeof(frame_table->u.free.first_dirty) <
277                  MAX_ORDER + 1);
278 #endif
279     BUILD_BUG_ON(sizeof(frame_table->u) != sizeof(unsigned long));
280 
281     ps = round_pgup(ps);
282     pe = round_pgdown(pe);
283     if ( pe <= ps )
284         return;
285 
286     first_valid_mfn = min_t(unsigned long, ps >> PAGE_SHIFT, first_valid_mfn);
287 
288     bootmem_region_add(ps >> PAGE_SHIFT, pe >> PAGE_SHIFT);
289 
290 #ifdef CONFIG_X86
291     /*
292      * Here we put platform-specific memory range workarounds, i.e.
293      * memory known to be corrupt or otherwise in need to be reserved on
294      * specific platforms.
295      * We get these certain pages and remove them from memory region list.
296      */
297     badpage = get_platform_badpages(&array_size);
298     if ( badpage )
299     {
300         for ( i = 0; i < array_size; i++ )
301         {
302             bootmem_region_zap(*badpage >> PAGE_SHIFT,
303                                (*badpage >> PAGE_SHIFT) + 1);
304             badpage++;
305         }
306     }
307 
308     if ( xen_guest )
309     {
310         badpage = hypervisor_reserved_pages(&array_size);
311         if ( badpage )
312         {
313             for ( i = 0; i < array_size; i++ )
314             {
315                 bootmem_region_zap(*badpage >> PAGE_SHIFT,
316                                    (*badpage >> PAGE_SHIFT) + 1);
317                 badpage++;
318             }
319         }
320     }
321 #endif
322 
323     /* Check new pages against the bad-page list. */
324     p = opt_badpage;
325     while ( *p != '\0' )
326     {
327         bad_spfn = simple_strtoul(p, &p, 0);
328         bad_epfn = bad_spfn;
329 
330         if ( *p == '-' )
331         {
332             p++;
333             bad_epfn = simple_strtoul(p, &p, 0);
334             if ( bad_epfn < bad_spfn )
335                 bad_epfn = bad_spfn;
336         }
337 
338         if ( *p == ',' )
339             p++;
340         else if ( *p != '\0' )
341             break;
342 
343         bootmem_region_zap(bad_spfn, bad_epfn+1);
344     }
345 }
346 
alloc_boot_pages(unsigned long nr_pfns,unsigned long pfn_align)347 mfn_t __init alloc_boot_pages(unsigned long nr_pfns, unsigned long pfn_align)
348 {
349     unsigned long pg, _e;
350     unsigned int i = nr_bootmem_regions;
351 
352     BUG_ON(!nr_bootmem_regions);
353 
354     while ( i-- )
355     {
356         struct bootmem_region *r = &bootmem_region_list[i];
357 
358         pg = (r->e - nr_pfns) & ~(pfn_align - 1);
359         if ( pg >= r->e || pg < r->s )
360             continue;
361 
362 #if defined(CONFIG_X86) && !defined(NDEBUG)
363         /*
364          * Filtering pfn_align == 1 since the only allocations using a bigger
365          * alignment are the ones used for setting up the frame table chunks.
366          * Those allocations get remapped anyway, i.e. them not having 1:1
367          * mappings always accessible is not a problem.
368          */
369         if ( highmem_start && pfn_align == 1 &&
370              r->e > PFN_DOWN(highmem_start) )
371         {
372             pg = r->s;
373             if ( pg + nr_pfns > PFN_DOWN(highmem_start) )
374                 continue;
375             r->s = pg + nr_pfns;
376             return _mfn(pg);
377         }
378 #endif
379 
380         _e = r->e;
381         r->e = pg;
382         bootmem_region_add(pg + nr_pfns, _e);
383         return _mfn(pg);
384     }
385 
386     BUG();
387 }
388 
389 
390 
391 /*************************
392  * BINARY BUDDY ALLOCATOR
393  */
394 
395 #define MEMZONE_XEN 0
396 #define NR_ZONES    (PADDR_BITS - PAGE_SHIFT + 1)
397 
398 #define bits_to_zone(b) (((b) < (PAGE_SHIFT + 1)) ? 1 : ((b) - PAGE_SHIFT))
399 #define page_to_zone(pg) (is_xen_heap_page(pg) ? MEMZONE_XEN :  \
400                           (flsl(page_to_mfn(pg)) ? : 1))
401 
402 typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
403 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
404 #define heap(node, zone, order) ((*_heap[node])[zone][order])
405 
406 static unsigned long node_need_scrub[MAX_NUMNODES];
407 
408 static unsigned long *avail[MAX_NUMNODES];
409 static long total_avail_pages;
410 
411 /* TMEM: Reserve a fraction of memory for mid-size (0<order<9) allocations.*/
412 static long midsize_alloc_zone_pages;
413 #define MIDSIZE_ALLOC_FRAC 128
414 
415 static DEFINE_SPINLOCK(heap_lock);
416 static long outstanding_claims; /* total outstanding claims by all domains */
417 
domain_adjust_tot_pages(struct domain * d,long pages)418 unsigned long domain_adjust_tot_pages(struct domain *d, long pages)
419 {
420     long dom_before, dom_after, dom_claimed, sys_before, sys_after;
421 
422     ASSERT(spin_is_locked(&d->page_alloc_lock));
423     d->tot_pages += pages;
424 
425     /*
426      * can test d->claimed_pages race-free because it can only change
427      * if d->page_alloc_lock and heap_lock are both held, see also
428      * domain_set_outstanding_pages below
429      */
430     if ( !d->outstanding_pages )
431         goto out;
432 
433     spin_lock(&heap_lock);
434     /* adjust domain outstanding pages; may not go negative */
435     dom_before = d->outstanding_pages;
436     dom_after = dom_before - pages;
437     BUG_ON(dom_before < 0);
438     dom_claimed = dom_after < 0 ? 0 : dom_after;
439     d->outstanding_pages = dom_claimed;
440     /* flag accounting bug if system outstanding_claims would go negative */
441     sys_before = outstanding_claims;
442     sys_after = sys_before - (dom_before - dom_claimed);
443     BUG_ON(sys_after < 0);
444     outstanding_claims = sys_after;
445     spin_unlock(&heap_lock);
446 
447 out:
448     return d->tot_pages;
449 }
450 
domain_set_outstanding_pages(struct domain * d,unsigned long pages)451 int domain_set_outstanding_pages(struct domain *d, unsigned long pages)
452 {
453     int ret = -ENOMEM;
454     unsigned long claim, avail_pages;
455 
456     /*
457      * take the domain's page_alloc_lock, else all d->tot_page adjustments
458      * must always take the global heap_lock rather than only in the much
459      * rarer case that d->outstanding_pages is non-zero
460      */
461     spin_lock(&d->page_alloc_lock);
462     spin_lock(&heap_lock);
463 
464     /* pages==0 means "unset" the claim. */
465     if ( pages == 0 )
466     {
467         outstanding_claims -= d->outstanding_pages;
468         d->outstanding_pages = 0;
469         ret = 0;
470         goto out;
471     }
472 
473     /* only one active claim per domain please */
474     if ( d->outstanding_pages )
475     {
476         ret = -EINVAL;
477         goto out;
478     }
479 
480     /* disallow a claim not exceeding current tot_pages or above max_pages */
481     if ( (pages <= d->tot_pages) || (pages > d->max_pages) )
482     {
483         ret = -EINVAL;
484         goto out;
485     }
486 
487     /* how much memory is available? */
488     avail_pages = total_avail_pages;
489 
490     /* Note: The usage of claim means that allocation from a guest *might*
491      * have to come from freeable memory. Using free memory is always better, if
492      * it is available, than using freeable memory.
493      *
494      * But that is OK as once the claim has been made, it still can take minutes
495      * before the claim is fully satisfied. Tmem can make use of the unclaimed
496      * pages during this time (to store ephemeral/freeable pages only,
497      * not persistent pages).
498      */
499     avail_pages += tmem_freeable_pages();
500     avail_pages -= outstanding_claims;
501 
502     /*
503      * Note, if domain has already allocated memory before making a claim
504      * then the claim must take tot_pages into account
505      */
506     claim = pages - d->tot_pages;
507     if ( claim > avail_pages )
508         goto out;
509 
510     /* yay, claim fits in available memory, stake the claim, success! */
511     d->outstanding_pages = claim;
512     outstanding_claims += d->outstanding_pages;
513     ret = 0;
514 
515 out:
516     spin_unlock(&heap_lock);
517     spin_unlock(&d->page_alloc_lock);
518     return ret;
519 }
520 
get_outstanding_claims(uint64_t * free_pages,uint64_t * outstanding_pages)521 void get_outstanding_claims(uint64_t *free_pages, uint64_t *outstanding_pages)
522 {
523     spin_lock(&heap_lock);
524     *outstanding_pages = outstanding_claims;
525     *free_pages =  avail_domheap_pages();
526     spin_unlock(&heap_lock);
527 }
528 
529 static bool_t __read_mostly first_node_initialised;
530 #ifndef CONFIG_SEPARATE_XENHEAP
531 static unsigned int __read_mostly xenheap_bits;
532 #else
533 #define xenheap_bits 0
534 #endif
535 
init_node_heap(int node,unsigned long mfn,unsigned long nr,bool_t * use_tail)536 static unsigned long init_node_heap(int node, unsigned long mfn,
537                                     unsigned long nr, bool_t *use_tail)
538 {
539     /* First node to be discovered has its heap metadata statically alloced. */
540     static heap_by_zone_and_order_t _heap_static;
541     static unsigned long avail_static[NR_ZONES];
542     unsigned long needed = (sizeof(**_heap) +
543                             sizeof(**avail) * NR_ZONES +
544                             PAGE_SIZE - 1) >> PAGE_SHIFT;
545     int i, j;
546 
547     if ( !first_node_initialised )
548     {
549         _heap[node] = &_heap_static;
550         avail[node] = avail_static;
551         first_node_initialised = 1;
552         needed = 0;
553     }
554     else if ( *use_tail && nr >= needed &&
555               arch_mfn_in_directmap(mfn + nr) &&
556               (!xenheap_bits ||
557                !((mfn + nr - 1) >> (xenheap_bits - PAGE_SHIFT))) )
558     {
559         _heap[node] = mfn_to_virt(mfn + nr - needed);
560         avail[node] = mfn_to_virt(mfn + nr - 1) +
561                       PAGE_SIZE - sizeof(**avail) * NR_ZONES;
562     }
563     else if ( nr >= needed &&
564               arch_mfn_in_directmap(mfn + needed) &&
565               (!xenheap_bits ||
566                !((mfn + needed - 1) >> (xenheap_bits - PAGE_SHIFT))) )
567     {
568         _heap[node] = mfn_to_virt(mfn);
569         avail[node] = mfn_to_virt(mfn + needed - 1) +
570                       PAGE_SIZE - sizeof(**avail) * NR_ZONES;
571         *use_tail = 0;
572     }
573     else if ( get_order_from_bytes(sizeof(**_heap)) ==
574               get_order_from_pages(needed) )
575     {
576         _heap[node] = alloc_xenheap_pages(get_order_from_pages(needed), 0);
577         BUG_ON(!_heap[node]);
578         avail[node] = (void *)_heap[node] + (needed << PAGE_SHIFT) -
579                       sizeof(**avail) * NR_ZONES;
580         needed = 0;
581     }
582     else
583     {
584         _heap[node] = xmalloc(heap_by_zone_and_order_t);
585         avail[node] = xmalloc_array(unsigned long, NR_ZONES);
586         BUG_ON(!_heap[node] || !avail[node]);
587         needed = 0;
588     }
589 
590     memset(avail[node], 0, NR_ZONES * sizeof(long));
591 
592     for ( i = 0; i < NR_ZONES; i++ )
593         for ( j = 0; j <= MAX_ORDER; j++ )
594             INIT_PAGE_LIST_HEAD(&heap(node, i, j));
595 
596     return needed;
597 }
598 
599 /* Default to 64 MiB */
600 #define DEFAULT_LOW_MEM_VIRQ    (((paddr_t) 64)   << 20)
601 #define MAX_LOW_MEM_VIRQ        (((paddr_t) 1024) << 20)
602 
603 static paddr_t __read_mostly opt_low_mem_virq = ((paddr_t) -1);
604 size_param("low_mem_virq_limit", opt_low_mem_virq);
605 
606 /* Thresholds to control hysteresis. In pages */
607 /* When memory grows above this threshold, reset hysteresis.
608  * -1 initially to not reset until at least one virq issued. */
609 static unsigned long low_mem_virq_high      = -1UL;
610 /* Threshold at which we issue virq */
611 static unsigned long low_mem_virq_th        = 0;
612 /* Original threshold after all checks completed */
613 static unsigned long low_mem_virq_orig      = 0;
614 /* Order for current threshold */
615 static unsigned int  low_mem_virq_th_order  = 0;
616 
617 /* Perform bootstrapping checks and set bounds */
setup_low_mem_virq(void)618 static void __init setup_low_mem_virq(void)
619 {
620     unsigned int order;
621     paddr_t threshold;
622     bool_t halve;
623 
624     /* If the user specifies zero, then he/she doesn't want this virq
625      * to ever trigger. */
626     if ( opt_low_mem_virq == 0 )
627     {
628         low_mem_virq_th = -1UL;
629         return;
630     }
631 
632     /* If the user did not specify a knob, remember that */
633     halve = (opt_low_mem_virq == ((paddr_t) -1));
634     threshold = halve ? DEFAULT_LOW_MEM_VIRQ : opt_low_mem_virq;
635 
636     /* Dom0 has already been allocated by now. So check we won't be
637      * complaining immediately with whatever's left of the heap. */
638     threshold = min(threshold,
639                     ((paddr_t) total_avail_pages) << PAGE_SHIFT);
640 
641     /* Then, cap to some predefined maximum */
642     threshold = min(threshold, MAX_LOW_MEM_VIRQ);
643 
644     /* If the user specified no knob, and we are at the current available
645      * level, halve the threshold. */
646     if ( halve &&
647          (threshold == (((paddr_t) total_avail_pages) << PAGE_SHIFT)) )
648         threshold >>= 1;
649 
650     /* Zero? Have to fire immediately */
651     threshold = max(threshold, (paddr_t) PAGE_SIZE);
652 
653     /* Threshold bytes -> pages */
654     low_mem_virq_th = threshold >> PAGE_SHIFT;
655 
656     /* Next, round the threshold down to the next order */
657     order = get_order_from_pages(low_mem_virq_th);
658     if ( (1UL << order) > low_mem_virq_th )
659         order--;
660 
661     /* Set bounds, ready to go */
662     low_mem_virq_th = low_mem_virq_orig = 1UL << order;
663     low_mem_virq_th_order = order;
664 
665     printk("Initial low memory virq threshold set at %#lx pages.\n",
666             low_mem_virq_th);
667 }
668 
check_low_mem_virq(void)669 static void check_low_mem_virq(void)
670 {
671     unsigned long avail_pages = total_avail_pages +
672         tmem_freeable_pages() - outstanding_claims;
673 
674     if ( unlikely(avail_pages <= low_mem_virq_th) )
675     {
676         send_global_virq(VIRQ_ENOMEM);
677 
678         /* Update thresholds. Next warning will be when we drop below
679          * next order. However, we wait until we grow beyond one
680          * order above us to complain again at the current order */
681         low_mem_virq_high   = 1UL << (low_mem_virq_th_order + 1);
682         if ( low_mem_virq_th_order > 0 )
683             low_mem_virq_th_order--;
684         low_mem_virq_th     = 1UL << low_mem_virq_th_order;
685         return;
686     }
687 
688     if ( unlikely(avail_pages >= low_mem_virq_high) )
689     {
690         /* Reset hysteresis. Bring threshold up one order.
691          * If we are back where originally set, set high
692          * threshold to -1 to avoid further growth of
693          * virq threshold. */
694         low_mem_virq_th_order++;
695         low_mem_virq_th = 1UL << low_mem_virq_th_order;
696         if ( low_mem_virq_th == low_mem_virq_orig )
697             low_mem_virq_high = -1UL;
698         else
699             low_mem_virq_high = 1UL << (low_mem_virq_th_order + 2);
700     }
701 }
702 
703 /* Pages that need a scrub are added to tail, otherwise to head. */
page_list_add_scrub(struct page_info * pg,unsigned int node,unsigned int zone,unsigned int order,unsigned int first_dirty)704 static void page_list_add_scrub(struct page_info *pg, unsigned int node,
705                                 unsigned int zone, unsigned int order,
706                                 unsigned int first_dirty)
707 {
708     PFN_ORDER(pg) = order;
709     pg->u.free.first_dirty = first_dirty;
710     pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
711 
712     if ( first_dirty != INVALID_DIRTY_IDX )
713     {
714         ASSERT(first_dirty < (1U << order));
715         page_list_add_tail(pg, &heap(node, zone, order));
716     }
717     else
718         page_list_add(pg, &heap(node, zone, order));
719 }
720 
721 /* SCRUB_PATTERN needs to be a repeating series of bytes. */
722 #ifndef NDEBUG
723 #define SCRUB_PATTERN        0xc2c2c2c2c2c2c2c2ULL
724 #else
725 #define SCRUB_PATTERN        0ULL
726 #endif
727 #define SCRUB_BYTE_PATTERN   (SCRUB_PATTERN & 0xff)
728 
poison_one_page(struct page_info * pg)729 static void poison_one_page(struct page_info *pg)
730 {
731 #ifdef CONFIG_SCRUB_DEBUG
732     mfn_t mfn = _mfn(page_to_mfn(pg));
733     uint64_t *ptr;
734 
735     if ( !scrub_debug )
736         return;
737 
738     ptr = map_domain_page(mfn);
739     *ptr = ~SCRUB_PATTERN;
740     unmap_domain_page(ptr);
741 #endif
742 }
743 
check_one_page(struct page_info * pg)744 static void check_one_page(struct page_info *pg)
745 {
746 #ifdef CONFIG_SCRUB_DEBUG
747     mfn_t mfn = _mfn(page_to_mfn(pg));
748     const uint64_t *ptr;
749     unsigned int i;
750 
751     if ( !scrub_debug )
752         return;
753 
754     ptr = map_domain_page(mfn);
755     for ( i = 0; i < PAGE_SIZE / sizeof (*ptr); i++ )
756         BUG_ON(ptr[i] != SCRUB_PATTERN);
757     unmap_domain_page(ptr);
758 #endif
759 }
760 
check_and_stop_scrub(struct page_info * head)761 static void check_and_stop_scrub(struct page_info *head)
762 {
763     if ( head->u.free.scrub_state == BUDDY_SCRUBBING )
764     {
765         typeof(head->u.free) pgfree;
766 
767         head->u.free.scrub_state = BUDDY_SCRUB_ABORT;
768         spin_lock_kick();
769         for ( ; ; )
770         {
771             /* Can't ACCESS_ONCE() a bitfield. */
772             pgfree.val = ACCESS_ONCE(head->u.free.val);
773             if ( pgfree.scrub_state != BUDDY_SCRUB_ABORT )
774                 break;
775             cpu_relax();
776         }
777     }
778 }
779 
get_free_buddy(unsigned int zone_lo,unsigned int zone_hi,unsigned int order,unsigned int memflags,const struct domain * d)780 static struct page_info *get_free_buddy(unsigned int zone_lo,
781                                         unsigned int zone_hi,
782                                         unsigned int order, unsigned int memflags,
783                                         const struct domain *d)
784 {
785     nodeid_t first_node, node = MEMF_get_node(memflags), req_node = node;
786     nodemask_t nodemask = d ? d->node_affinity : node_online_map;
787     unsigned int j, zone, nodemask_retry = 0;
788     struct page_info *pg;
789     bool use_unscrubbed = (memflags & MEMF_no_scrub);
790 
791     if ( node == NUMA_NO_NODE )
792     {
793         if ( d != NULL )
794         {
795             node = next_node(d->last_alloc_node, nodemask);
796             if ( node >= MAX_NUMNODES )
797                 node = first_node(nodemask);
798         }
799         if ( node >= MAX_NUMNODES )
800             node = cpu_to_node(smp_processor_id());
801     }
802     else if ( unlikely(node >= MAX_NUMNODES) )
803     {
804         ASSERT_UNREACHABLE();
805         return NULL;
806     }
807     first_node = node;
808 
809     /*
810      * Start with requested node, but exhaust all node memory in requested
811      * zone before failing, only calc new node value if we fail to find memory
812      * in target node, this avoids needless computation on fast-path.
813      */
814     for ( ; ; )
815     {
816         zone = zone_hi;
817         do {
818             /* Check if target node can support the allocation. */
819             if ( !avail[node] || (avail[node][zone] < (1UL << order)) )
820                 continue;
821 
822             /* Find smallest order which can satisfy the request. */
823             for ( j = order; j <= MAX_ORDER; j++ )
824             {
825                 if ( (pg = page_list_remove_head(&heap(node, zone, j))) )
826                 {
827                     if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
828                         return pg;
829                     /*
830                      * We grab single pages (order=0) even if they are
831                      * unscrubbed. Given that scrubbing one page is fairly quick
832                      * it is not worth breaking higher orders.
833                      */
834                     if ( (order == 0) || use_unscrubbed )
835                     {
836                         check_and_stop_scrub(pg);
837                         return pg;
838                     }
839 
840                     page_list_add_tail(pg, &heap(node, zone, j));
841                 }
842             }
843         } while ( zone-- > zone_lo ); /* careful: unsigned zone may wrap */
844 
845         if ( (memflags & MEMF_exact_node) && req_node != NUMA_NO_NODE )
846             return NULL;
847 
848         /* Pick next node. */
849         if ( !node_isset(node, nodemask) )
850         {
851             /* Very first node may be caller-specified and outside nodemask. */
852             ASSERT(!nodemask_retry);
853             first_node = node = first_node(nodemask);
854             if ( node < MAX_NUMNODES )
855                 continue;
856         }
857         else if ( (node = next_node(node, nodemask)) >= MAX_NUMNODES )
858             node = first_node(nodemask);
859         if ( node == first_node )
860         {
861             /* When we have tried all in nodemask, we fall back to others. */
862             if ( (memflags & MEMF_exact_node) || nodemask_retry++ )
863                 return NULL;
864             nodes_andnot(nodemask, node_online_map, nodemask);
865             first_node = node = first_node(nodemask);
866             if ( node >= MAX_NUMNODES )
867                 return NULL;
868         }
869     }
870 }
871 
872 /* Allocate 2^@order contiguous pages. */
alloc_heap_pages(unsigned int zone_lo,unsigned int zone_hi,unsigned int order,unsigned int memflags,struct domain * d)873 static struct page_info *alloc_heap_pages(
874     unsigned int zone_lo, unsigned int zone_hi,
875     unsigned int order, unsigned int memflags,
876     struct domain *d)
877 {
878     nodeid_t node;
879     unsigned int i, buddy_order, zone, first_dirty;
880     unsigned long request = 1UL << order;
881     struct page_info *pg;
882     bool need_tlbflush = false;
883     uint32_t tlbflush_timestamp = 0;
884     unsigned int dirty_cnt = 0;
885 
886     /* Make sure there are enough bits in memflags for nodeID. */
887     BUILD_BUG_ON((_MEMF_bits - _MEMF_node) < (8 * sizeof(nodeid_t)));
888 
889     ASSERT(zone_lo <= zone_hi);
890     ASSERT(zone_hi < NR_ZONES);
891 
892     if ( unlikely(order > MAX_ORDER) )
893         return NULL;
894 
895     spin_lock(&heap_lock);
896 
897     /*
898      * Claimed memory is considered unavailable unless the request
899      * is made by a domain with sufficient unclaimed pages.
900      */
901     if ( (outstanding_claims + request >
902           total_avail_pages + tmem_freeable_pages()) &&
903           ((memflags & MEMF_no_refcount) ||
904            !d || d->outstanding_pages < request) )
905     {
906         spin_unlock(&heap_lock);
907         return NULL;
908     }
909 
910     /*
911      * TMEM: When available memory is scarce due to tmem absorbing it, allow
912      * only mid-size allocations to avoid worst of fragmentation issues.
913      * Others try tmem pools then fail.  This is a workaround until all
914      * post-dom0-creation-multi-page allocations can be eliminated.
915      */
916     if ( ((order == 0) || (order >= 9)) &&
917          (total_avail_pages <= midsize_alloc_zone_pages) &&
918          tmem_freeable_pages() )
919     {
920         /* Try to free memory from tmem. */
921         pg = tmem_relinquish_pages(order, memflags);
922         spin_unlock(&heap_lock);
923         return pg;
924     }
925 
926     pg = get_free_buddy(zone_lo, zone_hi, order, memflags, d);
927     /* Try getting a dirty buddy if we couldn't get a clean one. */
928     if ( !pg && !(memflags & MEMF_no_scrub) )
929         pg = get_free_buddy(zone_lo, zone_hi, order,
930                             memflags | MEMF_no_scrub, d);
931     if ( !pg )
932     {
933         /* No suitable memory blocks. Fail the request. */
934         spin_unlock(&heap_lock);
935         return NULL;
936     }
937 
938     node = phys_to_nid(page_to_maddr(pg));
939     zone = page_to_zone(pg);
940     buddy_order = PFN_ORDER(pg);
941 
942     first_dirty = pg->u.free.first_dirty;
943 
944     /* We may have to halve the chunk a number of times. */
945     while ( buddy_order != order )
946     {
947         buddy_order--;
948         page_list_add_scrub(pg, node, zone, buddy_order,
949                             (1U << buddy_order) > first_dirty ?
950                             first_dirty : INVALID_DIRTY_IDX);
951         pg += 1U << buddy_order;
952 
953         if ( first_dirty != INVALID_DIRTY_IDX )
954         {
955             /* Adjust first_dirty */
956             if ( first_dirty >= 1U << buddy_order )
957                 first_dirty -= 1U << buddy_order;
958             else
959                 first_dirty = 0; /* We've moved past original first_dirty */
960         }
961     }
962 
963     ASSERT(avail[node][zone] >= request);
964     avail[node][zone] -= request;
965     total_avail_pages -= request;
966     ASSERT(total_avail_pages >= 0);
967 
968     check_low_mem_virq();
969 
970     if ( d != NULL )
971         d->last_alloc_node = node;
972 
973     for ( i = 0; i < (1 << order); i++ )
974     {
975         /* Reference count must continuously be zero for free pages. */
976         BUG_ON((pg[i].count_info & ~PGC_need_scrub) != PGC_state_free);
977 
978         /* PGC_need_scrub can only be set if first_dirty is valid */
979         ASSERT(first_dirty != INVALID_DIRTY_IDX || !(pg[i].count_info & PGC_need_scrub));
980 
981         /* Preserve PGC_need_scrub so we can check it after lock is dropped. */
982         pg[i].count_info = PGC_state_inuse | (pg[i].count_info & PGC_need_scrub);
983 
984         if ( !(memflags & MEMF_no_tlbflush) )
985             accumulate_tlbflush(&need_tlbflush, &pg[i],
986                                 &tlbflush_timestamp);
987 
988         /* Initialise fields which have other uses for free pages. */
989         pg[i].u.inuse.type_info = 0;
990         page_set_owner(&pg[i], NULL);
991 
992         /* Ensure cache and RAM are consistent for platforms where the
993          * guest can control its own visibility of/through the cache.
994          */
995         flush_page_to_ram(page_to_mfn(&pg[i]), !(memflags & MEMF_no_icache_flush));
996     }
997 
998     spin_unlock(&heap_lock);
999 
1000     if ( first_dirty != INVALID_DIRTY_IDX ||
1001          (scrub_debug && !(memflags & MEMF_no_scrub)) )
1002     {
1003         for ( i = 0; i < (1U << order); i++ )
1004         {
1005             if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
1006             {
1007                 if ( !(memflags & MEMF_no_scrub) )
1008                     scrub_one_page(&pg[i]);
1009 
1010                 dirty_cnt++;
1011 
1012                 spin_lock(&heap_lock);
1013                 pg[i].count_info &= ~PGC_need_scrub;
1014                 spin_unlock(&heap_lock);
1015             }
1016             else if ( !(memflags & MEMF_no_scrub) )
1017                 check_one_page(&pg[i]);
1018         }
1019 
1020         if ( dirty_cnt )
1021         {
1022             spin_lock(&heap_lock);
1023             node_need_scrub[node] -= dirty_cnt;
1024             spin_unlock(&heap_lock);
1025         }
1026     }
1027 
1028     if ( need_tlbflush )
1029         filtered_flush_tlb_mask(tlbflush_timestamp);
1030 
1031     return pg;
1032 }
1033 
1034 /* Remove any offlined page in the buddy pointed to by head. */
reserve_offlined_page(struct page_info * head)1035 static int reserve_offlined_page(struct page_info *head)
1036 {
1037     unsigned int node = phys_to_nid(page_to_maddr(head));
1038     int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
1039     struct page_info *cur_head;
1040     unsigned int cur_order, first_dirty;
1041 
1042     ASSERT(spin_is_locked(&heap_lock));
1043 
1044     cur_head = head;
1045 
1046     check_and_stop_scrub(head);
1047     /*
1048      * We may break the buddy so let's mark the head as clean. Then, when
1049      * merging chunks back into the heap, we will see whether the chunk has
1050      * unscrubbed pages and set its first_dirty properly.
1051      */
1052     first_dirty = head->u.free.first_dirty;
1053     head->u.free.first_dirty = INVALID_DIRTY_IDX;
1054 
1055     page_list_del(head, &heap(node, zone, head_order));
1056 
1057     while ( cur_head < (head + (1 << head_order)) )
1058     {
1059         struct page_info *pg;
1060         int next_order;
1061 
1062         if ( page_state_is(cur_head, offlined) )
1063         {
1064             cur_head++;
1065             if ( first_dirty != INVALID_DIRTY_IDX && first_dirty )
1066                 first_dirty--;
1067             continue;
1068         }
1069 
1070         next_order = cur_order = 0;
1071 
1072         while ( cur_order < head_order )
1073         {
1074             next_order = cur_order + 1;
1075 
1076             if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
1077                 goto merge;
1078 
1079             for ( i = (1 << cur_order), pg = cur_head + (1 << cur_order );
1080                   i < (1 << next_order);
1081                   i++, pg++ )
1082                 if ( page_state_is(pg, offlined) )
1083                     break;
1084             if ( i == ( 1 << next_order) )
1085             {
1086                 cur_order = next_order;
1087                 continue;
1088             }
1089             else
1090             {
1091             merge:
1092                 /* We don't consider merging outside the head_order. */
1093                 page_list_add_scrub(cur_head, node, zone, cur_order,
1094                                     (1U << cur_order) > first_dirty ?
1095                                     first_dirty : INVALID_DIRTY_IDX);
1096                 cur_head += (1 << cur_order);
1097 
1098                 /* Adjust first_dirty if needed. */
1099                 if ( first_dirty != INVALID_DIRTY_IDX )
1100                 {
1101                     if ( first_dirty >=  1U << cur_order )
1102                         first_dirty -= 1U << cur_order;
1103                     else
1104                         first_dirty = 0;
1105                 }
1106 
1107                 break;
1108             }
1109         }
1110     }
1111 
1112     for ( cur_head = head; cur_head < head + ( 1UL << head_order); cur_head++ )
1113     {
1114         if ( !page_state_is(cur_head, offlined) )
1115             continue;
1116 
1117         avail[node][zone]--;
1118         total_avail_pages--;
1119         ASSERT(total_avail_pages >= 0);
1120 
1121         page_list_add_tail(cur_head,
1122                            test_bit(_PGC_broken, &cur_head->count_info) ?
1123                            &page_broken_list : &page_offlined_list);
1124 
1125         count++;
1126     }
1127 
1128     return count;
1129 }
1130 
1131 static nodemask_t node_scrubbing;
1132 
1133 /*
1134  * If get_node is true this will return closest node that needs to be scrubbed,
1135  * with appropriate bit in node_scrubbing set.
1136  * If get_node is not set, this will return *a* node that needs to be scrubbed.
1137  * node_scrubbing bitmask will no be updated.
1138  * If no node needs scrubbing then NUMA_NO_NODE is returned.
1139  */
node_to_scrub(bool get_node)1140 static unsigned int node_to_scrub(bool get_node)
1141 {
1142     nodeid_t node = cpu_to_node(smp_processor_id()), local_node;
1143     nodeid_t closest = NUMA_NO_NODE;
1144     u8 dist, shortest = 0xff;
1145 
1146     if ( node == NUMA_NO_NODE )
1147         node = 0;
1148 
1149     if ( node_need_scrub[node] &&
1150          (!get_node || !node_test_and_set(node, node_scrubbing)) )
1151         return node;
1152 
1153     /*
1154      * See if there are memory-only nodes that need scrubbing and choose
1155      * the closest one.
1156      */
1157     local_node = node;
1158     for ( ; ; )
1159     {
1160         do {
1161             node = cycle_node(node, node_online_map);
1162         } while ( !cpumask_empty(&node_to_cpumask(node)) &&
1163                   (node != local_node) );
1164 
1165         if ( node == local_node )
1166             break;
1167 
1168         if ( node_need_scrub[node] )
1169         {
1170             if ( !get_node )
1171                 return node;
1172 
1173             dist = __node_distance(local_node, node);
1174 
1175             /*
1176              * Grab the node right away. If we find a closer node later we will
1177              * release this one. While there is a chance that another CPU will
1178              * not be able to scrub that node when it is searching for scrub work
1179              * at the same time it will be able to do so next time it wakes up.
1180              * The alternative would be to perform this search under a lock but
1181              * then we'd need to take this lock every time we come in here.
1182              */
1183             if ( (dist < shortest || closest == NUMA_NO_NODE) &&
1184                  !node_test_and_set(node, node_scrubbing) )
1185             {
1186                 if ( closest != NUMA_NO_NODE )
1187                     node_clear(closest, node_scrubbing);
1188                 shortest = dist;
1189                 closest = node;
1190             }
1191         }
1192     }
1193 
1194     return closest;
1195 }
1196 
1197 struct scrub_wait_state {
1198     struct page_info *pg;
1199     unsigned int first_dirty;
1200     bool drop;
1201 };
1202 
scrub_continue(void * data)1203 static void scrub_continue(void *data)
1204 {
1205     struct scrub_wait_state *st = data;
1206 
1207     if ( st->drop )
1208         return;
1209 
1210     if ( st->pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
1211     {
1212         /* There is a waiter for this buddy. Release it. */
1213         st->drop = true;
1214         st->pg->u.free.first_dirty = st->first_dirty;
1215         smp_wmb();
1216         st->pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1217     }
1218 }
1219 
scrub_free_pages(void)1220 bool scrub_free_pages(void)
1221 {
1222     struct page_info *pg;
1223     unsigned int zone;
1224     unsigned int cpu = smp_processor_id();
1225     bool preempt = false;
1226     nodeid_t node;
1227     unsigned int cnt = 0;
1228 
1229     node = node_to_scrub(true);
1230     if ( node == NUMA_NO_NODE )
1231         return false;
1232 
1233     spin_lock(&heap_lock);
1234 
1235     for ( zone = 0; zone < NR_ZONES; zone++ )
1236     {
1237         unsigned int order = MAX_ORDER;
1238 
1239         do {
1240             while ( !page_list_empty(&heap(node, zone, order)) )
1241             {
1242                 unsigned int i, dirty_cnt;
1243                 struct scrub_wait_state st;
1244 
1245                 /* Unscrubbed pages are always at the end of the list. */
1246                 pg = page_list_last(&heap(node, zone, order));
1247                 if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
1248                     break;
1249 
1250                 ASSERT(pg->u.free.scrub_state == BUDDY_NOT_SCRUBBING);
1251                 pg->u.free.scrub_state = BUDDY_SCRUBBING;
1252 
1253                 spin_unlock(&heap_lock);
1254 
1255                 dirty_cnt = 0;
1256 
1257                 for ( i = pg->u.free.first_dirty; i < (1U << order); i++)
1258                 {
1259                     if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
1260                     {
1261                         scrub_one_page(&pg[i]);
1262                         /*
1263                          * We can modify count_info without holding heap
1264                          * lock since we effectively locked this buddy by
1265                          * setting its scrub_state.
1266                          */
1267                         pg[i].count_info &= ~PGC_need_scrub;
1268                         dirty_cnt++;
1269                         cnt += 100; /* scrubbed pages add heavier weight. */
1270                     }
1271                     else
1272                         cnt++;
1273 
1274                     if ( pg->u.free.scrub_state == BUDDY_SCRUB_ABORT )
1275                     {
1276                         /* Someone wants this chunk. Drop everything. */
1277 
1278                         pg->u.free.first_dirty = (i == (1U << order) - 1) ?
1279                             INVALID_DIRTY_IDX : i + 1;
1280                         smp_wmb();
1281                         pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1282 
1283                         spin_lock(&heap_lock);
1284                         node_need_scrub[node] -= dirty_cnt;
1285                         spin_unlock(&heap_lock);
1286                         goto out_nolock;
1287                     }
1288 
1289                     /*
1290                      * Scrub a few (8) pages before becoming eligible for
1291                      * preemption. But also count non-scrubbing loop iterations
1292                      * so that we don't get stuck here with an almost clean
1293                      * heap.
1294                      */
1295                     if ( cnt > 800 && softirq_pending(cpu) )
1296                     {
1297                         preempt = true;
1298                         break;
1299                     }
1300                 }
1301 
1302                 st.pg = pg;
1303                 /*
1304                  * get_free_buddy() grabs a buddy with first_dirty set to
1305                  * INVALID_DIRTY_IDX so we can't set pg's first_dirty here.
1306                  * It will be set either below or in the lock callback (in
1307                  * scrub_continue()).
1308                  */
1309                 st.first_dirty = (i >= (1U << order) - 1) ?
1310                     INVALID_DIRTY_IDX : i + 1;
1311                 st.drop = false;
1312                 spin_lock_cb(&heap_lock, scrub_continue, &st);
1313 
1314                 node_need_scrub[node] -= dirty_cnt;
1315 
1316                 if ( st.drop )
1317                     goto out;
1318 
1319                 if ( i >= (1U << order) - 1 )
1320                 {
1321                     page_list_del(pg, &heap(node, zone, order));
1322                     page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
1323                 }
1324                 else
1325                     pg->u.free.first_dirty = i + 1;
1326 
1327                 pg->u.free.scrub_state = BUDDY_NOT_SCRUBBING;
1328 
1329                 if ( preempt || (node_need_scrub[node] == 0) )
1330                     goto out;
1331             }
1332         } while ( order-- != 0 );
1333     }
1334 
1335  out:
1336     spin_unlock(&heap_lock);
1337 
1338  out_nolock:
1339     node_clear(node, node_scrubbing);
1340     return node_to_scrub(false) != NUMA_NO_NODE;
1341 }
1342 
1343 /* Free 2^@order set of pages. */
free_heap_pages(struct page_info * pg,unsigned int order,bool need_scrub)1344 static void free_heap_pages(
1345     struct page_info *pg, unsigned int order, bool need_scrub)
1346 {
1347     unsigned long mask, mfn = page_to_mfn(pg);
1348     unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
1349     unsigned int zone = page_to_zone(pg);
1350 
1351     ASSERT(order <= MAX_ORDER);
1352     ASSERT(node >= 0);
1353 
1354     spin_lock(&heap_lock);
1355 
1356     for ( i = 0; i < (1 << order); i++ )
1357     {
1358         /*
1359          * Cannot assume that count_info == 0, as there are some corner cases
1360          * where it isn't the case and yet it isn't a bug:
1361          *  1. page_get_owner() is NULL
1362          *  2. page_get_owner() is a domain that was never accessible by
1363          *     its domid (e.g., failed to fully construct the domain).
1364          *  3. page was never addressable by the guest (e.g., it's an
1365          *     auto-translate-physmap guest and the page was never included
1366          *     in its pseudophysical address space).
1367          * In all the above cases there can be no guest mappings of this page.
1368          */
1369         ASSERT(!page_state_is(&pg[i], offlined));
1370         pg[i].count_info =
1371             ((pg[i].count_info & PGC_broken) |
1372              (page_state_is(&pg[i], offlining)
1373               ? PGC_state_offlined : PGC_state_free));
1374         if ( page_state_is(&pg[i], offlined) )
1375             tainted = 1;
1376 
1377         /* If a page has no owner it will need no safety TLB flush. */
1378         pg[i].u.free.need_tlbflush = (page_get_owner(&pg[i]) != NULL);
1379         if ( pg[i].u.free.need_tlbflush )
1380             page_set_tlbflush_timestamp(&pg[i]);
1381 
1382         /* This page is not a guest frame any more. */
1383         page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
1384         set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY);
1385 
1386         if ( need_scrub )
1387         {
1388             pg[i].count_info |= PGC_need_scrub;
1389             poison_one_page(&pg[i]);
1390         }
1391     }
1392 
1393     avail[node][zone] += 1 << order;
1394     total_avail_pages += 1 << order;
1395     if ( need_scrub )
1396     {
1397         node_need_scrub[node] += 1 << order;
1398         pg->u.free.first_dirty = 0;
1399     }
1400     else
1401         pg->u.free.first_dirty = INVALID_DIRTY_IDX;
1402 
1403     if ( tmem_enabled() )
1404         midsize_alloc_zone_pages = max(
1405             midsize_alloc_zone_pages, total_avail_pages / MIDSIZE_ALLOC_FRAC);
1406 
1407     /* Merge chunks as far as possible. */
1408     while ( order < MAX_ORDER )
1409     {
1410         mask = 1UL << order;
1411 
1412         if ( (page_to_mfn(pg) & mask) )
1413         {
1414             struct page_info *predecessor = pg - mask;
1415 
1416             /* Merge with predecessor block? */
1417             if ( !mfn_valid(_mfn(page_to_mfn(predecessor))) ||
1418                  !page_state_is(predecessor, free) ||
1419                  (PFN_ORDER(predecessor) != order) ||
1420                  (phys_to_nid(page_to_maddr(predecessor)) != node) )
1421                 break;
1422 
1423             check_and_stop_scrub(predecessor);
1424 
1425             page_list_del(predecessor, &heap(node, zone, order));
1426 
1427             /* Keep predecessor's first_dirty if it is already set. */
1428             if ( predecessor->u.free.first_dirty == INVALID_DIRTY_IDX &&
1429                  pg->u.free.first_dirty != INVALID_DIRTY_IDX )
1430                 predecessor->u.free.first_dirty = (1U << order) +
1431                                                   pg->u.free.first_dirty;
1432 
1433             pg = predecessor;
1434         }
1435         else
1436         {
1437             struct page_info *successor = pg + mask;
1438 
1439             /* Merge with successor block? */
1440             if ( !mfn_valid(_mfn(page_to_mfn(successor))) ||
1441                  !page_state_is(successor, free) ||
1442                  (PFN_ORDER(successor) != order) ||
1443                  (phys_to_nid(page_to_maddr(successor)) != node) )
1444                 break;
1445 
1446             check_and_stop_scrub(successor);
1447 
1448             page_list_del(successor, &heap(node, zone, order));
1449         }
1450 
1451         order++;
1452     }
1453 
1454     page_list_add_scrub(pg, node, zone, order, pg->u.free.first_dirty);
1455 
1456     if ( tainted )
1457         reserve_offlined_page(pg);
1458 
1459     spin_unlock(&heap_lock);
1460 }
1461 
1462 
1463 /*
1464  * Following rules applied for page offline:
1465  * Once a page is broken, it can't be assigned anymore
1466  * A page will be offlined only if it is free
1467  * return original count_info
1468  */
mark_page_offline(struct page_info * pg,int broken)1469 static unsigned long mark_page_offline(struct page_info *pg, int broken)
1470 {
1471     unsigned long nx, x, y = pg->count_info;
1472 
1473     ASSERT(page_is_ram_type(page_to_mfn(pg), RAM_TYPE_CONVENTIONAL));
1474     ASSERT(spin_is_locked(&heap_lock));
1475 
1476     do {
1477         nx = x = y;
1478 
1479         if ( ((x & PGC_state) != PGC_state_offlined) &&
1480              ((x & PGC_state) != PGC_state_offlining) )
1481         {
1482             nx &= ~PGC_state;
1483             nx |= (((x & PGC_state) == PGC_state_free)
1484                    ? PGC_state_offlined : PGC_state_offlining);
1485         }
1486 
1487         if ( broken )
1488             nx |= PGC_broken;
1489 
1490         if ( x == nx )
1491             break;
1492     } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
1493 
1494     return y;
1495 }
1496 
reserve_heap_page(struct page_info * pg)1497 static int reserve_heap_page(struct page_info *pg)
1498 {
1499     struct page_info *head = NULL;
1500     unsigned int i, node = phys_to_nid(page_to_maddr(pg));
1501     unsigned int zone = page_to_zone(pg);
1502 
1503     for ( i = 0; i <= MAX_ORDER; i++ )
1504     {
1505         struct page_info *tmp;
1506 
1507         if ( page_list_empty(&heap(node, zone, i)) )
1508             continue;
1509 
1510         page_list_for_each_safe ( head, tmp, &heap(node, zone, i) )
1511         {
1512             if ( (head <= pg) &&
1513                  (head + (1UL << i) > pg) )
1514                 return reserve_offlined_page(head);
1515         }
1516     }
1517 
1518     return -EINVAL;
1519 
1520 }
1521 
offline_page(unsigned long mfn,int broken,uint32_t * status)1522 int offline_page(unsigned long mfn, int broken, uint32_t *status)
1523 {
1524     unsigned long old_info = 0;
1525     struct domain *owner;
1526     struct page_info *pg;
1527 
1528     if ( !mfn_valid(_mfn(mfn)) )
1529     {
1530         dprintk(XENLOG_WARNING,
1531                 "try to offline page out of range %lx\n", mfn);
1532         return -EINVAL;
1533     }
1534 
1535     *status = 0;
1536     pg = mfn_to_page(mfn);
1537 
1538     if ( is_xen_fixed_mfn(mfn) )
1539     {
1540         *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_FAILED |
1541           (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
1542         return -EPERM;
1543     }
1544 
1545     /*
1546      * N.B. xen's txt in x86_64 is marked reserved and handled already.
1547      * Also kexec range is reserved.
1548      */
1549     if ( !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
1550     {
1551         *status = PG_OFFLINE_FAILED | PG_OFFLINE_NOT_CONV_RAM;
1552         return -EINVAL;
1553     }
1554 
1555     /*
1556      * NB. When broken page belong to guest, usually hypervisor will
1557      * notify the guest to handle the broken page. However, hypervisor
1558      * need to prevent malicious guest access the broken page again.
1559      * Under such case, hypervisor shutdown guest, preventing recursive mce.
1560      */
1561     if ( (pg->count_info & PGC_broken) && (owner = page_get_owner(pg)) )
1562     {
1563         *status = PG_OFFLINE_AGAIN;
1564         domain_shutdown(owner, SHUTDOWN_crash);
1565         return 0;
1566     }
1567 
1568     spin_lock(&heap_lock);
1569 
1570     old_info = mark_page_offline(pg, broken);
1571 
1572     if ( page_state_is(pg, offlined) )
1573     {
1574         reserve_heap_page(pg);
1575 
1576         spin_unlock(&heap_lock);
1577 
1578         *status = broken ? PG_OFFLINE_OFFLINED | PG_OFFLINE_BROKEN
1579                          : PG_OFFLINE_OFFLINED;
1580         return 0;
1581     }
1582 
1583     spin_unlock(&heap_lock);
1584 
1585     if ( (owner = page_get_owner_and_reference(pg)) )
1586     {
1587         if ( p2m_pod_offline_or_broken_hit(pg) )
1588         {
1589             put_page(pg);
1590             p2m_pod_offline_or_broken_replace(pg);
1591             *status = PG_OFFLINE_OFFLINED;
1592         }
1593         else
1594         {
1595             *status = PG_OFFLINE_OWNED | PG_OFFLINE_PENDING |
1596                       (owner->domain_id << PG_OFFLINE_OWNER_SHIFT);
1597             /* Release the reference since it will not be allocated anymore */
1598             put_page(pg);
1599         }
1600     }
1601     else if ( old_info & PGC_xen_heap )
1602     {
1603         *status = PG_OFFLINE_XENPAGE | PG_OFFLINE_PENDING |
1604                   (DOMID_XEN << PG_OFFLINE_OWNER_SHIFT);
1605     }
1606     else
1607     {
1608         /*
1609          * assign_pages does not hold heap_lock, so small window that the owner
1610          * may be set later, but please notice owner will only change from
1611          * NULL to be set, not verse, since page is offlining now.
1612          * No windows If called from #MC handler, since all CPU are in softirq
1613          * If called from user space like CE handling, tools can wait some time
1614          * before call again.
1615          */
1616         *status = PG_OFFLINE_ANONYMOUS | PG_OFFLINE_FAILED |
1617                   (DOMID_INVALID << PG_OFFLINE_OWNER_SHIFT );
1618     }
1619 
1620     if ( broken )
1621         *status |= PG_OFFLINE_BROKEN;
1622 
1623     return 0;
1624 }
1625 
1626 /*
1627  * Online the memory.
1628  *   The caller should make sure end_pfn <= max_page,
1629  *   if not, expand_pages() should be called prior to online_page().
1630  */
online_page(unsigned long mfn,uint32_t * status)1631 unsigned int online_page(unsigned long mfn, uint32_t *status)
1632 {
1633     unsigned long x, nx, y;
1634     struct page_info *pg;
1635     int ret;
1636 
1637     if ( !mfn_valid(_mfn(mfn)) )
1638     {
1639         dprintk(XENLOG_WARNING, "call expand_pages() first\n");
1640         return -EINVAL;
1641     }
1642 
1643     pg = mfn_to_page(mfn);
1644 
1645     spin_lock(&heap_lock);
1646 
1647     y = pg->count_info;
1648     do {
1649         ret = *status = 0;
1650 
1651         if ( y & PGC_broken )
1652         {
1653             ret = -EINVAL;
1654             *status = PG_ONLINE_FAILED |PG_ONLINE_BROKEN;
1655             break;
1656         }
1657 
1658         if ( (y & PGC_state) == PGC_state_offlined )
1659         {
1660             page_list_del(pg, &page_offlined_list);
1661             *status = PG_ONLINE_ONLINED;
1662         }
1663         else if ( (y & PGC_state) == PGC_state_offlining )
1664         {
1665             *status = PG_ONLINE_ONLINED;
1666         }
1667         else
1668         {
1669             break;
1670         }
1671 
1672         x = y;
1673         nx = (x & ~PGC_state) | PGC_state_inuse;
1674     } while ( (y = cmpxchg(&pg->count_info, x, nx)) != x );
1675 
1676     spin_unlock(&heap_lock);
1677 
1678     if ( (y & PGC_state) == PGC_state_offlined )
1679         free_heap_pages(pg, 0, false);
1680 
1681     return ret;
1682 }
1683 
query_page_offline(unsigned long mfn,uint32_t * status)1684 int query_page_offline(unsigned long mfn, uint32_t *status)
1685 {
1686     struct page_info *pg;
1687 
1688     if ( !mfn_valid(_mfn(mfn)) || !page_is_ram_type(mfn, RAM_TYPE_CONVENTIONAL) )
1689     {
1690         dprintk(XENLOG_WARNING, "call expand_pages() first\n");
1691         return -EINVAL;
1692     }
1693 
1694     *status = 0;
1695     spin_lock(&heap_lock);
1696 
1697     pg = mfn_to_page(mfn);
1698 
1699     if ( page_state_is(pg, offlining) )
1700         *status |= PG_OFFLINE_STATUS_OFFLINE_PENDING;
1701     if ( pg->count_info & PGC_broken )
1702         *status |= PG_OFFLINE_STATUS_BROKEN;
1703     if ( page_state_is(pg, offlined) )
1704         *status |= PG_OFFLINE_STATUS_OFFLINED;
1705 
1706     spin_unlock(&heap_lock);
1707 
1708     return 0;
1709 }
1710 
1711 /*
1712  * Hand the specified arbitrary page range to the specified heap zone
1713  * checking the node_id of the previous page.  If they differ and the
1714  * latter is not on a MAX_ORDER boundary, then we reserve the page by
1715  * not freeing it to the buddy allocator.
1716  */
init_heap_pages(struct page_info * pg,unsigned long nr_pages)1717 static void init_heap_pages(
1718     struct page_info *pg, unsigned long nr_pages)
1719 {
1720     unsigned long i;
1721 
1722     /*
1723      * Some pages may not go through the boot allocator (e.g reserved
1724      * memory at boot but released just after --- kernel, initramfs,
1725      * etc.).
1726      * Update first_valid_mfn to ensure those regions are covered.
1727      */
1728     spin_lock(&heap_lock);
1729     first_valid_mfn = min_t(unsigned long, page_to_mfn(pg), first_valid_mfn);
1730     spin_unlock(&heap_lock);
1731 
1732     for ( i = 0; i < nr_pages; i++ )
1733     {
1734         unsigned int nid = phys_to_nid(page_to_maddr(pg+i));
1735 
1736         if ( unlikely(!avail[nid]) )
1737         {
1738             unsigned long s = page_to_mfn(pg + i);
1739             unsigned long e = page_to_mfn(pg + nr_pages - 1) + 1;
1740             bool_t use_tail = (nid == phys_to_nid(pfn_to_paddr(e - 1))) &&
1741                               !(s & ((1UL << MAX_ORDER) - 1)) &&
1742                               (find_first_set_bit(e) <= find_first_set_bit(s));
1743             unsigned long n;
1744 
1745             n = init_node_heap(nid, page_to_mfn(pg+i), nr_pages - i,
1746                                &use_tail);
1747             BUG_ON(i + n > nr_pages);
1748             if ( n && !use_tail )
1749             {
1750                 i += n - 1;
1751                 continue;
1752             }
1753             if ( i + n == nr_pages )
1754                 break;
1755             nr_pages -= n;
1756         }
1757 
1758         free_heap_pages(pg + i, 0, scrub_debug);
1759     }
1760 }
1761 
avail_heap_pages(unsigned int zone_lo,unsigned int zone_hi,unsigned int node)1762 static unsigned long avail_heap_pages(
1763     unsigned int zone_lo, unsigned int zone_hi, unsigned int node)
1764 {
1765     unsigned int i, zone;
1766     unsigned long free_pages = 0;
1767 
1768     if ( zone_hi >= NR_ZONES )
1769         zone_hi = NR_ZONES - 1;
1770 
1771     for_each_online_node(i)
1772     {
1773         if ( !avail[i] )
1774             continue;
1775         for ( zone = zone_lo; zone <= zone_hi; zone++ )
1776             if ( (node == -1) || (node == i) )
1777                 free_pages += avail[i][zone];
1778     }
1779 
1780     return free_pages;
1781 }
1782 
total_free_pages(void)1783 unsigned long total_free_pages(void)
1784 {
1785     return total_avail_pages - midsize_alloc_zone_pages;
1786 }
1787 
end_boot_allocator(void)1788 void __init end_boot_allocator(void)
1789 {
1790     unsigned int i;
1791 
1792     /* Pages that are free now go to the domain sub-allocator. */
1793     for ( i = 0; i < nr_bootmem_regions; i++ )
1794     {
1795         struct bootmem_region *r = &bootmem_region_list[i];
1796         if ( (r->s < r->e) &&
1797              (phys_to_nid(pfn_to_paddr(r->s)) == cpu_to_node(0)) )
1798         {
1799             init_heap_pages(mfn_to_page(r->s), r->e - r->s);
1800             r->e = r->s;
1801             break;
1802         }
1803     }
1804     for ( i = nr_bootmem_regions; i-- > 0; )
1805     {
1806         struct bootmem_region *r = &bootmem_region_list[i];
1807         if ( r->s < r->e )
1808             init_heap_pages(mfn_to_page(r->s), r->e - r->s);
1809     }
1810     nr_bootmem_regions = 0;
1811     init_heap_pages(virt_to_page(bootmem_region_list), 1);
1812 
1813     if ( !dma_bitsize && (num_online_nodes() > 1) )
1814         dma_bitsize = arch_get_dma_bitsize();
1815 
1816     printk("Domain heap initialised");
1817     if ( dma_bitsize )
1818         printk(" DMA width %u bits", dma_bitsize);
1819     printk("\n");
1820 }
1821 
smp_scrub_heap_pages(void * data)1822 static void __init smp_scrub_heap_pages(void *data)
1823 {
1824     unsigned long mfn, start, end;
1825     struct page_info *pg;
1826     struct scrub_region *r;
1827     unsigned int temp_cpu, cpu_idx = 0;
1828     nodeid_t node;
1829     unsigned int cpu = smp_processor_id();
1830 
1831     if ( data )
1832         r = data;
1833     else
1834     {
1835         node = cpu_to_node(cpu);
1836         if ( node == NUMA_NO_NODE )
1837             return;
1838         r = &region[node];
1839     }
1840 
1841     /* Determine the current CPU's index into CPU's linked to this node. */
1842     for_each_cpu ( temp_cpu, &r->cpus )
1843     {
1844         if ( cpu == temp_cpu )
1845             break;
1846         cpu_idx++;
1847     }
1848 
1849     /* Calculate the starting mfn for this CPU's memory block. */
1850     start = r->start + (r->per_cpu_sz * cpu_idx) + r->offset;
1851 
1852     /* Calculate the end mfn into this CPU's memory block for this iteration. */
1853     if ( r->offset + chunk_size >= r->per_cpu_sz )
1854     {
1855         end = r->start + (r->per_cpu_sz * cpu_idx) + r->per_cpu_sz;
1856 
1857         if ( r->rem && (cpumask_weight(&r->cpus) - 1 == cpu_idx) )
1858             end += r->rem;
1859     }
1860     else
1861         end = start + chunk_size;
1862 
1863     for ( mfn = start; mfn < end; mfn++ )
1864     {
1865         pg = mfn_to_page(mfn);
1866 
1867         /* Check the mfn is valid and page is free. */
1868         if ( !mfn_valid(_mfn(mfn)) || !page_state_is(pg, free) )
1869             continue;
1870 
1871         scrub_one_page(pg);
1872     }
1873 }
1874 
find_non_smt(unsigned int node,cpumask_t * dest)1875 static int __init find_non_smt(unsigned int node, cpumask_t *dest)
1876 {
1877     cpumask_t node_cpus;
1878     unsigned int i, cpu;
1879 
1880     cpumask_and(&node_cpus, &node_to_cpumask(node), &cpu_online_map);
1881     cpumask_clear(dest);
1882     for_each_cpu ( i, &node_cpus )
1883     {
1884         if ( cpumask_intersects(dest, per_cpu(cpu_sibling_mask, i)) )
1885             continue;
1886         cpu = cpumask_first(per_cpu(cpu_sibling_mask, i));
1887         __cpumask_set_cpu(cpu, dest);
1888     }
1889     return cpumask_weight(dest);
1890 }
1891 
1892 /*
1893  * Scrub all unallocated pages in all heap zones. This function uses all
1894  * online cpu's to scrub the memory in parallel.
1895  */
scrub_heap_pages(void)1896 static void __init scrub_heap_pages(void)
1897 {
1898     cpumask_t node_cpus, all_worker_cpus;
1899     unsigned int i, j;
1900     unsigned long offset, max_per_cpu_sz = 0;
1901     unsigned long start, end;
1902     unsigned long rem = 0;
1903     int last_distance, best_node;
1904     int cpus;
1905 
1906     cpumask_clear(&all_worker_cpus);
1907     /* Scrub block size. */
1908     chunk_size = opt_bootscrub_chunk >> PAGE_SHIFT;
1909     if ( chunk_size == 0 )
1910         chunk_size = MB(128) >> PAGE_SHIFT;
1911 
1912     /* Round #0 - figure out amounts and which CPUs to use. */
1913     for_each_online_node ( i )
1914     {
1915         if ( !node_spanned_pages(i) )
1916             continue;
1917         /* Calculate Node memory start and end address. */
1918         start = max(node_start_pfn(i), first_valid_mfn);
1919         end = min(node_start_pfn(i) + node_spanned_pages(i), max_page);
1920         /* Just in case NODE has 1 page and starts below first_valid_mfn. */
1921         end = max(end, start);
1922         /* CPUs that are online and on this node (if none, that it is OK). */
1923         cpus = find_non_smt(i, &node_cpus);
1924         cpumask_or(&all_worker_cpus, &all_worker_cpus, &node_cpus);
1925         if ( cpus <= 0 )
1926         {
1927             /* No CPUs on this node. Round #2 will take of it. */
1928             rem = 0;
1929             region[i].per_cpu_sz = (end - start);
1930         }
1931         else
1932         {
1933             rem = (end - start) % cpus;
1934             region[i].per_cpu_sz = (end - start) / cpus;
1935             if ( region[i].per_cpu_sz > max_per_cpu_sz )
1936                 max_per_cpu_sz = region[i].per_cpu_sz;
1937         }
1938         region[i].start = start;
1939         region[i].rem = rem;
1940         cpumask_copy(&region[i].cpus, &node_cpus);
1941     }
1942 
1943     printk("Scrubbing Free RAM on %d nodes using %d CPUs\n", num_online_nodes(),
1944            cpumask_weight(&all_worker_cpus));
1945 
1946     /* Round: #1 - do NUMA nodes with CPUs. */
1947     for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
1948     {
1949         for_each_online_node ( i )
1950             region[i].offset = offset;
1951 
1952         process_pending_softirqs();
1953 
1954         spin_lock(&heap_lock);
1955         on_selected_cpus(&all_worker_cpus, smp_scrub_heap_pages, NULL, 1);
1956         spin_unlock(&heap_lock);
1957 
1958         printk(".");
1959     }
1960 
1961     /*
1962      * Round #2: NUMA nodes with no CPUs get scrubbed with CPUs on the node
1963      * closest to us and with CPUs.
1964      */
1965     for_each_online_node ( i )
1966     {
1967         node_cpus = node_to_cpumask(i);
1968 
1969         if ( !cpumask_empty(&node_cpus) )
1970             continue;
1971 
1972         last_distance = INT_MAX;
1973         best_node = first_node(node_online_map);
1974         /* Figure out which NODE CPUs are close. */
1975         for_each_online_node ( j )
1976         {
1977             u8 distance;
1978 
1979             if ( cpumask_empty(&node_to_cpumask(j)) )
1980                 continue;
1981 
1982             distance = __node_distance(i, j);
1983             if ( (distance < last_distance) && (distance != NUMA_NO_DISTANCE) )
1984             {
1985                 last_distance = distance;
1986                 best_node = j;
1987             }
1988         }
1989         /*
1990          * Use CPUs from best node, and if there are no CPUs on the
1991          * first node (the default) use the BSP.
1992          */
1993         cpus = find_non_smt(best_node, &node_cpus);
1994         if ( cpus == 0 )
1995         {
1996             __cpumask_set_cpu(smp_processor_id(), &node_cpus);
1997             cpus = 1;
1998         }
1999         /* We already have the node information from round #0. */
2000         region[i].rem = region[i].per_cpu_sz % cpus;
2001         region[i].per_cpu_sz /= cpus;
2002         max_per_cpu_sz = region[i].per_cpu_sz;
2003         cpumask_copy(&region[i].cpus, &node_cpus);
2004 
2005         for ( offset = 0; offset < max_per_cpu_sz; offset += chunk_size )
2006         {
2007             region[i].offset = offset;
2008 
2009             process_pending_softirqs();
2010 
2011             spin_lock(&heap_lock);
2012             on_selected_cpus(&node_cpus, smp_scrub_heap_pages, &region[i], 1);
2013             spin_unlock(&heap_lock);
2014 
2015             printk(".");
2016         }
2017     }
2018 
2019     printk("done.\n");
2020 
2021 #ifdef CONFIG_SCRUB_DEBUG
2022     scrub_debug = true;
2023 #endif
2024 }
2025 
heap_init_late(void)2026 void __init heap_init_late(void)
2027 {
2028     /*
2029      * Now that the heap is initialized set bounds
2030      * for the low mem virq algorithm.
2031      */
2032     setup_low_mem_virq();
2033 
2034     if ( opt_bootscrub )
2035         scrub_heap_pages();
2036 }
2037 
2038 
2039 /*************************
2040  * XEN-HEAP SUB-ALLOCATOR
2041  */
2042 
2043 #if defined(CONFIG_SEPARATE_XENHEAP)
2044 
init_xenheap_pages(paddr_t ps,paddr_t pe)2045 void init_xenheap_pages(paddr_t ps, paddr_t pe)
2046 {
2047     ps = round_pgup(ps);
2048     pe = round_pgdown(pe);
2049     if ( pe <= ps )
2050         return;
2051 
2052     /*
2053      * Yuk! Ensure there is a one-page buffer between Xen and Dom zones, to
2054      * prevent merging of power-of-two blocks across the zone boundary.
2055      */
2056     if ( ps && !is_xen_heap_mfn(paddr_to_pfn(ps)-1) )
2057         ps += PAGE_SIZE;
2058     if ( !is_xen_heap_mfn(paddr_to_pfn(pe)) )
2059         pe -= PAGE_SIZE;
2060 
2061     memguard_guard_range(maddr_to_virt(ps), pe - ps);
2062 
2063     init_heap_pages(maddr_to_page(ps), (pe - ps) >> PAGE_SHIFT);
2064 }
2065 
2066 
alloc_xenheap_pages(unsigned int order,unsigned int memflags)2067 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
2068 {
2069     struct page_info *pg;
2070 
2071     ASSERT(!in_irq());
2072 
2073     pg = alloc_heap_pages(MEMZONE_XEN, MEMZONE_XEN,
2074                           order, memflags | MEMF_no_scrub, NULL);
2075     if ( unlikely(pg == NULL) )
2076         return NULL;
2077 
2078     memguard_unguard_range(page_to_virt(pg), 1 << (order + PAGE_SHIFT));
2079 
2080     return page_to_virt(pg);
2081 }
2082 
2083 
free_xenheap_pages(void * v,unsigned int order)2084 void free_xenheap_pages(void *v, unsigned int order)
2085 {
2086     ASSERT(!in_irq());
2087 
2088     if ( v == NULL )
2089         return;
2090 
2091     memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
2092 
2093     free_heap_pages(virt_to_page(v), order, false);
2094 }
2095 
2096 #else
2097 
xenheap_max_mfn(unsigned long mfn)2098 void __init xenheap_max_mfn(unsigned long mfn)
2099 {
2100     ASSERT(!first_node_initialised);
2101     ASSERT(!xenheap_bits);
2102     BUILD_BUG_ON(PADDR_BITS >= BITS_PER_LONG);
2103     xenheap_bits = min(flsl(mfn + 1) - 1 + PAGE_SHIFT, PADDR_BITS);
2104     printk(XENLOG_INFO "Xen heap: %u bits\n", xenheap_bits);
2105 }
2106 
init_xenheap_pages(paddr_t ps,paddr_t pe)2107 void init_xenheap_pages(paddr_t ps, paddr_t pe)
2108 {
2109     init_domheap_pages(ps, pe);
2110 }
2111 
alloc_xenheap_pages(unsigned int order,unsigned int memflags)2112 void *alloc_xenheap_pages(unsigned int order, unsigned int memflags)
2113 {
2114     struct page_info *pg;
2115     unsigned int i;
2116 
2117     ASSERT(!in_irq());
2118 
2119     if ( xenheap_bits && (memflags >> _MEMF_bits) > xenheap_bits )
2120         memflags &= ~MEMF_bits(~0U);
2121     if ( !(memflags >> _MEMF_bits) )
2122         memflags |= MEMF_bits(xenheap_bits);
2123 
2124     pg = alloc_domheap_pages(NULL, order, memflags | MEMF_no_scrub);
2125     if ( unlikely(pg == NULL) )
2126         return NULL;
2127 
2128     for ( i = 0; i < (1u << order); i++ )
2129         pg[i].count_info |= PGC_xen_heap;
2130 
2131     return page_to_virt(pg);
2132 }
2133 
free_xenheap_pages(void * v,unsigned int order)2134 void free_xenheap_pages(void *v, unsigned int order)
2135 {
2136     struct page_info *pg;
2137     unsigned int i;
2138 
2139     ASSERT(!in_irq());
2140 
2141     if ( v == NULL )
2142         return;
2143 
2144     pg = virt_to_page(v);
2145 
2146     for ( i = 0; i < (1u << order); i++ )
2147         pg[i].count_info &= ~PGC_xen_heap;
2148 
2149     free_heap_pages(pg, order, true);
2150 }
2151 
2152 #endif
2153 
2154 
2155 
2156 /*************************
2157  * DOMAIN-HEAP SUB-ALLOCATOR
2158  */
2159 
init_domheap_pages(paddr_t ps,paddr_t pe)2160 void init_domheap_pages(paddr_t ps, paddr_t pe)
2161 {
2162     unsigned long smfn, emfn;
2163 
2164     ASSERT(!in_irq());
2165 
2166     smfn = round_pgup(ps) >> PAGE_SHIFT;
2167     emfn = round_pgdown(pe) >> PAGE_SHIFT;
2168 
2169     if ( emfn <= smfn )
2170         return;
2171 
2172     init_heap_pages(mfn_to_page(smfn), emfn - smfn);
2173 }
2174 
2175 
assign_pages(struct domain * d,struct page_info * pg,unsigned int order,unsigned int memflags)2176 int assign_pages(
2177     struct domain *d,
2178     struct page_info *pg,
2179     unsigned int order,
2180     unsigned int memflags)
2181 {
2182     int rc = 0;
2183     unsigned long i;
2184 
2185     spin_lock(&d->page_alloc_lock);
2186 
2187     if ( unlikely(d->is_dying) )
2188     {
2189         gdprintk(XENLOG_INFO, "Cannot assign page to domain%d -- dying.\n",
2190                 d->domain_id);
2191         rc = -EINVAL;
2192         goto out;
2193     }
2194 
2195     if ( !(memflags & MEMF_no_refcount) )
2196     {
2197         if ( unlikely((d->tot_pages + (1 << order)) > d->max_pages) )
2198         {
2199             if ( !tmem_enabled() || order != 0 || d->tot_pages != d->max_pages )
2200                 gprintk(XENLOG_INFO, "Over-allocation for domain %u: "
2201                         "%u > %u\n", d->domain_id,
2202                         d->tot_pages + (1 << order), d->max_pages);
2203             rc = -E2BIG;
2204             goto out;
2205         }
2206 
2207         if ( unlikely(d->tot_pages == 0) )
2208             get_knownalive_domain(d);
2209 
2210         domain_adjust_tot_pages(d, 1 << order);
2211     }
2212 
2213     for ( i = 0; i < (1 << order); i++ )
2214     {
2215         ASSERT(page_get_owner(&pg[i]) == NULL);
2216         ASSERT((pg[i].count_info & ~(PGC_allocated | 1)) == 0);
2217         page_set_owner(&pg[i], d);
2218         smp_wmb(); /* Domain pointer must be visible before updating refcnt. */
2219         pg[i].count_info = PGC_allocated | 1;
2220         page_list_add_tail(&pg[i], &d->page_list);
2221     }
2222 
2223  out:
2224     spin_unlock(&d->page_alloc_lock);
2225     return rc;
2226 }
2227 
2228 
alloc_domheap_pages(struct domain * d,unsigned int order,unsigned int memflags)2229 struct page_info *alloc_domheap_pages(
2230     struct domain *d, unsigned int order, unsigned int memflags)
2231 {
2232     struct page_info *pg = NULL;
2233     unsigned int bits = memflags >> _MEMF_bits, zone_hi = NR_ZONES - 1;
2234     unsigned int dma_zone;
2235 
2236     ASSERT(!in_irq());
2237 
2238     bits = domain_clamp_alloc_bitsize(memflags & MEMF_no_owner ? NULL : d,
2239                                       bits ? : (BITS_PER_LONG+PAGE_SHIFT));
2240     if ( (zone_hi = min_t(unsigned int, bits_to_zone(bits), zone_hi)) == 0 )
2241         return NULL;
2242 
2243     if ( memflags & MEMF_no_owner )
2244         memflags |= MEMF_no_refcount;
2245 
2246     if ( dma_bitsize && ((dma_zone = bits_to_zone(dma_bitsize)) < zone_hi) )
2247         pg = alloc_heap_pages(dma_zone + 1, zone_hi, order, memflags, d);
2248 
2249     if ( (pg == NULL) &&
2250          ((memflags & MEMF_no_dma) ||
2251           ((pg = alloc_heap_pages(MEMZONE_XEN + 1, zone_hi, order,
2252                                   memflags, d)) == NULL)) )
2253          return NULL;
2254 
2255     if ( d && !(memflags & MEMF_no_owner) &&
2256          assign_pages(d, pg, order, memflags) )
2257     {
2258         free_heap_pages(pg, order, memflags & MEMF_no_scrub);
2259         return NULL;
2260     }
2261 
2262     return pg;
2263 }
2264 
free_domheap_pages(struct page_info * pg,unsigned int order)2265 void free_domheap_pages(struct page_info *pg, unsigned int order)
2266 {
2267     struct domain *d = page_get_owner(pg);
2268     unsigned int i;
2269     bool_t drop_dom_ref;
2270 
2271     ASSERT(!in_irq());
2272 
2273     if ( unlikely(is_xen_heap_page(pg)) )
2274     {
2275         /* NB. May recursively lock from relinquish_memory(). */
2276         spin_lock_recursive(&d->page_alloc_lock);
2277 
2278         for ( i = 0; i < (1 << order); i++ )
2279             arch_free_heap_page(d, &pg[i]);
2280 
2281         d->xenheap_pages -= 1 << order;
2282         drop_dom_ref = (d->xenheap_pages == 0);
2283 
2284         spin_unlock_recursive(&d->page_alloc_lock);
2285     }
2286     else
2287     {
2288         bool_t scrub;
2289 
2290         if ( likely(d) && likely(d != dom_cow) )
2291         {
2292             /* NB. May recursively lock from relinquish_memory(). */
2293             spin_lock_recursive(&d->page_alloc_lock);
2294 
2295             for ( i = 0; i < (1 << order); i++ )
2296             {
2297                 BUG_ON((pg[i].u.inuse.type_info & PGT_count_mask) != 0);
2298                 arch_free_heap_page(d, &pg[i]);
2299             }
2300 
2301             drop_dom_ref = !domain_adjust_tot_pages(d, -(1 << order));
2302 
2303             spin_unlock_recursive(&d->page_alloc_lock);
2304 
2305             /*
2306              * Normally we expect a domain to clear pages before freeing them,
2307              * if it cares about the secrecy of their contents. However, after
2308              * a domain has died we assume responsibility for erasure.
2309              */
2310             scrub = d->is_dying || scrub_debug;
2311         }
2312         else
2313         {
2314             /*
2315              * All we need to check is that on dom_cow only order-0 chunks
2316              * make it here. Due to the if() above, the only two possible
2317              * cases right now are d == NULL and d == dom_cow. To protect
2318              * against relaxation of that if() condition without updating the
2319              * check here, don't check d != dom_cow for now.
2320              */
2321             ASSERT(!d || !order);
2322             drop_dom_ref = 0;
2323             scrub = 1;
2324         }
2325 
2326         free_heap_pages(pg, order, scrub);
2327     }
2328 
2329     if ( drop_dom_ref )
2330         put_domain(d);
2331 }
2332 
avail_domheap_pages_region(unsigned int node,unsigned int min_width,unsigned int max_width)2333 unsigned long avail_domheap_pages_region(
2334     unsigned int node, unsigned int min_width, unsigned int max_width)
2335 {
2336     int zone_lo, zone_hi;
2337 
2338     zone_lo = min_width ? bits_to_zone(min_width) : (MEMZONE_XEN + 1);
2339     zone_lo = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_lo));
2340 
2341     zone_hi = max_width ? bits_to_zone(max_width) : (NR_ZONES - 1);
2342     zone_hi = max_t(int, MEMZONE_XEN + 1, min_t(int, NR_ZONES - 1, zone_hi));
2343 
2344     return avail_heap_pages(zone_lo, zone_hi, node);
2345 }
2346 
avail_domheap_pages(void)2347 unsigned long avail_domheap_pages(void)
2348 {
2349     return avail_heap_pages(MEMZONE_XEN + 1,
2350                             NR_ZONES - 1,
2351                             -1);
2352 }
2353 
avail_node_heap_pages(unsigned int nodeid)2354 unsigned long avail_node_heap_pages(unsigned int nodeid)
2355 {
2356     return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid);
2357 }
2358 
2359 
pagealloc_info(unsigned char key)2360 static void pagealloc_info(unsigned char key)
2361 {
2362     unsigned int zone = MEMZONE_XEN;
2363     unsigned long n, total = 0;
2364 
2365     printk("Physical memory information:\n");
2366     printk("    Xen heap: %lukB free\n",
2367            avail_heap_pages(zone, zone, -1) << (PAGE_SHIFT-10));
2368 
2369     while ( ++zone < NR_ZONES )
2370     {
2371         if ( (zone + PAGE_SHIFT) == dma_bitsize )
2372         {
2373             printk("    DMA heap: %lukB free\n", total << (PAGE_SHIFT-10));
2374             total = 0;
2375         }
2376 
2377         if ( (n = avail_heap_pages(zone, zone, -1)) != 0 )
2378         {
2379             total += n;
2380             printk("    heap[%02u]: %lukB free\n", zone, n << (PAGE_SHIFT-10));
2381         }
2382     }
2383 
2384     printk("    Dom heap: %lukB free\n", total << (PAGE_SHIFT-10));
2385 }
2386 
pagealloc_keyhandler_init(void)2387 static __init int pagealloc_keyhandler_init(void)
2388 {
2389     register_keyhandler('m', pagealloc_info, "memory info", 1);
2390     return 0;
2391 }
2392 __initcall(pagealloc_keyhandler_init);
2393 
2394 
scrub_one_page(struct page_info * pg)2395 void scrub_one_page(struct page_info *pg)
2396 {
2397     if ( unlikely(pg->count_info & PGC_broken) )
2398         return;
2399 
2400 #ifndef NDEBUG
2401     /* Avoid callers relying on allocations returning zeroed pages. */
2402     unmap_domain_page(memset(__map_domain_page(pg),
2403                              SCRUB_BYTE_PATTERN, PAGE_SIZE));
2404 #else
2405     /* For a production build, clear_page() is the fastest way to scrub. */
2406     clear_domain_page(_mfn(page_to_mfn(pg)));
2407 #endif
2408 }
2409 
dump_heap(unsigned char key)2410 static void dump_heap(unsigned char key)
2411 {
2412     s_time_t      now = NOW();
2413     int           i, j;
2414 
2415     printk("'%c' pressed -> dumping heap info (now-0x%X:%08X)\n", key,
2416            (u32)(now>>32), (u32)now);
2417 
2418     for ( i = 0; i < MAX_NUMNODES; i++ )
2419     {
2420         if ( !avail[i] )
2421             continue;
2422         for ( j = 0; j < NR_ZONES; j++ )
2423             printk("heap[node=%d][zone=%d] -> %lu pages\n",
2424                    i, j, avail[i][j]);
2425     }
2426 
2427     for ( i = 0; i < MAX_NUMNODES; i++ )
2428     {
2429         if ( !node_need_scrub[i] )
2430             continue;
2431         printk("Node %d has %lu unscrubbed pages\n", i, node_need_scrub[i]);
2432     }
2433 }
2434 
register_heap_trigger(void)2435 static __init int register_heap_trigger(void)
2436 {
2437     register_keyhandler('H', dump_heap, "dump heap info", 1);
2438     return 0;
2439 }
2440 __initcall(register_heap_trigger);
2441 
2442 /*
2443  * Local variables:
2444  * mode: C
2445  * c-file-style: "BSD"
2446  * c-basic-offset: 4
2447  * tab-width: 4
2448  * indent-tabs-mode: nil
2449  * End:
2450  */
2451