1 /******************************************************************************
2  * arch/x86/x86_64/mm.c
3  *
4  * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5  * program is free software; you can redistribute it and/or modify it under
6  * the terms of the GNU General Public License as published by the Free
7  * Software Foundation; either version 2 of the License, or (at your option)
8  * any later version.
9  *
10  * This program is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 asm(".file \"" __FILE__ "\"");
20 
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/numa.h>
26 #include <xen/nodemask.h>
27 #include <xen/guest_access.h>
28 #include <xen/hypercall.h>
29 #include <xen/mem_access.h>
30 #include <asm/current.h>
31 #include <asm/asm_defns.h>
32 #include <asm/page.h>
33 #include <asm/flushtlb.h>
34 #include <asm/fixmap.h>
35 #include <asm/hypercall.h>
36 #include <asm/msr.h>
37 #include <asm/setup.h>
38 #include <asm/numa.h>
39 #include <asm/mem_paging.h>
40 #include <asm/mem_sharing.h>
41 #include <public/memory.h>
42 
43 unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
44 
45 l2_pgentry_t *compat_idle_pg_table_l2;
46 
do_page_walk(struct vcpu * v,unsigned long addr)47 void *do_page_walk(struct vcpu *v, unsigned long addr)
48 {
49     unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
50     l4_pgentry_t l4e, *l4t;
51     l3_pgentry_t l3e, *l3t;
52     l2_pgentry_t l2e, *l2t;
53     l1_pgentry_t l1e, *l1t;
54 
55     if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
56         return NULL;
57 
58     l4t = map_domain_page(_mfn(mfn));
59     l4e = l4t[l4_table_offset(addr)];
60     unmap_domain_page(l4t);
61     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
62         return NULL;
63 
64     l3t = map_l3t_from_l4e(l4e);
65     l3e = l3t[l3_table_offset(addr)];
66     unmap_domain_page(l3t);
67     mfn = l3e_get_pfn(l3e);
68     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
69         return NULL;
70     if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
71     {
72         mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
73         goto ret;
74     }
75 
76     l2t = map_domain_page(_mfn(mfn));
77     l2e = l2t[l2_table_offset(addr)];
78     unmap_domain_page(l2t);
79     mfn = l2e_get_pfn(l2e);
80     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
81         return NULL;
82     if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
83     {
84         mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
85         goto ret;
86     }
87 
88     l1t = map_domain_page(_mfn(mfn));
89     l1e = l1t[l1_table_offset(addr)];
90     unmap_domain_page(l1t);
91     mfn = l1e_get_pfn(l1e);
92     if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
93         return NULL;
94 
95  ret:
96     return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
97 }
98 
99 /*
100  * Allocate page table pages for m2p table
101  */
102 struct mem_hotadd_info
103 {
104     unsigned long spfn;
105     unsigned long epfn;
106     unsigned long cur;
107 };
108 
hotadd_mem_valid(unsigned long pfn,struct mem_hotadd_info * info)109 static int hotadd_mem_valid(unsigned long pfn, struct mem_hotadd_info *info)
110 {
111     return (pfn < info->epfn && pfn >= info->spfn);
112 }
113 
alloc_hotadd_mfn(struct mem_hotadd_info * info)114 static unsigned long alloc_hotadd_mfn(struct mem_hotadd_info *info)
115 {
116     unsigned mfn;
117 
118     ASSERT((info->cur + ( 1UL << PAGETABLE_ORDER) < info->epfn) &&
119             info->cur >= info->spfn);
120 
121     mfn = info->cur;
122     info->cur += (1UL << PAGETABLE_ORDER);
123     return mfn;
124 }
125 
126 #define M2P_NO_MAPPED   0
127 #define M2P_2M_MAPPED   1
128 #define M2P_1G_MAPPED   2
m2p_mapped(unsigned long spfn)129 static int m2p_mapped(unsigned long spfn)
130 {
131     unsigned long va;
132     l3_pgentry_t *l3_ro_mpt;
133     l2_pgentry_t *l2_ro_mpt;
134 
135     va = RO_MPT_VIRT_START + spfn * sizeof(*machine_to_phys_mapping);
136     l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
137 
138     switch ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
139              (_PAGE_PRESENT |_PAGE_PSE))
140     {
141         case _PAGE_PSE|_PAGE_PRESENT:
142             return M2P_1G_MAPPED;
143         /* Check for next level */
144         case _PAGE_PRESENT:
145             break;
146         default:
147             return M2P_NO_MAPPED;
148     }
149     l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
150 
151     if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
152         return M2P_2M_MAPPED;
153 
154     return M2P_NO_MAPPED;
155 }
156 
share_hotadd_m2p_table(struct mem_hotadd_info * info)157 static int share_hotadd_m2p_table(struct mem_hotadd_info *info)
158 {
159     unsigned long i, n, v, m2p_start_mfn = 0;
160     l3_pgentry_t l3e;
161     l2_pgentry_t l2e;
162 
163     /* M2P table is mappable read-only by privileged domains. */
164     for ( v  = RDWR_MPT_VIRT_START;
165           v != RDWR_MPT_VIRT_END;
166           v += n << PAGE_SHIFT )
167     {
168         n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
169         l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
170             l3_table_offset(v)];
171         if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
172             continue;
173         if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
174         {
175             n = L1_PAGETABLE_ENTRIES;
176             l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
177             if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
178                 continue;
179             m2p_start_mfn = l2e_get_pfn(l2e);
180         }
181         else
182             continue;
183 
184         for ( i = 0; i < n; i++ )
185         {
186             struct page_info *page = mfn_to_page(m2p_start_mfn + i);
187             if (hotadd_mem_valid(m2p_start_mfn + i, info))
188                 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
189         }
190     }
191 
192     for ( v  = RDWR_COMPAT_MPT_VIRT_START;
193           v != RDWR_COMPAT_MPT_VIRT_END;
194           v += 1 << L2_PAGETABLE_SHIFT )
195     {
196         l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
197             l3_table_offset(v)];
198         if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
199             continue;
200         l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
201         if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
202             continue;
203         m2p_start_mfn = l2e_get_pfn(l2e);
204 
205         for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
206         {
207             struct page_info *page = mfn_to_page(m2p_start_mfn + i);
208             if (hotadd_mem_valid(m2p_start_mfn + i, info))
209                 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
210         }
211     }
212     return 0;
213 }
214 
destroy_compat_m2p_mapping(struct mem_hotadd_info * info)215 static void destroy_compat_m2p_mapping(struct mem_hotadd_info *info)
216 {
217     unsigned long i, va, rwva, pt_pfn;
218     unsigned long smap = info->spfn, emap = info->spfn;
219 
220     l3_pgentry_t *l3_ro_mpt;
221     l2_pgentry_t *l2_ro_mpt;
222 
223     if ( smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
224         return;
225 
226     if ( emap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
227         emap = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
228 
229     l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
230 
231     ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]) & _PAGE_PRESENT);
232 
233     l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
234 
235     for ( i = smap; i < emap; )
236     {
237         va = HIRO_COMPAT_MPT_VIRT_START +
238               i * sizeof(*compat_machine_to_phys_mapping);
239         rwva = RDWR_COMPAT_MPT_VIRT_START +
240              i * sizeof(*compat_machine_to_phys_mapping);
241         if ( l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT )
242         {
243             pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
244             if ( hotadd_mem_valid(pt_pfn, info) )
245             {
246                 destroy_xen_mappings(rwva, rwva +
247                         (1UL << L2_PAGETABLE_SHIFT));
248                 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
249             }
250         }
251 
252         i += 1UL << (L2_PAGETABLE_SHIFT - 2);
253     }
254 
255     return;
256 }
257 
destroy_m2p_mapping(struct mem_hotadd_info * info)258 static void destroy_m2p_mapping(struct mem_hotadd_info *info)
259 {
260     l3_pgentry_t *l3_ro_mpt;
261     unsigned long i, va, rwva;
262     unsigned long smap = info->spfn, emap = info->epfn;
263 
264     l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
265 
266     /*
267      * No need to clean m2p structure existing before the hotplug
268      */
269     for (i = smap; i < emap;)
270     {
271         unsigned long pt_pfn;
272         l2_pgentry_t *l2_ro_mpt;
273 
274         va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
275         rwva = RDWR_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
276 
277         /* 1G mapping should not be created by mem hotadd */
278         if (!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT) ||
279             (l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PSE))
280         {
281             i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
282                 (1UL << (L3_PAGETABLE_SHIFT - 3) );
283             continue;
284         }
285 
286         l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
287         if (!(l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT))
288         {
289             i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
290                     (1UL << (L2_PAGETABLE_SHIFT - 3)) ;
291             continue;
292         }
293 
294         pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
295         if ( hotadd_mem_valid(pt_pfn, info) )
296         {
297             destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT));
298 
299             l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
300             l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
301         }
302         i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
303               (1UL << (L2_PAGETABLE_SHIFT - 3));
304     }
305 
306     destroy_compat_m2p_mapping(info);
307 
308     /* Brute-Force flush all TLB */
309     flush_tlb_all();
310     return;
311 }
312 
313 /*
314  * Allocate and map the compatibility mode machine-to-phys table.
315  * spfn/epfn: the pfn ranges to be setup
316  * free_s/free_e: the pfn ranges that is free still
317  */
setup_compat_m2p_table(struct mem_hotadd_info * info)318 static int setup_compat_m2p_table(struct mem_hotadd_info *info)
319 {
320     unsigned long i, va, smap, emap, rwva, epfn = info->epfn, mfn;
321     unsigned int n;
322     l3_pgentry_t *l3_ro_mpt = NULL;
323     l2_pgentry_t *l2_ro_mpt = NULL;
324     int err = 0;
325 
326     smap = info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 2)) -1));
327 
328     /*
329      * Notice: For hot-added memory, only range below m2p_compat_vstart
330      * will be filled up (assuming memory is discontinous when booting).
331      */
332     if   ((smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2)) )
333         return 0;
334 
335     if ( epfn > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
336         epfn = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
337 
338     emap = ( (epfn + ((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1 )) &
339                 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) );
340 
341     va = HIRO_COMPAT_MPT_VIRT_START +
342          smap * sizeof(*compat_machine_to_phys_mapping);
343     l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
344 
345     ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT);
346 
347     l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
348 
349 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
350 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
351              sizeof(*compat_machine_to_phys_mapping))
352     BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
353                  sizeof(*compat_machine_to_phys_mapping));
354 
355     for ( i = smap; i < emap; i += (1UL << (L2_PAGETABLE_SHIFT - 2)) )
356     {
357         va = HIRO_COMPAT_MPT_VIRT_START +
358               i * sizeof(*compat_machine_to_phys_mapping);
359 
360         rwva = RDWR_COMPAT_MPT_VIRT_START +
361                 i * sizeof(*compat_machine_to_phys_mapping);
362 
363         if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
364             continue;
365 
366         for ( n = 0; n < CNT; ++n)
367             if ( mfn_valid(_mfn(i + n * PDX_GROUP_COUNT)) )
368                 break;
369         if ( n == CNT )
370             continue;
371 
372         mfn = alloc_hotadd_mfn(info);
373         err = map_pages_to_xen(rwva, mfn, 1UL << PAGETABLE_ORDER,
374                                PAGE_HYPERVISOR);
375         if ( err )
376             break;
377         /* Fill with INVALID_M2P_ENTRY. */
378         memset((void *)rwva, 0xFF, 1UL << L2_PAGETABLE_SHIFT);
379         /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
380         l2e_write(&l2_ro_mpt[l2_table_offset(va)],
381                   l2e_from_pfn(mfn, _PAGE_PSE|_PAGE_PRESENT));
382     }
383 #undef CNT
384 #undef MFN
385     return err;
386 }
387 
388 /*
389  * Allocate and map the machine-to-phys table.
390  * The L3 for RO/RWRW MPT and the L2 for compatible MPT should be setup already
391  */
setup_m2p_table(struct mem_hotadd_info * info)392 static int setup_m2p_table(struct mem_hotadd_info *info)
393 {
394     unsigned long i, va, smap, emap;
395     unsigned int n;
396     l2_pgentry_t *l2_ro_mpt = NULL;
397     l3_pgentry_t *l3_ro_mpt = NULL;
398     int ret = 0;
399 
400     ASSERT(l4e_get_flags(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)])
401             & _PAGE_PRESENT);
402     l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
403 
404     smap = (info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1)));
405     emap = ((info->epfn + ((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1 )) &
406                 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1));
407 
408     va = RO_MPT_VIRT_START + smap * sizeof(*machine_to_phys_mapping);
409 
410 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
411 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
412              sizeof(*machine_to_phys_mapping))
413 
414     BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
415                  sizeof(*machine_to_phys_mapping));
416 
417     i = smap;
418     while ( i < emap )
419     {
420         switch ( m2p_mapped(i) )
421         {
422         case M2P_1G_MAPPED:
423             i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
424                 (1UL << (L3_PAGETABLE_SHIFT - 3));
425             continue;
426         case M2P_2M_MAPPED:
427             i = (i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
428                 (1UL << (L2_PAGETABLE_SHIFT - 3));
429             continue;
430         default:
431             break;
432         }
433 
434         va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
435 
436         for ( n = 0; n < CNT; ++n)
437             if ( mfn_valid(_mfn(i + n * PDX_GROUP_COUNT)) )
438                 break;
439         if ( n < CNT )
440         {
441             unsigned long mfn = alloc_hotadd_mfn(info);
442 
443             ret = map_pages_to_xen(
444                         RDWR_MPT_VIRT_START + i * sizeof(unsigned long),
445                         mfn, 1UL << PAGETABLE_ORDER,
446                         PAGE_HYPERVISOR);
447             if ( ret )
448                 goto error;
449             /* Fill with INVALID_M2P_ENTRY. */
450             memset((void *)(RDWR_MPT_VIRT_START + i * sizeof(unsigned long)),
451                    0xFF, 1UL << L2_PAGETABLE_SHIFT);
452 
453             ASSERT(!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
454                   _PAGE_PSE));
455             if ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
456               _PAGE_PRESENT )
457                 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]) +
458                   l2_table_offset(va);
459             else
460             {
461                 l2_ro_mpt = alloc_xen_pagetable();
462                 if ( !l2_ro_mpt )
463                 {
464                     ret = -ENOMEM;
465                     goto error;
466                 }
467 
468                 clear_page(l2_ro_mpt);
469                 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
470                           l3e_from_paddr(__pa(l2_ro_mpt),
471                                          __PAGE_HYPERVISOR_RO | _PAGE_USER));
472                 l2_ro_mpt += l2_table_offset(va);
473             }
474 
475             /* NB. Cannot be GLOBAL: guest user mode should not see it. */
476             l2e_write(l2_ro_mpt, l2e_from_pfn(mfn,
477                    /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
478         }
479         if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
480             l2_ro_mpt = NULL;
481         i += ( 1UL << (L2_PAGETABLE_SHIFT - 3));
482     }
483 #undef CNT
484 #undef MFN
485 
486     ret = setup_compat_m2p_table(info);
487 error:
488     return ret;
489 }
490 
paging_init(void)491 void __init paging_init(void)
492 {
493     unsigned long i, mpt_size, va;
494     unsigned int n, memflags;
495     l3_pgentry_t *l3_ro_mpt;
496     l2_pgentry_t *l2_ro_mpt = NULL;
497     struct page_info *l1_pg;
498 
499     /*
500      * We setup the L3s for 1:1 mapping if host support memory hotplug
501      * to avoid sync the 1:1 mapping on page fault handler
502      */
503     for ( va = DIRECTMAP_VIRT_START;
504           va < DIRECTMAP_VIRT_END && (void *)va < __va(mem_hotplug);
505           va += (1UL << L4_PAGETABLE_SHIFT) )
506     {
507         if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) &
508               _PAGE_PRESENT) )
509         {
510             l3_pgentry_t *pl3t = alloc_xen_pagetable();
511 
512             if ( !pl3t )
513                 goto nomem;
514             clear_page(pl3t);
515             l4e_write(&idle_pg_table[l4_table_offset(va)],
516                       l4e_from_paddr(__pa(pl3t), __PAGE_HYPERVISOR_RW));
517         }
518     }
519 
520     /* Create user-accessible L2 directory to map the MPT for guests. */
521     if ( (l3_ro_mpt = alloc_xen_pagetable()) == NULL )
522         goto nomem;
523     clear_page(l3_ro_mpt);
524     l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
525               l4e_from_paddr(__pa(l3_ro_mpt), __PAGE_HYPERVISOR_RO | _PAGE_USER));
526 
527     /*
528      * Allocate and map the machine-to-phys table.
529      * This also ensures L3 is present for fixmaps.
530      */
531     mpt_size  = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
532     mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
533 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
534 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
535              sizeof(*machine_to_phys_mapping))
536     BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
537                  sizeof(*machine_to_phys_mapping));
538     for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
539     {
540         BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
541         va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
542         memflags = MEMF_node(phys_to_nid(i <<
543             (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
544 
545         if ( cpu_has_page1gb &&
546              !((unsigned long)l2_ro_mpt & ~PAGE_MASK) &&
547              (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) )
548         {
549             unsigned int k, holes;
550 
551             for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k)
552             {
553                 for ( n = 0; n < CNT; ++n)
554                     if ( mfn_valid(_mfn(MFN(i + k) + n * PDX_GROUP_COUNT)) )
555                         break;
556                 if ( n == CNT )
557                     ++holes;
558             }
559             if ( k == holes )
560             {
561                 i += (1UL << PAGETABLE_ORDER) - 1;
562                 continue;
563             }
564             if ( holes == 0 &&
565                  (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER,
566                                               memflags)) != NULL )
567             {
568                 map_pages_to_xen(
569                     RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
570                     page_to_mfn(l1_pg),
571                     1UL << (2 * PAGETABLE_ORDER),
572                     PAGE_HYPERVISOR);
573                 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
574                        0x77, 1UL << L3_PAGETABLE_SHIFT);
575 
576                 ASSERT(!l2_table_offset(va));
577                 /* NB. Cannot be GLOBAL: guest user mode should not see it. */
578                 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
579                     l3e_from_page(l1_pg,
580                         /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
581                 i += (1UL << PAGETABLE_ORDER) - 1;
582                 continue;
583             }
584         }
585 
586         for ( n = 0; n < CNT; ++n)
587             if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
588                 break;
589         if ( n == CNT )
590             l1_pg = NULL;
591         else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
592                                                memflags)) == NULL )
593             goto nomem;
594         else
595         {
596             map_pages_to_xen(
597                 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
598                 page_to_mfn(l1_pg),
599                 1UL << PAGETABLE_ORDER,
600                 PAGE_HYPERVISOR);
601             /* Fill with INVALID_M2P_ENTRY. */
602             memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
603                    0xFF, 1UL << L2_PAGETABLE_SHIFT);
604         }
605         if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
606         {
607             if ( (l2_ro_mpt = alloc_xen_pagetable()) == NULL )
608                 goto nomem;
609             clear_page(l2_ro_mpt);
610             l3e_write(&l3_ro_mpt[l3_table_offset(va)],
611                       l3e_from_paddr(__pa(l2_ro_mpt),
612                                      __PAGE_HYPERVISOR_RO | _PAGE_USER));
613             ASSERT(!l2_table_offset(va));
614         }
615         /* NB. Cannot be GLOBAL: guest user mode should not see it. */
616         if ( l1_pg )
617             l2e_write(l2_ro_mpt, l2e_from_page(
618                 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
619         l2_ro_mpt++;
620     }
621 #undef CNT
622 #undef MFN
623 
624     /* Create user-accessible L2 directory to map the MPT for compat guests. */
625     BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
626                  l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
627     l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
628         HIRO_COMPAT_MPT_VIRT_START)]);
629     if ( (l2_ro_mpt = alloc_xen_pagetable()) == NULL )
630         goto nomem;
631     compat_idle_pg_table_l2 = l2_ro_mpt;
632     clear_page(l2_ro_mpt);
633     l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
634               l3e_from_paddr(__pa(l2_ro_mpt), __PAGE_HYPERVISOR_RO));
635     l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
636     /* Allocate and map the compatibility mode machine-to-phys table. */
637     mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
638     if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
639         mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
640     mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
641     if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END )
642         m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
643 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
644 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
645              sizeof(*compat_machine_to_phys_mapping))
646     BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
647                  sizeof(*compat_machine_to_phys_mapping));
648     for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, l2_ro_mpt++ )
649     {
650         memflags = MEMF_node(phys_to_nid(i <<
651             (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
652         for ( n = 0; n < CNT; ++n)
653             if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
654                 break;
655         if ( n == CNT )
656             continue;
657         if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
658                                                memflags)) == NULL )
659             goto nomem;
660         map_pages_to_xen(
661             RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
662             page_to_mfn(l1_pg),
663             1UL << PAGETABLE_ORDER,
664             PAGE_HYPERVISOR);
665         memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
666                         (i << L2_PAGETABLE_SHIFT)),
667                0x55,
668                1UL << L2_PAGETABLE_SHIFT);
669         /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
670         l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
671     }
672 #undef CNT
673 #undef MFN
674 
675     machine_to_phys_mapping_valid = 1;
676 
677     /* Set up linear page table mapping. */
678     l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
679               l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
680     return;
681 
682  nomem:
683     panic("Not enough memory for m2p table");
684 }
685 
zap_low_mappings(void)686 void __init zap_low_mappings(void)
687 {
688     BUG_ON(num_online_cpus() != 1);
689 
690     /* Remove aliased mapping of first 1:1 PML4 entry. */
691     l4e_write(&idle_pg_table[0], l4e_empty());
692     flush_local(FLUSH_TLB_GLOBAL);
693 
694     /* Replace with mapping of the boot trampoline only. */
695     map_pages_to_xen(trampoline_phys, trampoline_phys >> PAGE_SHIFT,
696                      PFN_UP(trampoline_end - trampoline_start),
697                      __PAGE_HYPERVISOR);
698 }
699 
setup_compat_arg_xlat(struct vcpu * v)700 int setup_compat_arg_xlat(struct vcpu *v)
701 {
702     return create_perdomain_mapping(v->domain, ARG_XLAT_START(v),
703                                     PFN_UP(COMPAT_ARG_XLAT_SIZE),
704                                     NULL, NIL(struct page_info *));
705 }
706 
free_compat_arg_xlat(struct vcpu * v)707 void free_compat_arg_xlat(struct vcpu *v)
708 {
709     destroy_perdomain_mapping(v->domain, ARG_XLAT_START(v),
710                               PFN_UP(COMPAT_ARG_XLAT_SIZE));
711 }
712 
cleanup_frame_table(struct mem_hotadd_info * info)713 static void cleanup_frame_table(struct mem_hotadd_info *info)
714 {
715     unsigned long sva, eva;
716     l3_pgentry_t l3e;
717     l2_pgentry_t l2e;
718     unsigned long spfn, epfn;
719 
720     spfn = info->spfn;
721     epfn = info->epfn;
722 
723     sva = (unsigned long)pdx_to_page(pfn_to_pdx(spfn));
724     eva = (unsigned long)pdx_to_page(pfn_to_pdx(epfn));
725 
726     /* Intialize all page */
727     memset(mfn_to_page(spfn), -1,
728            (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
729 
730     while (sva < eva)
731     {
732         l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(sva)])[
733           l3_table_offset(sva)];
734         if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
735              (l3e_get_flags(l3e) & _PAGE_PSE) )
736         {
737             sva = (sva & ~((1UL << L3_PAGETABLE_SHIFT) - 1)) +
738                     (1UL << L3_PAGETABLE_SHIFT);
739             continue;
740         }
741 
742         l2e = l3e_to_l2e(l3e)[l2_table_offset(sva)];
743         ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
744 
745         if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) ==
746               (_PAGE_PSE | _PAGE_PRESENT) )
747         {
748             if (hotadd_mem_valid(l2e_get_pfn(l2e), info))
749                 destroy_xen_mappings(sva & ~((1UL << L2_PAGETABLE_SHIFT) - 1),
750                          ((sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
751                             (1UL << L2_PAGETABLE_SHIFT) - 1));
752 
753             sva = (sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
754                   (1UL << L2_PAGETABLE_SHIFT);
755             continue;
756         }
757 
758         ASSERT(l1e_get_flags(l2e_to_l1e(l2e)[l1_table_offset(sva)]) &
759                 _PAGE_PRESENT);
760          sva = (sva & ~((1UL << PAGE_SHIFT) - 1)) +
761                     (1UL << PAGE_SHIFT);
762     }
763 
764     /* Brute-Force flush all TLB */
765     flush_tlb_all();
766 }
767 
setup_frametable_chunk(void * start,void * end,struct mem_hotadd_info * info)768 static int setup_frametable_chunk(void *start, void *end,
769                                   struct mem_hotadd_info *info)
770 {
771     unsigned long s = (unsigned long)start;
772     unsigned long e = (unsigned long)end;
773     unsigned long mfn;
774     int err;
775 
776     ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
777     ASSERT(!(e & ((1 << L2_PAGETABLE_SHIFT) - 1)));
778 
779     for ( ; s < e; s += (1UL << L2_PAGETABLE_SHIFT))
780     {
781         mfn = alloc_hotadd_mfn(info);
782         err = map_pages_to_xen(s, mfn, 1UL << PAGETABLE_ORDER,
783                                PAGE_HYPERVISOR);
784         if ( err )
785             return err;
786     }
787     memset(start, -1, s - (unsigned long)start);
788 
789     return 0;
790 }
791 
extend_frame_table(struct mem_hotadd_info * info)792 static int extend_frame_table(struct mem_hotadd_info *info)
793 {
794     unsigned long cidx, nidx, eidx, spfn, epfn;
795 
796     spfn = info->spfn;
797     epfn = info->epfn;
798 
799     eidx = (pfn_to_pdx(epfn) + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
800     nidx = cidx = pfn_to_pdx(spfn)/PDX_GROUP_COUNT;
801 
802     ASSERT( pfn_to_pdx(epfn) <= (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
803             pfn_to_pdx(epfn) <= FRAMETABLE_NR );
804 
805     if ( test_bit(cidx, pdx_group_valid) )
806         cidx = find_next_zero_bit(pdx_group_valid, eidx, cidx);
807 
808     if ( cidx >= eidx )
809         return 0;
810 
811     while ( cidx < eidx )
812     {
813         int err;
814 
815         nidx = find_next_bit(pdx_group_valid, eidx, cidx);
816         if ( nidx >= eidx )
817             nidx = eidx;
818         err = setup_frametable_chunk(pdx_to_page(cidx * PDX_GROUP_COUNT ),
819                                      pdx_to_page(nidx * PDX_GROUP_COUNT),
820                                      info);
821         if ( err )
822             return err;
823 
824         cidx = find_next_zero_bit(pdx_group_valid, eidx, nidx);
825     }
826 
827     memset(mfn_to_page(spfn), 0,
828            (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
829     return 0;
830 }
831 
subarch_init_memory(void)832 void __init subarch_init_memory(void)
833 {
834     unsigned long i, n, v, m2p_start_mfn;
835     l3_pgentry_t l3e;
836     l2_pgentry_t l2e;
837 
838     BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
839     BUILD_BUG_ON(RDWR_MPT_VIRT_END   & ((1UL << L3_PAGETABLE_SHIFT) - 1));
840     /* M2P table is mappable read-only by privileged domains. */
841     for ( v  = RDWR_MPT_VIRT_START;
842           v != RDWR_MPT_VIRT_END;
843           v += n << PAGE_SHIFT )
844     {
845         n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
846         l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
847             l3_table_offset(v)];
848         if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
849             continue;
850         if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
851         {
852             n = L1_PAGETABLE_ENTRIES;
853             l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
854             if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
855                 continue;
856             m2p_start_mfn = l2e_get_pfn(l2e);
857         }
858         else
859         {
860             m2p_start_mfn = l3e_get_pfn(l3e);
861         }
862 
863         for ( i = 0; i < n; i++ )
864         {
865             struct page_info *page = mfn_to_page(m2p_start_mfn + i);
866             share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
867         }
868     }
869 
870     for ( v  = RDWR_COMPAT_MPT_VIRT_START;
871           v != RDWR_COMPAT_MPT_VIRT_END;
872           v += 1 << L2_PAGETABLE_SHIFT )
873     {
874         l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
875             l3_table_offset(v)];
876         if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
877             continue;
878         l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
879         if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
880             continue;
881         m2p_start_mfn = l2e_get_pfn(l2e);
882 
883         for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
884         {
885             struct page_info *page = mfn_to_page(m2p_start_mfn + i);
886             share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
887         }
888     }
889 
890     /* Mark all of direct map NX if hardware supports it. */
891     if ( !cpu_has_nx )
892         return;
893 
894     for ( i = l4_table_offset(DIRECTMAP_VIRT_START);
895           i < l4_table_offset(DIRECTMAP_VIRT_END); ++i )
896     {
897         l4_pgentry_t l4e = idle_pg_table[i];
898 
899         if ( l4e_get_flags(l4e) & _PAGE_PRESENT )
900         {
901             l4e_add_flags(l4e, _PAGE_NX_BIT);
902             idle_pg_table[i] = l4e;
903         }
904     }
905 }
906 
subarch_memory_op(unsigned long cmd,XEN_GUEST_HANDLE_PARAM (void)arg)907 long subarch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
908 {
909     struct xen_machphys_mfn_list xmml;
910     l3_pgentry_t l3e;
911     l2_pgentry_t l2e;
912     unsigned long v, limit;
913     xen_pfn_t mfn, last_mfn;
914     unsigned int i;
915     long rc = 0;
916 
917     switch ( cmd )
918     {
919     case XENMEM_machphys_mfn_list:
920         if ( copy_from_guest(&xmml, arg, 1) )
921             return -EFAULT;
922 
923         BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
924         BUILD_BUG_ON(RDWR_MPT_VIRT_END   & ((1UL << L3_PAGETABLE_SHIFT) - 1));
925         for ( i = 0, v = RDWR_MPT_VIRT_START, last_mfn = 0;
926               (i != xmml.max_extents) &&
927               (v < (unsigned long)(machine_to_phys_mapping + max_page));
928               i++, v += 1UL << L2_PAGETABLE_SHIFT )
929         {
930             l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
931                 l3_table_offset(v)];
932             if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
933                 mfn = last_mfn;
934             else if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
935             {
936                 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
937                 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
938                     mfn = l2e_get_pfn(l2e);
939                 else
940                     mfn = last_mfn;
941             }
942             else
943             {
944                 mfn = l3e_get_pfn(l3e)
945                     + (l2_table_offset(v) << PAGETABLE_ORDER);
946             }
947             ASSERT(mfn);
948             if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
949                 return -EFAULT;
950             last_mfn = mfn;
951         }
952 
953         xmml.nr_extents = i;
954         if ( __copy_to_guest(arg, &xmml, 1) )
955             return -EFAULT;
956 
957         break;
958 
959     case XENMEM_machphys_compat_mfn_list:
960         if ( copy_from_guest(&xmml, arg, 1) )
961             return -EFAULT;
962 
963         limit = (unsigned long)(compat_machine_to_phys_mapping + max_page);
964         if ( limit > RDWR_COMPAT_MPT_VIRT_END )
965             limit = RDWR_COMPAT_MPT_VIRT_END;
966         for ( i = 0, v = RDWR_COMPAT_MPT_VIRT_START, last_mfn = 0;
967               (i != xmml.max_extents) && (v < limit);
968               i++, v += 1 << L2_PAGETABLE_SHIFT )
969         {
970             l2e = compat_idle_pg_table_l2[l2_table_offset(v)];
971             if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
972                 mfn = l2e_get_pfn(l2e);
973             else
974                 mfn = last_mfn;
975             ASSERT(mfn);
976             if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
977                 return -EFAULT;
978             last_mfn = mfn;
979         }
980 
981         xmml.nr_extents = i;
982         if ( __copy_to_guest(arg, &xmml, 1) )
983             rc = -EFAULT;
984 
985         break;
986 
987     case XENMEM_get_sharing_freed_pages:
988         return mem_sharing_get_nr_saved_mfns();
989 
990     case XENMEM_get_sharing_shared_pages:
991         return mem_sharing_get_nr_shared_mfns();
992 
993     case XENMEM_paging_op:
994         return mem_paging_memop(guest_handle_cast(arg, xen_mem_paging_op_t));
995 
996     case XENMEM_sharing_op:
997         return mem_sharing_memop(guest_handle_cast(arg, xen_mem_sharing_op_t));
998 
999     default:
1000         rc = -ENOSYS;
1001         break;
1002     }
1003 
1004     return rc;
1005 }
1006 
do_stack_switch(unsigned long ss,unsigned long esp)1007 long do_stack_switch(unsigned long ss, unsigned long esp)
1008 {
1009     fixup_guest_stack_selector(current->domain, ss);
1010     current->arch.pv_vcpu.kernel_ss = ss;
1011     current->arch.pv_vcpu.kernel_sp = esp;
1012     return 0;
1013 }
1014 
do_set_segment_base(unsigned int which,unsigned long base)1015 long do_set_segment_base(unsigned int which, unsigned long base)
1016 {
1017     struct vcpu *v = current;
1018     long ret = 0;
1019 
1020     if ( is_pv_32bit_vcpu(v) )
1021         return -ENOSYS; /* x86/64 only. */
1022 
1023     switch ( which )
1024     {
1025     case SEGBASE_FS:
1026         if ( is_canonical_address(base) )
1027         {
1028             wrfsbase(base);
1029             v->arch.pv_vcpu.fs_base = base;
1030         }
1031         else
1032             ret = -EINVAL;
1033         break;
1034 
1035     case SEGBASE_GS_USER:
1036         if ( is_canonical_address(base) )
1037         {
1038             wrmsrl(MSR_SHADOW_GS_BASE, base);
1039             v->arch.pv_vcpu.gs_base_user = base;
1040         }
1041         else
1042             ret = -EINVAL;
1043         break;
1044 
1045     case SEGBASE_GS_KERNEL:
1046         if ( is_canonical_address(base) )
1047         {
1048             wrgsbase(base);
1049             v->arch.pv_vcpu.gs_base_kernel = base;
1050         }
1051         else
1052             ret = -EINVAL;
1053         break;
1054 
1055     case SEGBASE_GS_USER_SEL:
1056         __asm__ __volatile__ (
1057             "     swapgs              \n"
1058             "1:   movl %k0,%%gs       \n"
1059             "    "safe_swapgs"        \n"
1060             ".section .fixup,\"ax\"   \n"
1061             "2:   xorl %k0,%k0        \n"
1062             "     jmp  1b             \n"
1063             ".previous                \n"
1064             _ASM_EXTABLE(1b, 2b)
1065             : "+r" (base) );
1066         break;
1067 
1068     default:
1069         ret = -EINVAL;
1070         break;
1071     }
1072 
1073     return ret;
1074 }
1075 
1076 
1077 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
check_descriptor(const struct domain * dom,struct desc_struct * d)1078 int check_descriptor(const struct domain *dom, struct desc_struct *d)
1079 {
1080     u32 a = d->a, b = d->b;
1081     u16 cs;
1082     unsigned int dpl;
1083 
1084     /* A not-present descriptor will always fault, so is safe. */
1085     if ( !(b & _SEGMENT_P) )
1086         return 1;
1087 
1088     /* Check and fix up the DPL. */
1089     dpl = (b >> 13) & 3;
1090     __fixup_guest_selector(dom, dpl);
1091     b = (b & ~_SEGMENT_DPL) | (dpl << 13);
1092 
1093     /* All code and data segments are okay. No base/limit checking. */
1094     if ( (b & _SEGMENT_S) )
1095     {
1096         if ( is_pv_32bit_domain(dom) )
1097         {
1098             unsigned long base, limit;
1099 
1100             if ( b & _SEGMENT_L )
1101                 goto bad;
1102 
1103             /*
1104              * Older PAE Linux guests use segments which are limited to
1105              * 0xf6800000. Extend these to allow access to the larger read-only
1106              * M2P table available in 32on64 mode.
1107              */
1108             base = (b & (0xff << 24)) | ((b & 0xff) << 16) | (a >> 16);
1109 
1110             limit = (b & 0xf0000) | (a & 0xffff);
1111             limit++; /* We add one because limit is inclusive. */
1112 
1113             if ( (b & _SEGMENT_G) )
1114                 limit <<= 12;
1115 
1116             if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) )
1117             {
1118                 a |= 0x0000ffff;
1119                 b |= 0x000f0000;
1120             }
1121         }
1122 
1123         goto good;
1124     }
1125 
1126     /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
1127     if ( (b & _SEGMENT_TYPE) == 0x000 )
1128         return 1;
1129 
1130     /* Everything but a call gate is discarded here. */
1131     if ( (b & _SEGMENT_TYPE) != 0xc00 )
1132         goto bad;
1133 
1134     /* Validate the target code selector. */
1135     cs = a >> 16;
1136     if ( !guest_gate_selector_okay(dom, cs) )
1137         goto bad;
1138     /*
1139      * Force DPL to zero, causing a GP fault with its error code indicating
1140      * the gate in use, allowing emulation. This is necessary because with
1141      * native guests (kernel in ring 3) call gates cannot be used directly
1142      * to transition from user to kernel mode (and whether a gate is used
1143      * to enter the kernel can only be determined when the gate is being
1144      * used), and with compat guests call gates cannot be used at all as
1145      * there are only 64-bit ones.
1146      * Store the original DPL in the selector's RPL field.
1147      */
1148     b &= ~_SEGMENT_DPL;
1149     cs = (cs & ~3) | dpl;
1150     a = (a & 0xffffU) | (cs << 16);
1151 
1152     /* Reserved bits must be zero. */
1153     if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
1154         goto bad;
1155 
1156  good:
1157     d->a = a;
1158     d->b = b;
1159     return 1;
1160  bad:
1161     return 0;
1162 }
1163 
pagefault_by_memadd(unsigned long addr,struct cpu_user_regs * regs)1164 int pagefault_by_memadd(unsigned long addr, struct cpu_user_regs *regs)
1165 {
1166     struct domain *d = current->domain;
1167 
1168     return mem_hotplug && guest_mode(regs) && is_pv_32bit_domain(d) &&
1169            (addr >= HYPERVISOR_COMPAT_VIRT_START(d)) &&
1170            (addr < MACH2PHYS_COMPAT_VIRT_END);
1171 }
1172 
handle_memadd_fault(unsigned long addr,struct cpu_user_regs * regs)1173 int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
1174 {
1175     struct domain *d = current->domain;
1176     l4_pgentry_t *pl4e = NULL;
1177     l4_pgentry_t l4e;
1178     l3_pgentry_t  *pl3e = NULL;
1179     l3_pgentry_t l3e;
1180     l2_pgentry_t *pl2e = NULL;
1181     l2_pgentry_t l2e, idle_l2e;
1182     unsigned long mfn, idle_index;
1183     int ret = 0;
1184 
1185     if (!is_pv_32bit_domain(d))
1186         return 0;
1187 
1188     if ( (addr < HYPERVISOR_COMPAT_VIRT_START(d)) ||
1189          (addr >= MACH2PHYS_COMPAT_VIRT_END) )
1190         return 0;
1191 
1192     mfn = (read_cr3()) >> PAGE_SHIFT;
1193 
1194     pl4e = map_domain_page(_mfn(mfn));
1195 
1196     l4e = pl4e[0];
1197 
1198     if (!(l4e_get_flags(l4e) & _PAGE_PRESENT))
1199         goto unmap;
1200 
1201     mfn = l4e_get_pfn(l4e);
1202     /* We don't need get page type here since it is current CR3 */
1203     pl3e = map_domain_page(_mfn(mfn));
1204 
1205     l3e = pl3e[3];
1206 
1207     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1208         goto unmap;
1209 
1210     mfn = l3e_get_pfn(l3e);
1211     pl2e = map_domain_page(_mfn(mfn));
1212 
1213     l2e = pl2e[l2_table_offset(addr)];
1214 
1215     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT))
1216         goto unmap;
1217 
1218     idle_index = (l2_table_offset(addr) -
1219                         COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d))/
1220                   sizeof(l2_pgentry_t);
1221     idle_l2e = compat_idle_pg_table_l2[idle_index];
1222     if (!(l2e_get_flags(idle_l2e) & _PAGE_PRESENT))
1223         goto unmap;
1224 
1225     memcpy(&pl2e[l2_table_offset(addr)],
1226             &compat_idle_pg_table_l2[idle_index],
1227             sizeof(l2_pgentry_t));
1228 
1229     ret = EXCRET_fault_fixed;
1230 
1231 unmap:
1232     if ( pl4e )
1233         unmap_domain_page(pl4e);
1234     if ( pl3e )
1235         unmap_domain_page(pl3e);
1236     if ( pl2e )
1237         unmap_domain_page(pl2e);
1238 
1239     return ret;
1240 }
1241 
domain_set_alloc_bitsize(struct domain * d)1242 void domain_set_alloc_bitsize(struct domain *d)
1243 {
1244     if ( !is_pv_32bit_domain(d) ||
1245          (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
1246          d->arch.physaddr_bitsize > 0 )
1247         return;
1248     d->arch.physaddr_bitsize =
1249         /* 2^n entries can be contained in guest's p2m mapping space */
1250         fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
1251         /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
1252         + PAGE_SHIFT;
1253 }
1254 
domain_clamp_alloc_bitsize(struct domain * d,unsigned int bits)1255 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
1256 {
1257     if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
1258         return bits;
1259     return min(d->arch.physaddr_bitsize, bits);
1260 }
1261 
transfer_pages_to_heap(struct mem_hotadd_info * info)1262 static int transfer_pages_to_heap(struct mem_hotadd_info *info)
1263 {
1264     unsigned long i;
1265     struct page_info *pg;
1266 
1267     /*
1268      * Mark the allocated page before put free pages to buddy allocator
1269      * to avoid merge in free_heap_pages
1270      */
1271     for (i = info->spfn; i < info->cur; i++)
1272     {
1273         pg = mfn_to_page(i);
1274         pg->count_info = PGC_state_inuse;
1275     }
1276 
1277     init_domheap_pages(pfn_to_paddr(info->cur), pfn_to_paddr(info->epfn));
1278 
1279     return 0;
1280 }
1281 
mem_hotadd_check(unsigned long spfn,unsigned long epfn)1282 static int mem_hotadd_check(unsigned long spfn, unsigned long epfn)
1283 {
1284     unsigned long s, e, length, sidx, eidx;
1285 
1286     if ( (spfn >= epfn) )
1287         return 0;
1288 
1289     if (pfn_to_pdx(epfn) > FRAMETABLE_NR)
1290         return 0;
1291 
1292     if ( (spfn | epfn) & ((1UL << PAGETABLE_ORDER) - 1) )
1293         return 0;
1294 
1295     if ( (spfn | epfn) & pfn_hole_mask )
1296         return 0;
1297 
1298     /* Make sure the new range is not present now */
1299     sidx = ((pfn_to_pdx(spfn) + PDX_GROUP_COUNT - 1)  & ~(PDX_GROUP_COUNT - 1))
1300             / PDX_GROUP_COUNT;
1301     eidx = (pfn_to_pdx(epfn - 1) & ~(PDX_GROUP_COUNT - 1)) / PDX_GROUP_COUNT;
1302     if (sidx >= eidx)
1303         return 0;
1304 
1305     s = find_next_zero_bit(pdx_group_valid, eidx, sidx);
1306     if ( s > eidx )
1307         return 0;
1308     e = find_next_bit(pdx_group_valid, eidx, s);
1309     if ( e < eidx )
1310         return 0;
1311 
1312     /* Caculate at most required m2p/compat m2p/frametable pages */
1313     s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1));
1314     e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 3)) - 1) &
1315             ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1);
1316 
1317     length = (e - s) * sizeof(unsigned long);
1318 
1319     s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1));
1320     e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) &
1321             ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1);
1322 
1323     e = min_t(unsigned long, e,
1324             (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2);
1325 
1326     if ( e > s )
1327         length += (e -s) * sizeof(unsigned int);
1328 
1329     s = pfn_to_pdx(spfn) & ~(PDX_GROUP_COUNT - 1);
1330     e = ( pfn_to_pdx(epfn) + (PDX_GROUP_COUNT - 1) ) & ~(PDX_GROUP_COUNT - 1);
1331 
1332     length += (e - s) * sizeof(struct page_info);
1333 
1334     if ((length >> PAGE_SHIFT) > (epfn - spfn))
1335         return 0;
1336 
1337     return 1;
1338 }
1339 
1340 /*
1341  * A bit paranoid for memory allocation failure issue since
1342  * it may be reason for memory add
1343  */
memory_add(unsigned long spfn,unsigned long epfn,unsigned int pxm)1344 int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
1345 {
1346     struct mem_hotadd_info info;
1347     int ret;
1348     nodeid_t node;
1349     unsigned long old_max = max_page, old_total = total_pages;
1350     unsigned long old_node_start, old_node_span, orig_online;
1351     unsigned long i;
1352 
1353     dprintk(XENLOG_INFO, "memory_add %lx ~ %lx with pxm %x\n", spfn, epfn, pxm);
1354 
1355     if ( !mem_hotadd_check(spfn, epfn) )
1356         return -EINVAL;
1357 
1358     if ( (node = setup_node(pxm)) == NUMA_NO_NODE )
1359         return -EINVAL;
1360 
1361     if ( !valid_numa_range(spfn << PAGE_SHIFT, epfn << PAGE_SHIFT, node) )
1362     {
1363         printk(XENLOG_WARNING
1364                "pfn range %lx..%lx PXM %x node %x is not NUMA-valid\n",
1365                spfn, epfn, pxm, node);
1366         return -EINVAL;
1367     }
1368 
1369     i = virt_to_mfn(HYPERVISOR_VIRT_END - 1) + 1;
1370     if ( spfn < i )
1371     {
1372         ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), spfn,
1373                                min(epfn, i) - spfn, PAGE_HYPERVISOR);
1374         if ( ret )
1375             goto destroy_directmap;
1376     }
1377     if ( i < epfn )
1378     {
1379         if ( i < spfn )
1380             i = spfn;
1381         ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), i,
1382                                epfn - i, __PAGE_HYPERVISOR_RW);
1383         if ( ret )
1384             goto destroy_directmap;
1385     }
1386 
1387     old_node_start = node_start_pfn(node);
1388     old_node_span = node_spanned_pages(node);
1389     orig_online = node_online(node);
1390 
1391     if ( !orig_online )
1392     {
1393         dprintk(XENLOG_WARNING, "node %x pxm %x is not online\n",node, pxm);
1394         NODE_DATA(node)->node_start_pfn = spfn;
1395         NODE_DATA(node)->node_spanned_pages =
1396                 epfn - node_start_pfn(node);
1397         node_set_online(node);
1398     }
1399     else
1400     {
1401         if (node_start_pfn(node) > spfn)
1402             NODE_DATA(node)->node_start_pfn = spfn;
1403         if (node_end_pfn(node) < epfn)
1404             NODE_DATA(node)->node_spanned_pages = epfn - node_start_pfn(node);
1405     }
1406 
1407     info.spfn = spfn;
1408     info.epfn = epfn;
1409     info.cur = spfn;
1410 
1411     ret = extend_frame_table(&info);
1412     if (ret)
1413         goto destroy_frametable;
1414 
1415     /* Set max_page as setup_m2p_table will use it*/
1416     if (max_page < epfn)
1417     {
1418         max_page = epfn;
1419         max_pdx = pfn_to_pdx(max_page - 1) + 1;
1420     }
1421     total_pages += epfn - spfn;
1422 
1423     set_pdx_range(spfn, epfn);
1424     ret = setup_m2p_table(&info);
1425 
1426     if ( ret )
1427         goto destroy_m2p;
1428 
1429     if ( iommu_enabled && !iommu_passthrough && !need_iommu(hardware_domain) )
1430     {
1431         for ( i = spfn; i < epfn; i++ )
1432             if ( iommu_map_page(hardware_domain, i, i, IOMMUF_readable|IOMMUF_writable) )
1433                 break;
1434         if ( i != epfn )
1435         {
1436             while (i-- > old_max)
1437                 /* If statement to satisfy __must_check. */
1438                 if ( iommu_unmap_page(hardware_domain, i) )
1439                     continue;
1440 
1441             goto destroy_m2p;
1442         }
1443     }
1444 
1445     /* We can't revert any more */
1446     share_hotadd_m2p_table(&info);
1447     transfer_pages_to_heap(&info);
1448 
1449     return 0;
1450 
1451 destroy_m2p:
1452     destroy_m2p_mapping(&info);
1453     max_page = old_max;
1454     total_pages = old_total;
1455     max_pdx = pfn_to_pdx(max_page - 1) + 1;
1456 destroy_frametable:
1457     cleanup_frame_table(&info);
1458     if ( !orig_online )
1459         node_set_offline(node);
1460     NODE_DATA(node)->node_start_pfn = old_node_start;
1461     NODE_DATA(node)->node_spanned_pages = old_node_span;
1462  destroy_directmap:
1463     destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
1464                          (unsigned long)mfn_to_virt(epfn));
1465 
1466     return ret;
1467 }
1468 
1469 #include "compat/mm.c"
1470 
1471 /*
1472  * Local variables:
1473  * mode: C
1474  * c-file-style: "BSD"
1475  * c-basic-offset: 4
1476  * tab-width: 4
1477  * indent-tabs-mode: nil
1478  * End:
1479  */
1480