1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 asm(".file \"" __FILE__ "\"");
20
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/numa.h>
26 #include <xen/nodemask.h>
27 #include <xen/guest_access.h>
28 #include <xen/hypercall.h>
29 #include <xen/mem_access.h>
30 #include <asm/current.h>
31 #include <asm/asm_defns.h>
32 #include <asm/page.h>
33 #include <asm/flushtlb.h>
34 #include <asm/fixmap.h>
35 #include <asm/hypercall.h>
36 #include <asm/msr.h>
37 #include <asm/setup.h>
38 #include <asm/numa.h>
39 #include <asm/mem_paging.h>
40 #include <asm/mem_sharing.h>
41 #include <public/memory.h>
42
43 unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
44
45 l2_pgentry_t *compat_idle_pg_table_l2;
46
do_page_walk(struct vcpu * v,unsigned long addr)47 void *do_page_walk(struct vcpu *v, unsigned long addr)
48 {
49 unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
50 l4_pgentry_t l4e, *l4t;
51 l3_pgentry_t l3e, *l3t;
52 l2_pgentry_t l2e, *l2t;
53 l1_pgentry_t l1e, *l1t;
54
55 if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
56 return NULL;
57
58 l4t = map_domain_page(_mfn(mfn));
59 l4e = l4t[l4_table_offset(addr)];
60 unmap_domain_page(l4t);
61 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
62 return NULL;
63
64 l3t = map_l3t_from_l4e(l4e);
65 l3e = l3t[l3_table_offset(addr)];
66 unmap_domain_page(l3t);
67 mfn = l3e_get_pfn(l3e);
68 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
69 return NULL;
70 if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
71 {
72 mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
73 goto ret;
74 }
75
76 l2t = map_domain_page(_mfn(mfn));
77 l2e = l2t[l2_table_offset(addr)];
78 unmap_domain_page(l2t);
79 mfn = l2e_get_pfn(l2e);
80 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
81 return NULL;
82 if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
83 {
84 mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
85 goto ret;
86 }
87
88 l1t = map_domain_page(_mfn(mfn));
89 l1e = l1t[l1_table_offset(addr)];
90 unmap_domain_page(l1t);
91 mfn = l1e_get_pfn(l1e);
92 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
93 return NULL;
94
95 ret:
96 return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
97 }
98
99 /*
100 * Allocate page table pages for m2p table
101 */
102 struct mem_hotadd_info
103 {
104 unsigned long spfn;
105 unsigned long epfn;
106 unsigned long cur;
107 };
108
hotadd_mem_valid(unsigned long pfn,struct mem_hotadd_info * info)109 static int hotadd_mem_valid(unsigned long pfn, struct mem_hotadd_info *info)
110 {
111 return (pfn < info->epfn && pfn >= info->spfn);
112 }
113
alloc_hotadd_mfn(struct mem_hotadd_info * info)114 static unsigned long alloc_hotadd_mfn(struct mem_hotadd_info *info)
115 {
116 unsigned mfn;
117
118 ASSERT((info->cur + ( 1UL << PAGETABLE_ORDER) < info->epfn) &&
119 info->cur >= info->spfn);
120
121 mfn = info->cur;
122 info->cur += (1UL << PAGETABLE_ORDER);
123 return mfn;
124 }
125
126 #define M2P_NO_MAPPED 0
127 #define M2P_2M_MAPPED 1
128 #define M2P_1G_MAPPED 2
m2p_mapped(unsigned long spfn)129 static int m2p_mapped(unsigned long spfn)
130 {
131 unsigned long va;
132 l3_pgentry_t *l3_ro_mpt;
133 l2_pgentry_t *l2_ro_mpt;
134
135 va = RO_MPT_VIRT_START + spfn * sizeof(*machine_to_phys_mapping);
136 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
137
138 switch ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
139 (_PAGE_PRESENT |_PAGE_PSE))
140 {
141 case _PAGE_PSE|_PAGE_PRESENT:
142 return M2P_1G_MAPPED;
143 /* Check for next level */
144 case _PAGE_PRESENT:
145 break;
146 default:
147 return M2P_NO_MAPPED;
148 }
149 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
150
151 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
152 return M2P_2M_MAPPED;
153
154 return M2P_NO_MAPPED;
155 }
156
share_hotadd_m2p_table(struct mem_hotadd_info * info)157 static int share_hotadd_m2p_table(struct mem_hotadd_info *info)
158 {
159 unsigned long i, n, v, m2p_start_mfn = 0;
160 l3_pgentry_t l3e;
161 l2_pgentry_t l2e;
162
163 /* M2P table is mappable read-only by privileged domains. */
164 for ( v = RDWR_MPT_VIRT_START;
165 v != RDWR_MPT_VIRT_END;
166 v += n << PAGE_SHIFT )
167 {
168 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
169 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
170 l3_table_offset(v)];
171 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
172 continue;
173 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
174 {
175 n = L1_PAGETABLE_ENTRIES;
176 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
177 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
178 continue;
179 m2p_start_mfn = l2e_get_pfn(l2e);
180 }
181 else
182 continue;
183
184 for ( i = 0; i < n; i++ )
185 {
186 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
187 if (hotadd_mem_valid(m2p_start_mfn + i, info))
188 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
189 }
190 }
191
192 for ( v = RDWR_COMPAT_MPT_VIRT_START;
193 v != RDWR_COMPAT_MPT_VIRT_END;
194 v += 1 << L2_PAGETABLE_SHIFT )
195 {
196 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
197 l3_table_offset(v)];
198 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
199 continue;
200 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
201 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
202 continue;
203 m2p_start_mfn = l2e_get_pfn(l2e);
204
205 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
206 {
207 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
208 if (hotadd_mem_valid(m2p_start_mfn + i, info))
209 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
210 }
211 }
212 return 0;
213 }
214
destroy_compat_m2p_mapping(struct mem_hotadd_info * info)215 static void destroy_compat_m2p_mapping(struct mem_hotadd_info *info)
216 {
217 unsigned long i, va, rwva, pt_pfn;
218 unsigned long smap = info->spfn, emap = info->spfn;
219
220 l3_pgentry_t *l3_ro_mpt;
221 l2_pgentry_t *l2_ro_mpt;
222
223 if ( smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
224 return;
225
226 if ( emap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
227 emap = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
228
229 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
230
231 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]) & _PAGE_PRESENT);
232
233 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)]);
234
235 for ( i = smap; i < emap; )
236 {
237 va = HIRO_COMPAT_MPT_VIRT_START +
238 i * sizeof(*compat_machine_to_phys_mapping);
239 rwva = RDWR_COMPAT_MPT_VIRT_START +
240 i * sizeof(*compat_machine_to_phys_mapping);
241 if ( l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT )
242 {
243 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
244 if ( hotadd_mem_valid(pt_pfn, info) )
245 {
246 destroy_xen_mappings(rwva, rwva +
247 (1UL << L2_PAGETABLE_SHIFT));
248 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
249 }
250 }
251
252 i += 1UL << (L2_PAGETABLE_SHIFT - 2);
253 }
254
255 return;
256 }
257
destroy_m2p_mapping(struct mem_hotadd_info * info)258 static void destroy_m2p_mapping(struct mem_hotadd_info *info)
259 {
260 l3_pgentry_t *l3_ro_mpt;
261 unsigned long i, va, rwva;
262 unsigned long smap = info->spfn, emap = info->epfn;
263
264 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
265
266 /*
267 * No need to clean m2p structure existing before the hotplug
268 */
269 for (i = smap; i < emap;)
270 {
271 unsigned long pt_pfn;
272 l2_pgentry_t *l2_ro_mpt;
273
274 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
275 rwva = RDWR_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
276
277 /* 1G mapping should not be created by mem hotadd */
278 if (!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT) ||
279 (l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PSE))
280 {
281 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
282 (1UL << (L3_PAGETABLE_SHIFT - 3) );
283 continue;
284 }
285
286 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
287 if (!(l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT))
288 {
289 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
290 (1UL << (L2_PAGETABLE_SHIFT - 3)) ;
291 continue;
292 }
293
294 pt_pfn = l2e_get_pfn(l2_ro_mpt[l2_table_offset(va)]);
295 if ( hotadd_mem_valid(pt_pfn, info) )
296 {
297 destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT));
298
299 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
300 l2e_write(&l2_ro_mpt[l2_table_offset(va)], l2e_empty());
301 }
302 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
303 (1UL << (L2_PAGETABLE_SHIFT - 3));
304 }
305
306 destroy_compat_m2p_mapping(info);
307
308 /* Brute-Force flush all TLB */
309 flush_tlb_all();
310 return;
311 }
312
313 /*
314 * Allocate and map the compatibility mode machine-to-phys table.
315 * spfn/epfn: the pfn ranges to be setup
316 * free_s/free_e: the pfn ranges that is free still
317 */
setup_compat_m2p_table(struct mem_hotadd_info * info)318 static int setup_compat_m2p_table(struct mem_hotadd_info *info)
319 {
320 unsigned long i, va, smap, emap, rwva, epfn = info->epfn, mfn;
321 unsigned int n;
322 l3_pgentry_t *l3_ro_mpt = NULL;
323 l2_pgentry_t *l2_ro_mpt = NULL;
324 int err = 0;
325
326 smap = info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 2)) -1));
327
328 /*
329 * Notice: For hot-added memory, only range below m2p_compat_vstart
330 * will be filled up (assuming memory is discontinous when booting).
331 */
332 if ((smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2)) )
333 return 0;
334
335 if ( epfn > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
336 epfn = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
337
338 emap = ( (epfn + ((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1 )) &
339 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) );
340
341 va = HIRO_COMPAT_MPT_VIRT_START +
342 smap * sizeof(*compat_machine_to_phys_mapping);
343 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(va)]);
344
345 ASSERT(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT);
346
347 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]);
348
349 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
350 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
351 sizeof(*compat_machine_to_phys_mapping))
352 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
353 sizeof(*compat_machine_to_phys_mapping));
354
355 for ( i = smap; i < emap; i += (1UL << (L2_PAGETABLE_SHIFT - 2)) )
356 {
357 va = HIRO_COMPAT_MPT_VIRT_START +
358 i * sizeof(*compat_machine_to_phys_mapping);
359
360 rwva = RDWR_COMPAT_MPT_VIRT_START +
361 i * sizeof(*compat_machine_to_phys_mapping);
362
363 if (l2e_get_flags(l2_ro_mpt[l2_table_offset(va)]) & _PAGE_PRESENT)
364 continue;
365
366 for ( n = 0; n < CNT; ++n)
367 if ( mfn_valid(_mfn(i + n * PDX_GROUP_COUNT)) )
368 break;
369 if ( n == CNT )
370 continue;
371
372 mfn = alloc_hotadd_mfn(info);
373 err = map_pages_to_xen(rwva, mfn, 1UL << PAGETABLE_ORDER,
374 PAGE_HYPERVISOR);
375 if ( err )
376 break;
377 /* Fill with INVALID_M2P_ENTRY. */
378 memset((void *)rwva, 0xFF, 1UL << L2_PAGETABLE_SHIFT);
379 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
380 l2e_write(&l2_ro_mpt[l2_table_offset(va)],
381 l2e_from_pfn(mfn, _PAGE_PSE|_PAGE_PRESENT));
382 }
383 #undef CNT
384 #undef MFN
385 return err;
386 }
387
388 /*
389 * Allocate and map the machine-to-phys table.
390 * The L3 for RO/RWRW MPT and the L2 for compatible MPT should be setup already
391 */
setup_m2p_table(struct mem_hotadd_info * info)392 static int setup_m2p_table(struct mem_hotadd_info *info)
393 {
394 unsigned long i, va, smap, emap;
395 unsigned int n;
396 l2_pgentry_t *l2_ro_mpt = NULL;
397 l3_pgentry_t *l3_ro_mpt = NULL;
398 int ret = 0;
399
400 ASSERT(l4e_get_flags(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)])
401 & _PAGE_PRESENT);
402 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
403
404 smap = (info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1)));
405 emap = ((info->epfn + ((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1 )) &
406 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1));
407
408 va = RO_MPT_VIRT_START + smap * sizeof(*machine_to_phys_mapping);
409
410 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
411 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
412 sizeof(*machine_to_phys_mapping))
413
414 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
415 sizeof(*machine_to_phys_mapping));
416
417 i = smap;
418 while ( i < emap )
419 {
420 switch ( m2p_mapped(i) )
421 {
422 case M2P_1G_MAPPED:
423 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
424 (1UL << (L3_PAGETABLE_SHIFT - 3));
425 continue;
426 case M2P_2M_MAPPED:
427 i = (i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
428 (1UL << (L2_PAGETABLE_SHIFT - 3));
429 continue;
430 default:
431 break;
432 }
433
434 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
435
436 for ( n = 0; n < CNT; ++n)
437 if ( mfn_valid(_mfn(i + n * PDX_GROUP_COUNT)) )
438 break;
439 if ( n < CNT )
440 {
441 unsigned long mfn = alloc_hotadd_mfn(info);
442
443 ret = map_pages_to_xen(
444 RDWR_MPT_VIRT_START + i * sizeof(unsigned long),
445 mfn, 1UL << PAGETABLE_ORDER,
446 PAGE_HYPERVISOR);
447 if ( ret )
448 goto error;
449 /* Fill with INVALID_M2P_ENTRY. */
450 memset((void *)(RDWR_MPT_VIRT_START + i * sizeof(unsigned long)),
451 0xFF, 1UL << L2_PAGETABLE_SHIFT);
452
453 ASSERT(!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
454 _PAGE_PSE));
455 if ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
456 _PAGE_PRESENT )
457 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]) +
458 l2_table_offset(va);
459 else
460 {
461 l2_ro_mpt = alloc_xen_pagetable();
462 if ( !l2_ro_mpt )
463 {
464 ret = -ENOMEM;
465 goto error;
466 }
467
468 clear_page(l2_ro_mpt);
469 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
470 l3e_from_paddr(__pa(l2_ro_mpt),
471 __PAGE_HYPERVISOR_RO | _PAGE_USER));
472 l2_ro_mpt += l2_table_offset(va);
473 }
474
475 /* NB. Cannot be GLOBAL: guest user mode should not see it. */
476 l2e_write(l2_ro_mpt, l2e_from_pfn(mfn,
477 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
478 }
479 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
480 l2_ro_mpt = NULL;
481 i += ( 1UL << (L2_PAGETABLE_SHIFT - 3));
482 }
483 #undef CNT
484 #undef MFN
485
486 ret = setup_compat_m2p_table(info);
487 error:
488 return ret;
489 }
490
paging_init(void)491 void __init paging_init(void)
492 {
493 unsigned long i, mpt_size, va;
494 unsigned int n, memflags;
495 l3_pgentry_t *l3_ro_mpt;
496 l2_pgentry_t *l2_ro_mpt = NULL;
497 struct page_info *l1_pg;
498
499 /*
500 * We setup the L3s for 1:1 mapping if host support memory hotplug
501 * to avoid sync the 1:1 mapping on page fault handler
502 */
503 for ( va = DIRECTMAP_VIRT_START;
504 va < DIRECTMAP_VIRT_END && (void *)va < __va(mem_hotplug);
505 va += (1UL << L4_PAGETABLE_SHIFT) )
506 {
507 if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) &
508 _PAGE_PRESENT) )
509 {
510 l3_pgentry_t *pl3t = alloc_xen_pagetable();
511
512 if ( !pl3t )
513 goto nomem;
514 clear_page(pl3t);
515 l4e_write(&idle_pg_table[l4_table_offset(va)],
516 l4e_from_paddr(__pa(pl3t), __PAGE_HYPERVISOR_RW));
517 }
518 }
519
520 /* Create user-accessible L2 directory to map the MPT for guests. */
521 if ( (l3_ro_mpt = alloc_xen_pagetable()) == NULL )
522 goto nomem;
523 clear_page(l3_ro_mpt);
524 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
525 l4e_from_paddr(__pa(l3_ro_mpt), __PAGE_HYPERVISOR_RO | _PAGE_USER));
526
527 /*
528 * Allocate and map the machine-to-phys table.
529 * This also ensures L3 is present for fixmaps.
530 */
531 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
532 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
533 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
534 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
535 sizeof(*machine_to_phys_mapping))
536 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
537 sizeof(*machine_to_phys_mapping));
538 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
539 {
540 BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
541 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
542 memflags = MEMF_node(phys_to_nid(i <<
543 (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
544
545 if ( cpu_has_page1gb &&
546 !((unsigned long)l2_ro_mpt & ~PAGE_MASK) &&
547 (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) )
548 {
549 unsigned int k, holes;
550
551 for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k)
552 {
553 for ( n = 0; n < CNT; ++n)
554 if ( mfn_valid(_mfn(MFN(i + k) + n * PDX_GROUP_COUNT)) )
555 break;
556 if ( n == CNT )
557 ++holes;
558 }
559 if ( k == holes )
560 {
561 i += (1UL << PAGETABLE_ORDER) - 1;
562 continue;
563 }
564 if ( holes == 0 &&
565 (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER,
566 memflags)) != NULL )
567 {
568 map_pages_to_xen(
569 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
570 page_to_mfn(l1_pg),
571 1UL << (2 * PAGETABLE_ORDER),
572 PAGE_HYPERVISOR);
573 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
574 0x77, 1UL << L3_PAGETABLE_SHIFT);
575
576 ASSERT(!l2_table_offset(va));
577 /* NB. Cannot be GLOBAL: guest user mode should not see it. */
578 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
579 l3e_from_page(l1_pg,
580 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
581 i += (1UL << PAGETABLE_ORDER) - 1;
582 continue;
583 }
584 }
585
586 for ( n = 0; n < CNT; ++n)
587 if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
588 break;
589 if ( n == CNT )
590 l1_pg = NULL;
591 else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
592 memflags)) == NULL )
593 goto nomem;
594 else
595 {
596 map_pages_to_xen(
597 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
598 page_to_mfn(l1_pg),
599 1UL << PAGETABLE_ORDER,
600 PAGE_HYPERVISOR);
601 /* Fill with INVALID_M2P_ENTRY. */
602 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
603 0xFF, 1UL << L2_PAGETABLE_SHIFT);
604 }
605 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
606 {
607 if ( (l2_ro_mpt = alloc_xen_pagetable()) == NULL )
608 goto nomem;
609 clear_page(l2_ro_mpt);
610 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
611 l3e_from_paddr(__pa(l2_ro_mpt),
612 __PAGE_HYPERVISOR_RO | _PAGE_USER));
613 ASSERT(!l2_table_offset(va));
614 }
615 /* NB. Cannot be GLOBAL: guest user mode should not see it. */
616 if ( l1_pg )
617 l2e_write(l2_ro_mpt, l2e_from_page(
618 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
619 l2_ro_mpt++;
620 }
621 #undef CNT
622 #undef MFN
623
624 /* Create user-accessible L2 directory to map the MPT for compat guests. */
625 BUILD_BUG_ON(l4_table_offset(RDWR_MPT_VIRT_START) !=
626 l4_table_offset(HIRO_COMPAT_MPT_VIRT_START));
627 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(
628 HIRO_COMPAT_MPT_VIRT_START)]);
629 if ( (l2_ro_mpt = alloc_xen_pagetable()) == NULL )
630 goto nomem;
631 compat_idle_pg_table_l2 = l2_ro_mpt;
632 clear_page(l2_ro_mpt);
633 l3e_write(&l3_ro_mpt[l3_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
634 l3e_from_paddr(__pa(l2_ro_mpt), __PAGE_HYPERVISOR_RO));
635 l2_ro_mpt += l2_table_offset(HIRO_COMPAT_MPT_VIRT_START);
636 /* Allocate and map the compatibility mode machine-to-phys table. */
637 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
638 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
639 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
640 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
641 if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END )
642 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
643 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
644 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
645 sizeof(*compat_machine_to_phys_mapping))
646 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
647 sizeof(*compat_machine_to_phys_mapping));
648 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, l2_ro_mpt++ )
649 {
650 memflags = MEMF_node(phys_to_nid(i <<
651 (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
652 for ( n = 0; n < CNT; ++n)
653 if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
654 break;
655 if ( n == CNT )
656 continue;
657 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
658 memflags)) == NULL )
659 goto nomem;
660 map_pages_to_xen(
661 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
662 page_to_mfn(l1_pg),
663 1UL << PAGETABLE_ORDER,
664 PAGE_HYPERVISOR);
665 memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
666 (i << L2_PAGETABLE_SHIFT)),
667 0x55,
668 1UL << L2_PAGETABLE_SHIFT);
669 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
670 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
671 }
672 #undef CNT
673 #undef MFN
674
675 machine_to_phys_mapping_valid = 1;
676
677 /* Set up linear page table mapping. */
678 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
679 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
680 return;
681
682 nomem:
683 panic("Not enough memory for m2p table");
684 }
685
zap_low_mappings(void)686 void __init zap_low_mappings(void)
687 {
688 BUG_ON(num_online_cpus() != 1);
689
690 /* Remove aliased mapping of first 1:1 PML4 entry. */
691 l4e_write(&idle_pg_table[0], l4e_empty());
692 flush_local(FLUSH_TLB_GLOBAL);
693
694 /* Replace with mapping of the boot trampoline only. */
695 map_pages_to_xen(trampoline_phys, trampoline_phys >> PAGE_SHIFT,
696 PFN_UP(trampoline_end - trampoline_start),
697 __PAGE_HYPERVISOR);
698 }
699
setup_compat_arg_xlat(struct vcpu * v)700 int setup_compat_arg_xlat(struct vcpu *v)
701 {
702 return create_perdomain_mapping(v->domain, ARG_XLAT_START(v),
703 PFN_UP(COMPAT_ARG_XLAT_SIZE),
704 NULL, NIL(struct page_info *));
705 }
706
free_compat_arg_xlat(struct vcpu * v)707 void free_compat_arg_xlat(struct vcpu *v)
708 {
709 destroy_perdomain_mapping(v->domain, ARG_XLAT_START(v),
710 PFN_UP(COMPAT_ARG_XLAT_SIZE));
711 }
712
cleanup_frame_table(struct mem_hotadd_info * info)713 static void cleanup_frame_table(struct mem_hotadd_info *info)
714 {
715 unsigned long sva, eva;
716 l3_pgentry_t l3e;
717 l2_pgentry_t l2e;
718 unsigned long spfn, epfn;
719
720 spfn = info->spfn;
721 epfn = info->epfn;
722
723 sva = (unsigned long)pdx_to_page(pfn_to_pdx(spfn));
724 eva = (unsigned long)pdx_to_page(pfn_to_pdx(epfn));
725
726 /* Intialize all page */
727 memset(mfn_to_page(spfn), -1,
728 (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
729
730 while (sva < eva)
731 {
732 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(sva)])[
733 l3_table_offset(sva)];
734 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
735 (l3e_get_flags(l3e) & _PAGE_PSE) )
736 {
737 sva = (sva & ~((1UL << L3_PAGETABLE_SHIFT) - 1)) +
738 (1UL << L3_PAGETABLE_SHIFT);
739 continue;
740 }
741
742 l2e = l3e_to_l2e(l3e)[l2_table_offset(sva)];
743 ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
744
745 if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) ==
746 (_PAGE_PSE | _PAGE_PRESENT) )
747 {
748 if (hotadd_mem_valid(l2e_get_pfn(l2e), info))
749 destroy_xen_mappings(sva & ~((1UL << L2_PAGETABLE_SHIFT) - 1),
750 ((sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
751 (1UL << L2_PAGETABLE_SHIFT) - 1));
752
753 sva = (sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
754 (1UL << L2_PAGETABLE_SHIFT);
755 continue;
756 }
757
758 ASSERT(l1e_get_flags(l2e_to_l1e(l2e)[l1_table_offset(sva)]) &
759 _PAGE_PRESENT);
760 sva = (sva & ~((1UL << PAGE_SHIFT) - 1)) +
761 (1UL << PAGE_SHIFT);
762 }
763
764 /* Brute-Force flush all TLB */
765 flush_tlb_all();
766 }
767
setup_frametable_chunk(void * start,void * end,struct mem_hotadd_info * info)768 static int setup_frametable_chunk(void *start, void *end,
769 struct mem_hotadd_info *info)
770 {
771 unsigned long s = (unsigned long)start;
772 unsigned long e = (unsigned long)end;
773 unsigned long mfn;
774 int err;
775
776 ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
777 ASSERT(!(e & ((1 << L2_PAGETABLE_SHIFT) - 1)));
778
779 for ( ; s < e; s += (1UL << L2_PAGETABLE_SHIFT))
780 {
781 mfn = alloc_hotadd_mfn(info);
782 err = map_pages_to_xen(s, mfn, 1UL << PAGETABLE_ORDER,
783 PAGE_HYPERVISOR);
784 if ( err )
785 return err;
786 }
787 memset(start, -1, s - (unsigned long)start);
788
789 return 0;
790 }
791
extend_frame_table(struct mem_hotadd_info * info)792 static int extend_frame_table(struct mem_hotadd_info *info)
793 {
794 unsigned long cidx, nidx, eidx, spfn, epfn;
795
796 spfn = info->spfn;
797 epfn = info->epfn;
798
799 eidx = (pfn_to_pdx(epfn) + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
800 nidx = cidx = pfn_to_pdx(spfn)/PDX_GROUP_COUNT;
801
802 ASSERT( pfn_to_pdx(epfn) <= (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
803 pfn_to_pdx(epfn) <= FRAMETABLE_NR );
804
805 if ( test_bit(cidx, pdx_group_valid) )
806 cidx = find_next_zero_bit(pdx_group_valid, eidx, cidx);
807
808 if ( cidx >= eidx )
809 return 0;
810
811 while ( cidx < eidx )
812 {
813 int err;
814
815 nidx = find_next_bit(pdx_group_valid, eidx, cidx);
816 if ( nidx >= eidx )
817 nidx = eidx;
818 err = setup_frametable_chunk(pdx_to_page(cidx * PDX_GROUP_COUNT ),
819 pdx_to_page(nidx * PDX_GROUP_COUNT),
820 info);
821 if ( err )
822 return err;
823
824 cidx = find_next_zero_bit(pdx_group_valid, eidx, nidx);
825 }
826
827 memset(mfn_to_page(spfn), 0,
828 (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
829 return 0;
830 }
831
subarch_init_memory(void)832 void __init subarch_init_memory(void)
833 {
834 unsigned long i, n, v, m2p_start_mfn;
835 l3_pgentry_t l3e;
836 l2_pgentry_t l2e;
837
838 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
839 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
840 /* M2P table is mappable read-only by privileged domains. */
841 for ( v = RDWR_MPT_VIRT_START;
842 v != RDWR_MPT_VIRT_END;
843 v += n << PAGE_SHIFT )
844 {
845 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
846 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
847 l3_table_offset(v)];
848 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
849 continue;
850 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
851 {
852 n = L1_PAGETABLE_ENTRIES;
853 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
854 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
855 continue;
856 m2p_start_mfn = l2e_get_pfn(l2e);
857 }
858 else
859 {
860 m2p_start_mfn = l3e_get_pfn(l3e);
861 }
862
863 for ( i = 0; i < n; i++ )
864 {
865 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
866 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
867 }
868 }
869
870 for ( v = RDWR_COMPAT_MPT_VIRT_START;
871 v != RDWR_COMPAT_MPT_VIRT_END;
872 v += 1 << L2_PAGETABLE_SHIFT )
873 {
874 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
875 l3_table_offset(v)];
876 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
877 continue;
878 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
879 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
880 continue;
881 m2p_start_mfn = l2e_get_pfn(l2e);
882
883 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
884 {
885 struct page_info *page = mfn_to_page(m2p_start_mfn + i);
886 share_xen_page_with_privileged_guests(page, XENSHARE_readonly);
887 }
888 }
889
890 /* Mark all of direct map NX if hardware supports it. */
891 if ( !cpu_has_nx )
892 return;
893
894 for ( i = l4_table_offset(DIRECTMAP_VIRT_START);
895 i < l4_table_offset(DIRECTMAP_VIRT_END); ++i )
896 {
897 l4_pgentry_t l4e = idle_pg_table[i];
898
899 if ( l4e_get_flags(l4e) & _PAGE_PRESENT )
900 {
901 l4e_add_flags(l4e, _PAGE_NX_BIT);
902 idle_pg_table[i] = l4e;
903 }
904 }
905 }
906
subarch_memory_op(unsigned long cmd,XEN_GUEST_HANDLE_PARAM (void)arg)907 long subarch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
908 {
909 struct xen_machphys_mfn_list xmml;
910 l3_pgentry_t l3e;
911 l2_pgentry_t l2e;
912 unsigned long v, limit;
913 xen_pfn_t mfn, last_mfn;
914 unsigned int i;
915 long rc = 0;
916
917 switch ( cmd )
918 {
919 case XENMEM_machphys_mfn_list:
920 if ( copy_from_guest(&xmml, arg, 1) )
921 return -EFAULT;
922
923 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
924 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
925 for ( i = 0, v = RDWR_MPT_VIRT_START, last_mfn = 0;
926 (i != xmml.max_extents) &&
927 (v < (unsigned long)(machine_to_phys_mapping + max_page));
928 i++, v += 1UL << L2_PAGETABLE_SHIFT )
929 {
930 l3e = l4e_to_l3e(idle_pg_table[l4_table_offset(v)])[
931 l3_table_offset(v)];
932 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
933 mfn = last_mfn;
934 else if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
935 {
936 l2e = l3e_to_l2e(l3e)[l2_table_offset(v)];
937 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
938 mfn = l2e_get_pfn(l2e);
939 else
940 mfn = last_mfn;
941 }
942 else
943 {
944 mfn = l3e_get_pfn(l3e)
945 + (l2_table_offset(v) << PAGETABLE_ORDER);
946 }
947 ASSERT(mfn);
948 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
949 return -EFAULT;
950 last_mfn = mfn;
951 }
952
953 xmml.nr_extents = i;
954 if ( __copy_to_guest(arg, &xmml, 1) )
955 return -EFAULT;
956
957 break;
958
959 case XENMEM_machphys_compat_mfn_list:
960 if ( copy_from_guest(&xmml, arg, 1) )
961 return -EFAULT;
962
963 limit = (unsigned long)(compat_machine_to_phys_mapping + max_page);
964 if ( limit > RDWR_COMPAT_MPT_VIRT_END )
965 limit = RDWR_COMPAT_MPT_VIRT_END;
966 for ( i = 0, v = RDWR_COMPAT_MPT_VIRT_START, last_mfn = 0;
967 (i != xmml.max_extents) && (v < limit);
968 i++, v += 1 << L2_PAGETABLE_SHIFT )
969 {
970 l2e = compat_idle_pg_table_l2[l2_table_offset(v)];
971 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
972 mfn = l2e_get_pfn(l2e);
973 else
974 mfn = last_mfn;
975 ASSERT(mfn);
976 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
977 return -EFAULT;
978 last_mfn = mfn;
979 }
980
981 xmml.nr_extents = i;
982 if ( __copy_to_guest(arg, &xmml, 1) )
983 rc = -EFAULT;
984
985 break;
986
987 case XENMEM_get_sharing_freed_pages:
988 return mem_sharing_get_nr_saved_mfns();
989
990 case XENMEM_get_sharing_shared_pages:
991 return mem_sharing_get_nr_shared_mfns();
992
993 case XENMEM_paging_op:
994 return mem_paging_memop(guest_handle_cast(arg, xen_mem_paging_op_t));
995
996 case XENMEM_sharing_op:
997 return mem_sharing_memop(guest_handle_cast(arg, xen_mem_sharing_op_t));
998
999 default:
1000 rc = -ENOSYS;
1001 break;
1002 }
1003
1004 return rc;
1005 }
1006
do_stack_switch(unsigned long ss,unsigned long esp)1007 long do_stack_switch(unsigned long ss, unsigned long esp)
1008 {
1009 fixup_guest_stack_selector(current->domain, ss);
1010 current->arch.pv_vcpu.kernel_ss = ss;
1011 current->arch.pv_vcpu.kernel_sp = esp;
1012 return 0;
1013 }
1014
do_set_segment_base(unsigned int which,unsigned long base)1015 long do_set_segment_base(unsigned int which, unsigned long base)
1016 {
1017 struct vcpu *v = current;
1018 long ret = 0;
1019
1020 if ( is_pv_32bit_vcpu(v) )
1021 return -ENOSYS; /* x86/64 only. */
1022
1023 switch ( which )
1024 {
1025 case SEGBASE_FS:
1026 if ( is_canonical_address(base) )
1027 {
1028 wrfsbase(base);
1029 v->arch.pv_vcpu.fs_base = base;
1030 }
1031 else
1032 ret = -EINVAL;
1033 break;
1034
1035 case SEGBASE_GS_USER:
1036 if ( is_canonical_address(base) )
1037 {
1038 wrmsrl(MSR_SHADOW_GS_BASE, base);
1039 v->arch.pv_vcpu.gs_base_user = base;
1040 }
1041 else
1042 ret = -EINVAL;
1043 break;
1044
1045 case SEGBASE_GS_KERNEL:
1046 if ( is_canonical_address(base) )
1047 {
1048 wrgsbase(base);
1049 v->arch.pv_vcpu.gs_base_kernel = base;
1050 }
1051 else
1052 ret = -EINVAL;
1053 break;
1054
1055 case SEGBASE_GS_USER_SEL:
1056 __asm__ __volatile__ (
1057 " swapgs \n"
1058 "1: movl %k0,%%gs \n"
1059 " "safe_swapgs" \n"
1060 ".section .fixup,\"ax\" \n"
1061 "2: xorl %k0,%k0 \n"
1062 " jmp 1b \n"
1063 ".previous \n"
1064 _ASM_EXTABLE(1b, 2b)
1065 : "+r" (base) );
1066 break;
1067
1068 default:
1069 ret = -EINVAL;
1070 break;
1071 }
1072
1073 return ret;
1074 }
1075
1076
1077 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
check_descriptor(const struct domain * dom,struct desc_struct * d)1078 int check_descriptor(const struct domain *dom, struct desc_struct *d)
1079 {
1080 u32 a = d->a, b = d->b;
1081 u16 cs;
1082 unsigned int dpl;
1083
1084 /* A not-present descriptor will always fault, so is safe. */
1085 if ( !(b & _SEGMENT_P) )
1086 return 1;
1087
1088 /* Check and fix up the DPL. */
1089 dpl = (b >> 13) & 3;
1090 __fixup_guest_selector(dom, dpl);
1091 b = (b & ~_SEGMENT_DPL) | (dpl << 13);
1092
1093 /* All code and data segments are okay. No base/limit checking. */
1094 if ( (b & _SEGMENT_S) )
1095 {
1096 if ( is_pv_32bit_domain(dom) )
1097 {
1098 unsigned long base, limit;
1099
1100 if ( b & _SEGMENT_L )
1101 goto bad;
1102
1103 /*
1104 * Older PAE Linux guests use segments which are limited to
1105 * 0xf6800000. Extend these to allow access to the larger read-only
1106 * M2P table available in 32on64 mode.
1107 */
1108 base = (b & (0xff << 24)) | ((b & 0xff) << 16) | (a >> 16);
1109
1110 limit = (b & 0xf0000) | (a & 0xffff);
1111 limit++; /* We add one because limit is inclusive. */
1112
1113 if ( (b & _SEGMENT_G) )
1114 limit <<= 12;
1115
1116 if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) )
1117 {
1118 a |= 0x0000ffff;
1119 b |= 0x000f0000;
1120 }
1121 }
1122
1123 goto good;
1124 }
1125
1126 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
1127 if ( (b & _SEGMENT_TYPE) == 0x000 )
1128 return 1;
1129
1130 /* Everything but a call gate is discarded here. */
1131 if ( (b & _SEGMENT_TYPE) != 0xc00 )
1132 goto bad;
1133
1134 /* Validate the target code selector. */
1135 cs = a >> 16;
1136 if ( !guest_gate_selector_okay(dom, cs) )
1137 goto bad;
1138 /*
1139 * Force DPL to zero, causing a GP fault with its error code indicating
1140 * the gate in use, allowing emulation. This is necessary because with
1141 * native guests (kernel in ring 3) call gates cannot be used directly
1142 * to transition from user to kernel mode (and whether a gate is used
1143 * to enter the kernel can only be determined when the gate is being
1144 * used), and with compat guests call gates cannot be used at all as
1145 * there are only 64-bit ones.
1146 * Store the original DPL in the selector's RPL field.
1147 */
1148 b &= ~_SEGMENT_DPL;
1149 cs = (cs & ~3) | dpl;
1150 a = (a & 0xffffU) | (cs << 16);
1151
1152 /* Reserved bits must be zero. */
1153 if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
1154 goto bad;
1155
1156 good:
1157 d->a = a;
1158 d->b = b;
1159 return 1;
1160 bad:
1161 return 0;
1162 }
1163
pagefault_by_memadd(unsigned long addr,struct cpu_user_regs * regs)1164 int pagefault_by_memadd(unsigned long addr, struct cpu_user_regs *regs)
1165 {
1166 struct domain *d = current->domain;
1167
1168 return mem_hotplug && guest_mode(regs) && is_pv_32bit_domain(d) &&
1169 (addr >= HYPERVISOR_COMPAT_VIRT_START(d)) &&
1170 (addr < MACH2PHYS_COMPAT_VIRT_END);
1171 }
1172
handle_memadd_fault(unsigned long addr,struct cpu_user_regs * regs)1173 int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
1174 {
1175 struct domain *d = current->domain;
1176 l4_pgentry_t *pl4e = NULL;
1177 l4_pgentry_t l4e;
1178 l3_pgentry_t *pl3e = NULL;
1179 l3_pgentry_t l3e;
1180 l2_pgentry_t *pl2e = NULL;
1181 l2_pgentry_t l2e, idle_l2e;
1182 unsigned long mfn, idle_index;
1183 int ret = 0;
1184
1185 if (!is_pv_32bit_domain(d))
1186 return 0;
1187
1188 if ( (addr < HYPERVISOR_COMPAT_VIRT_START(d)) ||
1189 (addr >= MACH2PHYS_COMPAT_VIRT_END) )
1190 return 0;
1191
1192 mfn = (read_cr3()) >> PAGE_SHIFT;
1193
1194 pl4e = map_domain_page(_mfn(mfn));
1195
1196 l4e = pl4e[0];
1197
1198 if (!(l4e_get_flags(l4e) & _PAGE_PRESENT))
1199 goto unmap;
1200
1201 mfn = l4e_get_pfn(l4e);
1202 /* We don't need get page type here since it is current CR3 */
1203 pl3e = map_domain_page(_mfn(mfn));
1204
1205 l3e = pl3e[3];
1206
1207 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1208 goto unmap;
1209
1210 mfn = l3e_get_pfn(l3e);
1211 pl2e = map_domain_page(_mfn(mfn));
1212
1213 l2e = pl2e[l2_table_offset(addr)];
1214
1215 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT))
1216 goto unmap;
1217
1218 idle_index = (l2_table_offset(addr) -
1219 COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d))/
1220 sizeof(l2_pgentry_t);
1221 idle_l2e = compat_idle_pg_table_l2[idle_index];
1222 if (!(l2e_get_flags(idle_l2e) & _PAGE_PRESENT))
1223 goto unmap;
1224
1225 memcpy(&pl2e[l2_table_offset(addr)],
1226 &compat_idle_pg_table_l2[idle_index],
1227 sizeof(l2_pgentry_t));
1228
1229 ret = EXCRET_fault_fixed;
1230
1231 unmap:
1232 if ( pl4e )
1233 unmap_domain_page(pl4e);
1234 if ( pl3e )
1235 unmap_domain_page(pl3e);
1236 if ( pl2e )
1237 unmap_domain_page(pl2e);
1238
1239 return ret;
1240 }
1241
domain_set_alloc_bitsize(struct domain * d)1242 void domain_set_alloc_bitsize(struct domain *d)
1243 {
1244 if ( !is_pv_32bit_domain(d) ||
1245 (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
1246 d->arch.physaddr_bitsize > 0 )
1247 return;
1248 d->arch.physaddr_bitsize =
1249 /* 2^n entries can be contained in guest's p2m mapping space */
1250 fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
1251 /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
1252 + PAGE_SHIFT;
1253 }
1254
domain_clamp_alloc_bitsize(struct domain * d,unsigned int bits)1255 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
1256 {
1257 if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
1258 return bits;
1259 return min(d->arch.physaddr_bitsize, bits);
1260 }
1261
transfer_pages_to_heap(struct mem_hotadd_info * info)1262 static int transfer_pages_to_heap(struct mem_hotadd_info *info)
1263 {
1264 unsigned long i;
1265 struct page_info *pg;
1266
1267 /*
1268 * Mark the allocated page before put free pages to buddy allocator
1269 * to avoid merge in free_heap_pages
1270 */
1271 for (i = info->spfn; i < info->cur; i++)
1272 {
1273 pg = mfn_to_page(i);
1274 pg->count_info = PGC_state_inuse;
1275 }
1276
1277 init_domheap_pages(pfn_to_paddr(info->cur), pfn_to_paddr(info->epfn));
1278
1279 return 0;
1280 }
1281
mem_hotadd_check(unsigned long spfn,unsigned long epfn)1282 static int mem_hotadd_check(unsigned long spfn, unsigned long epfn)
1283 {
1284 unsigned long s, e, length, sidx, eidx;
1285
1286 if ( (spfn >= epfn) )
1287 return 0;
1288
1289 if (pfn_to_pdx(epfn) > FRAMETABLE_NR)
1290 return 0;
1291
1292 if ( (spfn | epfn) & ((1UL << PAGETABLE_ORDER) - 1) )
1293 return 0;
1294
1295 if ( (spfn | epfn) & pfn_hole_mask )
1296 return 0;
1297
1298 /* Make sure the new range is not present now */
1299 sidx = ((pfn_to_pdx(spfn) + PDX_GROUP_COUNT - 1) & ~(PDX_GROUP_COUNT - 1))
1300 / PDX_GROUP_COUNT;
1301 eidx = (pfn_to_pdx(epfn - 1) & ~(PDX_GROUP_COUNT - 1)) / PDX_GROUP_COUNT;
1302 if (sidx >= eidx)
1303 return 0;
1304
1305 s = find_next_zero_bit(pdx_group_valid, eidx, sidx);
1306 if ( s > eidx )
1307 return 0;
1308 e = find_next_bit(pdx_group_valid, eidx, s);
1309 if ( e < eidx )
1310 return 0;
1311
1312 /* Caculate at most required m2p/compat m2p/frametable pages */
1313 s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1));
1314 e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 3)) - 1) &
1315 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1);
1316
1317 length = (e - s) * sizeof(unsigned long);
1318
1319 s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1));
1320 e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) &
1321 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1);
1322
1323 e = min_t(unsigned long, e,
1324 (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2);
1325
1326 if ( e > s )
1327 length += (e -s) * sizeof(unsigned int);
1328
1329 s = pfn_to_pdx(spfn) & ~(PDX_GROUP_COUNT - 1);
1330 e = ( pfn_to_pdx(epfn) + (PDX_GROUP_COUNT - 1) ) & ~(PDX_GROUP_COUNT - 1);
1331
1332 length += (e - s) * sizeof(struct page_info);
1333
1334 if ((length >> PAGE_SHIFT) > (epfn - spfn))
1335 return 0;
1336
1337 return 1;
1338 }
1339
1340 /*
1341 * A bit paranoid for memory allocation failure issue since
1342 * it may be reason for memory add
1343 */
memory_add(unsigned long spfn,unsigned long epfn,unsigned int pxm)1344 int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
1345 {
1346 struct mem_hotadd_info info;
1347 int ret;
1348 nodeid_t node;
1349 unsigned long old_max = max_page, old_total = total_pages;
1350 unsigned long old_node_start, old_node_span, orig_online;
1351 unsigned long i;
1352
1353 dprintk(XENLOG_INFO, "memory_add %lx ~ %lx with pxm %x\n", spfn, epfn, pxm);
1354
1355 if ( !mem_hotadd_check(spfn, epfn) )
1356 return -EINVAL;
1357
1358 if ( (node = setup_node(pxm)) == NUMA_NO_NODE )
1359 return -EINVAL;
1360
1361 if ( !valid_numa_range(spfn << PAGE_SHIFT, epfn << PAGE_SHIFT, node) )
1362 {
1363 printk(XENLOG_WARNING
1364 "pfn range %lx..%lx PXM %x node %x is not NUMA-valid\n",
1365 spfn, epfn, pxm, node);
1366 return -EINVAL;
1367 }
1368
1369 i = virt_to_mfn(HYPERVISOR_VIRT_END - 1) + 1;
1370 if ( spfn < i )
1371 {
1372 ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), spfn,
1373 min(epfn, i) - spfn, PAGE_HYPERVISOR);
1374 if ( ret )
1375 goto destroy_directmap;
1376 }
1377 if ( i < epfn )
1378 {
1379 if ( i < spfn )
1380 i = spfn;
1381 ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), i,
1382 epfn - i, __PAGE_HYPERVISOR_RW);
1383 if ( ret )
1384 goto destroy_directmap;
1385 }
1386
1387 old_node_start = node_start_pfn(node);
1388 old_node_span = node_spanned_pages(node);
1389 orig_online = node_online(node);
1390
1391 if ( !orig_online )
1392 {
1393 dprintk(XENLOG_WARNING, "node %x pxm %x is not online\n",node, pxm);
1394 NODE_DATA(node)->node_start_pfn = spfn;
1395 NODE_DATA(node)->node_spanned_pages =
1396 epfn - node_start_pfn(node);
1397 node_set_online(node);
1398 }
1399 else
1400 {
1401 if (node_start_pfn(node) > spfn)
1402 NODE_DATA(node)->node_start_pfn = spfn;
1403 if (node_end_pfn(node) < epfn)
1404 NODE_DATA(node)->node_spanned_pages = epfn - node_start_pfn(node);
1405 }
1406
1407 info.spfn = spfn;
1408 info.epfn = epfn;
1409 info.cur = spfn;
1410
1411 ret = extend_frame_table(&info);
1412 if (ret)
1413 goto destroy_frametable;
1414
1415 /* Set max_page as setup_m2p_table will use it*/
1416 if (max_page < epfn)
1417 {
1418 max_page = epfn;
1419 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1420 }
1421 total_pages += epfn - spfn;
1422
1423 set_pdx_range(spfn, epfn);
1424 ret = setup_m2p_table(&info);
1425
1426 if ( ret )
1427 goto destroy_m2p;
1428
1429 if ( iommu_enabled && !iommu_passthrough && !need_iommu(hardware_domain) )
1430 {
1431 for ( i = spfn; i < epfn; i++ )
1432 if ( iommu_map_page(hardware_domain, i, i, IOMMUF_readable|IOMMUF_writable) )
1433 break;
1434 if ( i != epfn )
1435 {
1436 while (i-- > old_max)
1437 /* If statement to satisfy __must_check. */
1438 if ( iommu_unmap_page(hardware_domain, i) )
1439 continue;
1440
1441 goto destroy_m2p;
1442 }
1443 }
1444
1445 /* We can't revert any more */
1446 share_hotadd_m2p_table(&info);
1447 transfer_pages_to_heap(&info);
1448
1449 return 0;
1450
1451 destroy_m2p:
1452 destroy_m2p_mapping(&info);
1453 max_page = old_max;
1454 total_pages = old_total;
1455 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1456 destroy_frametable:
1457 cleanup_frame_table(&info);
1458 if ( !orig_online )
1459 node_set_offline(node);
1460 NODE_DATA(node)->node_start_pfn = old_node_start;
1461 NODE_DATA(node)->node_spanned_pages = old_node_span;
1462 destroy_directmap:
1463 destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
1464 (unsigned long)mfn_to_virt(epfn));
1465
1466 return ret;
1467 }
1468
1469 #include "compat/mm.c"
1470
1471 /*
1472 * Local variables:
1473 * mode: C
1474 * c-file-style: "BSD"
1475 * c-basic-offset: 4
1476 * tab-width: 4
1477 * indent-tabs-mode: nil
1478 * End:
1479 */
1480