1 #include <xen/init.h>
2 #include <xen/lib.h>
3 #include <xen/sched.h>
4 #include <asm/cpuid.h>
5 #include <asm/hvm/hvm.h>
6 #include <asm/hvm/nestedhvm.h>
7 #include <asm/hvm/svm/svm.h>
8 #include <asm/hvm/vmx/vmcs.h>
9 #include <asm/paging.h>
10 #include <asm/processor.h>
11 #include <asm/xstate.h>
12
13 const uint32_t known_features[] = INIT_KNOWN_FEATURES;
14 const uint32_t special_features[] = INIT_SPECIAL_FEATURES;
15
16 static const uint32_t pv_featuremask[] = INIT_PV_FEATURES;
17 static const uint32_t hvm_shadow_featuremask[] = INIT_HVM_SHADOW_FEATURES;
18 static const uint32_t hvm_hap_featuremask[] = INIT_HVM_HAP_FEATURES;
19 static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
20
21 #define EMPTY_LEAF ((struct cpuid_leaf){})
zero_leaves(struct cpuid_leaf * l,unsigned int first,unsigned int last)22 static void zero_leaves(struct cpuid_leaf *l,
23 unsigned int first, unsigned int last)
24 {
25 memset(&l[first], 0, sizeof(*l) * (last - first + 1));
26 }
27
28 struct cpuid_policy __read_mostly raw_cpuid_policy,
29 __read_mostly host_cpuid_policy,
30 __read_mostly pv_max_cpuid_policy,
31 __read_mostly hvm_max_cpuid_policy;
32
cpuid_leaf(uint32_t leaf,struct cpuid_leaf * data)33 static void cpuid_leaf(uint32_t leaf, struct cpuid_leaf *data)
34 {
35 cpuid(leaf, &data->a, &data->b, &data->c, &data->d);
36 }
37
sanitise_featureset(uint32_t * fs)38 static void sanitise_featureset(uint32_t *fs)
39 {
40 /* for_each_set_bit() uses unsigned longs. Extend with zeroes. */
41 uint32_t disabled_features[
42 ROUNDUP(FSCAPINTS, sizeof(unsigned long)/sizeof(uint32_t))] = {};
43 unsigned int i;
44
45 for ( i = 0; i < FSCAPINTS; ++i )
46 {
47 /* Clamp to known mask. */
48 fs[i] &= known_features[i];
49
50 /*
51 * Identify which features with deep dependencies have been
52 * disabled.
53 */
54 disabled_features[i] = ~fs[i] & deep_features[i];
55 }
56
57 for_each_set_bit(i, (void *)disabled_features,
58 sizeof(disabled_features) * 8)
59 {
60 const uint32_t *dfs = lookup_deep_deps(i);
61 unsigned int j;
62
63 ASSERT(dfs); /* deep_features[] should guarentee this. */
64
65 for ( j = 0; j < FSCAPINTS; ++j )
66 {
67 fs[j] &= ~dfs[j];
68 disabled_features[j] &= ~dfs[j];
69 }
70 }
71 }
72
recalculate_xstate(struct cpuid_policy * p)73 static void recalculate_xstate(struct cpuid_policy *p)
74 {
75 uint64_t xstates = XSTATE_FP_SSE;
76 uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
77 unsigned int i, Da1 = p->xstate.Da1;
78
79 /*
80 * The Da1 leaf is the only piece of information preserved in the common
81 * case. Everything else is derived from other feature state.
82 */
83 memset(&p->xstate, 0, sizeof(p->xstate));
84
85 if ( !p->basic.xsave )
86 return;
87
88 if ( p->basic.avx )
89 {
90 xstates |= XSTATE_YMM;
91 xstate_size = max(xstate_size,
92 xstate_offsets[_XSTATE_YMM] +
93 xstate_sizes[_XSTATE_YMM]);
94 }
95
96 if ( p->feat.mpx )
97 {
98 xstates |= XSTATE_BNDREGS | XSTATE_BNDCSR;
99 xstate_size = max(xstate_size,
100 xstate_offsets[_XSTATE_BNDCSR] +
101 xstate_sizes[_XSTATE_BNDCSR]);
102 }
103
104 if ( p->feat.avx512f )
105 {
106 xstates |= XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM;
107 xstate_size = max(xstate_size,
108 xstate_offsets[_XSTATE_HI_ZMM] +
109 xstate_sizes[_XSTATE_HI_ZMM]);
110 }
111
112 if ( p->feat.pku )
113 {
114 xstates |= XSTATE_PKRU;
115 xstate_size = max(xstate_size,
116 xstate_offsets[_XSTATE_PKRU] +
117 xstate_sizes[_XSTATE_PKRU]);
118 }
119
120 if ( p->extd.lwp )
121 {
122 xstates |= XSTATE_LWP;
123 xstate_size = max(xstate_size,
124 xstate_offsets[_XSTATE_LWP] +
125 xstate_sizes[_XSTATE_LWP]);
126 }
127
128 p->xstate.max_size = xstate_size;
129 p->xstate.xcr0_low = xstates & ~XSTATE_XSAVES_ONLY;
130 p->xstate.xcr0_high = (xstates & ~XSTATE_XSAVES_ONLY) >> 32;
131
132 p->xstate.Da1 = Da1;
133 if ( p->xstate.xsaves )
134 {
135 p->xstate.xss_low = xstates & XSTATE_XSAVES_ONLY;
136 p->xstate.xss_high = (xstates & XSTATE_XSAVES_ONLY) >> 32;
137 }
138 else
139 xstates &= ~XSTATE_XSAVES_ONLY;
140
141 for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.comp)); ++i )
142 {
143 uint64_t curr_xstate = 1ul << i;
144
145 if ( !(xstates & curr_xstate) )
146 continue;
147
148 p->xstate.comp[i].size = xstate_sizes[i];
149 p->xstate.comp[i].offset = xstate_offsets[i];
150 p->xstate.comp[i].xss = curr_xstate & XSTATE_XSAVES_ONLY;
151 p->xstate.comp[i].align = curr_xstate & xstate_align;
152 }
153 }
154
155 /*
156 * Misc adjustments to the policy. Mostly clobbering reserved fields and
157 * duplicating shared fields. Intentionally hidden fields are annotated.
158 */
recalculate_misc(struct cpuid_policy * p)159 static void recalculate_misc(struct cpuid_policy *p)
160 {
161 p->basic.raw_fms &= 0x0fff0fff; /* Clobber Processor Type on Intel. */
162 p->basic.apic_id = 0; /* Dynamic. */
163
164 p->basic.raw[0x5] = EMPTY_LEAF; /* MONITOR not exposed to guests. */
165 p->basic.raw[0x6] = EMPTY_LEAF; /* Therm/Power not exposed to guests. */
166
167 p->basic.raw[0x8] = EMPTY_LEAF;
168 p->basic.raw[0xb] = EMPTY_LEAF; /* TODO: Rework topology logic. */
169 p->basic.raw[0xc] = EMPTY_LEAF;
170
171 p->extd.e1d &= ~CPUID_COMMON_1D_FEATURES;
172
173 /* Most of Power/RAS hidden from guests. */
174 p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
175
176 p->extd.raw[0x8].d = 0;
177
178 switch ( p->x86_vendor )
179 {
180 case X86_VENDOR_INTEL:
181 p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
182 p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
183 p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
184
185 p->extd.vendor_ebx = 0;
186 p->extd.vendor_ecx = 0;
187 p->extd.vendor_edx = 0;
188
189 p->extd.raw[0x1].a = p->extd.raw[0x1].b = 0;
190
191 p->extd.raw[0x5] = EMPTY_LEAF;
192 p->extd.raw[0x6].a = p->extd.raw[0x6].b = p->extd.raw[0x6].d = 0;
193
194 p->extd.raw[0x8].a &= 0x0000ffff;
195 p->extd.raw[0x8].c = 0;
196 break;
197
198 case X86_VENDOR_AMD:
199 zero_leaves(p->basic.raw, 0x2, 0x3);
200 memset(p->cache.raw, 0, sizeof(p->cache.raw));
201 zero_leaves(p->basic.raw, 0x9, 0xa);
202
203 p->extd.vendor_ebx = p->basic.vendor_ebx;
204 p->extd.vendor_ecx = p->basic.vendor_ecx;
205 p->extd.vendor_edx = p->basic.vendor_edx;
206
207 p->extd.raw_fms = p->basic.raw_fms;
208 p->extd.raw[0x1].b &= 0xff00ffff;
209 p->extd.e1d |= p->basic._1d & CPUID_COMMON_1D_FEATURES;
210
211 p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
212 p->extd.raw[0x8].c &= 0x0003f0ff;
213
214 p->extd.raw[0x9] = EMPTY_LEAF;
215
216 zero_leaves(p->extd.raw, 0xb, 0x18);
217
218 p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */
219
220 p->extd.raw[0x1c].a = 0; /* LWP.a entirely dynamic. */
221 break;
222 }
223 }
224
calculate_raw_policy(void)225 static void __init calculate_raw_policy(void)
226 {
227 struct cpuid_policy *p = &raw_cpuid_policy;
228 unsigned int i;
229
230 cpuid_leaf(0, &p->basic.raw[0]);
231 for ( i = 1; i < min(ARRAY_SIZE(p->basic.raw),
232 p->basic.max_leaf + 1ul); ++i )
233 {
234 switch ( i )
235 {
236 case 0x4: case 0x7: case 0xd:
237 /* Multi-invocation leaves. Deferred. */
238 continue;
239 }
240
241 cpuid_leaf(i, &p->basic.raw[i]);
242 }
243
244 if ( p->basic.max_leaf >= 4 )
245 {
246 for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
247 {
248 union {
249 struct cpuid_leaf l;
250 struct cpuid_cache_leaf c;
251 } u;
252
253 cpuid_count_leaf(4, i, &u.l);
254
255 if ( u.c.type == 0 )
256 break;
257
258 p->cache.subleaf[i] = u.c;
259 }
260
261 /*
262 * The choice of CPUID_GUEST_NR_CACHE is arbitrary. It is expected
263 * that it will eventually need increasing for future hardware.
264 */
265 if ( i == ARRAY_SIZE(p->cache.raw) )
266 printk(XENLOG_WARNING
267 "CPUID: Insufficient Leaf 4 space for this hardware\n");
268 }
269
270 if ( p->basic.max_leaf >= 7 )
271 {
272 cpuid_count_leaf(7, 0, &p->feat.raw[0]);
273
274 for ( i = 1; i < min(ARRAY_SIZE(p->feat.raw),
275 p->feat.max_subleaf + 1ul); ++i )
276 cpuid_count_leaf(7, i, &p->feat.raw[i]);
277 }
278
279 if ( p->basic.max_leaf >= XSTATE_CPUID )
280 {
281 uint64_t xstates;
282
283 cpuid_count_leaf(XSTATE_CPUID, 0, &p->xstate.raw[0]);
284 cpuid_count_leaf(XSTATE_CPUID, 1, &p->xstate.raw[1]);
285
286 xstates = ((uint64_t)(p->xstate.xcr0_high | p->xstate.xss_high) << 32) |
287 (p->xstate.xcr0_low | p->xstate.xss_low);
288
289 for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.raw)); ++i )
290 {
291 if ( xstates & (1ul << i) )
292 cpuid_count_leaf(XSTATE_CPUID, i, &p->xstate.raw[i]);
293 }
294 }
295
296 /* Extended leaves. */
297 cpuid_leaf(0x80000000, &p->extd.raw[0]);
298 for ( i = 1; i < min(ARRAY_SIZE(p->extd.raw),
299 p->extd.max_leaf + 1 - 0x80000000ul); ++i )
300 cpuid_leaf(0x80000000 + i, &p->extd.raw[i]);
301
302 p->x86_vendor = boot_cpu_data.x86_vendor;
303 }
304
calculate_host_policy(void)305 static void __init calculate_host_policy(void)
306 {
307 struct cpuid_policy *p = &host_cpuid_policy;
308
309 *p = raw_cpuid_policy;
310
311 p->basic.max_leaf =
312 min_t(uint32_t, p->basic.max_leaf, ARRAY_SIZE(p->basic.raw) - 1);
313 p->feat.max_subleaf =
314 min_t(uint32_t, p->feat.max_subleaf, ARRAY_SIZE(p->feat.raw) - 1);
315 p->extd.max_leaf = 0x80000000 | min_t(uint32_t, p->extd.max_leaf & 0xffff,
316 ARRAY_SIZE(p->extd.raw) - 1);
317
318 cpuid_featureset_to_policy(boot_cpu_data.x86_capability, p);
319 recalculate_xstate(p);
320 recalculate_misc(p);
321
322 if ( p->extd.svm )
323 {
324 /* Clamp to implemented features which require hardware support. */
325 p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) |
326 (1u << SVM_FEATURE_LBRV) |
327 (1u << SVM_FEATURE_NRIPS) |
328 (1u << SVM_FEATURE_PAUSEFILTER) |
329 (1u << SVM_FEATURE_DECODEASSISTS));
330 /* Enable features which are always emulated. */
331 p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) |
332 (1u << SVM_FEATURE_TSCRATEMSR));
333 }
334 }
335
calculate_pv_max_policy(void)336 static void __init calculate_pv_max_policy(void)
337 {
338 struct cpuid_policy *p = &pv_max_cpuid_policy;
339 uint32_t pv_featureset[FSCAPINTS];
340 unsigned int i;
341
342 *p = host_cpuid_policy;
343 cpuid_policy_to_featureset(p, pv_featureset);
344
345 for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
346 pv_featureset[i] &= pv_featuremask[i];
347
348 /* Unconditionally claim to be able to set the hypervisor bit. */
349 __set_bit(X86_FEATURE_HYPERVISOR, pv_featureset);
350
351 sanitise_featureset(pv_featureset);
352 cpuid_featureset_to_policy(pv_featureset, p);
353 recalculate_xstate(p);
354
355 p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
356 }
357
calculate_hvm_max_policy(void)358 static void __init calculate_hvm_max_policy(void)
359 {
360 struct cpuid_policy *p = &hvm_max_cpuid_policy;
361 uint32_t hvm_featureset[FSCAPINTS];
362 unsigned int i;
363 const uint32_t *hvm_featuremask;
364
365 if ( !hvm_enabled )
366 return;
367
368 *p = host_cpuid_policy;
369 cpuid_policy_to_featureset(p, hvm_featureset);
370
371 hvm_featuremask = hvm_funcs.hap_supported ?
372 hvm_hap_featuremask : hvm_shadow_featuremask;
373
374 for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
375 hvm_featureset[i] &= hvm_featuremask[i];
376
377 /* Unconditionally claim to be able to set the hypervisor bit. */
378 __set_bit(X86_FEATURE_HYPERVISOR, hvm_featureset);
379
380 /*
381 * Xen can provide an APIC emulation to HVM guests even if the host's APIC
382 * isn't enabled.
383 */
384 __set_bit(X86_FEATURE_APIC, hvm_featureset);
385
386 /*
387 * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
388 * long mode (and init_amd() has cleared it out of host capabilities), but
389 * HVM guests are able if running in protected mode.
390 */
391 if ( (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
392 raw_cpuid_policy.basic.sep )
393 __set_bit(X86_FEATURE_SEP, hvm_featureset);
394
395 /*
396 * With VT-x, some features are only supported by Xen if dedicated
397 * hardware support is also available.
398 */
399 if ( cpu_has_vmx )
400 {
401 if ( !cpu_has_vmx_mpx )
402 __clear_bit(X86_FEATURE_MPX, hvm_featureset);
403
404 if ( !cpu_has_vmx_xsaves )
405 __clear_bit(X86_FEATURE_XSAVES, hvm_featureset);
406 }
407
408 sanitise_featureset(hvm_featureset);
409 cpuid_featureset_to_policy(hvm_featureset, p);
410 recalculate_xstate(p);
411 }
412
init_guest_cpuid(void)413 void __init init_guest_cpuid(void)
414 {
415 calculate_raw_policy();
416 calculate_host_policy();
417 calculate_pv_max_policy();
418 calculate_hvm_max_policy();
419 }
420
lookup_deep_deps(uint32_t feature)421 const uint32_t *lookup_deep_deps(uint32_t feature)
422 {
423 static const struct {
424 uint32_t feature;
425 uint32_t fs[FSCAPINTS];
426 } deep_deps[] = INIT_DEEP_DEPS;
427 unsigned int start = 0, end = ARRAY_SIZE(deep_deps);
428
429 BUILD_BUG_ON(ARRAY_SIZE(deep_deps) != NR_DEEP_DEPS);
430
431 /* Fast early exit. */
432 if ( !test_bit(feature, deep_features) )
433 return NULL;
434
435 /* deep_deps[] is sorted. Perform a binary search. */
436 while ( start < end )
437 {
438 unsigned int mid = start + ((end - start) / 2);
439
440 if ( deep_deps[mid].feature > feature )
441 end = mid;
442 else if ( deep_deps[mid].feature < feature )
443 start = mid + 1;
444 else
445 return deep_deps[mid].fs;
446 }
447
448 return NULL;
449 }
450
recalculate_cpuid_policy(struct domain * d)451 void recalculate_cpuid_policy(struct domain *d)
452 {
453 struct cpuid_policy *p = d->arch.cpuid;
454 const struct cpuid_policy *max =
455 is_pv_domain(d) ? &pv_max_cpuid_policy : &hvm_max_cpuid_policy;
456 uint32_t fs[FSCAPINTS], max_fs[FSCAPINTS];
457 unsigned int i;
458
459 p->x86_vendor = get_cpu_vendor(p->basic.vendor_ebx, p->basic.vendor_ecx,
460 p->basic.vendor_edx, gcv_guest);
461
462 p->basic.max_leaf = min(p->basic.max_leaf, max->basic.max_leaf);
463 p->feat.max_subleaf = min(p->feat.max_subleaf, max->feat.max_subleaf);
464 p->extd.max_leaf = 0x80000000 | min(p->extd.max_leaf & 0xffff,
465 (p->x86_vendor == X86_VENDOR_AMD
466 ? CPUID_GUEST_NR_EXTD_AMD
467 : CPUID_GUEST_NR_EXTD_INTEL) - 1);
468
469 cpuid_policy_to_featureset(p, fs);
470 cpuid_policy_to_featureset(max, max_fs);
471
472 if ( is_hvm_domain(d) )
473 {
474 /*
475 * HVM domains using Shadow paging have further restrictions on their
476 * available paging features.
477 */
478 if ( !hap_enabled(d) )
479 {
480 for ( i = 0; i < ARRAY_SIZE(max_fs); i++ )
481 max_fs[i] &= hvm_shadow_featuremask[i];
482 }
483
484 /* Hide nested-virt if it hasn't been explicitly configured. */
485 if ( !nestedhvm_enabled(d) )
486 {
487 __clear_bit(X86_FEATURE_VMX, max_fs);
488 __clear_bit(X86_FEATURE_SVM, max_fs);
489 }
490 }
491
492 /*
493 * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY. These bits
494 * affect how to interpret topology information in other cpuid leaves.
495 */
496 __set_bit(X86_FEATURE_HTT, max_fs);
497 __set_bit(X86_FEATURE_X2APIC, max_fs);
498 __set_bit(X86_FEATURE_CMP_LEGACY, max_fs);
499
500 /*
501 * 32bit PV domains can't use any Long Mode features, and cannot use
502 * SYSCALL on non-AMD hardware.
503 */
504 if ( is_pv_32bit_domain(d) )
505 {
506 __clear_bit(X86_FEATURE_LM, max_fs);
507 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
508 __clear_bit(X86_FEATURE_SYSCALL, max_fs);
509 }
510
511 /*
512 * ITSC is masked by default (so domains are safe to migrate), but a
513 * toolstack which has configured disable_migrate or vTSC for a domain may
514 * safely select it, and needs a way of doing so.
515 */
516 if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
517 __set_bit(X86_FEATURE_ITSC, max_fs);
518
519 /* Clamp the toolstacks choices to reality. */
520 for ( i = 0; i < ARRAY_SIZE(fs); i++ )
521 fs[i] &= max_fs[i];
522
523 if ( p->basic.max_leaf < XSTATE_CPUID )
524 __clear_bit(X86_FEATURE_XSAVE, fs);
525
526 sanitise_featureset(fs);
527
528 /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
529 fs[FEATURESET_7b0] &= ~special_features[FEATURESET_7b0];
530 fs[FEATURESET_7b0] |= (host_cpuid_policy.feat._7b0 &
531 special_features[FEATURESET_7b0]);
532
533 cpuid_featureset_to_policy(fs, p);
534
535 /* Pass host cacheline size through to guests. */
536 p->basic.clflush_size = max->basic.clflush_size;
537
538 p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
539 p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
540 paging_max_paddr_bits(d));
541 p->extd.maxphysaddr = max_t(uint8_t, p->extd.maxphysaddr,
542 (p->basic.pae || p->basic.pse36) ? 36 : 32);
543
544 p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
545
546 recalculate_xstate(p);
547 recalculate_misc(p);
548
549 for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
550 {
551 if ( p->cache.subleaf[i].type >= 1 &&
552 p->cache.subleaf[i].type <= 3 )
553 {
554 /* Subleaf has a valid cache type. Zero reserved fields. */
555 p->cache.raw[i].a &= 0xffffc3ffu;
556 p->cache.raw[i].d &= 0x00000007u;
557 }
558 else
559 {
560 /* Subleaf is not valid. Zero the rest of the union. */
561 zero_leaves(p->cache.raw, i, ARRAY_SIZE(p->cache.raw) - 1);
562 break;
563 }
564 }
565
566 if ( !p->extd.svm )
567 p->extd.raw[0xa] = EMPTY_LEAF;
568
569 if ( !p->extd.page1gb )
570 p->extd.raw[0x19] = EMPTY_LEAF;
571
572 if ( p->extd.lwp )
573 p->extd.raw[0x1c].d &= max->extd.raw[0x1c].d;
574 else
575 p->extd.raw[0x1c] = EMPTY_LEAF;
576 }
577
init_domain_cpuid_policy(struct domain * d)578 int init_domain_cpuid_policy(struct domain *d)
579 {
580 d->arch.cpuid = xmalloc(struct cpuid_policy);
581
582 if ( !d->arch.cpuid )
583 return -ENOMEM;
584
585 *d->arch.cpuid = is_pv_domain(d)
586 ? pv_max_cpuid_policy : hvm_max_cpuid_policy;
587
588 if ( d->disable_migrate )
589 d->arch.cpuid->extd.itsc = cpu_has_itsc;
590
591 recalculate_cpuid_policy(d);
592
593 return 0;
594 }
595
guest_cpuid(const struct vcpu * v,uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res)596 void guest_cpuid(const struct vcpu *v, uint32_t leaf,
597 uint32_t subleaf, struct cpuid_leaf *res)
598 {
599 const struct domain *d = v->domain;
600 const struct cpuid_policy *p = d->arch.cpuid;
601
602 *res = EMPTY_LEAF;
603
604 /*
605 * First pass:
606 * - Perform max_leaf/subleaf calculations. Out-of-range leaves return
607 * all zeros, following the AMD model.
608 * - Fill in *res for leaves no longer handled on the legacy path.
609 * - Dispatch the virtualised leaves to their respective handlers.
610 */
611 switch ( leaf )
612 {
613 case 0 ... CPUID_GUEST_NR_BASIC - 1:
614 ASSERT(p->basic.max_leaf < ARRAY_SIZE(p->basic.raw));
615 if ( leaf > min_t(uint32_t, p->basic.max_leaf,
616 ARRAY_SIZE(p->basic.raw) - 1) )
617 return;
618
619 switch ( leaf )
620 {
621 case 0x4:
622 if ( subleaf >= ARRAY_SIZE(p->cache.raw) )
623 return;
624
625 *res = p->cache.raw[subleaf];
626 break;
627
628 case 0x7:
629 ASSERT(p->feat.max_subleaf < ARRAY_SIZE(p->feat.raw));
630 if ( subleaf > min_t(uint32_t, p->feat.max_subleaf,
631 ARRAY_SIZE(p->feat.raw) - 1) )
632 return;
633
634 *res = p->feat.raw[subleaf];
635 break;
636
637 case XSTATE_CPUID:
638 if ( !p->basic.xsave || subleaf >= ARRAY_SIZE(p->xstate.raw) )
639 return;
640
641 *res = p->xstate.raw[subleaf];
642 break;
643
644 default:
645 *res = p->basic.raw[leaf];
646 break;
647 }
648 break;
649
650 case 0x40000000 ... 0x400000ff:
651 if ( is_viridian_domain(d) )
652 return cpuid_viridian_leaves(v, leaf, subleaf, res);
653
654 /*
655 * Fallthrough.
656 *
657 * Intel reserve up until 0x4fffffff for hypervisor use. AMD reserve
658 * only until 0x400000ff, but we already use double that.
659 */
660 case 0x40000100 ... 0x400001ff:
661 return cpuid_hypervisor_leaves(v, leaf, subleaf, res);
662
663 case 0x80000000 ... 0x80000000 + CPUID_GUEST_NR_EXTD - 1:
664 ASSERT((p->extd.max_leaf & 0xffff) < ARRAY_SIZE(p->extd.raw));
665 if ( (leaf & 0xffff) > min_t(uint32_t, p->extd.max_leaf & 0xffff,
666 ARRAY_SIZE(p->extd.raw) - 1) )
667 return;
668
669 *res = p->extd.raw[leaf & 0xffff];
670 break;
671
672 default:
673 return;
674 }
675
676 /*
677 * Skip dynamic adjustments if we are in the wrong context.
678 *
679 * All dynamic adjustments depends on current register state, which will
680 * be stale if the vcpu is running elsewhere. It is simpler, quicker, and
681 * more reliable for the caller to do nothing (consistently) than to hand
682 * back stale data which it can't use safely.
683 */
684 if ( v != current )
685 return;
686
687 /*
688 * Second pass:
689 * - Dynamic adjustments
690 */
691 switch ( leaf )
692 {
693 const struct cpu_user_regs *regs;
694
695 case 0x1:
696 /* TODO: Rework topology logic. */
697 res->b &= 0x00ffffffu;
698 if ( is_hvm_domain(d) )
699 res->b |= (v->vcpu_id * 2) << 24;
700
701 /* TODO: Rework vPMU control in terms of toolstack choices. */
702 if ( vpmu_available(v) &&
703 vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) )
704 {
705 res->d |= cpufeat_mask(X86_FEATURE_DS);
706 if ( cpu_has(¤t_cpu_data, X86_FEATURE_DTES64) )
707 res->c |= cpufeat_mask(X86_FEATURE_DTES64);
708 if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) )
709 res->c |= cpufeat_mask(X86_FEATURE_DSCPL);
710 }
711
712 if ( is_hvm_domain(d) )
713 {
714 /* OSXSAVE clear in policy. Fast-forward CR4 back in. */
715 if ( v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE )
716 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
717 }
718 else /* PV domain */
719 {
720 regs = guest_cpu_user_regs();
721
722 /*
723 * !!! OSXSAVE handling for PV guests is non-architectural !!!
724 *
725 * Architecturally, the correct code here is simply:
726 *
727 * if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE )
728 * c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
729 *
730 * However because of bugs in Xen (before c/s bd19080b, Nov 2010,
731 * the XSAVE cpuid flag leaked into guests despite the feature not
732 * being available for use), buggy workarounds where introduced to
733 * Linux (c/s 947ccf9c, also Nov 2010) which relied on the fact
734 * that Xen also incorrectly leaked OSXSAVE into the guest.
735 *
736 * Furthermore, providing architectural OSXSAVE behaviour to a
737 * many Linux PV guests triggered a further kernel bug when the
738 * fpu code observes that XSAVEOPT is available, assumes that
739 * xsave state had been set up for the task, and follows a wild
740 * pointer.
741 *
742 * Older Linux PVOPS kernels however do require architectural
743 * behaviour. They observe Xen's leaked OSXSAVE and assume they
744 * can already use XSETBV, dying with a #UD because the shadowed
745 * CR4.OSXSAVE is clear. This behaviour has been adjusted in all
746 * observed cases via stable backports of the above changeset.
747 *
748 * Therefore, the leaking of Xen's OSXSAVE setting has become a
749 * defacto part of the PV ABI and can't reasonably be corrected.
750 * It can however be restricted to only the enlightened CPUID
751 * view, as seen by the guest kernel.
752 *
753 * The following situations and logic now applies:
754 *
755 * - Hardware without CPUID faulting support and native CPUID:
756 * There is nothing Xen can do here. The hosts XSAVE flag will
757 * leak through and Xen's OSXSAVE choice will leak through.
758 *
759 * In the case that the guest kernel has not set up OSXSAVE, only
760 * SSE will be set in xcr0, and guest userspace can't do too much
761 * damage itself.
762 *
763 * - Enlightened CPUID or CPUID faulting available:
764 * Xen can fully control what is seen here. Guest kernels need
765 * to see the leaked OSXSAVE via the enlightened path, but
766 * guest userspace and the native is given architectural
767 * behaviour.
768 *
769 * Emulated vs Faulted CPUID is distinguised based on whether a
770 * #UD or #GP is currently being serviced.
771 */
772 /* OSXSAVE clear in policy. Fast-forward CR4 back in. */
773 if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) ||
774 (regs->entry_vector == TRAP_invalid_op &&
775 guest_kernel_mode(v, regs) &&
776 (read_cr4() & X86_CR4_OSXSAVE)) )
777 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
778
779 /*
780 * At the time of writing, a PV domain is the only viable option
781 * for Dom0. Several interactions between dom0 and Xen for real
782 * hardware setup have unfortunately been implemented based on
783 * state which incorrectly leaked into dom0.
784 *
785 * These leaks are retained for backwards compatibility, but
786 * restricted to the hardware domains kernel only.
787 */
788 if ( is_hardware_domain(d) && guest_kernel_mode(v, regs) )
789 {
790 /*
791 * MONITOR never leaked into PV guests, as PV guests cannot
792 * use the MONITOR/MWAIT instructions. As such, they require
793 * the feature to not being present in emulated CPUID.
794 *
795 * Modern PVOPS Linux try to be cunning and use native CPUID
796 * to see if the hardware actually supports MONITOR, and by
797 * extension, deep C states.
798 *
799 * If the feature is seen, deep-C state information is
800 * obtained from the DSDT and handed back to Xen via the
801 * XENPF_set_processor_pminfo hypercall.
802 *
803 * This mechanism is incompatible with an HVM-based hardware
804 * domain, and also with CPUID Faulting.
805 *
806 * Luckily, Xen can be just as 'cunning', and distinguish an
807 * emulated CPUID from a faulted CPUID by whether a #UD or #GP
808 * fault is currently being serviced. Yuck...
809 */
810 if ( cpu_has_monitor && regs->entry_vector == TRAP_gp_fault )
811 res->c |= cpufeat_mask(X86_FEATURE_MONITOR);
812
813 /*
814 * While MONITOR never leaked into PV guests, EIST always used
815 * to.
816 *
817 * Modern PVOPS Linux will only parse P state information from
818 * the DSDT and return it to Xen if EIST is seen in the
819 * emulated CPUID information.
820 */
821 if ( cpu_has_eist )
822 res->c |= cpufeat_mask(X86_FEATURE_EIST);
823 }
824 }
825 goto common_leaf1_adjustments;
826
827 case 0x5:
828 /*
829 * Leak the hardware MONITOR leaf under the same conditions that the
830 * MONITOR feature flag is leaked. See above for details.
831 */
832 regs = guest_cpu_user_regs();
833 if ( is_pv_domain(d) && is_hardware_domain(d) &&
834 guest_kernel_mode(v, regs) && cpu_has_monitor &&
835 regs->entry_vector == TRAP_gp_fault )
836 *res = raw_cpuid_policy.basic.raw[leaf];
837 break;
838
839 case 0x7:
840 switch ( subleaf )
841 {
842 case 0:
843 /* OSPKE clear in policy. Fast-forward CR4 back in. */
844 if ( (is_pv_domain(d)
845 ? v->arch.pv_vcpu.ctrlreg[4]
846 : v->arch.hvm_vcpu.guest_cr[4]) & X86_CR4_PKE )
847 res->c |= cpufeat_mask(X86_FEATURE_OSPKE);
848 break;
849 }
850 break;
851
852 case 0xa:
853 /* TODO: Rework vPMU control in terms of toolstack choices. */
854 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
855 !vpmu_available(v) )
856 *res = EMPTY_LEAF;
857 else
858 {
859 /* Report at most v3 since that's all we currently emulate. */
860 if ( (res->a & 0xff) > 3 )
861 res->a = (res->a & ~0xff) | 3;
862 }
863 break;
864
865 case 0xb:
866 /*
867 * In principle, this leaf is Intel-only. In practice, it is tightly
868 * coupled with x2apic, and we offer an x2apic-capable APIC emulation
869 * to guests on AMD hardware as well.
870 *
871 * TODO: Rework topology logic.
872 */
873 if ( p->basic.x2apic )
874 {
875 *(uint8_t *)&res->c = subleaf;
876
877 /* Fix the x2APIC identifier. */
878 res->d = v->vcpu_id * 2;
879 }
880 break;
881
882 case XSTATE_CPUID:
883 switch ( subleaf )
884 {
885 case 1:
886 if ( p->xstate.xsaves )
887 {
888 /*
889 * TODO: Figure out what to do for XSS state. VT-x manages
890 * host vs guest MSR_XSS automatically, so as soon as we start
891 * supporting any XSS states, the wrong XSS will be in
892 * context.
893 */
894 BUILD_BUG_ON(XSTATE_XSAVES_ONLY != 0);
895
896 /*
897 * Read CPUID[0xD,0/1].EBX from hardware. They vary with
898 * enabled XSTATE, and appropraite XCR0|XSS are in context.
899 */
900 case 0:
901 res->b = cpuid_count_ebx(leaf, subleaf);
902 }
903 break;
904 }
905 break;
906
907 case 0x80000001:
908 /* SYSCALL is hidden outside of long mode on Intel. */
909 if ( p->x86_vendor == X86_VENDOR_INTEL &&
910 is_hvm_domain(d) && !hvm_long_mode_active(v) )
911 res->d &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
912
913 common_leaf1_adjustments:
914 if ( is_hvm_domain(d) )
915 {
916 /* Fast-forward MSR_APIC_BASE.EN. */
917 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
918 res->d &= ~cpufeat_bit(X86_FEATURE_APIC);
919
920 /*
921 * PSE36 is not supported in shadow mode. This bit should be
922 * clear in hvm_shadow_featuremask[].
923 *
924 * However, an unspecified version of Hyper-V from 2011 refuses to
925 * start as the "cpu does not provide required hw features" if it
926 * can't see PSE36.
927 *
928 * As a workaround, leak the toolstack-provided PSE36 value into a
929 * shadow guest if the guest is already using PAE paging (and
930 * won't care about reverting back to PSE paging). Otherwise,
931 * knoble it, so a 32bit guest doesn't get the impression that it
932 * could try to use PSE36 paging.
933 */
934 if ( !hap_enabled(d) && !hvm_pae_enabled(v) )
935 res->d &= ~cpufeat_mask(X86_FEATURE_PSE36);
936 }
937 else /* PV domain */
938 {
939 /*
940 * MTRR used to unconditionally leak into PV guests. They cannot
941 * MTRR infrastructure at all, and shouldn't be able to see the
942 * feature.
943 *
944 * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid
945 * trying to use the associated MSRs. Xenolinux-based PV dom0's
946 * however use the MTRR feature as an indication of the presence
947 * of the XENPF_{add,del,read}_memtype hypercalls.
948 */
949 if ( is_hardware_domain(d) && cpu_has_mtrr &&
950 guest_kernel_mode(v, guest_cpu_user_regs()) )
951 res->d |= cpufeat_mask(X86_FEATURE_MTRR);
952 }
953 break;
954
955 case 0x8000001c:
956 if ( (v->arch.xcr0 & XSTATE_LWP) && cpu_has_svm )
957 /* Turn on available bit and other features specified in lwp_cfg. */
958 res->a = (res->d & v->arch.hvm_svm.guest_lwp_cfg) | 1;
959 break;
960 }
961 }
962
build_assertions(void)963 static void __init __maybe_unused build_assertions(void)
964 {
965 BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
966 BUILD_BUG_ON(ARRAY_SIZE(special_features) != FSCAPINTS);
967 BUILD_BUG_ON(ARRAY_SIZE(pv_featuremask) != FSCAPINTS);
968 BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_featuremask) != FSCAPINTS);
969 BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_featuremask) != FSCAPINTS);
970 BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
971
972 /* Find some more clever allocation scheme if this trips. */
973 BUILD_BUG_ON(sizeof(struct cpuid_policy) > PAGE_SIZE);
974
975 BUILD_BUG_ON(sizeof(raw_cpuid_policy.basic) !=
976 sizeof(raw_cpuid_policy.basic.raw));
977 BUILD_BUG_ON(sizeof(raw_cpuid_policy.feat) !=
978 sizeof(raw_cpuid_policy.feat.raw));
979 BUILD_BUG_ON(sizeof(raw_cpuid_policy.xstate) !=
980 sizeof(raw_cpuid_policy.xstate.raw));
981 BUILD_BUG_ON(sizeof(raw_cpuid_policy.extd) !=
982 sizeof(raw_cpuid_policy.extd.raw));
983 }
984
985 /*
986 * Local variables:
987 * mode: C
988 * c-file-style: "BSD"
989 * c-basic-offset: 4
990 * tab-width: 4
991 * indent-tabs-mode: nil
992 * End:
993 */
994