1 #include <xen/init.h>
2 #include <xen/lib.h>
3 #include <xen/sched.h>
4 #include <asm/cpuid.h>
5 #include <asm/hvm/hvm.h>
6 #include <asm/hvm/nestedhvm.h>
7 #include <asm/hvm/svm/svm.h>
8 #include <asm/hvm/vmx/vmcs.h>
9 #include <asm/paging.h>
10 #include <asm/processor.h>
11 #include <asm/xstate.h>
12 
13 const uint32_t known_features[] = INIT_KNOWN_FEATURES;
14 const uint32_t special_features[] = INIT_SPECIAL_FEATURES;
15 
16 static const uint32_t pv_featuremask[] = INIT_PV_FEATURES;
17 static const uint32_t hvm_shadow_featuremask[] = INIT_HVM_SHADOW_FEATURES;
18 static const uint32_t hvm_hap_featuremask[] = INIT_HVM_HAP_FEATURES;
19 static const uint32_t deep_features[] = INIT_DEEP_FEATURES;
20 
21 #define EMPTY_LEAF ((struct cpuid_leaf){})
zero_leaves(struct cpuid_leaf * l,unsigned int first,unsigned int last)22 static void zero_leaves(struct cpuid_leaf *l,
23                         unsigned int first, unsigned int last)
24 {
25     memset(&l[first], 0, sizeof(*l) * (last - first + 1));
26 }
27 
28 struct cpuid_policy __read_mostly raw_cpuid_policy,
29     __read_mostly host_cpuid_policy,
30     __read_mostly pv_max_cpuid_policy,
31     __read_mostly hvm_max_cpuid_policy;
32 
cpuid_leaf(uint32_t leaf,struct cpuid_leaf * data)33 static void cpuid_leaf(uint32_t leaf, struct cpuid_leaf *data)
34 {
35     cpuid(leaf, &data->a, &data->b, &data->c, &data->d);
36 }
37 
sanitise_featureset(uint32_t * fs)38 static void sanitise_featureset(uint32_t *fs)
39 {
40     /* for_each_set_bit() uses unsigned longs.  Extend with zeroes. */
41     uint32_t disabled_features[
42         ROUNDUP(FSCAPINTS, sizeof(unsigned long)/sizeof(uint32_t))] = {};
43     unsigned int i;
44 
45     for ( i = 0; i < FSCAPINTS; ++i )
46     {
47         /* Clamp to known mask. */
48         fs[i] &= known_features[i];
49 
50         /*
51          * Identify which features with deep dependencies have been
52          * disabled.
53          */
54         disabled_features[i] = ~fs[i] & deep_features[i];
55     }
56 
57     for_each_set_bit(i, (void *)disabled_features,
58                      sizeof(disabled_features) * 8)
59     {
60         const uint32_t *dfs = lookup_deep_deps(i);
61         unsigned int j;
62 
63         ASSERT(dfs); /* deep_features[] should guarentee this. */
64 
65         for ( j = 0; j < FSCAPINTS; ++j )
66         {
67             fs[j] &= ~dfs[j];
68             disabled_features[j] &= ~dfs[j];
69         }
70     }
71 }
72 
recalculate_xstate(struct cpuid_policy * p)73 static void recalculate_xstate(struct cpuid_policy *p)
74 {
75     uint64_t xstates = XSTATE_FP_SSE;
76     uint32_t xstate_size = XSTATE_AREA_MIN_SIZE;
77     unsigned int i, Da1 = p->xstate.Da1;
78 
79     /*
80      * The Da1 leaf is the only piece of information preserved in the common
81      * case.  Everything else is derived from other feature state.
82      */
83     memset(&p->xstate, 0, sizeof(p->xstate));
84 
85     if ( !p->basic.xsave )
86         return;
87 
88     if ( p->basic.avx )
89     {
90         xstates |= XSTATE_YMM;
91         xstate_size = max(xstate_size,
92                           xstate_offsets[_XSTATE_YMM] +
93                           xstate_sizes[_XSTATE_YMM]);
94     }
95 
96     if ( p->feat.mpx )
97     {
98         xstates |= XSTATE_BNDREGS | XSTATE_BNDCSR;
99         xstate_size = max(xstate_size,
100                           xstate_offsets[_XSTATE_BNDCSR] +
101                           xstate_sizes[_XSTATE_BNDCSR]);
102     }
103 
104     if ( p->feat.avx512f )
105     {
106         xstates |= XSTATE_OPMASK | XSTATE_ZMM | XSTATE_HI_ZMM;
107         xstate_size = max(xstate_size,
108                           xstate_offsets[_XSTATE_HI_ZMM] +
109                           xstate_sizes[_XSTATE_HI_ZMM]);
110     }
111 
112     if ( p->feat.pku )
113     {
114         xstates |= XSTATE_PKRU;
115         xstate_size = max(xstate_size,
116                           xstate_offsets[_XSTATE_PKRU] +
117                           xstate_sizes[_XSTATE_PKRU]);
118     }
119 
120     if ( p->extd.lwp )
121     {
122         xstates |= XSTATE_LWP;
123         xstate_size = max(xstate_size,
124                           xstate_offsets[_XSTATE_LWP] +
125                           xstate_sizes[_XSTATE_LWP]);
126     }
127 
128     p->xstate.max_size  =  xstate_size;
129     p->xstate.xcr0_low  =  xstates & ~XSTATE_XSAVES_ONLY;
130     p->xstate.xcr0_high = (xstates & ~XSTATE_XSAVES_ONLY) >> 32;
131 
132     p->xstate.Da1 = Da1;
133     if ( p->xstate.xsaves )
134     {
135         p->xstate.xss_low   =  xstates & XSTATE_XSAVES_ONLY;
136         p->xstate.xss_high  = (xstates & XSTATE_XSAVES_ONLY) >> 32;
137     }
138     else
139         xstates &= ~XSTATE_XSAVES_ONLY;
140 
141     for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.comp)); ++i )
142     {
143         uint64_t curr_xstate = 1ul << i;
144 
145         if ( !(xstates & curr_xstate) )
146             continue;
147 
148         p->xstate.comp[i].size   = xstate_sizes[i];
149         p->xstate.comp[i].offset = xstate_offsets[i];
150         p->xstate.comp[i].xss    = curr_xstate & XSTATE_XSAVES_ONLY;
151         p->xstate.comp[i].align  = curr_xstate & xstate_align;
152     }
153 }
154 
155 /*
156  * Misc adjustments to the policy.  Mostly clobbering reserved fields and
157  * duplicating shared fields.  Intentionally hidden fields are annotated.
158  */
recalculate_misc(struct cpuid_policy * p)159 static void recalculate_misc(struct cpuid_policy *p)
160 {
161     p->basic.raw_fms &= 0x0fff0fff; /* Clobber Processor Type on Intel. */
162     p->basic.apic_id = 0; /* Dynamic. */
163 
164     p->basic.raw[0x5] = EMPTY_LEAF; /* MONITOR not exposed to guests. */
165     p->basic.raw[0x6] = EMPTY_LEAF; /* Therm/Power not exposed to guests. */
166 
167     p->basic.raw[0x8] = EMPTY_LEAF;
168     p->basic.raw[0xb] = EMPTY_LEAF; /* TODO: Rework topology logic. */
169     p->basic.raw[0xc] = EMPTY_LEAF;
170 
171     p->extd.e1d &= ~CPUID_COMMON_1D_FEATURES;
172 
173     /* Most of Power/RAS hidden from guests. */
174     p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
175 
176     p->extd.raw[0x8].d = 0;
177 
178     switch ( p->x86_vendor )
179     {
180     case X86_VENDOR_INTEL:
181         p->basic.l2_nr_queries = 1; /* Fixed to 1 query. */
182         p->basic.raw[0x3] = EMPTY_LEAF; /* PSN - always hidden. */
183         p->basic.raw[0x9] = EMPTY_LEAF; /* DCA - always hidden. */
184 
185         p->extd.vendor_ebx = 0;
186         p->extd.vendor_ecx = 0;
187         p->extd.vendor_edx = 0;
188 
189         p->extd.raw[0x1].a = p->extd.raw[0x1].b = 0;
190 
191         p->extd.raw[0x5] = EMPTY_LEAF;
192         p->extd.raw[0x6].a = p->extd.raw[0x6].b = p->extd.raw[0x6].d = 0;
193 
194         p->extd.raw[0x8].a &= 0x0000ffff;
195         p->extd.raw[0x8].c = 0;
196         break;
197 
198     case X86_VENDOR_AMD:
199         zero_leaves(p->basic.raw, 0x2, 0x3);
200         memset(p->cache.raw, 0, sizeof(p->cache.raw));
201         zero_leaves(p->basic.raw, 0x9, 0xa);
202 
203         p->extd.vendor_ebx = p->basic.vendor_ebx;
204         p->extd.vendor_ecx = p->basic.vendor_ecx;
205         p->extd.vendor_edx = p->basic.vendor_edx;
206 
207         p->extd.raw_fms = p->basic.raw_fms;
208         p->extd.raw[0x1].b &= 0xff00ffff;
209         p->extd.e1d |= p->basic._1d & CPUID_COMMON_1D_FEATURES;
210 
211         p->extd.raw[0x8].a &= 0x0000ffff; /* GuestMaxPhysAddr hidden. */
212         p->extd.raw[0x8].c &= 0x0003f0ff;
213 
214         p->extd.raw[0x9] = EMPTY_LEAF;
215 
216         zero_leaves(p->extd.raw, 0xb, 0x18);
217 
218         p->extd.raw[0x1b] = EMPTY_LEAF; /* IBS - not supported. */
219 
220         p->extd.raw[0x1c].a = 0; /* LWP.a entirely dynamic. */
221         break;
222     }
223 }
224 
calculate_raw_policy(void)225 static void __init calculate_raw_policy(void)
226 {
227     struct cpuid_policy *p = &raw_cpuid_policy;
228     unsigned int i;
229 
230     cpuid_leaf(0, &p->basic.raw[0]);
231     for ( i = 1; i < min(ARRAY_SIZE(p->basic.raw),
232                          p->basic.max_leaf + 1ul); ++i )
233     {
234         switch ( i )
235         {
236         case 0x4: case 0x7: case 0xd:
237             /* Multi-invocation leaves.  Deferred. */
238             continue;
239         }
240 
241         cpuid_leaf(i, &p->basic.raw[i]);
242     }
243 
244     if ( p->basic.max_leaf >= 4 )
245     {
246         for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
247         {
248             union {
249                 struct cpuid_leaf l;
250                 struct cpuid_cache_leaf c;
251             } u;
252 
253             cpuid_count_leaf(4, i, &u.l);
254 
255             if ( u.c.type == 0 )
256                 break;
257 
258             p->cache.subleaf[i] = u.c;
259         }
260 
261         /*
262          * The choice of CPUID_GUEST_NR_CACHE is arbitrary.  It is expected
263          * that it will eventually need increasing for future hardware.
264          */
265         if ( i == ARRAY_SIZE(p->cache.raw) )
266             printk(XENLOG_WARNING
267                    "CPUID: Insufficient Leaf 4 space for this hardware\n");
268     }
269 
270     if ( p->basic.max_leaf >= 7 )
271     {
272         cpuid_count_leaf(7, 0, &p->feat.raw[0]);
273 
274         for ( i = 1; i < min(ARRAY_SIZE(p->feat.raw),
275                              p->feat.max_subleaf + 1ul); ++i )
276             cpuid_count_leaf(7, i, &p->feat.raw[i]);
277     }
278 
279     if ( p->basic.max_leaf >= XSTATE_CPUID )
280     {
281         uint64_t xstates;
282 
283         cpuid_count_leaf(XSTATE_CPUID, 0, &p->xstate.raw[0]);
284         cpuid_count_leaf(XSTATE_CPUID, 1, &p->xstate.raw[1]);
285 
286         xstates = ((uint64_t)(p->xstate.xcr0_high | p->xstate.xss_high) << 32) |
287             (p->xstate.xcr0_low | p->xstate.xss_low);
288 
289         for ( i = 2; i < min(63ul, ARRAY_SIZE(p->xstate.raw)); ++i )
290         {
291             if ( xstates & (1ul << i) )
292                 cpuid_count_leaf(XSTATE_CPUID, i, &p->xstate.raw[i]);
293         }
294     }
295 
296     /* Extended leaves. */
297     cpuid_leaf(0x80000000, &p->extd.raw[0]);
298     for ( i = 1; i < min(ARRAY_SIZE(p->extd.raw),
299                          p->extd.max_leaf + 1 - 0x80000000ul); ++i )
300         cpuid_leaf(0x80000000 + i, &p->extd.raw[i]);
301 
302     p->x86_vendor = boot_cpu_data.x86_vendor;
303 }
304 
calculate_host_policy(void)305 static void __init calculate_host_policy(void)
306 {
307     struct cpuid_policy *p = &host_cpuid_policy;
308 
309     *p = raw_cpuid_policy;
310 
311     p->basic.max_leaf =
312         min_t(uint32_t, p->basic.max_leaf,   ARRAY_SIZE(p->basic.raw) - 1);
313     p->feat.max_subleaf =
314         min_t(uint32_t, p->feat.max_subleaf, ARRAY_SIZE(p->feat.raw) - 1);
315     p->extd.max_leaf = 0x80000000 | min_t(uint32_t, p->extd.max_leaf & 0xffff,
316                                           ARRAY_SIZE(p->extd.raw) - 1);
317 
318     cpuid_featureset_to_policy(boot_cpu_data.x86_capability, p);
319     recalculate_xstate(p);
320     recalculate_misc(p);
321 
322     if ( p->extd.svm )
323     {
324         /* Clamp to implemented features which require hardware support. */
325         p->extd.raw[0xa].d &= ((1u << SVM_FEATURE_NPT) |
326                                (1u << SVM_FEATURE_LBRV) |
327                                (1u << SVM_FEATURE_NRIPS) |
328                                (1u << SVM_FEATURE_PAUSEFILTER) |
329                                (1u << SVM_FEATURE_DECODEASSISTS));
330         /* Enable features which are always emulated. */
331         p->extd.raw[0xa].d |= ((1u << SVM_FEATURE_VMCBCLEAN) |
332                                (1u << SVM_FEATURE_TSCRATEMSR));
333     }
334 }
335 
calculate_pv_max_policy(void)336 static void __init calculate_pv_max_policy(void)
337 {
338     struct cpuid_policy *p = &pv_max_cpuid_policy;
339     uint32_t pv_featureset[FSCAPINTS];
340     unsigned int i;
341 
342     *p = host_cpuid_policy;
343     cpuid_policy_to_featureset(p, pv_featureset);
344 
345     for ( i = 0; i < ARRAY_SIZE(pv_featureset); ++i )
346         pv_featureset[i] &= pv_featuremask[i];
347 
348     /* Unconditionally claim to be able to set the hypervisor bit. */
349     __set_bit(X86_FEATURE_HYPERVISOR, pv_featureset);
350 
351     sanitise_featureset(pv_featureset);
352     cpuid_featureset_to_policy(pv_featureset, p);
353     recalculate_xstate(p);
354 
355     p->extd.raw[0xa] = EMPTY_LEAF; /* No SVM for PV guests. */
356 }
357 
calculate_hvm_max_policy(void)358 static void __init calculate_hvm_max_policy(void)
359 {
360     struct cpuid_policy *p = &hvm_max_cpuid_policy;
361     uint32_t hvm_featureset[FSCAPINTS];
362     unsigned int i;
363     const uint32_t *hvm_featuremask;
364 
365     if ( !hvm_enabled )
366         return;
367 
368     *p = host_cpuid_policy;
369     cpuid_policy_to_featureset(p, hvm_featureset);
370 
371     hvm_featuremask = hvm_funcs.hap_supported ?
372         hvm_hap_featuremask : hvm_shadow_featuremask;
373 
374     for ( i = 0; i < ARRAY_SIZE(hvm_featureset); ++i )
375         hvm_featureset[i] &= hvm_featuremask[i];
376 
377     /* Unconditionally claim to be able to set the hypervisor bit. */
378     __set_bit(X86_FEATURE_HYPERVISOR, hvm_featureset);
379 
380     /*
381      * Xen can provide an APIC emulation to HVM guests even if the host's APIC
382      * isn't enabled.
383      */
384     __set_bit(X86_FEATURE_APIC, hvm_featureset);
385 
386     /*
387      * On AMD, PV guests are entirely unable to use SYSENTER as Xen runs in
388      * long mode (and init_amd() has cleared it out of host capabilities), but
389      * HVM guests are able if running in protected mode.
390      */
391     if ( (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
392          raw_cpuid_policy.basic.sep )
393         __set_bit(X86_FEATURE_SEP, hvm_featureset);
394 
395     /*
396      * With VT-x, some features are only supported by Xen if dedicated
397      * hardware support is also available.
398      */
399     if ( cpu_has_vmx )
400     {
401         if ( !cpu_has_vmx_mpx )
402             __clear_bit(X86_FEATURE_MPX, hvm_featureset);
403 
404         if ( !cpu_has_vmx_xsaves )
405             __clear_bit(X86_FEATURE_XSAVES, hvm_featureset);
406     }
407 
408     sanitise_featureset(hvm_featureset);
409     cpuid_featureset_to_policy(hvm_featureset, p);
410     recalculate_xstate(p);
411 }
412 
init_guest_cpuid(void)413 void __init init_guest_cpuid(void)
414 {
415     calculate_raw_policy();
416     calculate_host_policy();
417     calculate_pv_max_policy();
418     calculate_hvm_max_policy();
419 }
420 
lookup_deep_deps(uint32_t feature)421 const uint32_t *lookup_deep_deps(uint32_t feature)
422 {
423     static const struct {
424         uint32_t feature;
425         uint32_t fs[FSCAPINTS];
426     } deep_deps[] = INIT_DEEP_DEPS;
427     unsigned int start = 0, end = ARRAY_SIZE(deep_deps);
428 
429     BUILD_BUG_ON(ARRAY_SIZE(deep_deps) != NR_DEEP_DEPS);
430 
431     /* Fast early exit. */
432     if ( !test_bit(feature, deep_features) )
433         return NULL;
434 
435     /* deep_deps[] is sorted.  Perform a binary search. */
436     while ( start < end )
437     {
438         unsigned int mid = start + ((end - start) / 2);
439 
440         if ( deep_deps[mid].feature > feature )
441             end = mid;
442         else if ( deep_deps[mid].feature < feature )
443             start = mid + 1;
444         else
445             return deep_deps[mid].fs;
446     }
447 
448     return NULL;
449 }
450 
recalculate_cpuid_policy(struct domain * d)451 void recalculate_cpuid_policy(struct domain *d)
452 {
453     struct cpuid_policy *p = d->arch.cpuid;
454     const struct cpuid_policy *max =
455         is_pv_domain(d) ? &pv_max_cpuid_policy : &hvm_max_cpuid_policy;
456     uint32_t fs[FSCAPINTS], max_fs[FSCAPINTS];
457     unsigned int i;
458 
459     p->x86_vendor = get_cpu_vendor(p->basic.vendor_ebx, p->basic.vendor_ecx,
460                                    p->basic.vendor_edx, gcv_guest);
461 
462     p->basic.max_leaf   = min(p->basic.max_leaf,   max->basic.max_leaf);
463     p->feat.max_subleaf = min(p->feat.max_subleaf, max->feat.max_subleaf);
464     p->extd.max_leaf    = 0x80000000 | min(p->extd.max_leaf & 0xffff,
465                                            (p->x86_vendor == X86_VENDOR_AMD
466                                             ? CPUID_GUEST_NR_EXTD_AMD
467                                             : CPUID_GUEST_NR_EXTD_INTEL) - 1);
468 
469     cpuid_policy_to_featureset(p, fs);
470     cpuid_policy_to_featureset(max, max_fs);
471 
472     if ( is_hvm_domain(d) )
473     {
474         /*
475          * HVM domains using Shadow paging have further restrictions on their
476          * available paging features.
477          */
478         if ( !hap_enabled(d) )
479         {
480             for ( i = 0; i < ARRAY_SIZE(max_fs); i++ )
481                 max_fs[i] &= hvm_shadow_featuremask[i];
482         }
483 
484         /* Hide nested-virt if it hasn't been explicitly configured. */
485         if ( !nestedhvm_enabled(d) )
486         {
487             __clear_bit(X86_FEATURE_VMX, max_fs);
488             __clear_bit(X86_FEATURE_SVM, max_fs);
489         }
490     }
491 
492     /*
493      * Allow the toolstack to set HTT, X2APIC and CMP_LEGACY.  These bits
494      * affect how to interpret topology information in other cpuid leaves.
495      */
496     __set_bit(X86_FEATURE_HTT, max_fs);
497     __set_bit(X86_FEATURE_X2APIC, max_fs);
498     __set_bit(X86_FEATURE_CMP_LEGACY, max_fs);
499 
500     /*
501      * 32bit PV domains can't use any Long Mode features, and cannot use
502      * SYSCALL on non-AMD hardware.
503      */
504     if ( is_pv_32bit_domain(d) )
505     {
506         __clear_bit(X86_FEATURE_LM, max_fs);
507         if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
508             __clear_bit(X86_FEATURE_SYSCALL, max_fs);
509     }
510 
511     /*
512      * ITSC is masked by default (so domains are safe to migrate), but a
513      * toolstack which has configured disable_migrate or vTSC for a domain may
514      * safely select it, and needs a way of doing so.
515      */
516     if ( cpu_has_itsc && (d->disable_migrate || d->arch.vtsc) )
517         __set_bit(X86_FEATURE_ITSC, max_fs);
518 
519     /* Clamp the toolstacks choices to reality. */
520     for ( i = 0; i < ARRAY_SIZE(fs); i++ )
521         fs[i] &= max_fs[i];
522 
523     if ( p->basic.max_leaf < XSTATE_CPUID )
524         __clear_bit(X86_FEATURE_XSAVE, fs);
525 
526     sanitise_featureset(fs);
527 
528     /* Fold host's FDP_EXCP_ONLY and NO_FPU_SEL into guest's view. */
529     fs[FEATURESET_7b0] &= ~special_features[FEATURESET_7b0];
530     fs[FEATURESET_7b0] |= (host_cpuid_policy.feat._7b0 &
531                            special_features[FEATURESET_7b0]);
532 
533     cpuid_featureset_to_policy(fs, p);
534 
535     /* Pass host cacheline size through to guests. */
536     p->basic.clflush_size = max->basic.clflush_size;
537 
538     p->extd.maxphysaddr = min(p->extd.maxphysaddr, max->extd.maxphysaddr);
539     p->extd.maxphysaddr = min_t(uint8_t, p->extd.maxphysaddr,
540                                 paging_max_paddr_bits(d));
541     p->extd.maxphysaddr = max_t(uint8_t, p->extd.maxphysaddr,
542                                 (p->basic.pae || p->basic.pse36) ? 36 : 32);
543 
544     p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
545 
546     recalculate_xstate(p);
547     recalculate_misc(p);
548 
549     for ( i = 0; i < ARRAY_SIZE(p->cache.raw); ++i )
550     {
551         if ( p->cache.subleaf[i].type >= 1 &&
552              p->cache.subleaf[i].type <= 3 )
553         {
554             /* Subleaf has a valid cache type. Zero reserved fields. */
555             p->cache.raw[i].a &= 0xffffc3ffu;
556             p->cache.raw[i].d &= 0x00000007u;
557         }
558         else
559         {
560             /* Subleaf is not valid.  Zero the rest of the union. */
561             zero_leaves(p->cache.raw, i, ARRAY_SIZE(p->cache.raw) - 1);
562             break;
563         }
564     }
565 
566     if ( !p->extd.svm )
567         p->extd.raw[0xa] = EMPTY_LEAF;
568 
569     if ( !p->extd.page1gb )
570         p->extd.raw[0x19] = EMPTY_LEAF;
571 
572     if ( p->extd.lwp )
573         p->extd.raw[0x1c].d &= max->extd.raw[0x1c].d;
574     else
575         p->extd.raw[0x1c] = EMPTY_LEAF;
576 }
577 
init_domain_cpuid_policy(struct domain * d)578 int init_domain_cpuid_policy(struct domain *d)
579 {
580     d->arch.cpuid = xmalloc(struct cpuid_policy);
581 
582     if ( !d->arch.cpuid )
583         return -ENOMEM;
584 
585     *d->arch.cpuid = is_pv_domain(d)
586         ? pv_max_cpuid_policy : hvm_max_cpuid_policy;
587 
588     if ( d->disable_migrate )
589         d->arch.cpuid->extd.itsc = cpu_has_itsc;
590 
591     recalculate_cpuid_policy(d);
592 
593     return 0;
594 }
595 
guest_cpuid(const struct vcpu * v,uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res)596 void guest_cpuid(const struct vcpu *v, uint32_t leaf,
597                  uint32_t subleaf, struct cpuid_leaf *res)
598 {
599     const struct domain *d = v->domain;
600     const struct cpuid_policy *p = d->arch.cpuid;
601 
602     *res = EMPTY_LEAF;
603 
604     /*
605      * First pass:
606      * - Perform max_leaf/subleaf calculations.  Out-of-range leaves return
607      *   all zeros, following the AMD model.
608      * - Fill in *res for leaves no longer handled on the legacy path.
609      * - Dispatch the virtualised leaves to their respective handlers.
610      */
611     switch ( leaf )
612     {
613     case 0 ... CPUID_GUEST_NR_BASIC - 1:
614         ASSERT(p->basic.max_leaf < ARRAY_SIZE(p->basic.raw));
615         if ( leaf > min_t(uint32_t, p->basic.max_leaf,
616                           ARRAY_SIZE(p->basic.raw) - 1) )
617             return;
618 
619         switch ( leaf )
620         {
621         case 0x4:
622             if ( subleaf >= ARRAY_SIZE(p->cache.raw) )
623                 return;
624 
625             *res = p->cache.raw[subleaf];
626             break;
627 
628         case 0x7:
629             ASSERT(p->feat.max_subleaf < ARRAY_SIZE(p->feat.raw));
630             if ( subleaf > min_t(uint32_t, p->feat.max_subleaf,
631                                  ARRAY_SIZE(p->feat.raw) - 1) )
632                 return;
633 
634             *res = p->feat.raw[subleaf];
635             break;
636 
637         case XSTATE_CPUID:
638             if ( !p->basic.xsave || subleaf >= ARRAY_SIZE(p->xstate.raw) )
639                 return;
640 
641             *res = p->xstate.raw[subleaf];
642             break;
643 
644         default:
645             *res = p->basic.raw[leaf];
646             break;
647         }
648         break;
649 
650     case 0x40000000 ... 0x400000ff:
651         if ( is_viridian_domain(d) )
652             return cpuid_viridian_leaves(v, leaf, subleaf, res);
653 
654         /*
655          * Fallthrough.
656          *
657          * Intel reserve up until 0x4fffffff for hypervisor use.  AMD reserve
658          * only until 0x400000ff, but we already use double that.
659          */
660     case 0x40000100 ... 0x400001ff:
661         return cpuid_hypervisor_leaves(v, leaf, subleaf, res);
662 
663     case 0x80000000 ... 0x80000000 + CPUID_GUEST_NR_EXTD - 1:
664         ASSERT((p->extd.max_leaf & 0xffff) < ARRAY_SIZE(p->extd.raw));
665         if ( (leaf & 0xffff) > min_t(uint32_t, p->extd.max_leaf & 0xffff,
666                                      ARRAY_SIZE(p->extd.raw) - 1) )
667             return;
668 
669         *res = p->extd.raw[leaf & 0xffff];
670         break;
671 
672     default:
673         return;
674     }
675 
676     /*
677      * Skip dynamic adjustments if we are in the wrong context.
678      *
679      * All dynamic adjustments depends on current register state, which will
680      * be stale if the vcpu is running elsewhere.  It is simpler, quicker, and
681      * more reliable for the caller to do nothing (consistently) than to hand
682      * back stale data which it can't use safely.
683      */
684     if ( v != current )
685         return;
686 
687     /*
688      * Second pass:
689      * - Dynamic adjustments
690      */
691     switch ( leaf )
692     {
693         const struct cpu_user_regs *regs;
694 
695     case 0x1:
696         /* TODO: Rework topology logic. */
697         res->b &= 0x00ffffffu;
698         if ( is_hvm_domain(d) )
699             res->b |= (v->vcpu_id * 2) << 24;
700 
701         /* TODO: Rework vPMU control in terms of toolstack choices. */
702         if ( vpmu_available(v) &&
703              vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) )
704         {
705             res->d |= cpufeat_mask(X86_FEATURE_DS);
706             if ( cpu_has(&current_cpu_data, X86_FEATURE_DTES64) )
707                 res->c |= cpufeat_mask(X86_FEATURE_DTES64);
708             if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
709                 res->c |= cpufeat_mask(X86_FEATURE_DSCPL);
710         }
711 
712         if ( is_hvm_domain(d) )
713         {
714             /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
715             if ( v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_OSXSAVE )
716                 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
717         }
718         else /* PV domain */
719         {
720             regs = guest_cpu_user_regs();
721 
722             /*
723              * !!! OSXSAVE handling for PV guests is non-architectural !!!
724              *
725              * Architecturally, the correct code here is simply:
726              *
727              *   if ( v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE )
728              *       c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
729              *
730              * However because of bugs in Xen (before c/s bd19080b, Nov 2010,
731              * the XSAVE cpuid flag leaked into guests despite the feature not
732              * being available for use), buggy workarounds where introduced to
733              * Linux (c/s 947ccf9c, also Nov 2010) which relied on the fact
734              * that Xen also incorrectly leaked OSXSAVE into the guest.
735              *
736              * Furthermore, providing architectural OSXSAVE behaviour to a
737              * many Linux PV guests triggered a further kernel bug when the
738              * fpu code observes that XSAVEOPT is available, assumes that
739              * xsave state had been set up for the task, and follows a wild
740              * pointer.
741              *
742              * Older Linux PVOPS kernels however do require architectural
743              * behaviour.  They observe Xen's leaked OSXSAVE and assume they
744              * can already use XSETBV, dying with a #UD because the shadowed
745              * CR4.OSXSAVE is clear.  This behaviour has been adjusted in all
746              * observed cases via stable backports of the above changeset.
747              *
748              * Therefore, the leaking of Xen's OSXSAVE setting has become a
749              * defacto part of the PV ABI and can't reasonably be corrected.
750              * It can however be restricted to only the enlightened CPUID
751              * view, as seen by the guest kernel.
752              *
753              * The following situations and logic now applies:
754              *
755              * - Hardware without CPUID faulting support and native CPUID:
756              *    There is nothing Xen can do here.  The hosts XSAVE flag will
757              *    leak through and Xen's OSXSAVE choice will leak through.
758              *
759              *    In the case that the guest kernel has not set up OSXSAVE, only
760              *    SSE will be set in xcr0, and guest userspace can't do too much
761              *    damage itself.
762              *
763              * - Enlightened CPUID or CPUID faulting available:
764              *    Xen can fully control what is seen here.  Guest kernels need
765              *    to see the leaked OSXSAVE via the enlightened path, but
766              *    guest userspace and the native is given architectural
767              *    behaviour.
768              *
769              *    Emulated vs Faulted CPUID is distinguised based on whether a
770              *    #UD or #GP is currently being serviced.
771              */
772             /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
773             if ( (v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_OSXSAVE) ||
774                  (regs->entry_vector == TRAP_invalid_op &&
775                   guest_kernel_mode(v, regs) &&
776                   (read_cr4() & X86_CR4_OSXSAVE)) )
777                 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
778 
779             /*
780              * At the time of writing, a PV domain is the only viable option
781              * for Dom0.  Several interactions between dom0 and Xen for real
782              * hardware setup have unfortunately been implemented based on
783              * state which incorrectly leaked into dom0.
784              *
785              * These leaks are retained for backwards compatibility, but
786              * restricted to the hardware domains kernel only.
787              */
788             if ( is_hardware_domain(d) && guest_kernel_mode(v, regs) )
789             {
790                 /*
791                  * MONITOR never leaked into PV guests, as PV guests cannot
792                  * use the MONITOR/MWAIT instructions.  As such, they require
793                  * the feature to not being present in emulated CPUID.
794                  *
795                  * Modern PVOPS Linux try to be cunning and use native CPUID
796                  * to see if the hardware actually supports MONITOR, and by
797                  * extension, deep C states.
798                  *
799                  * If the feature is seen, deep-C state information is
800                  * obtained from the DSDT and handed back to Xen via the
801                  * XENPF_set_processor_pminfo hypercall.
802                  *
803                  * This mechanism is incompatible with an HVM-based hardware
804                  * domain, and also with CPUID Faulting.
805                  *
806                  * Luckily, Xen can be just as 'cunning', and distinguish an
807                  * emulated CPUID from a faulted CPUID by whether a #UD or #GP
808                  * fault is currently being serviced.  Yuck...
809                  */
810                 if ( cpu_has_monitor && regs->entry_vector == TRAP_gp_fault )
811                     res->c |= cpufeat_mask(X86_FEATURE_MONITOR);
812 
813                 /*
814                  * While MONITOR never leaked into PV guests, EIST always used
815                  * to.
816                  *
817                  * Modern PVOPS Linux will only parse P state information from
818                  * the DSDT and return it to Xen if EIST is seen in the
819                  * emulated CPUID information.
820                  */
821                 if ( cpu_has_eist )
822                     res->c |= cpufeat_mask(X86_FEATURE_EIST);
823             }
824         }
825         goto common_leaf1_adjustments;
826 
827     case 0x5:
828         /*
829          * Leak the hardware MONITOR leaf under the same conditions that the
830          * MONITOR feature flag is leaked.  See above for details.
831          */
832         regs = guest_cpu_user_regs();
833         if ( is_pv_domain(d) && is_hardware_domain(d) &&
834              guest_kernel_mode(v, regs) && cpu_has_monitor &&
835              regs->entry_vector == TRAP_gp_fault )
836             *res = raw_cpuid_policy.basic.raw[leaf];
837         break;
838 
839     case 0x7:
840         switch ( subleaf )
841         {
842         case 0:
843             /* OSPKE clear in policy.  Fast-forward CR4 back in. */
844             if ( (is_pv_domain(d)
845                   ? v->arch.pv_vcpu.ctrlreg[4]
846                   : v->arch.hvm_vcpu.guest_cr[4]) & X86_CR4_PKE )
847                 res->c |= cpufeat_mask(X86_FEATURE_OSPKE);
848             break;
849         }
850         break;
851 
852     case 0xa:
853         /* TODO: Rework vPMU control in terms of toolstack choices. */
854         if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
855              !vpmu_available(v) )
856             *res = EMPTY_LEAF;
857         else
858         {
859             /* Report at most v3 since that's all we currently emulate. */
860             if ( (res->a & 0xff) > 3 )
861                 res->a = (res->a & ~0xff) | 3;
862         }
863         break;
864 
865     case 0xb:
866         /*
867          * In principle, this leaf is Intel-only.  In practice, it is tightly
868          * coupled with x2apic, and we offer an x2apic-capable APIC emulation
869          * to guests on AMD hardware as well.
870          *
871          * TODO: Rework topology logic.
872          */
873         if ( p->basic.x2apic )
874         {
875             *(uint8_t *)&res->c = subleaf;
876 
877             /* Fix the x2APIC identifier. */
878             res->d = v->vcpu_id * 2;
879         }
880         break;
881 
882     case XSTATE_CPUID:
883         switch ( subleaf )
884         {
885         case 1:
886             if ( p->xstate.xsaves )
887             {
888                 /*
889                  * TODO: Figure out what to do for XSS state.  VT-x manages
890                  * host vs guest MSR_XSS automatically, so as soon as we start
891                  * supporting any XSS states, the wrong XSS will be in
892                  * context.
893                  */
894                 BUILD_BUG_ON(XSTATE_XSAVES_ONLY != 0);
895 
896                 /*
897                  * Read CPUID[0xD,0/1].EBX from hardware.  They vary with
898                  * enabled XSTATE, and appropraite XCR0|XSS are in context.
899                  */
900         case 0:
901                 res->b = cpuid_count_ebx(leaf, subleaf);
902             }
903             break;
904         }
905         break;
906 
907     case 0x80000001:
908         /* SYSCALL is hidden outside of long mode on Intel. */
909         if ( p->x86_vendor == X86_VENDOR_INTEL &&
910              is_hvm_domain(d) && !hvm_long_mode_active(v) )
911             res->d &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
912 
913     common_leaf1_adjustments:
914         if ( is_hvm_domain(d) )
915         {
916             /* Fast-forward MSR_APIC_BASE.EN. */
917             if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
918                 res->d &= ~cpufeat_bit(X86_FEATURE_APIC);
919 
920             /*
921              * PSE36 is not supported in shadow mode.  This bit should be
922              * clear in hvm_shadow_featuremask[].
923              *
924              * However, an unspecified version of Hyper-V from 2011 refuses to
925              * start as the "cpu does not provide required hw features" if it
926              * can't see PSE36.
927              *
928              * As a workaround, leak the toolstack-provided PSE36 value into a
929              * shadow guest if the guest is already using PAE paging (and
930              * won't care about reverting back to PSE paging).  Otherwise,
931              * knoble it, so a 32bit guest doesn't get the impression that it
932              * could try to use PSE36 paging.
933              */
934             if ( !hap_enabled(d) && !hvm_pae_enabled(v) )
935                 res->d &= ~cpufeat_mask(X86_FEATURE_PSE36);
936         }
937         else /* PV domain */
938         {
939             /*
940              * MTRR used to unconditionally leak into PV guests.  They cannot
941              * MTRR infrastructure at all, and shouldn't be able to see the
942              * feature.
943              *
944              * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid
945              * trying to use the associated MSRs.  Xenolinux-based PV dom0's
946              * however use the MTRR feature as an indication of the presence
947              * of the XENPF_{add,del,read}_memtype hypercalls.
948              */
949             if ( is_hardware_domain(d) && cpu_has_mtrr &&
950                  guest_kernel_mode(v, guest_cpu_user_regs()) )
951                 res->d |= cpufeat_mask(X86_FEATURE_MTRR);
952         }
953         break;
954 
955     case 0x8000001c:
956         if ( (v->arch.xcr0 & XSTATE_LWP) && cpu_has_svm )
957             /* Turn on available bit and other features specified in lwp_cfg. */
958             res->a = (res->d & v->arch.hvm_svm.guest_lwp_cfg) | 1;
959         break;
960     }
961 }
962 
build_assertions(void)963 static void __init __maybe_unused build_assertions(void)
964 {
965     BUILD_BUG_ON(ARRAY_SIZE(known_features) != FSCAPINTS);
966     BUILD_BUG_ON(ARRAY_SIZE(special_features) != FSCAPINTS);
967     BUILD_BUG_ON(ARRAY_SIZE(pv_featuremask) != FSCAPINTS);
968     BUILD_BUG_ON(ARRAY_SIZE(hvm_shadow_featuremask) != FSCAPINTS);
969     BUILD_BUG_ON(ARRAY_SIZE(hvm_hap_featuremask) != FSCAPINTS);
970     BUILD_BUG_ON(ARRAY_SIZE(deep_features) != FSCAPINTS);
971 
972     /* Find some more clever allocation scheme if this trips. */
973     BUILD_BUG_ON(sizeof(struct cpuid_policy) > PAGE_SIZE);
974 
975     BUILD_BUG_ON(sizeof(raw_cpuid_policy.basic) !=
976                  sizeof(raw_cpuid_policy.basic.raw));
977     BUILD_BUG_ON(sizeof(raw_cpuid_policy.feat) !=
978                  sizeof(raw_cpuid_policy.feat.raw));
979     BUILD_BUG_ON(sizeof(raw_cpuid_policy.xstate) !=
980                  sizeof(raw_cpuid_policy.xstate.raw));
981     BUILD_BUG_ON(sizeof(raw_cpuid_policy.extd) !=
982                  sizeof(raw_cpuid_policy.extd.raw));
983 }
984 
985 /*
986  * Local variables:
987  * mode: C
988  * c-file-style: "BSD"
989  * c-basic-offset: 4
990  * tab-width: 4
991  * indent-tabs-mode: nil
992  * End:
993  */
994