1 #include <xen/sched.h>
2 #include <xen/types.h>
3 
4 #include <public/hvm/params.h>
5 
6 #include <asm/cpu-policy.h>
7 #include <asm/cpuid.h>
8 #include <asm/hvm/viridian.h>
9 #include <asm/xstate.h>
10 
11 #define EMPTY_LEAF ((struct cpuid_leaf){})
12 
recheck_cpu_features(unsigned int cpu)13 bool recheck_cpu_features(unsigned int cpu)
14 {
15     bool okay = true;
16     struct cpuinfo_x86 c = {0};
17     const struct cpuinfo_x86 *bsp = &boot_cpu_data;
18     unsigned int i;
19 
20     identify_cpu(&c);
21 
22     for ( i = 0; i < NCAPINTS; ++i )
23     {
24         if ( !(~c.x86_capability[i] & bsp->x86_capability[i]) )
25             continue;
26 
27         printk(XENLOG_ERR "CPU%u: cap[%2u] is %08x (expected %08x)\n",
28                cpu, i, c.x86_capability[i], bsp->x86_capability[i]);
29         okay = false;
30     }
31 
32     return okay;
33 }
34 
guest_cpuid(const struct vcpu * v,uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res)35 void guest_cpuid(const struct vcpu *v, uint32_t leaf,
36                  uint32_t subleaf, struct cpuid_leaf *res)
37 {
38     const struct domain *d = v->domain;
39     const struct cpu_policy *p = d->arch.cpu_policy;
40 
41     *res = EMPTY_LEAF;
42 
43     /*
44      * First pass:
45      * - Perform max_leaf/subleaf calculations.  Out-of-range leaves return
46      *   all zeros, following the AMD model.
47      * - Fill in *res with static data.
48      * - Dispatch the virtualised leaves to their respective handlers.
49      */
50     switch ( leaf )
51     {
52     case 0 ... CPUID_GUEST_NR_BASIC - 1:
53         ASSERT(p->basic.max_leaf < ARRAY_SIZE(p->basic.raw));
54         if ( leaf > min_t(uint32_t, p->basic.max_leaf,
55                           ARRAY_SIZE(p->basic.raw) - 1) )
56             return;
57 
58         switch ( leaf )
59         {
60         case 0x4:
61             if ( subleaf >= ARRAY_SIZE(p->cache.raw) )
62                 return;
63 
64             *res = array_access_nospec(p->cache.raw, subleaf);
65             break;
66 
67         case 0x7:
68             ASSERT(p->feat.max_subleaf < ARRAY_SIZE(p->feat.raw));
69             if ( subleaf > min_t(uint32_t, p->feat.max_subleaf,
70                                  ARRAY_SIZE(p->feat.raw) - 1) )
71                 return;
72 
73             *res = array_access_nospec(p->feat.raw, subleaf);
74             break;
75 
76         case 0xb:
77             if ( subleaf >= ARRAY_SIZE(p->topo.raw) )
78                 return;
79 
80             *res = array_access_nospec(p->topo.raw, subleaf);
81             break;
82 
83         case XSTATE_CPUID:
84             if ( !p->basic.xsave || subleaf >= ARRAY_SIZE(p->xstate.raw) )
85                 return;
86 
87             *res = array_access_nospec(p->xstate.raw, subleaf);
88             break;
89 
90         default:
91             *res = array_access_nospec(p->basic.raw, leaf);
92             break;
93         }
94         break;
95 
96     case 0x40000000U ... 0x400000ffU:
97         if ( is_viridian_domain(d) )
98             return cpuid_viridian_leaves(v, leaf, subleaf, res);
99 
100         /*
101          * Fallthrough.
102          *
103          * Intel reserve up until 0x4fffffff for hypervisor use.  AMD reserve
104          * only until 0x400000ff, but we already use double that.
105          */
106     case 0x40000100U ... 0x400001ffU:
107         return cpuid_hypervisor_leaves(v, leaf, subleaf, res);
108 
109     case 0x80000000U ... 0x80000000U + CPUID_GUEST_NR_EXTD - 1:
110         ASSERT((p->extd.max_leaf & 0xffff) < ARRAY_SIZE(p->extd.raw));
111         if ( (leaf & 0xffff) > min_t(uint32_t, p->extd.max_leaf & 0xffff,
112                                      ARRAY_SIZE(p->extd.raw) - 1) )
113             return;
114 
115         *res = array_access_nospec(p->extd.raw, leaf & 0xffff);
116         break;
117 
118     default:
119         return;
120     }
121 
122     /*
123      * Skip dynamic adjustments if we are in the wrong context.
124      *
125      * All dynamic adjustments depends on current register state, which will
126      * be stale if the vcpu is running elsewhere.  It is simpler, quicker, and
127      * more reliable for the caller to do nothing (consistently) than to hand
128      * back stale data which it can't use safely.
129      */
130     if ( v != current )
131         return;
132 
133     /*
134      * Second pass:
135      * - Dynamic adjustments
136      */
137     switch ( leaf )
138     {
139         const struct cpu_user_regs *regs;
140 
141     case 0x1:
142         /* TODO: Rework topology logic. */
143         res->b &= 0x00ffffffu;
144         if ( is_hvm_domain(d) )
145             res->b |= (v->vcpu_id * 2) << 24;
146 
147         /* TODO: Rework vPMU control in terms of toolstack choices. */
148         if ( vpmu_available(v) &&
149              vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) )
150         {
151             res->d |= cpufeat_mask(X86_FEATURE_DS);
152             if ( cpu_has(&current_cpu_data, X86_FEATURE_DTES64) )
153                 res->c |= cpufeat_mask(X86_FEATURE_DTES64);
154             if ( cpu_has(&current_cpu_data, X86_FEATURE_DSCPL) )
155                 res->c |= cpufeat_mask(X86_FEATURE_DSCPL);
156         }
157 
158         if ( is_hvm_domain(d) )
159         {
160             /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
161             if ( v->arch.hvm.guest_cr[4] & X86_CR4_OSXSAVE )
162                 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
163         }
164         else /* PV domain */
165         {
166             regs = guest_cpu_user_regs();
167 
168             /*
169              * !!! OSXSAVE handling for PV guests is non-architectural !!!
170              *
171              * Architecturally, the correct code here is simply:
172              *
173              *   if ( v->arch.pv.ctrlreg[4] & X86_CR4_OSXSAVE )
174              *       c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
175              *
176              * However because of bugs in Xen (before c/s bd19080b, Nov 2010,
177              * the XSAVE cpuid flag leaked into guests despite the feature not
178              * being available for use), buggy workarounds where introduced to
179              * Linux (c/s 947ccf9c, also Nov 2010) which relied on the fact
180              * that Xen also incorrectly leaked OSXSAVE into the guest.
181              *
182              * Furthermore, providing architectural OSXSAVE behaviour to a
183              * many Linux PV guests triggered a further kernel bug when the
184              * fpu code observes that XSAVEOPT is available, assumes that
185              * xsave state had been set up for the task, and follows a wild
186              * pointer.
187              *
188              * Older Linux PVOPS kernels however do require architectural
189              * behaviour.  They observe Xen's leaked OSXSAVE and assume they
190              * can already use XSETBV, dying with a #UD because the shadowed
191              * CR4.OSXSAVE is clear.  This behaviour has been adjusted in all
192              * observed cases via stable backports of the above changeset.
193              *
194              * Therefore, the leaking of Xen's OSXSAVE setting has become a
195              * defacto part of the PV ABI and can't reasonably be corrected.
196              * It can however be restricted to only the enlightened CPUID
197              * view, as seen by the guest kernel.
198              *
199              * The following situations and logic now applies:
200              *
201              * - Hardware without CPUID faulting support and native CPUID:
202              *    There is nothing Xen can do here.  The hosts XSAVE flag will
203              *    leak through and Xen's OSXSAVE choice will leak through.
204              *
205              *    In the case that the guest kernel has not set up OSXSAVE, only
206              *    SSE will be set in xcr0, and guest userspace can't do too much
207              *    damage itself.
208              *
209              * - Enlightened CPUID or CPUID faulting available:
210              *    Xen can fully control what is seen here.  When the guest has
211              *    been configured to have XSAVE available, guest kernels need
212              *    to see the leaked OSXSAVE via the enlightened path, but
213              *    guest userspace and the native is given architectural
214              *    behaviour.
215              *
216              *    Emulated vs Faulted CPUID is distinguised based on whether a
217              *    #UD or #GP is currently being serviced.
218              */
219             /* OSXSAVE clear in policy.  Fast-forward CR4 back in. */
220             if ( (v->arch.pv.ctrlreg[4] & X86_CR4_OSXSAVE) ||
221                  (p->basic.xsave &&
222                   regs->entry_vector == X86_EXC_UD &&
223                   guest_kernel_mode(v, regs) &&
224                   (read_cr4() & X86_CR4_OSXSAVE)) )
225                 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
226 
227             /*
228              * At the time of writing, a PV domain is the only viable option
229              * for Dom0.  Several interactions between dom0 and Xen for real
230              * hardware setup have unfortunately been implemented based on
231              * state which incorrectly leaked into dom0.
232              *
233              * These leaks are retained for backwards compatibility, but
234              * restricted to the hardware domains kernel only.
235              */
236             if ( is_hardware_domain(d) && guest_kernel_mode(v, regs) )
237             {
238                 /*
239                  * MONITOR never leaked into PV guests, as PV guests cannot
240                  * use the MONITOR/MWAIT instructions.  As such, they require
241                  * the feature to not being present in emulated CPUID.
242                  *
243                  * Modern PVOPS Linux try to be cunning and use native CPUID
244                  * to see if the hardware actually supports MONITOR, and by
245                  * extension, deep C states.
246                  *
247                  * If the feature is seen, deep-C state information is
248                  * obtained from the DSDT and handed back to Xen via the
249                  * XENPF_set_processor_pminfo hypercall.
250                  *
251                  * This mechanism is incompatible with an HVM-based hardware
252                  * domain, and also with CPUID Faulting.
253                  *
254                  * Luckily, Xen can be just as 'cunning', and distinguish an
255                  * emulated CPUID from a faulted CPUID by whether a #UD or #GP
256                  * fault is currently being serviced.  Yuck...
257                  */
258                 if ( cpu_has_monitor && regs->entry_vector == X86_EXC_GP )
259                     res->c |= cpufeat_mask(X86_FEATURE_MONITOR);
260 
261                 /*
262                  * While MONITOR never leaked into PV guests, EIST always used
263                  * to.
264                  *
265                  * Modern PVOPS Linux will only parse P state information from
266                  * the DSDT and return it to Xen if EIST is seen in the
267                  * emulated CPUID information.
268                  */
269                 if ( cpu_has_eist )
270                     res->c |= cpufeat_mask(X86_FEATURE_EIST);
271             }
272         }
273         goto common_leaf1_adjustments;
274 
275     case 0x5:
276         /*
277          * Leak the hardware MONITOR leaf under the same conditions that the
278          * MONITOR feature flag is leaked.  See above for details.
279          */
280         regs = guest_cpu_user_regs();
281         if ( is_pv_domain(d) && is_hardware_domain(d) &&
282              guest_kernel_mode(v, regs) && cpu_has_monitor &&
283              regs->entry_vector == X86_EXC_GP )
284             *res = raw_cpu_policy.basic.raw[5];
285         break;
286 
287     case 0x7:
288         switch ( subleaf )
289         {
290         case 0:
291             /* OSPKE clear in policy.  Fast-forward CR4 back in. */
292             if ( (is_pv_domain(d)
293                   ? v->arch.pv.ctrlreg[4]
294                   : v->arch.hvm.guest_cr[4]) & X86_CR4_PKE )
295                 res->c |= cpufeat_mask(X86_FEATURE_OSPKE);
296             break;
297         }
298         break;
299 
300     case 0xa:
301         /* TODO: Rework vPMU control in terms of toolstack choices. */
302         if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
303              !vpmu_available(v) )
304             *res = EMPTY_LEAF;
305         else
306         {
307             /* Report at most v3 since that's all we currently emulate. */
308             if ( (res->a & 0xff) > 3 )
309                 res->a = (res->a & ~0xff) | 3;
310         }
311         break;
312 
313     case 0xb:
314         /*
315          * In principle, this leaf is Intel-only.  In practice, it is tightly
316          * coupled with x2apic, and we offer an x2apic-capable APIC emulation
317          * to guests on AMD hardware as well.
318          *
319          * TODO: Rework topology logic.
320          */
321         if ( p->basic.x2apic )
322         {
323             *(uint8_t *)&res->c = subleaf;
324 
325             /* Fix the x2APIC identifier. */
326             res->d = v->vcpu_id * 2;
327         }
328         break;
329 
330     case XSTATE_CPUID:
331         switch ( subleaf )
332         {
333         case 0:
334             if ( p->basic.xsave )
335                 res->b = xstate_uncompressed_size(v->arch.xcr0);
336             break;
337 
338         case 1:
339             if ( p->xstate.xsavec )
340                 res->b = xstate_compressed_size(v->arch.xcr0 |
341                                                 v->arch.msrs->xss.raw);
342             break;
343         }
344         break;
345 
346     case 0x80000001U:
347         /* SYSCALL is hidden outside of long mode on Intel. */
348         if ( p->x86_vendor == X86_VENDOR_INTEL &&
349              is_hvm_domain(d) && !hvm_long_mode_active(v) )
350             res->d &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
351 
352     common_leaf1_adjustments:
353         if ( is_hvm_domain(d) )
354         {
355             /* Fast-forward MSR_APIC_BASE.EN. */
356             if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
357                 res->d &= ~cpufeat_mask(X86_FEATURE_APIC);
358 
359             /*
360              * PSE36 is not supported in shadow mode.  This bit should be
361              * clear in hvm_shadow_max_featuremask[].
362              *
363              * However, an unspecified version of Hyper-V from 2011 refuses to
364              * start as the "cpu does not provide required hw features" if it
365              * can't see PSE36.
366              *
367              * As a workaround, leak the toolstack-provided PSE36 value into a
368              * shadow guest if the guest is already using PAE paging (and
369              * won't care about reverting back to PSE paging).  Otherwise,
370              * knoble it, so a 32bit guest doesn't get the impression that it
371              * could try to use PSE36 paging.
372              */
373             if ( !hap_enabled(d) && !hvm_pae_enabled(v) )
374                 res->d &= ~cpufeat_mask(X86_FEATURE_PSE36);
375         }
376         else /* PV domain */
377         {
378             /*
379              * MTRR used to unconditionally leak into PV guests.  They cannot
380              * MTRR infrastructure at all, and shouldn't be able to see the
381              * feature.
382              *
383              * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid
384              * trying to use the associated MSRs.  Xenolinux-based PV dom0's
385              * however use the MTRR feature as an indication of the presence
386              * of the XENPF_{add,del,read}_memtype hypercalls.
387              */
388             if ( is_hardware_domain(d) && cpu_has_mtrr &&
389                  guest_kernel_mode(v, guest_cpu_user_regs()) )
390                 res->d |= cpufeat_mask(X86_FEATURE_MTRR);
391         }
392         break;
393     }
394 }
395 
396 /*
397  * Local variables:
398  * mode: C
399  * c-file-style: "BSD"
400  * c-basic-offset: 4
401  * tab-width: 4
402  * indent-tabs-mode: nil
403  * End:
404  */
405