1 #include <xen/sched.h>
2 #include <xen/types.h>
3
4 #include <public/hvm/params.h>
5
6 #include <asm/cpu-policy.h>
7 #include <asm/cpuid.h>
8 #include <asm/hvm/viridian.h>
9 #include <asm/xstate.h>
10
11 #define EMPTY_LEAF ((struct cpuid_leaf){})
12
recheck_cpu_features(unsigned int cpu)13 bool recheck_cpu_features(unsigned int cpu)
14 {
15 bool okay = true;
16 struct cpuinfo_x86 c = {0};
17 const struct cpuinfo_x86 *bsp = &boot_cpu_data;
18 unsigned int i;
19
20 identify_cpu(&c);
21
22 for ( i = 0; i < NCAPINTS; ++i )
23 {
24 if ( !(~c.x86_capability[i] & bsp->x86_capability[i]) )
25 continue;
26
27 printk(XENLOG_ERR "CPU%u: cap[%2u] is %08x (expected %08x)\n",
28 cpu, i, c.x86_capability[i], bsp->x86_capability[i]);
29 okay = false;
30 }
31
32 return okay;
33 }
34
guest_cpuid(const struct vcpu * v,uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res)35 void guest_cpuid(const struct vcpu *v, uint32_t leaf,
36 uint32_t subleaf, struct cpuid_leaf *res)
37 {
38 const struct domain *d = v->domain;
39 const struct cpu_policy *p = d->arch.cpu_policy;
40
41 *res = EMPTY_LEAF;
42
43 /*
44 * First pass:
45 * - Perform max_leaf/subleaf calculations. Out-of-range leaves return
46 * all zeros, following the AMD model.
47 * - Fill in *res with static data.
48 * - Dispatch the virtualised leaves to their respective handlers.
49 */
50 switch ( leaf )
51 {
52 case 0 ... CPUID_GUEST_NR_BASIC - 1:
53 ASSERT(p->basic.max_leaf < ARRAY_SIZE(p->basic.raw));
54 if ( leaf > min_t(uint32_t, p->basic.max_leaf,
55 ARRAY_SIZE(p->basic.raw) - 1) )
56 return;
57
58 switch ( leaf )
59 {
60 case 0x4:
61 if ( subleaf >= ARRAY_SIZE(p->cache.raw) )
62 return;
63
64 *res = array_access_nospec(p->cache.raw, subleaf);
65 break;
66
67 case 0x7:
68 ASSERT(p->feat.max_subleaf < ARRAY_SIZE(p->feat.raw));
69 if ( subleaf > min_t(uint32_t, p->feat.max_subleaf,
70 ARRAY_SIZE(p->feat.raw) - 1) )
71 return;
72
73 *res = array_access_nospec(p->feat.raw, subleaf);
74 break;
75
76 case 0xb:
77 if ( subleaf >= ARRAY_SIZE(p->topo.raw) )
78 return;
79
80 *res = array_access_nospec(p->topo.raw, subleaf);
81 break;
82
83 case XSTATE_CPUID:
84 if ( !p->basic.xsave || subleaf >= ARRAY_SIZE(p->xstate.raw) )
85 return;
86
87 *res = array_access_nospec(p->xstate.raw, subleaf);
88 break;
89
90 default:
91 *res = array_access_nospec(p->basic.raw, leaf);
92 break;
93 }
94 break;
95
96 case 0x40000000U ... 0x400000ffU:
97 if ( is_viridian_domain(d) )
98 return cpuid_viridian_leaves(v, leaf, subleaf, res);
99
100 /*
101 * Fallthrough.
102 *
103 * Intel reserve up until 0x4fffffff for hypervisor use. AMD reserve
104 * only until 0x400000ff, but we already use double that.
105 */
106 case 0x40000100U ... 0x400001ffU:
107 return cpuid_hypervisor_leaves(v, leaf, subleaf, res);
108
109 case 0x80000000U ... 0x80000000U + CPUID_GUEST_NR_EXTD - 1:
110 ASSERT((p->extd.max_leaf & 0xffff) < ARRAY_SIZE(p->extd.raw));
111 if ( (leaf & 0xffff) > min_t(uint32_t, p->extd.max_leaf & 0xffff,
112 ARRAY_SIZE(p->extd.raw) - 1) )
113 return;
114
115 *res = array_access_nospec(p->extd.raw, leaf & 0xffff);
116 break;
117
118 default:
119 return;
120 }
121
122 /*
123 * Skip dynamic adjustments if we are in the wrong context.
124 *
125 * All dynamic adjustments depends on current register state, which will
126 * be stale if the vcpu is running elsewhere. It is simpler, quicker, and
127 * more reliable for the caller to do nothing (consistently) than to hand
128 * back stale data which it can't use safely.
129 */
130 if ( v != current )
131 return;
132
133 /*
134 * Second pass:
135 * - Dynamic adjustments
136 */
137 switch ( leaf )
138 {
139 const struct cpu_user_regs *regs;
140
141 case 0x1:
142 /* TODO: Rework topology logic. */
143 res->b &= 0x00ffffffu;
144 if ( is_hvm_domain(d) )
145 res->b |= (v->vcpu_id * 2) << 24;
146
147 /* TODO: Rework vPMU control in terms of toolstack choices. */
148 if ( vpmu_available(v) &&
149 vpmu_is_set(vcpu_vpmu(v), VPMU_CPU_HAS_DS) )
150 {
151 res->d |= cpufeat_mask(X86_FEATURE_DS);
152 if ( cpu_has(¤t_cpu_data, X86_FEATURE_DTES64) )
153 res->c |= cpufeat_mask(X86_FEATURE_DTES64);
154 if ( cpu_has(¤t_cpu_data, X86_FEATURE_DSCPL) )
155 res->c |= cpufeat_mask(X86_FEATURE_DSCPL);
156 }
157
158 if ( is_hvm_domain(d) )
159 {
160 /* OSXSAVE clear in policy. Fast-forward CR4 back in. */
161 if ( v->arch.hvm.guest_cr[4] & X86_CR4_OSXSAVE )
162 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
163 }
164 else /* PV domain */
165 {
166 regs = guest_cpu_user_regs();
167
168 /*
169 * !!! OSXSAVE handling for PV guests is non-architectural !!!
170 *
171 * Architecturally, the correct code here is simply:
172 *
173 * if ( v->arch.pv.ctrlreg[4] & X86_CR4_OSXSAVE )
174 * c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
175 *
176 * However because of bugs in Xen (before c/s bd19080b, Nov 2010,
177 * the XSAVE cpuid flag leaked into guests despite the feature not
178 * being available for use), buggy workarounds where introduced to
179 * Linux (c/s 947ccf9c, also Nov 2010) which relied on the fact
180 * that Xen also incorrectly leaked OSXSAVE into the guest.
181 *
182 * Furthermore, providing architectural OSXSAVE behaviour to a
183 * many Linux PV guests triggered a further kernel bug when the
184 * fpu code observes that XSAVEOPT is available, assumes that
185 * xsave state had been set up for the task, and follows a wild
186 * pointer.
187 *
188 * Older Linux PVOPS kernels however do require architectural
189 * behaviour. They observe Xen's leaked OSXSAVE and assume they
190 * can already use XSETBV, dying with a #UD because the shadowed
191 * CR4.OSXSAVE is clear. This behaviour has been adjusted in all
192 * observed cases via stable backports of the above changeset.
193 *
194 * Therefore, the leaking of Xen's OSXSAVE setting has become a
195 * defacto part of the PV ABI and can't reasonably be corrected.
196 * It can however be restricted to only the enlightened CPUID
197 * view, as seen by the guest kernel.
198 *
199 * The following situations and logic now applies:
200 *
201 * - Hardware without CPUID faulting support and native CPUID:
202 * There is nothing Xen can do here. The hosts XSAVE flag will
203 * leak through and Xen's OSXSAVE choice will leak through.
204 *
205 * In the case that the guest kernel has not set up OSXSAVE, only
206 * SSE will be set in xcr0, and guest userspace can't do too much
207 * damage itself.
208 *
209 * - Enlightened CPUID or CPUID faulting available:
210 * Xen can fully control what is seen here. When the guest has
211 * been configured to have XSAVE available, guest kernels need
212 * to see the leaked OSXSAVE via the enlightened path, but
213 * guest userspace and the native is given architectural
214 * behaviour.
215 *
216 * Emulated vs Faulted CPUID is distinguised based on whether a
217 * #UD or #GP is currently being serviced.
218 */
219 /* OSXSAVE clear in policy. Fast-forward CR4 back in. */
220 if ( (v->arch.pv.ctrlreg[4] & X86_CR4_OSXSAVE) ||
221 (p->basic.xsave &&
222 regs->entry_vector == X86_EXC_UD &&
223 guest_kernel_mode(v, regs) &&
224 (read_cr4() & X86_CR4_OSXSAVE)) )
225 res->c |= cpufeat_mask(X86_FEATURE_OSXSAVE);
226
227 /*
228 * At the time of writing, a PV domain is the only viable option
229 * for Dom0. Several interactions between dom0 and Xen for real
230 * hardware setup have unfortunately been implemented based on
231 * state which incorrectly leaked into dom0.
232 *
233 * These leaks are retained for backwards compatibility, but
234 * restricted to the hardware domains kernel only.
235 */
236 if ( is_hardware_domain(d) && guest_kernel_mode(v, regs) )
237 {
238 /*
239 * MONITOR never leaked into PV guests, as PV guests cannot
240 * use the MONITOR/MWAIT instructions. As such, they require
241 * the feature to not being present in emulated CPUID.
242 *
243 * Modern PVOPS Linux try to be cunning and use native CPUID
244 * to see if the hardware actually supports MONITOR, and by
245 * extension, deep C states.
246 *
247 * If the feature is seen, deep-C state information is
248 * obtained from the DSDT and handed back to Xen via the
249 * XENPF_set_processor_pminfo hypercall.
250 *
251 * This mechanism is incompatible with an HVM-based hardware
252 * domain, and also with CPUID Faulting.
253 *
254 * Luckily, Xen can be just as 'cunning', and distinguish an
255 * emulated CPUID from a faulted CPUID by whether a #UD or #GP
256 * fault is currently being serviced. Yuck...
257 */
258 if ( cpu_has_monitor && regs->entry_vector == X86_EXC_GP )
259 res->c |= cpufeat_mask(X86_FEATURE_MONITOR);
260
261 /*
262 * While MONITOR never leaked into PV guests, EIST always used
263 * to.
264 *
265 * Modern PVOPS Linux will only parse P state information from
266 * the DSDT and return it to Xen if EIST is seen in the
267 * emulated CPUID information.
268 */
269 if ( cpu_has_eist )
270 res->c |= cpufeat_mask(X86_FEATURE_EIST);
271 }
272 }
273 goto common_leaf1_adjustments;
274
275 case 0x5:
276 /*
277 * Leak the hardware MONITOR leaf under the same conditions that the
278 * MONITOR feature flag is leaked. See above for details.
279 */
280 regs = guest_cpu_user_regs();
281 if ( is_pv_domain(d) && is_hardware_domain(d) &&
282 guest_kernel_mode(v, regs) && cpu_has_monitor &&
283 regs->entry_vector == X86_EXC_GP )
284 *res = raw_cpu_policy.basic.raw[5];
285 break;
286
287 case 0x7:
288 switch ( subleaf )
289 {
290 case 0:
291 /* OSPKE clear in policy. Fast-forward CR4 back in. */
292 if ( (is_pv_domain(d)
293 ? v->arch.pv.ctrlreg[4]
294 : v->arch.hvm.guest_cr[4]) & X86_CR4_PKE )
295 res->c |= cpufeat_mask(X86_FEATURE_OSPKE);
296 break;
297 }
298 break;
299
300 case 0xa:
301 /* TODO: Rework vPMU control in terms of toolstack choices. */
302 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
303 !vpmu_available(v) )
304 *res = EMPTY_LEAF;
305 else
306 {
307 /* Report at most v3 since that's all we currently emulate. */
308 if ( (res->a & 0xff) > 3 )
309 res->a = (res->a & ~0xff) | 3;
310 }
311 break;
312
313 case 0xb:
314 /*
315 * In principle, this leaf is Intel-only. In practice, it is tightly
316 * coupled with x2apic, and we offer an x2apic-capable APIC emulation
317 * to guests on AMD hardware as well.
318 *
319 * TODO: Rework topology logic.
320 */
321 if ( p->basic.x2apic )
322 {
323 *(uint8_t *)&res->c = subleaf;
324
325 /* Fix the x2APIC identifier. */
326 res->d = v->vcpu_id * 2;
327 }
328 break;
329
330 case XSTATE_CPUID:
331 switch ( subleaf )
332 {
333 case 0:
334 if ( p->basic.xsave )
335 res->b = xstate_uncompressed_size(v->arch.xcr0);
336 break;
337
338 case 1:
339 if ( p->xstate.xsavec )
340 res->b = xstate_compressed_size(v->arch.xcr0 |
341 v->arch.msrs->xss.raw);
342 break;
343 }
344 break;
345
346 case 0x80000001U:
347 /* SYSCALL is hidden outside of long mode on Intel. */
348 if ( p->x86_vendor == X86_VENDOR_INTEL &&
349 is_hvm_domain(d) && !hvm_long_mode_active(v) )
350 res->d &= ~cpufeat_mask(X86_FEATURE_SYSCALL);
351
352 common_leaf1_adjustments:
353 if ( is_hvm_domain(d) )
354 {
355 /* Fast-forward MSR_APIC_BASE.EN. */
356 if ( vlapic_hw_disabled(vcpu_vlapic(v)) )
357 res->d &= ~cpufeat_mask(X86_FEATURE_APIC);
358
359 /*
360 * PSE36 is not supported in shadow mode. This bit should be
361 * clear in hvm_shadow_max_featuremask[].
362 *
363 * However, an unspecified version of Hyper-V from 2011 refuses to
364 * start as the "cpu does not provide required hw features" if it
365 * can't see PSE36.
366 *
367 * As a workaround, leak the toolstack-provided PSE36 value into a
368 * shadow guest if the guest is already using PAE paging (and
369 * won't care about reverting back to PSE paging). Otherwise,
370 * knoble it, so a 32bit guest doesn't get the impression that it
371 * could try to use PSE36 paging.
372 */
373 if ( !hap_enabled(d) && !hvm_pae_enabled(v) )
374 res->d &= ~cpufeat_mask(X86_FEATURE_PSE36);
375 }
376 else /* PV domain */
377 {
378 /*
379 * MTRR used to unconditionally leak into PV guests. They cannot
380 * MTRR infrastructure at all, and shouldn't be able to see the
381 * feature.
382 *
383 * Modern PVOPS Linux self-clobbers the MTRR feature, to avoid
384 * trying to use the associated MSRs. Xenolinux-based PV dom0's
385 * however use the MTRR feature as an indication of the presence
386 * of the XENPF_{add,del,read}_memtype hypercalls.
387 */
388 if ( is_hardware_domain(d) && cpu_has_mtrr &&
389 guest_kernel_mode(v, guest_cpu_user_regs()) )
390 res->d |= cpufeat_mask(X86_FEATURE_MTRR);
391 }
392 break;
393 }
394 }
395
396 /*
397 * Local variables:
398 * mode: C
399 * c-file-style: "BSD"
400 * c-basic-offset: 4
401 * tab-width: 4
402 * indent-tabs-mode: nil
403 * End:
404 */
405