1 #include <xen/types.h>
2 #include <xen/sched.h>
3 #include "mcaction.h"
4 #include "vmce.h"
5 #include "mce.h"
6 
7 static struct mcinfo_recovery *
mci_action_add_pageoffline(int bank,struct mc_info * mi,uint64_t mfn,uint32_t status)8 mci_action_add_pageoffline(int bank, struct mc_info *mi,
9                            uint64_t mfn, uint32_t status)
10 {
11     struct mcinfo_recovery *rec;
12 
13     if ( !mi )
14         return NULL;
15 
16     rec = x86_mcinfo_reserve(mi, sizeof(*rec), MC_TYPE_RECOVERY);
17     if ( !rec )
18     {
19         mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
20         return NULL;
21     }
22 
23     rec->mc_bank = bank;
24     rec->action_types = MC_ACTION_PAGE_OFFLINE;
25     rec->action_info.page_retire.mfn = mfn;
26     rec->action_info.page_retire.status = status;
27     return rec;
28 }
29 
30 mce_check_addr_t mc_check_addr = NULL;
31 
mce_register_addrcheck(mce_check_addr_t cbfunc)32 void mce_register_addrcheck(mce_check_addr_t cbfunc)
33 {
34     mc_check_addr = cbfunc;
35 }
36 
37 void
mc_memerr_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)38 mc_memerr_dhandler(struct mca_binfo *binfo,
39                    enum mce_result *result,
40                    const struct cpu_user_regs *regs)
41 {
42     struct mcinfo_bank *bank = binfo->mib;
43     struct mcinfo_global *global = binfo->mig;
44     struct domain *d;
45     unsigned long mfn, gfn;
46     uint32_t status;
47     int vmce_vcpuid;
48     unsigned int mc_vcpuid;
49 
50     if ( !mc_check_addr(bank->mc_status, bank->mc_misc, MC_ADDR_PHYSICAL) )
51     {
52         dprintk(XENLOG_WARNING,
53                 "No physical address provided for memory error\n");
54         return;
55     }
56 
57     mfn = bank->mc_addr >> PAGE_SHIFT;
58     if ( offline_page(mfn, 1, &status) )
59     {
60         dprintk(XENLOG_WARNING,
61                 "Failed to offline page %lx for MCE error\n", mfn);
62         return;
63     }
64 
65     mci_action_add_pageoffline(binfo->bank, binfo->mi, mfn, status);
66 
67     /* This is free page */
68     if ( status & PG_OFFLINE_OFFLINED )
69         *result = MCER_RECOVERED;
70     else if ( status & PG_OFFLINE_AGAIN )
71         *result = MCER_CONTINUE;
72     else if ( status & PG_OFFLINE_PENDING )
73     {
74         /* This page has owner */
75         if ( status & PG_OFFLINE_OWNED )
76         {
77             bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT;
78             mce_printk(MCE_QUIET, "MCE: This error page is ownded"
79                        " by DOM %d\n", bank->mc_domid);
80             /*
81              * XXX: Cannot handle shared pages yet
82              * (this should identify all domains and gfn mapping to
83              *  the mfn in question)
84              */
85             BUG_ON( bank->mc_domid == DOMID_COW );
86             if ( bank->mc_domid != DOMID_XEN )
87             {
88                 d = get_domain_by_id(bank->mc_domid);
89                 ASSERT(d);
90                 gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT);
91 
92                 if ( unmmap_broken_page(d, _mfn(mfn), gfn) )
93                 {
94                     printk("Unmap broken memory %lx for DOM%d failed\n",
95                            mfn, d->domain_id);
96                     goto vmce_failed;
97                 }
98 
99                 mc_vcpuid = global->mc_vcpuid;
100                 if ( mc_vcpuid == XEN_MC_VCPUID_INVALID ||
101                      /*
102                       * Because MC# may happen asynchronously with the actual
103                       * operation that triggers the error, the domain ID as
104                       * well as the vCPU ID collected in 'global' at MC# are
105                       * not always precise. In that case, fallback to broadcast.
106                       */
107                      global->mc_domid != bank->mc_domid ||
108                      (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
109                       (!(global->mc_gstatus & MCG_STATUS_LMCE) ||
110                        !(d->vcpu[mc_vcpuid]->arch.vmce.mcg_ext_ctl &
111                          MCG_EXT_CTL_LMCE_EN))) )
112                     vmce_vcpuid = VMCE_INJECT_BROADCAST;
113                 else
114                     vmce_vcpuid = mc_vcpuid;
115 
116                 bank->mc_addr = gfn << PAGE_SHIFT |
117                                 (bank->mc_addr & (PAGE_SIZE - 1));
118                 if ( fill_vmsr_data(bank, d, global->mc_gstatus, vmce_vcpuid) )
119                 {
120                     mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d "
121                                "failed\n", bank->mc_domid);
122                     goto vmce_failed;
123                 }
124 
125                 /* We will inject vMCE to DOMU */
126                 if ( inject_vmce(d, vmce_vcpuid) < 0 )
127                 {
128                     mce_printk(MCE_QUIET, "inject vMCE to DOM%d"
129                                " failed\n", d->domain_id);
130                     goto vmce_failed;
131                 }
132 
133                 /*
134                  * Impacted domain go on with domain's recovery job
135                  * if the domain has its own MCA handler.
136                  * For xen, it has contained the error and finished
137                  * its own recovery job.
138                  */
139                 *result = MCER_RECOVERED;
140                 put_domain(d);
141 
142                 return;
143 vmce_failed:
144                 put_domain(d);
145                 domain_crash(d);
146             }
147         }
148     }
149 }
150