1 #ifndef __COMMON__H
2 #define __COMMON__H
3 
4 #include <stdbool.h>
5 
6 #include "xg_private.h"
7 #include "xg_save_restore.h"
8 #include "xc_bitops.h"
9 
10 #include "xg_sr_stream_format.h"
11 
12 /* String representation of Domain Header types. */
13 const char *dhdr_type_to_str(uint32_t type);
14 
15 /* String representation of Record types. */
16 const char *rec_type_to_str(uint32_t type);
17 
18 struct xc_sr_context;
19 struct xc_sr_record;
20 
21 /**
22  * Save operations.  To be implemented for each type of guest, for use by the
23  * common save algorithm.
24  *
25  * Every function must be implemented, even if only with a no-op stub.
26  */
27 struct xc_sr_save_ops
28 {
29     /* Convert a PFN to GFN.  May return ~0UL for an invalid mapping. */
30     xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
31 
32     /**
33      * Optionally transform the contents of a page from being specific to the
34      * sending environment, to being generic for the stream.
35      *
36      * The page of data at the end of 'page' may be a read-only mapping of a
37      * running guest; it must not be modified.  If no transformation is
38      * required, the callee should leave '*pages' untouched.
39      *
40      * If a transformation is required, the callee should allocate themselves
41      * a local page using malloc() and return it via '*page'.
42      *
43      * The caller shall free() '*page' in all cases.  In the case that the
44      * callee encounters an error, it should *NOT* free() the memory it
45      * allocated for '*page'.
46      *
47      * It is valid to fail with EAGAIN if the transformation is not able to be
48      * completed at this point.  The page shall be retried later.
49      *
50      * @returns 0 for success, -1 for failure, with errno appropriately set.
51      */
52     int (*normalise_page)(struct xc_sr_context *ctx, xen_pfn_t type,
53                           void **page);
54 
55     /**
56      * Set up local environment to save a domain. (Typically querying
57      * running domain state, setting up mappings etc.)
58      *
59      * This is called once before any common setup has occurred, allowing for
60      * guest-specific adjustments to be made to common state.
61      */
62     int (*setup)(struct xc_sr_context *ctx);
63 
64     /**
65      * Send static records at the head of the stream.  This is called once,
66      * after the Image and Domain headers are written.
67      */
68     int (*static_data)(struct xc_sr_context *ctx);
69 
70     /**
71      * Send dynamic records which need to be at the start of the stream.  This
72      * is called after the STATIC_DATA_END record is written.
73      */
74     int (*start_of_stream)(struct xc_sr_context *ctx);
75 
76     /**
77      * Send records which need to be at the start of a checkpoint.  This is
78      * called once, or once per checkpoint in a checkpointed stream, and is
79      * ahead of memory data.
80      */
81     int (*start_of_checkpoint)(struct xc_sr_context *ctx);
82 
83     /**
84      * Send records which need to be at the end of the checkpoint.  This is
85      * called once, or once per checkpoint in a checkpointed stream, and is
86      * after the memory data.
87      */
88     int (*end_of_checkpoint)(struct xc_sr_context *ctx);
89 
90     /**
91      * Check state of guest to decide whether it makes sense to continue
92      * migration.  This is called in each iteration or checkpoint to check
93      * whether all criteria for the migration are still met.  If that's not
94      * the case either migration is cancelled via a bad rc or the situation
95      * is handled, e.g. by sending appropriate records.
96      */
97     int (*check_vm_state)(struct xc_sr_context *ctx);
98 
99     /**
100      * Clean up the local environment.  Will be called exactly once, either
101      * after a successful save, or upon encountering an error.
102      */
103     int (*cleanup)(struct xc_sr_context *ctx);
104 };
105 
106 
107 /**
108  * Restore operations.  To be implemented for each type of guest, for use by
109  * the common restore algorithm.
110  *
111  * Every function must be implemented, even if only with a no-op stub.
112  */
113 struct xc_sr_restore_ops
114 {
115     /* Convert a PFN to GFN.  May return ~0UL for an invalid mapping. */
116     xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
117 
118     /* Check to see whether a PFN is valid. */
119     bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
120 
121     /* Set the GFN of a PFN. */
122     void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn);
123 
124     /* Set the type of a PFN. */
125     void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn,
126                           xen_pfn_t type);
127 
128     /**
129      * Optionally transform the contents of a page from being generic in the
130      * stream, to being specific to the restoring environment.
131      *
132      * 'page' is expected to be modified in-place if a transformation is
133      * required.
134      *
135      * @returns 0 for success, -1 for failure, with errno appropriately set.
136      */
137     int (*localise_page)(struct xc_sr_context *ctx, uint32_t type, void *page);
138 
139     /**
140      * Set up local environment to restore a domain.
141      *
142      * This is called once before any common setup has occurred, allowing for
143      * guest-specific adjustments to be made to common state.
144      */
145     int (*setup)(struct xc_sr_context *ctx);
146 
147     /**
148      * Process an individual record from the stream.  The caller shall take
149      * care of processing common records (e.g. END, PAGE_DATA).
150      *
151      * @return 0 for success, -1 for failure, or the following sentinels:
152      *  - RECORD_NOT_PROCESSED
153      *  - BROKEN_CHANNEL: under Remus/COLO, this means master may be dead, and
154      *    a failover is needed.
155      */
156 #define RECORD_NOT_PROCESSED 1
157 #define BROKEN_CHANNEL 2
158     int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record *rec);
159 
160     /**
161      * Perform any actions required after the static data has arrived.  Called
162      * when the STATIC_DATA_COMPLETE record has been recieved/inferred.
163      * 'missing' should be filled in for any data item the higher level
164      * toolstack needs to provide compatiblity for.
165      */
166     int (*static_data_complete)(struct xc_sr_context *ctx,
167                                 unsigned int *missing);
168 
169     /**
170      * Perform any actions required after the stream has been finished. Called
171      * after the END record has been received.
172      */
173     int (*stream_complete)(struct xc_sr_context *ctx);
174 
175     /**
176      * Clean up the local environment.  Will be called exactly once, either
177      * after a successful restore, or upon encountering an error.
178      */
179     int (*cleanup)(struct xc_sr_context *ctx);
180 };
181 
182 /* Wrapper for blobs of data heading Xen-wards. */
183 struct xc_sr_blob
184 {
185     void *ptr;
186     size_t size;
187 };
188 
189 /*
190  * Update a blob.  Duplicate src/size, freeing the old blob if necessary.  May
191  * fail due to memory allocation.
192  */
update_blob(struct xc_sr_blob * blob,const void * src,size_t size)193 static inline int update_blob(struct xc_sr_blob *blob,
194                               const void *src, size_t size)
195 {
196     void *ptr;
197 
198     if ( !src || !size )
199     {
200         errno = EINVAL;
201         return -1;
202     }
203 
204     if ( (ptr = malloc(size)) == NULL )
205         return -1;
206 
207     free(blob->ptr);
208     blob->ptr = memcpy(ptr, src, size);
209     blob->size = size;
210 
211     return 0;
212 }
213 
214 struct xc_sr_context
215 {
216     xc_interface *xch;
217     uint32_t domid;
218     int fd;
219 
220     /* Plain VM, or checkpoints over time. */
221     xc_stream_type_t stream_type;
222 
223     xc_domaininfo_t dominfo;
224 
225     union /* Common save or restore data. */
226     {
227         struct /* Save data. */
228         {
229             int recv_fd;
230 
231             struct xc_sr_save_ops ops;
232             struct save_callbacks *callbacks;
233 
234             /* Live migrate vs non live suspend. */
235             bool live;
236 
237             /* Further debugging information in the stream. */
238             bool debug;
239 
240             unsigned long p2m_size;
241 
242             struct precopy_stats stats;
243 
244             xen_pfn_t *batch_pfns;
245             unsigned int nr_batch_pfns;
246             unsigned long *deferred_pages;
247             unsigned long nr_deferred_pages;
248             xc_hypercall_buffer_t dirty_bitmap_hbuf;
249         } save;
250 
251         struct /* Restore data. */
252         {
253             struct xc_sr_restore_ops ops;
254             struct restore_callbacks *callbacks;
255 
256             int send_back_fd;
257             unsigned long p2m_size;
258             xc_hypercall_buffer_t dirty_bitmap_hbuf;
259 
260             /* From Image Header. */
261             uint32_t format_version;
262 
263             /* From Domain Header. */
264             uint32_t guest_type;
265             uint32_t guest_page_size;
266 
267             /* Currently buffering records between a checkpoint */
268             bool buffer_all_records;
269 
270             /* Whether a STATIC_DATA_END record has been seen/inferred. */
271             bool seen_static_data_end;
272 
273 /*
274  * With Remus/COLO, we buffer the records sent by the primary at checkpoint,
275  * in case the primary will fail, we can recover from the last
276  * checkpoint state.
277  * This should be enough for most of the cases because primary only send
278  * dirty pages at checkpoint.
279  */
280 #define DEFAULT_BUF_RECORDS 1024
281             struct xc_sr_record *buffered_records;
282             unsigned int allocated_rec_num;
283             unsigned int buffered_rec_num;
284 
285             /*
286              * Xenstore and Console parameters.
287              * INPUT:  evtchn & domid
288              * OUTPUT: gfn
289              */
290             xen_pfn_t    xenstore_gfn,    console_gfn;
291             unsigned int xenstore_evtchn, console_evtchn;
292             uint32_t     xenstore_domid,  console_domid;
293 
294             /* Bitmap of currently populated PFNs during restore. */
295             unsigned long *populated_pfns;
296             xen_pfn_t max_populated_pfn;
297 
298             /* Sender has invoked verify mode on the stream. */
299             bool verify;
300         } restore;
301     };
302 
303     union /* Guest-arch specific data. */
304     {
305         struct /* x86 */
306         {
307             /* Common save/restore data. */
308             union
309             {
310                 struct
311                 {
312                     /* X86_{CPUID,MSR}_DATA blobs for CPU Policy. */
313                     struct xc_sr_blob cpuid, msr;
314                 } restore;
315             };
316 
317             struct /* x86 PV guest. */
318             {
319                 /* 4 or 8; 32 or 64 bit domain */
320                 unsigned int width;
321                 /* 3 or 4 pagetable levels */
322                 unsigned int levels;
323 
324                 /* Maximum Xen frame */
325                 xen_pfn_t max_mfn;
326                 /* Read-only machine to phys map */
327                 xen_pfn_t *m2p;
328                 /* first mfn of the compat m2p (Only set for 32bit PV guests) */
329                 xen_pfn_t compat_m2p_mfn0;
330                 /* Number of m2p frames mapped */
331                 unsigned long nr_m2p_frames;
332 
333                 /* Maximum guest frame */
334                 xen_pfn_t max_pfn;
335 
336                 /* Number of frames making up the p2m */
337                 unsigned int p2m_frames;
338                 /* Guest's phys to machine map.  Mapped read-only (save) or
339                  * allocated locally (restore).  Uses guest unsigned longs. */
340                 void *p2m;
341                 /* The guest pfns containing the p2m leaves */
342                 xen_pfn_t *p2m_pfns;
343 
344                 /* Read-only mapping of guests shared info page */
345                 shared_info_any_t *shinfo;
346 
347                 /* p2m generation count for verifying validity of local p2m. */
348                 uint64_t p2m_generation;
349 
350                 union
351                 {
352                     struct
353                     {
354                         /* State machine for the order of received records. */
355                         bool seen_pv_info;
356 
357                         /* Types for each page (bounded by max_pfn). */
358                         uint32_t *pfn_types;
359 
360                         /* x86 PV per-vcpu storage structure for blobs. */
361                         struct xc_sr_x86_pv_restore_vcpu
362                         {
363                             struct xc_sr_blob basic, extd, xsave, msr;
364                         } *vcpus;
365                         unsigned int nr_vcpus;
366                     } restore;
367                 };
368             } pv;
369 
370             struct /* x86 HVM guest. */
371             {
372                 union
373                 {
374                     struct
375                     {
376                         /* Whether qemu enabled logdirty mode, and we should
377                          * disable on cleanup. */
378                         bool qemu_enabled_logdirty;
379                     } save;
380 
381                     struct
382                     {
383                         /* HVM context blob. */
384                         struct xc_sr_blob context;
385                     } restore;
386                 };
387             } hvm;
388 
389         } x86;
390     };
391 };
392 
393 extern struct xc_sr_save_ops save_ops_x86_pv;
394 extern struct xc_sr_save_ops save_ops_x86_hvm;
395 
396 extern struct xc_sr_restore_ops restore_ops_x86_pv;
397 extern struct xc_sr_restore_ops restore_ops_x86_hvm;
398 
399 struct xc_sr_record
400 {
401     uint32_t type;
402     uint32_t length;
403     void *data;
404 };
405 
406 /*
407  * Writes a split record to the stream, applying correct padding where
408  * appropriate.  It is common when sending records containing blobs from Xen
409  * that the header and blob data are separate.  This function accepts a second
410  * buffer and length, and will merge it with the main record when sending.
411  *
412  * Records with a non-zero length must provide a valid data field; records
413  * with a 0 length shall have their data field ignored.
414  *
415  * Returns 0 on success and non0 on failure.
416  */
417 int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec,
418                        void *buf, size_t sz);
419 
420 /*
421  * Writes a record to the stream, applying correct padding where appropriate.
422  * Records with a non-zero length must provide a valid data field; records
423  * with a 0 length shall have their data field ignored.
424  *
425  * Returns 0 on success and non0 on failure.
426  */
write_record(struct xc_sr_context * ctx,struct xc_sr_record * rec)427 static inline int write_record(struct xc_sr_context *ctx,
428                                struct xc_sr_record *rec)
429 {
430     return write_split_record(ctx, rec, NULL, 0);
431 }
432 
433 /*
434  * Reads a record from the stream, and fills in the record structure.
435  *
436  * Returns 0 on success and non-0 on failure.
437  *
438  * On success, the records type and size shall be valid.
439  * - If size is 0, data shall be NULL.
440  * - If size is non-0, data shall be a buffer allocated by malloc() which must
441  *   be passed to free() by the caller.
442  *
443  * On failure, the contents of the record structure are undefined.
444  */
445 int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec);
446 
447 /*
448  * This would ideally be private in restore.c, but is needed by
449  * x86_pv_localise_page() if we receive pagetables frames ahead of the
450  * contents of the frames they point at.
451  */
452 int populate_pfns(struct xc_sr_context *ctx, unsigned int count,
453                   const xen_pfn_t *original_pfns, const uint32_t *types);
454 
455 /* Handle a STATIC_DATA_END record. */
456 int handle_static_data_end(struct xc_sr_context *ctx);
457 
458 /* Page type known to the migration logic? */
is_known_page_type(uint32_t type)459 static inline bool is_known_page_type(uint32_t type)
460 {
461     switch ( type )
462     {
463     case XEN_DOMCTL_PFINFO_NOTAB:
464 
465     case XEN_DOMCTL_PFINFO_L1TAB:
466     case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
467 
468     case XEN_DOMCTL_PFINFO_L2TAB:
469     case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:
470 
471     case XEN_DOMCTL_PFINFO_L3TAB:
472     case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:
473 
474     case XEN_DOMCTL_PFINFO_L4TAB:
475     case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
476 
477     case XEN_DOMCTL_PFINFO_XTAB:
478     case XEN_DOMCTL_PFINFO_XALLOC: /* Synthetic type in Xen 4.2 - 4.5 */
479     case XEN_DOMCTL_PFINFO_BROKEN:
480         return true;
481 
482     default:
483         return false;
484     }
485 }
486 
487 /* Page type backed by RAM in the guest? */
page_type_to_populate(uint32_t type)488 static inline bool page_type_to_populate(uint32_t type)
489 {
490     switch ( type )
491     {
492     case XEN_DOMCTL_PFINFO_NOTAB:
493 
494     case XEN_DOMCTL_PFINFO_L1TAB:
495     case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
496 
497     case XEN_DOMCTL_PFINFO_L2TAB:
498     case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:
499 
500     case XEN_DOMCTL_PFINFO_L3TAB:
501     case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:
502 
503     case XEN_DOMCTL_PFINFO_L4TAB:
504     case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
505 
506     case XEN_DOMCTL_PFINFO_XALLOC:
507         return true;
508 
509     case XEN_DOMCTL_PFINFO_XTAB:
510     case XEN_DOMCTL_PFINFO_BROKEN:
511     default:
512         return false;
513     }
514 }
515 
page_type_has_stream_data(uint32_t type)516 static inline bool page_type_has_stream_data(uint32_t type)
517 {
518     switch ( type )
519     {
520     case XEN_DOMCTL_PFINFO_NOTAB:
521 
522     case XEN_DOMCTL_PFINFO_L1TAB:
523     case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
524 
525     case XEN_DOMCTL_PFINFO_L2TAB:
526     case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:
527 
528     case XEN_DOMCTL_PFINFO_L3TAB:
529     case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:
530 
531     case XEN_DOMCTL_PFINFO_L4TAB:
532     case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
533         return true;
534 
535     case XEN_DOMCTL_PFINFO_XTAB:
536     case XEN_DOMCTL_PFINFO_BROKEN:
537     case XEN_DOMCTL_PFINFO_XALLOC:
538     default:
539         return false;
540     }
541 }
542 
543 #endif
544 /*
545  * Local variables:
546  * mode: C
547  * c-file-style: "BSD"
548  * c-basic-offset: 4
549  * tab-width: 4
550  * indent-tabs-mode: nil
551  * End:
552  */
553