1 #ifndef __COMMON__H
2 #define __COMMON__H
3
4 #include <stdbool.h>
5
6 #include "xg_private.h"
7 #include "xg_save_restore.h"
8 #include "xc_bitops.h"
9
10 #include "xg_sr_stream_format.h"
11
12 /* String representation of Domain Header types. */
13 const char *dhdr_type_to_str(uint32_t type);
14
15 /* String representation of Record types. */
16 const char *rec_type_to_str(uint32_t type);
17
18 struct xc_sr_context;
19 struct xc_sr_record;
20
21 /**
22 * Save operations. To be implemented for each type of guest, for use by the
23 * common save algorithm.
24 *
25 * Every function must be implemented, even if only with a no-op stub.
26 */
27 struct xc_sr_save_ops
28 {
29 /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */
30 xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
31
32 /**
33 * Optionally transform the contents of a page from being specific to the
34 * sending environment, to being generic for the stream.
35 *
36 * The page of data at the end of 'page' may be a read-only mapping of a
37 * running guest; it must not be modified. If no transformation is
38 * required, the callee should leave '*pages' untouched.
39 *
40 * If a transformation is required, the callee should allocate themselves
41 * a local page using malloc() and return it via '*page'.
42 *
43 * The caller shall free() '*page' in all cases. In the case that the
44 * callee encounters an error, it should *NOT* free() the memory it
45 * allocated for '*page'.
46 *
47 * It is valid to fail with EAGAIN if the transformation is not able to be
48 * completed at this point. The page shall be retried later.
49 *
50 * @returns 0 for success, -1 for failure, with errno appropriately set.
51 */
52 int (*normalise_page)(struct xc_sr_context *ctx, xen_pfn_t type,
53 void **page);
54
55 /**
56 * Set up local environment to save a domain. (Typically querying
57 * running domain state, setting up mappings etc.)
58 *
59 * This is called once before any common setup has occurred, allowing for
60 * guest-specific adjustments to be made to common state.
61 */
62 int (*setup)(struct xc_sr_context *ctx);
63
64 /**
65 * Send static records at the head of the stream. This is called once,
66 * after the Image and Domain headers are written.
67 */
68 int (*static_data)(struct xc_sr_context *ctx);
69
70 /**
71 * Send dynamic records which need to be at the start of the stream. This
72 * is called after the STATIC_DATA_END record is written.
73 */
74 int (*start_of_stream)(struct xc_sr_context *ctx);
75
76 /**
77 * Send records which need to be at the start of a checkpoint. This is
78 * called once, or once per checkpoint in a checkpointed stream, and is
79 * ahead of memory data.
80 */
81 int (*start_of_checkpoint)(struct xc_sr_context *ctx);
82
83 /**
84 * Send records which need to be at the end of the checkpoint. This is
85 * called once, or once per checkpoint in a checkpointed stream, and is
86 * after the memory data.
87 */
88 int (*end_of_checkpoint)(struct xc_sr_context *ctx);
89
90 /**
91 * Check state of guest to decide whether it makes sense to continue
92 * migration. This is called in each iteration or checkpoint to check
93 * whether all criteria for the migration are still met. If that's not
94 * the case either migration is cancelled via a bad rc or the situation
95 * is handled, e.g. by sending appropriate records.
96 */
97 int (*check_vm_state)(struct xc_sr_context *ctx);
98
99 /**
100 * Clean up the local environment. Will be called exactly once, either
101 * after a successful save, or upon encountering an error.
102 */
103 int (*cleanup)(struct xc_sr_context *ctx);
104 };
105
106
107 /**
108 * Restore operations. To be implemented for each type of guest, for use by
109 * the common restore algorithm.
110 *
111 * Every function must be implemented, even if only with a no-op stub.
112 */
113 struct xc_sr_restore_ops
114 {
115 /* Convert a PFN to GFN. May return ~0UL for an invalid mapping. */
116 xen_pfn_t (*pfn_to_gfn)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
117
118 /* Check to see whether a PFN is valid. */
119 bool (*pfn_is_valid)(const struct xc_sr_context *ctx, xen_pfn_t pfn);
120
121 /* Set the GFN of a PFN. */
122 void (*set_gfn)(struct xc_sr_context *ctx, xen_pfn_t pfn, xen_pfn_t gfn);
123
124 /* Set the type of a PFN. */
125 void (*set_page_type)(struct xc_sr_context *ctx, xen_pfn_t pfn,
126 xen_pfn_t type);
127
128 /**
129 * Optionally transform the contents of a page from being generic in the
130 * stream, to being specific to the restoring environment.
131 *
132 * 'page' is expected to be modified in-place if a transformation is
133 * required.
134 *
135 * @returns 0 for success, -1 for failure, with errno appropriately set.
136 */
137 int (*localise_page)(struct xc_sr_context *ctx, uint32_t type, void *page);
138
139 /**
140 * Set up local environment to restore a domain.
141 *
142 * This is called once before any common setup has occurred, allowing for
143 * guest-specific adjustments to be made to common state.
144 */
145 int (*setup)(struct xc_sr_context *ctx);
146
147 /**
148 * Process an individual record from the stream. The caller shall take
149 * care of processing common records (e.g. END, PAGE_DATA).
150 *
151 * @return 0 for success, -1 for failure, or the following sentinels:
152 * - RECORD_NOT_PROCESSED
153 * - BROKEN_CHANNEL: under Remus/COLO, this means master may be dead, and
154 * a failover is needed.
155 */
156 #define RECORD_NOT_PROCESSED 1
157 #define BROKEN_CHANNEL 2
158 int (*process_record)(struct xc_sr_context *ctx, struct xc_sr_record *rec);
159
160 /**
161 * Perform any actions required after the static data has arrived. Called
162 * when the STATIC_DATA_COMPLETE record has been recieved/inferred.
163 * 'missing' should be filled in for any data item the higher level
164 * toolstack needs to provide compatiblity for.
165 */
166 int (*static_data_complete)(struct xc_sr_context *ctx,
167 unsigned int *missing);
168
169 /**
170 * Perform any actions required after the stream has been finished. Called
171 * after the END record has been received.
172 */
173 int (*stream_complete)(struct xc_sr_context *ctx);
174
175 /**
176 * Clean up the local environment. Will be called exactly once, either
177 * after a successful restore, or upon encountering an error.
178 */
179 int (*cleanup)(struct xc_sr_context *ctx);
180 };
181
182 /* Wrapper for blobs of data heading Xen-wards. */
183 struct xc_sr_blob
184 {
185 void *ptr;
186 size_t size;
187 };
188
189 /*
190 * Update a blob. Duplicate src/size, freeing the old blob if necessary. May
191 * fail due to memory allocation.
192 */
update_blob(struct xc_sr_blob * blob,const void * src,size_t size)193 static inline int update_blob(struct xc_sr_blob *blob,
194 const void *src, size_t size)
195 {
196 void *ptr;
197
198 if ( !src || !size )
199 {
200 errno = EINVAL;
201 return -1;
202 }
203
204 if ( (ptr = malloc(size)) == NULL )
205 return -1;
206
207 free(blob->ptr);
208 blob->ptr = memcpy(ptr, src, size);
209 blob->size = size;
210
211 return 0;
212 }
213
214 struct xc_sr_context
215 {
216 xc_interface *xch;
217 uint32_t domid;
218 int fd;
219
220 /* Plain VM, or checkpoints over time. */
221 xc_stream_type_t stream_type;
222
223 xc_domaininfo_t dominfo;
224
225 union /* Common save or restore data. */
226 {
227 struct /* Save data. */
228 {
229 int recv_fd;
230
231 struct xc_sr_save_ops ops;
232 struct save_callbacks *callbacks;
233
234 /* Live migrate vs non live suspend. */
235 bool live;
236
237 /* Further debugging information in the stream. */
238 bool debug;
239
240 unsigned long p2m_size;
241
242 struct precopy_stats stats;
243
244 xen_pfn_t *batch_pfns;
245 unsigned int nr_batch_pfns;
246 unsigned long *deferred_pages;
247 unsigned long nr_deferred_pages;
248 xc_hypercall_buffer_t dirty_bitmap_hbuf;
249 } save;
250
251 struct /* Restore data. */
252 {
253 struct xc_sr_restore_ops ops;
254 struct restore_callbacks *callbacks;
255
256 int send_back_fd;
257 unsigned long p2m_size;
258 xc_hypercall_buffer_t dirty_bitmap_hbuf;
259
260 /* From Image Header. */
261 uint32_t format_version;
262
263 /* From Domain Header. */
264 uint32_t guest_type;
265 uint32_t guest_page_size;
266
267 /* Currently buffering records between a checkpoint */
268 bool buffer_all_records;
269
270 /* Whether a STATIC_DATA_END record has been seen/inferred. */
271 bool seen_static_data_end;
272
273 /*
274 * With Remus/COLO, we buffer the records sent by the primary at checkpoint,
275 * in case the primary will fail, we can recover from the last
276 * checkpoint state.
277 * This should be enough for most of the cases because primary only send
278 * dirty pages at checkpoint.
279 */
280 #define DEFAULT_BUF_RECORDS 1024
281 struct xc_sr_record *buffered_records;
282 unsigned int allocated_rec_num;
283 unsigned int buffered_rec_num;
284
285 /*
286 * Xenstore and Console parameters.
287 * INPUT: evtchn & domid
288 * OUTPUT: gfn
289 */
290 xen_pfn_t xenstore_gfn, console_gfn;
291 unsigned int xenstore_evtchn, console_evtchn;
292 uint32_t xenstore_domid, console_domid;
293
294 /* Bitmap of currently populated PFNs during restore. */
295 unsigned long *populated_pfns;
296 xen_pfn_t max_populated_pfn;
297
298 /* Sender has invoked verify mode on the stream. */
299 bool verify;
300 } restore;
301 };
302
303 union /* Guest-arch specific data. */
304 {
305 struct /* x86 */
306 {
307 /* Common save/restore data. */
308 union
309 {
310 struct
311 {
312 /* X86_{CPUID,MSR}_DATA blobs for CPU Policy. */
313 struct xc_sr_blob cpuid, msr;
314 } restore;
315 };
316
317 struct /* x86 PV guest. */
318 {
319 /* 4 or 8; 32 or 64 bit domain */
320 unsigned int width;
321 /* 3 or 4 pagetable levels */
322 unsigned int levels;
323
324 /* Maximum Xen frame */
325 xen_pfn_t max_mfn;
326 /* Read-only machine to phys map */
327 xen_pfn_t *m2p;
328 /* first mfn of the compat m2p (Only set for 32bit PV guests) */
329 xen_pfn_t compat_m2p_mfn0;
330 /* Number of m2p frames mapped */
331 unsigned long nr_m2p_frames;
332
333 /* Maximum guest frame */
334 xen_pfn_t max_pfn;
335
336 /* Number of frames making up the p2m */
337 unsigned int p2m_frames;
338 /* Guest's phys to machine map. Mapped read-only (save) or
339 * allocated locally (restore). Uses guest unsigned longs. */
340 void *p2m;
341 /* The guest pfns containing the p2m leaves */
342 xen_pfn_t *p2m_pfns;
343
344 /* Read-only mapping of guests shared info page */
345 shared_info_any_t *shinfo;
346
347 /* p2m generation count for verifying validity of local p2m. */
348 uint64_t p2m_generation;
349
350 union
351 {
352 struct
353 {
354 /* State machine for the order of received records. */
355 bool seen_pv_info;
356
357 /* Types for each page (bounded by max_pfn). */
358 uint32_t *pfn_types;
359
360 /* x86 PV per-vcpu storage structure for blobs. */
361 struct xc_sr_x86_pv_restore_vcpu
362 {
363 struct xc_sr_blob basic, extd, xsave, msr;
364 } *vcpus;
365 unsigned int nr_vcpus;
366 } restore;
367 };
368 } pv;
369
370 struct /* x86 HVM guest. */
371 {
372 union
373 {
374 struct
375 {
376 /* Whether qemu enabled logdirty mode, and we should
377 * disable on cleanup. */
378 bool qemu_enabled_logdirty;
379 } save;
380
381 struct
382 {
383 /* HVM context blob. */
384 struct xc_sr_blob context;
385 } restore;
386 };
387 } hvm;
388
389 } x86;
390 };
391 };
392
393 extern struct xc_sr_save_ops save_ops_x86_pv;
394 extern struct xc_sr_save_ops save_ops_x86_hvm;
395
396 extern struct xc_sr_restore_ops restore_ops_x86_pv;
397 extern struct xc_sr_restore_ops restore_ops_x86_hvm;
398
399 struct xc_sr_record
400 {
401 uint32_t type;
402 uint32_t length;
403 void *data;
404 };
405
406 /*
407 * Writes a split record to the stream, applying correct padding where
408 * appropriate. It is common when sending records containing blobs from Xen
409 * that the header and blob data are separate. This function accepts a second
410 * buffer and length, and will merge it with the main record when sending.
411 *
412 * Records with a non-zero length must provide a valid data field; records
413 * with a 0 length shall have their data field ignored.
414 *
415 * Returns 0 on success and non0 on failure.
416 */
417 int write_split_record(struct xc_sr_context *ctx, struct xc_sr_record *rec,
418 void *buf, size_t sz);
419
420 /*
421 * Writes a record to the stream, applying correct padding where appropriate.
422 * Records with a non-zero length must provide a valid data field; records
423 * with a 0 length shall have their data field ignored.
424 *
425 * Returns 0 on success and non0 on failure.
426 */
write_record(struct xc_sr_context * ctx,struct xc_sr_record * rec)427 static inline int write_record(struct xc_sr_context *ctx,
428 struct xc_sr_record *rec)
429 {
430 return write_split_record(ctx, rec, NULL, 0);
431 }
432
433 /*
434 * Reads a record from the stream, and fills in the record structure.
435 *
436 * Returns 0 on success and non-0 on failure.
437 *
438 * On success, the records type and size shall be valid.
439 * - If size is 0, data shall be NULL.
440 * - If size is non-0, data shall be a buffer allocated by malloc() which must
441 * be passed to free() by the caller.
442 *
443 * On failure, the contents of the record structure are undefined.
444 */
445 int read_record(struct xc_sr_context *ctx, int fd, struct xc_sr_record *rec);
446
447 /*
448 * This would ideally be private in restore.c, but is needed by
449 * x86_pv_localise_page() if we receive pagetables frames ahead of the
450 * contents of the frames they point at.
451 */
452 int populate_pfns(struct xc_sr_context *ctx, unsigned int count,
453 const xen_pfn_t *original_pfns, const uint32_t *types);
454
455 /* Handle a STATIC_DATA_END record. */
456 int handle_static_data_end(struct xc_sr_context *ctx);
457
458 /* Page type known to the migration logic? */
is_known_page_type(uint32_t type)459 static inline bool is_known_page_type(uint32_t type)
460 {
461 switch ( type )
462 {
463 case XEN_DOMCTL_PFINFO_NOTAB:
464
465 case XEN_DOMCTL_PFINFO_L1TAB:
466 case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
467
468 case XEN_DOMCTL_PFINFO_L2TAB:
469 case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:
470
471 case XEN_DOMCTL_PFINFO_L3TAB:
472 case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:
473
474 case XEN_DOMCTL_PFINFO_L4TAB:
475 case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
476
477 case XEN_DOMCTL_PFINFO_XTAB:
478 case XEN_DOMCTL_PFINFO_XALLOC: /* Synthetic type in Xen 4.2 - 4.5 */
479 case XEN_DOMCTL_PFINFO_BROKEN:
480 return true;
481
482 default:
483 return false;
484 }
485 }
486
487 /* Page type backed by RAM in the guest? */
page_type_to_populate(uint32_t type)488 static inline bool page_type_to_populate(uint32_t type)
489 {
490 switch ( type )
491 {
492 case XEN_DOMCTL_PFINFO_NOTAB:
493
494 case XEN_DOMCTL_PFINFO_L1TAB:
495 case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
496
497 case XEN_DOMCTL_PFINFO_L2TAB:
498 case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:
499
500 case XEN_DOMCTL_PFINFO_L3TAB:
501 case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:
502
503 case XEN_DOMCTL_PFINFO_L4TAB:
504 case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
505
506 case XEN_DOMCTL_PFINFO_XALLOC:
507 return true;
508
509 case XEN_DOMCTL_PFINFO_XTAB:
510 case XEN_DOMCTL_PFINFO_BROKEN:
511 default:
512 return false;
513 }
514 }
515
page_type_has_stream_data(uint32_t type)516 static inline bool page_type_has_stream_data(uint32_t type)
517 {
518 switch ( type )
519 {
520 case XEN_DOMCTL_PFINFO_NOTAB:
521
522 case XEN_DOMCTL_PFINFO_L1TAB:
523 case XEN_DOMCTL_PFINFO_L1TAB | XEN_DOMCTL_PFINFO_LPINTAB:
524
525 case XEN_DOMCTL_PFINFO_L2TAB:
526 case XEN_DOMCTL_PFINFO_L2TAB | XEN_DOMCTL_PFINFO_LPINTAB:
527
528 case XEN_DOMCTL_PFINFO_L3TAB:
529 case XEN_DOMCTL_PFINFO_L3TAB | XEN_DOMCTL_PFINFO_LPINTAB:
530
531 case XEN_DOMCTL_PFINFO_L4TAB:
532 case XEN_DOMCTL_PFINFO_L4TAB | XEN_DOMCTL_PFINFO_LPINTAB:
533 return true;
534
535 case XEN_DOMCTL_PFINFO_XTAB:
536 case XEN_DOMCTL_PFINFO_BROKEN:
537 case XEN_DOMCTL_PFINFO_XALLOC:
538 default:
539 return false;
540 }
541 }
542
543 #endif
544 /*
545 * Local variables:
546 * mode: C
547 * c-file-style: "BSD"
548 * c-basic-offset: 4
549 * tab-width: 4
550 * indent-tabs-mode: nil
551 * End:
552 */
553