1 /*
2  * Copyright (c) 2008, XenSource Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of XenSource Inc. nor the names of its contributors
13  *       may be used to endorse or promote products derived from this software
14  *       without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28 #include <stdio.h>
29 #include <errno.h>
30 #include <fcntl.h>
31 #include <regex.h>
32 #include <unistd.h>
33 #include <stdlib.h>
34 #include <libgen.h>
35 #include <sys/mman.h>
36 #include <sys/ioctl.h>
37 #ifdef MEMSHR
38 #include <memshr.h>
39 #endif
40 
41 #include "tapdisk-image.h"
42 #include "tapdisk-driver.h"
43 #include "tapdisk-server.h"
44 #include "tapdisk-interface.h"
45 #include "tapdisk-disktype.h"
46 #include "tapdisk-vbd.h"
47 #include "blktap2.h"
48 
49 #define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a)
50 #define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a)
51 
52 #if 1
53 #define ASSERT(p)							\
54 	do {								\
55 		if (!(p)) {						\
56 			DPRINTF("Assertion '%s' failed, line %d, "	\
57 				"file %s", #p, __LINE__, __FILE__);	\
58 			abort();					\
59 		}							\
60 	} while (0)
61 #else
62 #define ASSERT(p) ((void)0)
63 #endif
64 
65 
66 #define TD_VBD_EIO_RETRIES          10
67 #define TD_VBD_EIO_SLEEP            1
68 #define TD_VBD_WATCHDOG_TIMEOUT     10
69 
70 static void tapdisk_vbd_ring_event(event_id_t, char, void *);
71 static void tapdisk_vbd_callback(void *, blkif_response_t *);
72 
73 /*
74  * initialization
75  */
76 
77 static inline void
tapdisk_vbd_initialize_vreq(td_vbd_request_t * vreq)78 tapdisk_vbd_initialize_vreq(td_vbd_request_t *vreq)
79 {
80 	memset(vreq, 0, sizeof(td_vbd_request_t));
81 	INIT_LIST_HEAD(&vreq->next);
82 }
83 
84 void
tapdisk_vbd_free(td_vbd_t * vbd)85 tapdisk_vbd_free(td_vbd_t *vbd)
86 {
87 	if (vbd) {
88 		tapdisk_vbd_free_stack(vbd);
89 		list_del_init(&vbd->next);
90 		free(vbd->name);
91 		free(vbd);
92 	}
93 }
94 
95 td_vbd_t*
tapdisk_vbd_create(uint16_t uuid)96 tapdisk_vbd_create(uint16_t uuid)
97 {
98 	td_vbd_t *vbd;
99 	int i;
100 
101 	vbd = calloc(1, sizeof(td_vbd_t));
102 	if (!vbd) {
103 		EPRINTF("failed to allocate tapdisk state\n");
104 		return NULL;
105 	}
106 
107 	vbd->uuid     = uuid;
108 	vbd->minor    = -1;
109 	vbd->ring.fd  = -1;
110 
111 	/* default blktap ring completion */
112 	vbd->callback = tapdisk_vbd_callback;
113 	vbd->argument = vbd;
114 
115 #ifdef MEMSHR
116 	memshr_vbd_initialize();
117 #endif
118 
119 	INIT_LIST_HEAD(&vbd->driver_stack);
120 	INIT_LIST_HEAD(&vbd->images);
121 	INIT_LIST_HEAD(&vbd->new_requests);
122 	INIT_LIST_HEAD(&vbd->pending_requests);
123 	INIT_LIST_HEAD(&vbd->failed_requests);
124 	INIT_LIST_HEAD(&vbd->completed_requests);
125 	INIT_LIST_HEAD(&vbd->next);
126 	gettimeofday(&vbd->ts, NULL);
127 
128 	for (i = 0; i < MAX_REQUESTS; i++)
129 		tapdisk_vbd_initialize_vreq(vbd->request_list + i);
130 
131 	return vbd;
132 }
133 
134 int
tapdisk_vbd_initialize(uint16_t uuid)135 tapdisk_vbd_initialize(uint16_t uuid)
136 {
137 	td_vbd_t *vbd;
138 
139 	vbd = tapdisk_server_get_vbd(uuid);
140 	if (vbd) {
141 		EPRINTF("duplicate vbds! %u\n", uuid);
142 		return -EEXIST;
143 	}
144 
145 	vbd = tapdisk_vbd_create(uuid);
146 
147 	tapdisk_server_add_vbd(vbd);
148 
149 	return 0;
150 }
151 
152 void
tapdisk_vbd_set_callback(td_vbd_t * vbd,td_vbd_cb_t callback,void * argument)153 tapdisk_vbd_set_callback(td_vbd_t *vbd, td_vbd_cb_t callback, void *argument)
154 {
155 	vbd->callback = callback;
156 	vbd->argument = argument;
157 }
158 
159 static int
tapdisk_vbd_validate_chain(td_vbd_t * vbd)160 tapdisk_vbd_validate_chain(td_vbd_t *vbd)
161 {
162 	int err;
163 	td_image_t *image, *parent, *tmp;
164 
165 	DPRINTF("VBD CHAIN:\n");
166 
167 	tapdisk_vbd_for_each_image(vbd, image, tmp) {
168 		DPRINTF("%s: %d\n", image->name, image->type);
169 
170 		if (tapdisk_vbd_is_last_image(vbd, image))
171 			break;
172 
173 		parent = tapdisk_vbd_next_image(image);
174 		err    = td_validate_parent(image, parent);
175 		if (err)
176 			return err;
177 	}
178 
179 	return 0;
180 }
181 
182 void
tapdisk_vbd_close_vdi(td_vbd_t * vbd)183 tapdisk_vbd_close_vdi(td_vbd_t *vbd)
184 {
185 	td_image_t *image, *tmp;
186 
187 	tapdisk_vbd_for_each_image(vbd, image, tmp) {
188 		td_close(image);
189 		tapdisk_image_free(image);
190 	}
191 
192 	INIT_LIST_HEAD(&vbd->images);
193 	td_flag_set(vbd->state, TD_VBD_CLOSED);
194 
195 	tapdisk_vbd_free_stack(vbd);
196 }
197 
198 static int
tapdisk_vbd_add_block_cache(td_vbd_t * vbd)199 tapdisk_vbd_add_block_cache(td_vbd_t *vbd)
200 {
201 	int err;
202 	td_driver_t *driver;
203 	td_image_t *cache, *image, *target, *tmp;
204 
205 	target = NULL;
206 
207 	tapdisk_vbd_for_each_image(vbd, image, tmp)
208 		if (td_flag_test(image->flags, TD_OPEN_RDONLY) &&
209 		    td_flag_test(image->flags, TD_OPEN_SHAREABLE)) {
210 			target = image;
211 			break;
212 		}
213 
214 	if (!target)
215 		return 0;
216 
217 	cache = tapdisk_image_allocate(target->name,
218 				       DISK_TYPE_BLOCK_CACHE,
219 				       target->storage,
220 				       target->flags,
221 				       target->private);
222 	if (!cache)
223 		return -ENOMEM;
224 
225 	/* try to load existing cache */
226 	err = td_load(cache);
227 	if (!err)
228 		goto done;
229 
230 	/* hack driver to send open() correct image size */
231 	if (!target->driver) {
232 		err = -ENODEV;
233 		goto fail;
234 	}
235 
236 	cache->driver = tapdisk_driver_allocate(cache->type,
237 						cache->name,
238 						cache->flags,
239 						cache->storage);
240 	if (!cache->driver) {
241 		err = -ENOMEM;
242 		goto fail;
243 	}
244 
245 	cache->driver->info = target->driver->info;
246 
247 	/* try to open new cache */
248 	err = td_open(cache);
249 	if (!err)
250 		goto done;
251 
252 fail:
253 	/* give up */
254 	tapdisk_image_free(target);
255 	return err;
256 
257 done:
258 	/* insert cache before image */
259 	list_add(&cache->next, target->next.prev);
260 	return 0;
261 }
262 
263 static int
tapdisk_vbd_add_dirty_log(td_vbd_t * vbd)264 tapdisk_vbd_add_dirty_log(td_vbd_t *vbd)
265 {
266 	int err;
267 	td_driver_t *driver;
268 	td_image_t *log, *parent;
269 
270 	driver = NULL;
271 	log    = NULL;
272 
273 	parent = tapdisk_vbd_first_image(vbd);
274 
275 	log    = tapdisk_image_allocate(parent->name,
276 					DISK_TYPE_LOG,
277 					parent->storage,
278 					parent->flags,
279 					vbd);
280 	if (!log)
281 		return -ENOMEM;
282 
283 	driver = tapdisk_driver_allocate(log->type,
284 					 log->name,
285 					 log->flags,
286 					 log->storage);
287 	if (!driver) {
288 		err = -ENOMEM;
289 		goto fail;
290 	}
291 
292 	driver->info = parent->driver->info;
293 	log->driver  = driver;
294 
295 	err = td_open(log);
296 	if (err)
297 		goto fail;
298 
299 	list_add(&log->next, &vbd->images);
300 	return 0;
301 
302 fail:
303 	tapdisk_image_free(log);
304 	return err;
305 }
306 
307 static int
tapdisk_vbd_open_level(td_vbd_t * vbd,struct list_head * head,const char * params,int driver_type,td_disk_info_t * driver_info,td_flag_t flags)308 tapdisk_vbd_open_level(td_vbd_t *vbd, struct list_head *head,
309 		       const char *params, int driver_type,
310 		       td_disk_info_t *driver_info, td_flag_t flags)
311 {
312 	const char *name;
313 	int type, err;
314 	td_image_t *image;
315 	td_disk_id_t id;
316 	td_driver_t *driver;
317 
318 	name    = params;
319 	id.name = NULL;
320 	type    = driver_type;
321 	INIT_LIST_HEAD(head);
322 
323 	for (;;) {
324 		err   = -ENOMEM;
325 		image = tapdisk_image_allocate(name, type,
326 					       vbd->storage, flags, vbd);
327 
328 		free(id.name);
329 
330 		if (!image)
331 			goto out;
332 
333 
334 		/* this breaks if a driver modifies its info within a layer */
335 		err = __td_open(image, driver_info);
336 		if (err)
337 			goto out;
338 
339 		/* TODO: non-sink drivers that don't care about their child
340 		 * currently return EINVAL. Could return TD_PARENT_OK or
341 		 * TD_ANY_PARENT */
342 
343 		err = td_get_parent_id(image, &id);
344 		if (err && (err != TD_NO_PARENT && err != -EINVAL)) {
345 			td_close(image);
346 			goto out;
347 		}
348 
349 		/* add this image to the end of the list */
350 		list_add_tail(&image->next, head);
351 		image = NULL;
352 
353 		/* if the image does not have a parent we return the
354 		 * list of images generated by this level of the stack */
355 		if (err == TD_NO_PARENT || err == -EINVAL) {
356 			err = 0;
357 			goto out;
358 		}
359 
360 		name   = id.name;
361 		type   = id.drivertype;
362 
363 		flags |= (TD_OPEN_RDONLY | TD_OPEN_SHAREABLE);
364 	}
365 
366 out:
367 	if (err) {
368 		if (image) {
369 			td_close(image);
370 			tapdisk_image_free(image);
371 		}
372 		while (!list_empty(head)) {
373 			image = list_entry(&head->next, td_image_t, next);
374 			td_close(image);
375 			tapdisk_image_free(image);
376 		}
377 	}
378 
379 	return err;
380 }
381 
382 static int
__tapdisk_vbd_open_vdi(td_vbd_t * vbd,td_flag_t extra_flags)383 __tapdisk_vbd_open_vdi(td_vbd_t *vbd, td_flag_t extra_flags)
384 {
385 	int err;
386 	td_flag_t flags;
387 	td_image_t *tmp;
388 	td_vbd_driver_info_t *driver_info;
389 	struct list_head *images;
390 	td_disk_info_t *parent_info = NULL;
391 
392 	if (list_empty(&vbd->driver_stack))
393 		return -ENOENT;
394 
395 	flags = (vbd->flags & ~TD_OPEN_SHAREABLE) | extra_flags;
396 
397 	/* loop on each user specified driver.
398 	 * NOTE: driver_info is in reverse order. That is, the first
399 	 * item is the 'parent' or 'sink' driver */
400 	list_for_each_entry(driver_info, &vbd->driver_stack, next) {
401 		LIST_HEAD(images);
402 
403 		err = tapdisk_vbd_open_level(vbd, &images,
404 					     driver_info->params,
405 					     driver_info->type,
406 					     parent_info, flags);
407 		if (err)
408 			goto fail;
409 
410 		/* after each loop,
411 		 * append the created stack to the result stack */
412 		list_splice(&images, &vbd->images);
413 
414 		/* set the parent_info to the first diskinfo on the stack */
415 		tmp = tapdisk_vbd_first_image(vbd);
416 		parent_info = &tmp->info;
417 	}
418 
419 	if (td_flag_test(vbd->flags, TD_OPEN_LOG_DIRTY)) {
420 		err = tapdisk_vbd_add_dirty_log(vbd);
421 		if (err)
422 			goto fail;
423 	}
424 
425 	if (td_flag_test(vbd->flags, TD_OPEN_ADD_CACHE)) {
426 		err = tapdisk_vbd_add_block_cache(vbd);
427 		if (err)
428 			goto fail;
429 	}
430 
431 	err = tapdisk_vbd_validate_chain(vbd);
432 	if (err)
433 		goto fail;
434 
435 	td_flag_clear(vbd->state, TD_VBD_CLOSED);
436 
437 	return 0;
438 
439 fail:
440 	tapdisk_vbd_close_vdi(vbd);
441 	return err;
442 }
443 
444 /* this populates a vbd type based on path */
445 int
tapdisk_vbd_parse_stack(td_vbd_t * vbd,const char * path)446 tapdisk_vbd_parse_stack(td_vbd_t *vbd, const char *path)
447 {
448 	int err;
449 	char *params, *driver_str;
450 	td_vbd_driver_info_t *driver;
451 
452 	err = tapdisk_namedup(&params, path);
453 	if (err)
454 		return err;
455 
456 	/* tokenize params based on pipe '|' */
457 	driver_str = strtok(params, "|");
458 	while (driver_str != NULL) {
459 		const char *path;
460 		int type;
461 
462 		/* parse driver info and add to vbd */
463 		driver = calloc(1, sizeof(td_vbd_driver_info_t));
464 		if (!driver) {
465 			PERROR("malloc");
466 			err = -errno;
467 			goto out;
468 		}
469 		INIT_LIST_HEAD(&driver->next);
470 
471 		err = tapdisk_parse_disk_type(driver_str, &path, &type);
472 		if (err) {
473 			free(driver);
474 			goto out;
475 		}
476 
477 		driver->type   = type;
478 		driver->params = strdup(path);
479 		if (!driver->params) {
480 			err = -ENOMEM;
481 			free(driver);
482 			goto out;
483 		}
484 
485 		/* build the list backwards as the last driver will be the
486 		 * first driver to open in the stack */
487 		list_add(&driver->next, &vbd->driver_stack);
488 
489 		/* get next driver string */
490 		driver_str = strtok(NULL, "|");
491 	}
492 
493 out:
494 	free(params);
495 	if (err)
496 		tapdisk_vbd_free_stack(vbd);
497 
498 	return err;
499 }
500 
501 void
tapdisk_vbd_free_stack(td_vbd_t * vbd)502 tapdisk_vbd_free_stack(td_vbd_t *vbd)
503 {
504 	td_vbd_driver_info_t *driver;
505 
506 	while (!list_empty(&vbd->driver_stack)) {
507 		driver = list_entry(vbd->driver_stack.next,
508 				    td_vbd_driver_info_t, next);
509 		list_del(&driver->next);
510 		free(driver->params);
511 		free(driver);
512 	}
513 }
514 
515 /* NOTE: driver type, etc. must be set */
516 int
tapdisk_vbd_open_stack(td_vbd_t * vbd,uint16_t storage,td_flag_t flags)517 tapdisk_vbd_open_stack(td_vbd_t *vbd, uint16_t storage, td_flag_t flags)
518 {
519 	int i, err = 0;
520 
521 	vbd->flags   = flags;
522 	vbd->storage = storage;
523 
524 	for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
525 		err = __tapdisk_vbd_open_vdi(vbd, 0);
526 		if (err != -EIO)
527 			break;
528 
529 		sleep(TD_VBD_EIO_SLEEP);
530 	}
531 	if (err)
532 		goto fail;
533 
534 	return 0;
535 
536  fail:
537 	return err;
538 }
539 
540 int
tapdisk_vbd_open_vdi(td_vbd_t * vbd,const char * path,uint16_t drivertype,uint16_t storage,td_flag_t flags)541 tapdisk_vbd_open_vdi(td_vbd_t *vbd, const char *path,
542 		     uint16_t drivertype, uint16_t storage, td_flag_t flags)
543 {
544 	int i, err;
545 	const struct tap_disk *ops;
546 
547 	ops = tapdisk_disk_drivers[drivertype];
548 	if (!ops)
549 		return -EINVAL;
550 	DPRINTF("Loaded %s driver for vbd %u %s 0x%08x\n",
551 		ops->disk_type, vbd->uuid, path, flags);
552 
553 	err = tapdisk_namedup(&vbd->name, path);
554 	if (err)
555 		return err;
556 
557 	vbd->flags   = flags;
558 	vbd->storage = storage;
559 
560 	for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
561 		err = __tapdisk_vbd_open_vdi(vbd, 0);
562 		if (err != -EIO)
563 			break;
564 
565 		sleep(TD_VBD_EIO_SLEEP);
566 	}
567 	if (err)
568 		goto fail;
569 
570 	return 0;
571 
572 fail:
573 	free(vbd->name);
574 	vbd->name = NULL;
575 	return err;
576 }
577 
578 static int
tapdisk_vbd_register_event_watches(td_vbd_t * vbd)579 tapdisk_vbd_register_event_watches(td_vbd_t *vbd)
580 {
581 	event_id_t id;
582 
583 	id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
584 					   vbd->ring.fd, 0,
585 					   tapdisk_vbd_ring_event, vbd);
586 	if (id < 0)
587 		return id;
588 
589 	vbd->ring_event_id = id;
590 
591 	return 0;
592 }
593 
594 static void
tapdisk_vbd_unregister_events(td_vbd_t * vbd)595 tapdisk_vbd_unregister_events(td_vbd_t *vbd)
596 {
597 	if (vbd->ring_event_id)
598 		tapdisk_server_unregister_event(vbd->ring_event_id);
599 }
600 
601 static int
tapdisk_vbd_map_device(td_vbd_t * vbd,const char * devname)602 tapdisk_vbd_map_device(td_vbd_t *vbd, const char *devname)
603 {
604 
605 	int err, psize;
606 	td_ring_t *ring;
607 
608 	ring  = &vbd->ring;
609 	psize = getpagesize();
610 
611 	ring->fd = open(devname, O_RDWR);
612 	if (ring->fd == -1) {
613 		err = -errno;
614 		EPRINTF("failed to open %s: %d\n", devname, err);
615 		goto fail;
616 	}
617 
618 	ring->mem = mmap(0, psize * BLKTAP_MMAP_REGION_SIZE,
619 			 PROT_READ | PROT_WRITE, MAP_SHARED, ring->fd, 0);
620 	if (ring->mem == MAP_FAILED) {
621 		err = -errno;
622 		EPRINTF("failed to mmap %s: %d\n", devname, err);
623 		goto fail;
624 	}
625 
626 	ring->sring = (blkif_sring_t *)((unsigned long)ring->mem);
627 	BACK_RING_INIT(&ring->fe_ring, ring->sring, psize);
628 
629 	ring->vstart =
630 		(unsigned long)ring->mem + (BLKTAP_RING_PAGES * psize);
631 
632 	ioctl(ring->fd, BLKTAP_IOCTL_SETMODE, BLKTAP_MODE_INTERPOSE);
633 
634 	return 0;
635 
636 fail:
637 	if (ring->mem && ring->mem != MAP_FAILED)
638 		munmap(ring->mem, psize * BLKTAP_MMAP_REGION_SIZE);
639 	if (ring->fd != -1)
640 		close(ring->fd);
641 	ring->fd  = -1;
642 	ring->mem = NULL;
643 	return err;
644 }
645 
646 static int
tapdisk_vbd_unmap_device(td_vbd_t * vbd)647 tapdisk_vbd_unmap_device(td_vbd_t *vbd)
648 {
649 	int psize;
650 
651 	psize = getpagesize();
652 
653 	if (vbd->ring.fd != -1)
654 		close(vbd->ring.fd);
655 	if (vbd->ring.mem > 0)
656 		munmap(vbd->ring.mem, psize * BLKTAP_MMAP_REGION_SIZE);
657 
658 	return 0;
659 }
660 
661 void
tapdisk_vbd_detach(td_vbd_t * vbd)662 tapdisk_vbd_detach(td_vbd_t *vbd)
663 {
664 	tapdisk_vbd_unregister_events(vbd);
665 
666 	tapdisk_vbd_unmap_device(vbd);
667 	vbd->minor = -1;
668 }
669 
670 
671 int
tapdisk_vbd_attach(td_vbd_t * vbd,const char * devname,int minor)672 tapdisk_vbd_attach(td_vbd_t *vbd, const char *devname, int minor)
673 {
674 	int err;
675 
676 	err = tapdisk_vbd_map_device(vbd, devname);
677 	if (err)
678 		goto fail;
679 
680 	err = tapdisk_vbd_register_event_watches(vbd);
681 	if (err)
682 		goto fail;
683 
684 	vbd->minor = minor;
685 
686 	return 0;
687 
688 fail:
689 	tapdisk_vbd_detach(vbd);
690 
691 	return err;
692 }
693 
694 int
tapdisk_vbd_open(td_vbd_t * vbd,const char * name,uint16_t type,uint16_t storage,int minor,const char * ring,td_flag_t flags)695 tapdisk_vbd_open(td_vbd_t *vbd, const char *name, uint16_t type,
696 		 uint16_t storage, int minor, const char *ring, td_flag_t flags)
697 {
698 	int err;
699 
700 	err = tapdisk_vbd_open_stack(vbd, storage, flags);
701 	if (err)
702 		goto out;
703 
704 	err = tapdisk_vbd_attach(vbd, ring, minor);
705 	if (err)
706 		goto out;
707 
708 	return 0;
709 
710 out:
711 	tapdisk_vbd_detach(vbd);
712 	tapdisk_vbd_close_vdi(vbd);
713 	free(vbd->name);
714 	vbd->name = NULL;
715 	return err;
716 }
717 
718 static void
tapdisk_vbd_queue_count(td_vbd_t * vbd,int * new,int * pending,int * failed,int * completed)719 tapdisk_vbd_queue_count(td_vbd_t *vbd, int *new,
720 			int *pending, int *failed, int *completed)
721 {
722 	int n, p, f, c;
723 	td_vbd_request_t *vreq, *tvreq;
724 
725 	n = 0;
726 	p = 0;
727 	f = 0;
728 	c = 0;
729 
730 	tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->new_requests)
731 		n++;
732 
733 	tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->pending_requests)
734 		p++;
735 
736 	tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->failed_requests)
737 		f++;
738 
739 	tapdisk_vbd_for_each_request(vreq, tvreq, &vbd->completed_requests)
740 		c++;
741 
742 	*new       = n;
743 	*pending   = p;
744 	*failed    = f;
745 	*completed = c;
746 }
747 
748 static int
tapdisk_vbd_shutdown(td_vbd_t * vbd)749 tapdisk_vbd_shutdown(td_vbd_t *vbd)
750 {
751 	int new, pending, failed, completed;
752 
753 	if (!list_empty(&vbd->pending_requests))
754 		return -EAGAIN;
755 
756 	tapdisk_vbd_kick(vbd);
757 	tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed);
758 
759 	DPRINTF("%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, "
760 		"failed: 0x%02x, completed: 0x%02x\n",
761 		vbd->name, vbd->state, new, pending, failed, completed);
762 	DPRINTF("last activity: %010ld.%06lld, errors: 0x%04"PRIx64", "
763 		"retries: 0x%04"PRIx64", received: 0x%08"PRIx64", "
764 		"returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n",
765 		vbd->ts.tv_sec, (unsigned long long)vbd->ts.tv_usec,
766 		vbd->errors, vbd->retries, vbd->received, vbd->returned,
767 		vbd->kicked);
768 
769 	tapdisk_vbd_close_vdi(vbd);
770 	tapdisk_vbd_detach(vbd);
771 	tapdisk_server_remove_vbd(vbd);
772 	tapdisk_vbd_free(vbd);
773 
774 	tlog_print_errors();
775 
776 	return 0;
777 }
778 
779 int
tapdisk_vbd_close(td_vbd_t * vbd)780 tapdisk_vbd_close(td_vbd_t *vbd)
781 {
782 	/*
783 	 * don't close if any requests are pending in the aio layer
784 	 */
785 	if (!list_empty(&vbd->pending_requests))
786 		goto fail;
787 
788 	/*
789 	 * if the queue is still active and we have more
790 	 * requests, try to complete them before closing.
791 	 */
792 	if (tapdisk_vbd_queue_ready(vbd) &&
793 	    (!list_empty(&vbd->new_requests) ||
794 	     !list_empty(&vbd->failed_requests) ||
795 	     !list_empty(&vbd->completed_requests)))
796 		goto fail;
797 
798 	return tapdisk_vbd_shutdown(vbd);
799 
800 fail:
801 	td_flag_set(vbd->state, TD_VBD_SHUTDOWN_REQUESTED);
802 	DBG(TLOG_WARN, "%s: requests pending\n", vbd->name);
803 	return -EAGAIN;
804 }
805 
806 /*
807  * control operations
808  */
809 
810 void
tapdisk_vbd_debug(td_vbd_t * vbd)811 tapdisk_vbd_debug(td_vbd_t *vbd)
812 {
813 	td_image_t *image, *tmp;
814 	int new, pending, failed, completed;
815 
816 	tapdisk_vbd_queue_count(vbd, &new, &pending, &failed, &completed);
817 
818 	DBG(TLOG_WARN, "%s: state: 0x%08x, new: 0x%02x, pending: 0x%02x, "
819 	    "failed: 0x%02x, completed: 0x%02x, last activity: %010ld.%06lld, "
820 	    "errors: 0x%04"PRIx64", retries: 0x%04"PRIx64", received: 0x%08"PRIx64", "
821 	    "returned: 0x%08"PRIx64", kicked: 0x%08"PRIx64"\n",
822 	    vbd->name, vbd->state, new, pending, failed, completed,
823 	    vbd->ts.tv_sec, (unsigned long long)vbd->ts.tv_usec,
824 	    vbd->errors, vbd->retries,
825 	    vbd->received, vbd->returned, vbd->kicked);
826 
827 	tapdisk_vbd_for_each_image(vbd, image, tmp)
828 		td_debug(image);
829 }
830 
831 static void
tapdisk_vbd_drop_log(td_vbd_t * vbd)832 tapdisk_vbd_drop_log(td_vbd_t *vbd)
833 {
834 	if (td_flag_test(vbd->state, TD_VBD_LOG_DROPPED))
835 		return;
836 
837 	tapdisk_vbd_debug(vbd);
838 	tlog_flush();
839 	td_flag_set(vbd->state, TD_VBD_LOG_DROPPED);
840 }
841 
842 int
tapdisk_vbd_get_image_info(td_vbd_t * vbd,image_t * img)843 tapdisk_vbd_get_image_info(td_vbd_t *vbd, image_t *img)
844 {
845 	td_image_t *image;
846 
847 	memset(img, 0, sizeof(image_t));
848 
849 	if (list_empty(&vbd->images))
850 		return -EINVAL;
851 
852 	image        = tapdisk_vbd_first_image(vbd);
853 	img->size    = image->info.size;
854 	img->secsize = image->info.sector_size;
855 	img->info    = image->info.info;
856 
857 	return 0;
858 }
859 
860 int
tapdisk_vbd_queue_ready(td_vbd_t * vbd)861 tapdisk_vbd_queue_ready(td_vbd_t *vbd)
862 {
863 	return (!td_flag_test(vbd->state, TD_VBD_DEAD) &&
864 		!td_flag_test(vbd->state, TD_VBD_CLOSED) &&
865 		!td_flag_test(vbd->state, TD_VBD_QUIESCED) &&
866 		!td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED));
867 }
868 
869 int
tapdisk_vbd_retry_needed(td_vbd_t * vbd)870 tapdisk_vbd_retry_needed(td_vbd_t *vbd)
871 {
872 	return td_flag_test(vbd->state, TD_VBD_RETRY_NEEDED);
873 }
874 
875 int
tapdisk_vbd_lock(td_vbd_t * vbd)876 tapdisk_vbd_lock(td_vbd_t *vbd)
877 {
878 	return 0;
879 }
880 
881 int
tapdisk_vbd_quiesce_queue(td_vbd_t * vbd)882 tapdisk_vbd_quiesce_queue(td_vbd_t *vbd)
883 {
884 	if (!list_empty(&vbd->pending_requests)) {
885 		td_flag_set(vbd->state, TD_VBD_QUIESCE_REQUESTED);
886 		return -EAGAIN;
887 	}
888 
889 	td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED);
890 	td_flag_set(vbd->state, TD_VBD_QUIESCED);
891 	return 0;
892 }
893 
894 int
tapdisk_vbd_start_queue(td_vbd_t * vbd)895 tapdisk_vbd_start_queue(td_vbd_t *vbd)
896 {
897 	td_flag_clear(vbd->state, TD_VBD_QUIESCED);
898 	td_flag_clear(vbd->state, TD_VBD_QUIESCE_REQUESTED);
899 	return 0;
900 }
901 
902 int
tapdisk_vbd_kill_queue(td_vbd_t * vbd)903 tapdisk_vbd_kill_queue(td_vbd_t *vbd)
904 {
905 	tapdisk_vbd_quiesce_queue(vbd);
906 	td_flag_set(vbd->state, TD_VBD_DEAD);
907 	return 0;
908 }
909 
910 static int
tapdisk_vbd_open_image(td_vbd_t * vbd,td_image_t * image)911 tapdisk_vbd_open_image(td_vbd_t *vbd, td_image_t *image)
912 {
913 	int err;
914 	td_image_t *parent;
915 
916 	err = td_open(image);
917 	if (err)
918 		return err;
919 
920 	if (!tapdisk_vbd_is_last_image(vbd, image)) {
921 		parent = tapdisk_vbd_next_image(image);
922 		err    = td_validate_parent(image, parent);
923 		if (err) {
924 			td_close(image);
925 			return err;
926 		}
927 	}
928 
929 	return 0;
930 }
931 
932 static int
tapdisk_vbd_close_and_reopen_image(td_vbd_t * vbd,td_image_t * image)933 tapdisk_vbd_close_and_reopen_image(td_vbd_t *vbd, td_image_t *image)
934 {
935 	int i, err = 0;
936 
937 	td_close(image);
938 
939 	for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
940 		err = tapdisk_vbd_open_image(vbd, image);
941 		if (err != -EIO)
942 			break;
943 
944 		sleep(TD_VBD_EIO_SLEEP);
945 	}
946 
947 	if (err)
948 		td_flag_set(vbd->state, TD_VBD_CLOSED);
949 
950 	return err;
951 }
952 
953 int
tapdisk_vbd_pause(td_vbd_t * vbd)954 tapdisk_vbd_pause(td_vbd_t *vbd)
955 {
956 	int err;
957 
958 	td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED);
959 
960 	err = tapdisk_vbd_quiesce_queue(vbd);
961 	if (err)
962 		return err;
963 
964 	tapdisk_vbd_close_vdi(vbd);
965 
966 	td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
967 	td_flag_set(vbd->state, TD_VBD_PAUSED);
968 
969 	return 0;
970 }
971 
972 int
tapdisk_vbd_resume(td_vbd_t * vbd,const char * path,uint16_t drivertype)973 tapdisk_vbd_resume(td_vbd_t *vbd, const char *path, uint16_t drivertype)
974 {
975 	int i, err = 0;
976 
977 	if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
978 		EPRINTF("resume request for unpaused vbd %s\n", vbd->name);
979 		return -EINVAL;
980 	}
981 
982 	if (path) {
983 		free(vbd->name);
984 		vbd->name = strdup(path);
985 		if (!vbd->name) {
986 			EPRINTF("copying new vbd %s name failed\n", path);
987 			return -EINVAL;
988 		}
989 	}
990 
991 	for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
992 		err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT);
993 		if (err != -EIO)
994 			break;
995 
996 		sleep(TD_VBD_EIO_SLEEP);
997 	}
998 
999 	if (err)
1000 		return err;
1001 
1002 	tapdisk_vbd_start_queue(vbd);
1003 	td_flag_clear(vbd->state, TD_VBD_PAUSED);
1004 	td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
1005 	tapdisk_vbd_check_state(vbd);
1006 
1007 	return 0;
1008 }
1009 
1010 int
tapdisk_vbd_kick(td_vbd_t * vbd)1011 tapdisk_vbd_kick(td_vbd_t *vbd)
1012 {
1013 	int n;
1014 	td_ring_t *ring;
1015 
1016 	tapdisk_vbd_check_state(vbd);
1017 
1018 	ring = &vbd->ring;
1019 	if (!ring->sring)
1020 		return 0;
1021 
1022 	n    = (ring->fe_ring.rsp_prod_pvt - ring->fe_ring.sring->rsp_prod);
1023 	if (!n)
1024 		return 0;
1025 
1026 	vbd->kicked += n;
1027 	RING_PUSH_RESPONSES(&ring->fe_ring);
1028 	ioctl(ring->fd, BLKTAP_IOCTL_KICK_FE, 0);
1029 
1030 	DBG(TLOG_INFO, "kicking %d: rec: 0x%08"PRIx64", ret: 0x%08"PRIx64", kicked: "
1031 	    "0x%08"PRIx64"\n", n, vbd->received, vbd->returned, vbd->kicked);
1032 
1033 	return n;
1034 }
1035 
1036 static inline void
tapdisk_vbd_write_response_to_ring(td_vbd_t * vbd,blkif_response_t * rsp)1037 tapdisk_vbd_write_response_to_ring(td_vbd_t *vbd, blkif_response_t *rsp)
1038 {
1039 	td_ring_t *ring;
1040 	blkif_response_t *rspp;
1041 
1042 	ring = &vbd->ring;
1043 	rspp = RING_GET_RESPONSE(&ring->fe_ring, ring->fe_ring.rsp_prod_pvt);
1044 	memcpy(rspp, rsp, sizeof(blkif_response_t));
1045 	ring->fe_ring.rsp_prod_pvt++;
1046 }
1047 
1048 static void
tapdisk_vbd_callback(void * arg,blkif_response_t * rsp)1049 tapdisk_vbd_callback(void *arg, blkif_response_t *rsp)
1050 {
1051 	td_vbd_t *vbd = (td_vbd_t *)arg;
1052 	tapdisk_vbd_write_response_to_ring(vbd, rsp);
1053 }
1054 
1055 static void
tapdisk_vbd_make_response(td_vbd_t * vbd,td_vbd_request_t * vreq)1056 tapdisk_vbd_make_response(td_vbd_t *vbd, td_vbd_request_t *vreq)
1057 {
1058 	blkif_request_t tmp;
1059 	blkif_response_t *rsp;
1060 
1061 	tmp = vreq->req;
1062 	rsp = (blkif_response_t *)&vreq->req;
1063 
1064 	rsp->id = tmp.id;
1065 	rsp->operation = tmp.operation;
1066 	rsp->status = vreq->status;
1067 
1068 	DBG(TLOG_DBG, "writing req %d, sec 0x%08"PRIx64", res %d to ring\n",
1069 	    (int)tmp.id, tmp.sector_number, vreq->status);
1070 
1071 	if (rsp->status != BLKIF_RSP_OKAY)
1072 		ERR(EIO, "returning BLKIF_RSP %d", rsp->status);
1073 
1074 	vbd->returned++;
1075 	vbd->callback(vbd->argument, rsp);
1076 }
1077 
1078 void
tapdisk_vbd_check_state(td_vbd_t * vbd)1079 tapdisk_vbd_check_state(td_vbd_t *vbd)
1080 {
1081 	td_vbd_request_t *vreq, *tmp;
1082 
1083 	tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests)
1084 		if (vreq->num_retries >= TD_VBD_MAX_RETRIES)
1085 			tapdisk_vbd_complete_vbd_request(vbd, vreq);
1086 
1087 	if (!list_empty(&vbd->new_requests) ||
1088 	    !list_empty(&vbd->failed_requests))
1089 		tapdisk_vbd_issue_requests(vbd);
1090 
1091 	tapdisk_vbd_for_each_request(vreq, tmp, &vbd->completed_requests) {
1092 		tapdisk_vbd_make_response(vbd, vreq);
1093 		list_del(&vreq->next);
1094 		tapdisk_vbd_initialize_vreq(vreq);
1095 	}
1096 
1097 	if (td_flag_test(vbd->state, TD_VBD_QUIESCE_REQUESTED))
1098 		tapdisk_vbd_quiesce_queue(vbd);
1099 
1100 	if (td_flag_test(vbd->state, TD_VBD_PAUSE_REQUESTED))
1101 		tapdisk_vbd_pause(vbd);
1102 
1103 	if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
1104 		tapdisk_vbd_close(vbd);
1105 }
1106 
1107 void
tapdisk_vbd_check_progress(td_vbd_t * vbd)1108 tapdisk_vbd_check_progress(td_vbd_t *vbd)
1109 {
1110 	int diff;
1111 	struct timeval now;
1112 
1113 	if (list_empty(&vbd->pending_requests))
1114 		return;
1115 
1116 	gettimeofday(&now, NULL);
1117 	diff = now.tv_sec - vbd->ts.tv_sec;
1118 
1119 	if (diff >= TD_VBD_WATCHDOG_TIMEOUT) {
1120 		DBG(TLOG_WARN, "%s: watchdog timeout: pending requests "
1121 		    "idle for %d seconds\n", vbd->name, diff);
1122 		tapdisk_vbd_drop_log(vbd);
1123 		return;
1124 	}
1125 
1126 	tapdisk_server_set_max_timeout(TD_VBD_WATCHDOG_TIMEOUT - diff);
1127 }
1128 
1129 /*
1130  * request submission
1131  */
1132 
1133 static int
tapdisk_vbd_check_queue(td_vbd_t * vbd)1134 tapdisk_vbd_check_queue(td_vbd_t *vbd)
1135 {
1136 	int err;
1137 	td_image_t *image;
1138 
1139 	if (list_empty(&vbd->images))
1140 		return -ENOSYS;
1141 
1142 	if (!tapdisk_vbd_queue_ready(vbd))
1143 		return -EAGAIN;
1144 
1145 	if (!vbd->reopened) {
1146 		if (td_flag_test(vbd->state, TD_VBD_LOCKING)) {
1147 			err = tapdisk_vbd_lock(vbd);
1148 			if (err)
1149 				return err;
1150 		}
1151 
1152 		image = tapdisk_vbd_first_image(vbd);
1153 		td_flag_set(image->flags, TD_OPEN_STRICT);
1154 
1155 		if (tapdisk_vbd_close_and_reopen_image(vbd, image))
1156 			EPRINTF("reopening disks failed\n");
1157 		else {
1158 			DPRINTF("reopening disks succeeded\n");
1159 			vbd->reopened = 1;
1160 		}
1161 	}
1162 
1163 	return 0;
1164 }
1165 
1166 void
tapdisk_vbd_complete_vbd_request(td_vbd_t * vbd,td_vbd_request_t * vreq)1167 tapdisk_vbd_complete_vbd_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
1168 {
1169 	if (!vreq->submitting && !vreq->secs_pending) {
1170 		if (vreq->status == BLKIF_RSP_ERROR &&
1171 		    vreq->num_retries < TD_VBD_MAX_RETRIES &&
1172 		    !td_flag_test(vbd->state, TD_VBD_DEAD) &&
1173 		    !td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
1174 			tapdisk_vbd_move_request(vreq, &vbd->failed_requests);
1175 		else
1176 			tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
1177 	}
1178 }
1179 
1180 static uint64_t
tapdisk_vbd_breq_get_sector(blkif_request_t * breq,td_request_t treq)1181 tapdisk_vbd_breq_get_sector(blkif_request_t *breq, td_request_t treq)
1182 {
1183     int seg, nsects;
1184     uint64_t sector_nr = breq->sector_number;
1185 
1186     for(seg=0; seg < treq.sidx; seg++) {
1187         nsects = breq->seg[seg].last_sect - breq->seg[seg].first_sect + 1;
1188         sector_nr += nsects;
1189     }
1190 
1191     return sector_nr;
1192 }
1193 
1194 static void
__tapdisk_vbd_complete_td_request(td_vbd_t * vbd,td_vbd_request_t * vreq,td_request_t treq,int res)1195 __tapdisk_vbd_complete_td_request(td_vbd_t *vbd, td_vbd_request_t *vreq,
1196 				  td_request_t treq, int res)
1197 {
1198 	int err;
1199     td_image_t *image = treq.image;
1200 
1201 	err = (res <= 0 ? res : -res);
1202 	vbd->secs_pending  -= treq.secs;
1203 	vreq->secs_pending -= treq.secs;
1204 
1205 	vreq->blocked = treq.blocked;
1206 
1207 	if (err) {
1208 		vreq->status = BLKIF_RSP_ERROR;
1209 		vreq->error  = (vreq->error ? : err);
1210 		if (err != -EBUSY) {
1211 			vbd->errors++;
1212 			ERR(err, "req %"PRIu64": %s 0x%04x secs to "
1213 			    "0x%08"PRIx64, vreq->req.id,
1214 			    (treq.op == TD_OP_WRITE ? "write" : "read"),
1215 			    treq.secs, treq.sec);
1216 		}
1217 	} else {
1218 #ifdef MEMSHR
1219 		if (treq.op == TD_OP_READ
1220 		   && td_flag_test(image->flags, TD_OPEN_RDONLY)) {
1221 			share_tuple_t hnd = treq.memshr_hnd;
1222 			uint16_t uid  = image->memshr_id;
1223 			blkif_request_t *breq = &vreq->req;
1224 			uint64_t sec  = tapdisk_vbd_breq_get_sector(breq, treq);
1225 			int secs = breq->seg[treq.sidx].last_sect -
1226 			    breq->seg[treq.sidx].first_sect + 1;
1227 
1228 			if (hnd.handle != 0)
1229 				memshr_vbd_complete_ro_request(hnd, uid,
1230 								sec, secs);
1231 		}
1232 #endif
1233 	}
1234 
1235 	tapdisk_vbd_complete_vbd_request(vbd, vreq);
1236 }
1237 
1238 static void
__tapdisk_vbd_reissue_td_request(td_vbd_t * vbd,td_image_t * image,td_request_t treq)1239 __tapdisk_vbd_reissue_td_request(td_vbd_t *vbd,
1240 				 td_image_t *image, td_request_t treq)
1241 {
1242 	td_image_t *parent;
1243 	td_vbd_request_t *vreq;
1244 
1245 	vreq = (td_vbd_request_t *)treq.private;
1246 	gettimeofday(&vreq->last_try, NULL);
1247 
1248 	vreq->submitting++;
1249 
1250 	if (tapdisk_vbd_is_last_image(vbd, image)) {
1251 		memset(treq.buf, 0, treq.secs << SECTOR_SHIFT);
1252 		td_complete_request(treq, 0);
1253 		goto done;
1254 	}
1255 
1256 	parent     = tapdisk_vbd_next_image(image);
1257 	treq.image = parent;
1258 
1259 	/* return zeros for requests that extend beyond end of parent image */
1260 	if (treq.sec + treq.secs > parent->info.size) {
1261 		td_request_t clone  = treq;
1262 
1263 		if (parent->info.size > treq.sec) {
1264 			int secs    = parent->info.size - treq.sec;
1265 			clone.sec  += secs;
1266 			clone.secs -= secs;
1267 			clone.buf  += (secs << SECTOR_SHIFT);
1268 			treq.secs   = secs;
1269 		} else
1270 			treq.secs   = 0;
1271 
1272 		memset(clone.buf, 0, clone.secs << SECTOR_SHIFT);
1273 		td_complete_request(clone, 0);
1274 
1275 		if (!treq.secs)
1276 			goto done;
1277 	}
1278 
1279 	switch (treq.op) {
1280 	case TD_OP_WRITE:
1281 		td_queue_write(parent, treq);
1282 		break;
1283 
1284 	case TD_OP_READ:
1285 #ifdef MEMSHR
1286 		if(td_flag_test(parent->flags, TD_OPEN_RDONLY)) {
1287 			int ret, seg = treq.sidx;
1288 			blkif_request_t *breq = &vreq->req;
1289 
1290 			ret = memshr_vbd_issue_ro_request(treq.buf,
1291 			      breq->seg[seg].gref,
1292 			      parent->memshr_id,
1293 			      treq.sec,
1294 			      treq.secs,
1295 			      &treq.memshr_hnd);
1296 			if(ret == 0) {
1297 				/* Reset memshr handle. This'll prevent
1298 				 * memshr_vbd_complete_ro_request being called
1299 				 */
1300 				treq.memshr_hnd.handle = 0;
1301 				td_complete_request(treq, 0);
1302 			} else
1303 				td_queue_read(parent, treq);
1304 		} else
1305 #endif
1306 			td_queue_read(parent, treq);
1307 		break;
1308 	}
1309 
1310 done:
1311 	vreq->submitting--;
1312 	if (!vreq->secs_pending)
1313 		tapdisk_vbd_complete_vbd_request(vbd, vreq);
1314 }
1315 
1316 void
tapdisk_vbd_forward_request(td_request_t treq)1317 tapdisk_vbd_forward_request(td_request_t treq)
1318 {
1319 	td_vbd_t *vbd;
1320 	td_image_t *image;
1321 	td_vbd_request_t *vreq;
1322 
1323 	image = treq.image;
1324 	vbd   = (td_vbd_t *)image->private;
1325 	vreq  = (td_vbd_request_t *)treq.private;
1326 
1327 	gettimeofday(&vbd->ts, NULL);
1328 
1329 	if (tapdisk_vbd_queue_ready(vbd))
1330 		__tapdisk_vbd_reissue_td_request(vbd, image, treq);
1331 	else
1332 		__tapdisk_vbd_complete_td_request(vbd, vreq, treq, -EIO);
1333 }
1334 
1335 static void
tapdisk_vbd_complete_td_request(td_request_t treq,int res)1336 tapdisk_vbd_complete_td_request(td_request_t treq, int res)
1337 {
1338 	td_vbd_t *vbd;
1339 	td_image_t *image;
1340 	td_vbd_request_t *vreq;
1341 
1342 	image = treq.image;
1343 	vbd   = (td_vbd_t *)image->private;
1344 	vreq  = (td_vbd_request_t *)treq.private;
1345 
1346 	gettimeofday(&vbd->ts, NULL);
1347 	DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" "
1348 	    "secs 0x%04x buf %p op %d res %d\n", image->name,
1349 	    (int)treq.id, treq.sidx, treq.sec, treq.secs,
1350 	    treq.buf, (int)vreq->req.operation, res);
1351 
1352 	__tapdisk_vbd_complete_td_request(vbd, vreq, treq, res);
1353 }
1354 
1355 static int
tapdisk_vbd_issue_request(td_vbd_t * vbd,td_vbd_request_t * vreq)1356 tapdisk_vbd_issue_request(td_vbd_t *vbd, td_vbd_request_t *vreq)
1357 {
1358 	char *page;
1359 	td_ring_t *ring;
1360 	td_image_t *image;
1361 	td_request_t treq;
1362 	uint64_t sector_nr;
1363 	blkif_request_t *req;
1364 	int i, err, id, nsects;
1365 
1366 	req       = &vreq->req;
1367 	id        = req->id;
1368 	ring      = &vbd->ring;
1369 	sector_nr = req->sector_number;
1370 	image     = tapdisk_vbd_first_image(vbd);
1371 
1372 	vreq->submitting = 1;
1373 	gettimeofday(&vbd->ts, NULL);
1374 	gettimeofday(&vreq->last_try, NULL);
1375 	tapdisk_vbd_move_request(vreq, &vbd->pending_requests);
1376 
1377 #if 0
1378 	err = tapdisk_vbd_check_queue(vbd);
1379 	if (err)
1380 		goto fail;
1381 #endif
1382 
1383 	err = tapdisk_image_check_ring_request(image, req);
1384 	if (err)
1385 		goto fail;
1386 
1387 	for (i = 0; i < req->nr_segments; i++) {
1388 		nsects = req->seg[i].last_sect - req->seg[i].first_sect + 1;
1389 		page   = (char *)MMAP_VADDR(ring->vstart,
1390 					   (unsigned long)req->id, i);
1391 		page  += (req->seg[i].first_sect << SECTOR_SHIFT);
1392 
1393 		treq.id             = id;
1394 		treq.sidx           = i;
1395 		treq.blocked        = 0;
1396 		treq.buf            = page;
1397 		treq.sec            = sector_nr;
1398 		treq.secs           = nsects;
1399 		treq.image          = image;
1400 		treq.cb             = tapdisk_vbd_complete_td_request;
1401 		treq.cb_data        = NULL;
1402 		treq.private        = vreq;
1403 
1404 		DBG(TLOG_DBG, "%s: req %d seg %d sec 0x%08"PRIx64" secs 0x%04x "
1405 		    "buf %p op %d\n", image->name, id, i, treq.sec, treq.secs,
1406 		    treq.buf, (int)req->operation);
1407 
1408 		vreq->secs_pending += nsects;
1409 		vbd->secs_pending  += nsects;
1410 
1411 		switch (req->operation)	{
1412 		case BLKIF_OP_WRITE:
1413 			treq.op = TD_OP_WRITE;
1414 			td_queue_write(image, treq);
1415 			break;
1416 
1417 		case BLKIF_OP_READ:
1418 			treq.op = TD_OP_READ;
1419 			td_queue_read(image, treq);
1420 			break;
1421 		}
1422 
1423 		sector_nr += nsects;
1424 	}
1425 
1426 	err = 0;
1427 
1428 out:
1429 	vreq->submitting--;
1430 	if (!vreq->secs_pending) {
1431 		err = (err ? : vreq->error);
1432 		tapdisk_vbd_complete_vbd_request(vbd, vreq);
1433 	}
1434 
1435 	return err;
1436 
1437 fail:
1438 	vreq->status = BLKIF_RSP_ERROR;
1439 	goto out;
1440 }
1441 
1442 static int
tapdisk_vbd_reissue_failed_requests(td_vbd_t * vbd)1443 tapdisk_vbd_reissue_failed_requests(td_vbd_t *vbd)
1444 {
1445 	int err;
1446 	struct timeval now;
1447 	td_vbd_request_t *vreq, *tmp;
1448 
1449 	err = 0;
1450 	gettimeofday(&now, NULL);
1451 
1452 	tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) {
1453 		if (vreq->secs_pending)
1454 			continue;
1455 
1456 		if (td_flag_test(vbd->state, TD_VBD_SHUTDOWN_REQUESTED))
1457 			goto fail;
1458 
1459 		if (vreq->error != -EBUSY &&
1460 		    now.tv_sec - vreq->last_try.tv_sec < TD_VBD_RETRY_INTERVAL)
1461 			continue;
1462 
1463 		if (vreq->num_retries >= TD_VBD_MAX_RETRIES) {
1464 		fail:
1465 			DBG(TLOG_INFO, "req %"PRIu64"retried %d times\n",
1466 			    vreq->req.id, vreq->num_retries);
1467 			tapdisk_vbd_complete_vbd_request(vbd, vreq);
1468 			continue;
1469 		}
1470 
1471 		/*
1472 		 * never fail due to too many retries if we are blocked on a
1473 		 * dependency
1474 		 */
1475 		if (vreq->blocked) {
1476 			vreq->blocked = 0;
1477 		} else {
1478 			vbd->retries++;
1479 			vreq->num_retries++;
1480 		}
1481 		vreq->error  = 0;
1482 		vreq->status = BLKIF_RSP_OKAY;
1483 		DBG(TLOG_DBG, "retry #%d of req %"PRIu64", "
1484 		    "sec 0x%08"PRIx64", nr_segs: %d\n", vreq->num_retries,
1485 		    vreq->req.id, vreq->req.sector_number,
1486 		    vreq->req.nr_segments);
1487 
1488 		err = tapdisk_vbd_issue_request(vbd, vreq);
1489 		if (err)
1490 			break;
1491 	}
1492 
1493 	if (list_empty(&vbd->failed_requests))
1494 		td_flag_clear(vbd->state, TD_VBD_RETRY_NEEDED);
1495 	else
1496 		td_flag_set(vbd->state, TD_VBD_RETRY_NEEDED);
1497 
1498 	return err;
1499 }
1500 
1501 static int
tapdisk_vbd_issue_new_requests(td_vbd_t * vbd)1502 tapdisk_vbd_issue_new_requests(td_vbd_t *vbd)
1503 {
1504 	int err;
1505 	td_vbd_request_t *vreq, *tmp;
1506 
1507 	tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) {
1508 		err = tapdisk_vbd_issue_request(vbd, vreq);
1509 		if (err)
1510 			return err;
1511 	}
1512 
1513 	return 0;
1514 }
1515 
1516 static int
tapdisk_vbd_kill_requests(td_vbd_t * vbd)1517 tapdisk_vbd_kill_requests(td_vbd_t *vbd)
1518 {
1519 	td_vbd_request_t *vreq, *tmp;
1520 
1521 	tapdisk_vbd_for_each_request(vreq, tmp, &vbd->new_requests) {
1522 		vreq->status = BLKIF_RSP_ERROR;
1523 		tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
1524 	}
1525 
1526 	tapdisk_vbd_for_each_request(vreq, tmp, &vbd->failed_requests) {
1527 		vreq->status = BLKIF_RSP_ERROR;
1528 		tapdisk_vbd_move_request(vreq, &vbd->completed_requests);
1529 	}
1530 
1531 	return 0;
1532 }
1533 
1534 int
tapdisk_vbd_issue_requests(td_vbd_t * vbd)1535 tapdisk_vbd_issue_requests(td_vbd_t *vbd)
1536 {
1537 	int err;
1538 
1539 	if (td_flag_test(vbd->state, TD_VBD_DEAD))
1540 		return tapdisk_vbd_kill_requests(vbd);
1541 
1542 	if (!tapdisk_vbd_queue_ready(vbd))
1543 		return -EAGAIN;
1544 
1545 	err = tapdisk_vbd_reissue_failed_requests(vbd);
1546 	if (err)
1547 		return err;
1548 
1549 	return tapdisk_vbd_issue_new_requests(vbd);
1550 }
1551 
1552 static void
tapdisk_vbd_pull_ring_requests(td_vbd_t * vbd)1553 tapdisk_vbd_pull_ring_requests(td_vbd_t *vbd)
1554 {
1555 	int idx;
1556 	RING_IDX rp, rc;
1557 	td_ring_t *ring;
1558 	blkif_request_t *req;
1559 	td_vbd_request_t *vreq;
1560 
1561 	ring = &vbd->ring;
1562 	if (!ring->sring)
1563 		return;
1564 
1565 	rp   = ring->fe_ring.sring->req_prod;
1566 	xen_rmb();
1567 
1568 	for (rc = ring->fe_ring.req_cons; rc != rp; rc++) {
1569 		req = RING_GET_REQUEST(&ring->fe_ring, rc);
1570 		++ring->fe_ring.req_cons;
1571 
1572 		idx  = req->id;
1573 		vreq = &vbd->request_list[idx];
1574 
1575 		ASSERT(list_empty(&vreq->next));
1576 		ASSERT(vreq->secs_pending == 0);
1577 
1578 		memcpy(&vreq->req, req, sizeof(blkif_request_t));
1579 		vbd->received++;
1580 		vreq->vbd = vbd;
1581 
1582 		tapdisk_vbd_move_request(vreq, &vbd->new_requests);
1583 
1584 		DBG(TLOG_DBG, "%s: request %d \n", vbd->name, idx);
1585 	}
1586 }
1587 
1588 static int
tapdisk_vbd_pause_ring(td_vbd_t * vbd)1589 tapdisk_vbd_pause_ring(td_vbd_t *vbd)
1590 {
1591 	int err;
1592 
1593 	if (td_flag_test(vbd->state, TD_VBD_PAUSED))
1594 		return 0;
1595 
1596 	td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED);
1597 
1598 	err = tapdisk_vbd_quiesce_queue(vbd);
1599 	if (err) {
1600 		EPRINTF("%s: ring pause request on active queue\n", vbd->name);
1601 		return err;
1602 	}
1603 
1604 	tapdisk_vbd_close_vdi(vbd);
1605 
1606 	err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_PAUSE, 0);
1607 	if (err)
1608 		EPRINTF("%s: pause ioctl failed: %d\n", vbd->name, errno);
1609 	else {
1610 		td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED);
1611 		td_flag_set(vbd->state, TD_VBD_PAUSED);
1612 	}
1613 
1614 	return err;
1615 }
1616 
1617 static int
tapdisk_vbd_resume_ring(td_vbd_t * vbd)1618 tapdisk_vbd_resume_ring(td_vbd_t *vbd)
1619 {
1620 	int i, err, type;
1621 	char message[BLKTAP2_MAX_MESSAGE_LEN];
1622 	const char *path;
1623 
1624 	memset(message, 0, sizeof(message));
1625 
1626 	if (!td_flag_test(vbd->state, TD_VBD_PAUSED)) {
1627 		EPRINTF("%s: resume message for unpaused vbd\n", vbd->name);
1628 		return -EINVAL;
1629 	}
1630 
1631 	err = ioctl(vbd->ring.fd, BLKTAP2_IOCTL_REOPEN, &message);
1632 	if (err) {
1633 		EPRINTF("%s: resume ioctl failed: %d\n", vbd->name, errno);
1634 		return err;
1635 	}
1636 
1637 	err = tapdisk_parse_disk_type(message, &path, &type);
1638 	if (err) {
1639 		EPRINTF("%s: invalid resume string %s\n", vbd->name, message);
1640 		goto out;
1641 	}
1642 
1643 	free(vbd->name);
1644 	vbd->name = strdup(path);
1645 	if (!vbd->name) {
1646 		EPRINTF("resume malloc failed\n");
1647 		err = -ENOMEM;
1648 		goto out;
1649 	}
1650 
1651 	tapdisk_vbd_start_queue(vbd);
1652 
1653 	for (i = 0; i < TD_VBD_EIO_RETRIES; i++) {
1654 		err = __tapdisk_vbd_open_vdi(vbd, TD_OPEN_STRICT);
1655 		if (err != -EIO)
1656 			break;
1657 
1658 		sleep(TD_VBD_EIO_SLEEP);
1659 	}
1660 
1661 out:
1662 	if (!err) {
1663 		image_t image;
1664 		struct blktap2_params params;
1665 
1666 		memset(&params, 0, sizeof(params));
1667 		tapdisk_vbd_get_image_info(vbd, &image);
1668 
1669 		params.sector_size = image.secsize;
1670 		params.capacity    = image.size;
1671 		snprintf(params.name, sizeof(params.name) - 1, "%s", message);
1672 
1673 		ioctl(vbd->ring.fd, BLKTAP2_IOCTL_SET_PARAMS, &params);
1674 		td_flag_clear(vbd->state, TD_VBD_PAUSED);
1675 	}
1676 
1677 	ioctl(vbd->ring.fd, BLKTAP2_IOCTL_RESUME, err);
1678 	return err;
1679 }
1680 
1681 static int
tapdisk_vbd_check_ring_message(td_vbd_t * vbd)1682 tapdisk_vbd_check_ring_message(td_vbd_t *vbd)
1683 {
1684 	if (!vbd->ring.sring)
1685 		return -EINVAL;
1686 
1687 	switch (vbd->ring.sring->pvt.tapif_user.msg) {
1688 	case 0:
1689 		return 0;
1690 
1691 	case BLKTAP2_RING_MESSAGE_PAUSE:
1692 		return tapdisk_vbd_pause_ring(vbd);
1693 
1694 	case BLKTAP2_RING_MESSAGE_RESUME:
1695 		return tapdisk_vbd_resume_ring(vbd);
1696 
1697 	case BLKTAP2_RING_MESSAGE_CLOSE:
1698 		return tapdisk_vbd_close(vbd);
1699 
1700 	default:
1701 		return -EINVAL;
1702 	}
1703 }
1704 
1705 static void
tapdisk_vbd_ring_event(event_id_t id,char mode,void * private)1706 tapdisk_vbd_ring_event(event_id_t id, char mode, void *private)
1707 {
1708 	td_vbd_t *vbd;
1709 
1710 	vbd = (td_vbd_t *)private;
1711 
1712 	tapdisk_vbd_pull_ring_requests(vbd);
1713 	tapdisk_vbd_issue_requests(vbd);
1714 
1715 	/* vbd may be destroyed after this call */
1716 	tapdisk_vbd_check_ring_message(vbd);
1717 }
1718 
1719 td_image_t *
tapdisk_vbd_first_image(td_vbd_t * vbd)1720 tapdisk_vbd_first_image(td_vbd_t *vbd)
1721 {
1722 	return list_entry(vbd->images.next, td_image_t, next);
1723 }
1724