1 /* block-qcow.c
2  *
3  * Asynchronous Qemu copy-on-write disk implementation.
4  * Code based on the Qemu implementation
5  * (see copyright notice below)
6  *
7  * (c) 2006 Andrew Warfield and Julian Chesterfield
8  *
9  */
10 
11 /*
12  * Block driver for the QCOW format
13  *
14  * Copyright (c) 2004 Fabrice Bellard
15  *
16  * Permission is hereby granted, free of charge, to any person obtaining a copy
17  * of this software and associated documentation files(the "Software"), to deal
18  * in the Software without restriction, including without limitation the rights
19  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
20  * copies of the Software, and to permit persons to whom the Software is
21  * furnished to do so, subject to the following conditions:
22  */
23 
24 #include <errno.h>
25 #include <fcntl.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <unistd.h>
29 #include <sys/statvfs.h>
30 #include <sys/stat.h>
31 #include <sys/ioctl.h>
32 #include <string.h>
33 #include <zlib.h>
34 #include <inttypes.h>
35 #include <libaio.h>
36 #include <limits.h>
37 #include "bswap.h"
38 #include "aes.h"
39 #include "md5.h"
40 
41 #include "tapdisk.h"
42 #include "tapdisk-driver.h"
43 #include "tapdisk-interface.h"
44 #include "tapdisk-disktype.h"
45 #include "qcow.h"
46 #include "blk.h"
47 #include "atomicio.h"
48 
49 /* *BSD has no O_LARGEFILE */
50 #ifndef O_LARGEFILE
51 #define O_LARGEFILE     0
52 #endif
53 
54 #if 1
55 #define ASSERT(_p) \
56     if ( !(_p) ) { DPRINTF("Assertion '%s' failed, line %d, file %s", #_p , \
57     __LINE__, __FILE__); *(int*)0=0; }
58 #else
59 #define ASSERT(_p) ((void)0)
60 #endif
61 
62 struct pending_aio {
63         td_callback_t cb;
64         int id;
65         void *private;
66 	int nb_sectors;
67 	char *buf;
68 	uint64_t sector;
69 };
70 
71 #undef IOCB_IDX
72 #define IOCB_IDX(_s, _io) ((_io) - (_s)->iocb_list)
73 
74 #define ZERO_TEST(_b) (_b | 0x00)
75 
76 struct qcow_request {
77 	td_request_t         treq;
78 	struct tiocb         tiocb;
79 	struct tdqcow_state  *state;
80 };
81 
82 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset);
83 
gen_cksum(char * ptr,int len)84 uint32_t gen_cksum(char *ptr, int len)
85 {
86   int i;
87   uint32_t md[4];
88 
89   /* Generate checksum */
90   md5_sum((const uint8_t*)ptr, len, (uint8_t*)md);
91 
92   return md[0];
93 }
94 
free_aio_state(struct tdqcow_state * s)95 static void free_aio_state(struct tdqcow_state* s)
96 {
97 	free(s->aio_requests);
98 	free(s->aio_free_list);
99 }
100 
init_aio_state(td_driver_t * driver)101 static int init_aio_state(td_driver_t *driver)
102 {
103 	int i, ret;
104 	td_disk_info_t *bs = &(driver->info);
105 	struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
106 
107         // A segment (i.e. a page) can span multiple clusters
108         s->max_aio_reqs = ((getpagesize() / s->cluster_size) + 1) *
109 	  MAX_SEGMENTS_PER_REQ * MAX_REQUESTS;
110 
111 	s->aio_free_count = s->max_aio_reqs;
112 
113 	if (!(s->aio_requests  = calloc(s->max_aio_reqs, sizeof(struct qcow_request))) ||
114 	    !(s->aio_free_list = calloc(s->max_aio_reqs, sizeof(struct qcow_request)))) {
115 	    DPRINTF("Failed to allocate AIO structs (max_aio_reqs = %d)\n",
116 		    s->max_aio_reqs);
117 	    goto fail;
118 	}
119 
120 	for (i = 0; i < s->max_aio_reqs; i++)
121 		s->aio_free_list[i] = &s->aio_requests[i];
122 
123         DPRINTF("AIO state initialised\n");
124 
125         return 0;
126  fail:
127 	return -1;
128 }
129 
get_filesize(char * filename,uint64_t * size,struct stat * st)130 int get_filesize(char *filename, uint64_t *size, struct stat *st)
131 {
132 	int fd;
133 	QCowHeader header;
134 
135 	/*Set to the backing file size*/
136 	fd = open(filename, O_RDONLY);
137 	if (fd < 0)
138 		return -1;
139 	if (read(fd, &header, sizeof(header)) < sizeof(header)) {
140 		close(fd);
141 		return -1;
142 	}
143 	close(fd);
144 
145 	be32_to_cpus(&header.magic);
146 	be64_to_cpus(&header.size);
147 	if (header.magic == QCOW_MAGIC) {
148 		*size = header.size >> SECTOR_SHIFT;
149 		return 0;
150 	}
151 
152 	if(S_ISBLK(st->st_mode)) {
153 		fd = open(filename, O_RDONLY);
154 		if (fd < 0)
155 			return -1;
156 		if (blk_getimagesize(fd, size) != 0) {
157 			printf("Unable to get Block device size\n");
158 			close(fd);
159 			return -1;
160 		}
161 		close(fd);
162 	} else *size = (st->st_size >> SECTOR_SHIFT);
163 	return 0;
164 }
165 
qcow_set_key(struct tdqcow_state * s,const char * key)166 static int qcow_set_key(struct tdqcow_state *s, const char *key)
167 {
168 	uint8_t keybuf[16];
169 	int len, i;
170 
171 	memset(keybuf, 0, 16);
172 	len = strlen(key);
173 	if (len > 16)
174 		len = 16;
175 	/* XXX: we could compress the chars to 7 bits to increase
176 	   entropy */
177 	for (i = 0; i < len; i++) {
178 		keybuf[i] = key[i];
179 	}
180 	s->crypt_method = s->crypt_method_header;
181 
182 	if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
183 		return -1;
184 	if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
185 		return -1;
186 #if 0
187 	/* test */
188 	{
189 		uint8_t in[16];
190 		uint8_t out[16];
191 		uint8_t tmp[16];
192 		for (i=0; i<16; i++)
193 			in[i] = i;
194 		AES_encrypt(in, tmp, &s->aes_encrypt_key);
195 		AES_decrypt(tmp, out, &s->aes_decrypt_key);
196 		for (i = 0; i < 16; i++)
197 			DPRINTF(" %02x", tmp[i]);
198 		DPRINTF("\n");
199 		for (i = 0; i < 16; i++)
200 			DPRINTF(" %02x", out[i]);
201 		DPRINTF("\n");
202 	}
203 #endif
204 	return 0;
205 }
206 
tdqcow_complete(void * arg,struct tiocb * tiocb,int err)207 void tdqcow_complete(void *arg, struct tiocb *tiocb, int err)
208 {
209 	struct qcow_request *aio = (struct qcow_request *)arg;
210 	struct tdqcow_state *s = aio->state;
211 
212 	td_complete_request(aio->treq, err);
213 
214 	s->aio_free_list[s->aio_free_count++] = aio;
215 }
216 
async_read(td_driver_t * driver,td_request_t treq)217 static void async_read(td_driver_t *driver, td_request_t treq)
218 {
219 	int size;
220 	uint64_t offset;
221 	struct qcow_request *aio;
222 	struct tdqcow_state *prv;
223 
224 	prv    = (struct tdqcow_state *)driver->data;
225 	size   = treq.secs * driver->info.sector_size;
226 	offset = treq.sec  * (uint64_t)driver->info.sector_size;
227 
228 	if (prv->aio_free_count == 0)
229 		goto fail;
230 
231 	aio        = prv->aio_free_list[--prv->aio_free_count];
232 	aio->treq  = treq;
233 	aio->state = prv;
234 
235 	td_prep_read(&aio->tiocb, prv->fd, treq.buf,
236 		     size, offset, tdqcow_complete, aio);
237 	td_queue_tiocb(driver, &aio->tiocb);
238 
239 	return;
240 
241 fail:
242 	td_complete_request(treq, -EBUSY);
243 }
244 
async_write(td_driver_t * driver,td_request_t treq)245 static void async_write(td_driver_t *driver, td_request_t treq)
246 {
247 	int size;
248 	uint64_t offset;
249 	struct qcow_request *aio;
250 	struct tdqcow_state *prv;
251 
252 	prv     = (struct tdqcow_state *)driver->data;
253 	size    = treq.secs * driver->info.sector_size;
254 	offset  = treq.sec  * (uint64_t)driver->info.sector_size;
255 
256 	if (prv->aio_free_count == 0)
257 		goto fail;
258 
259 	aio        = prv->aio_free_list[--prv->aio_free_count];
260 	aio->treq  = treq;
261 	aio->state = prv;
262 
263 	td_prep_write(&aio->tiocb, prv->fd, treq.buf,
264 		      size, offset, tdqcow_complete, aio);
265 	td_queue_tiocb(driver, &aio->tiocb);
266 
267 	return;
268 
269 fail:
270 	td_complete_request(treq, -EBUSY);
271 }
272 
273 /*
274  * The crypt function is compatible with the linux cryptoloop
275  * algorithm for < 4 GB images. NOTE: out_buf == in_buf is
276  * supported .
277  */
encrypt_sectors(struct tdqcow_state * s,int64_t sector_num,uint8_t * out_buf,const uint8_t * in_buf,int nb_sectors,int enc,const AES_KEY * key)278 static void encrypt_sectors(struct tdqcow_state *s, int64_t sector_num,
279                             uint8_t *out_buf, const uint8_t *in_buf,
280                             int nb_sectors, int enc,
281                             const AES_KEY *key)
282 {
283 	union {
284 		uint64_t ll[2];
285 		uint8_t b[16];
286 	} ivec;
287 	int i;
288 
289 	for (i = 0; i < nb_sectors; i++) {
290 		ivec.ll[0] = cpu_to_le64(sector_num);
291 		ivec.ll[1] = 0;
292 		AES_cbc_encrypt(in_buf, out_buf, 512, key,
293 				ivec.b, enc);
294 		sector_num++;
295 		in_buf += 512;
296 		out_buf += 512;
297 	}
298 }
299 
qtruncate(int fd,off_t length,int sparse)300 int qtruncate(int fd, off_t length, int sparse)
301 {
302 	int ret, i;
303 	int current = 0, rem = 0;
304 	uint64_t sectors;
305 	struct stat st;
306 	char *buf;
307 
308 	/* If length is greater than the current file len
309 	 * we synchronously write zeroes to the end of the
310 	 * file, otherwise we truncate the length down
311 	 */
312 	ret = fstat(fd, &st);
313 	if (ret == -1)
314 		return -1;
315 	if (S_ISBLK(st.st_mode))
316 		return 0;
317 
318 	sectors = (length + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
319 	current = (st.st_size + DEFAULT_SECTOR_SIZE - 1)/DEFAULT_SECTOR_SIZE;
320 	rem     = st.st_size % DEFAULT_SECTOR_SIZE;
321 
322 	/* If we are extending this file, we write zeros to the end --
323 	 * this tries to ensure that the extents allocated wind up being
324 	 * contiguous on disk.
325 	 */
326 	if(st.st_size < sectors * DEFAULT_SECTOR_SIZE) {
327 		/*We are extending the file*/
328 		if ((ret = posix_memalign((void **)&buf,
329 					  512, DEFAULT_SECTOR_SIZE))) {
330 			DPRINTF("posix_memalign failed: %d\n", ret);
331 			return -1;
332 		}
333 		memset(buf, 0x00, DEFAULT_SECTOR_SIZE);
334 		if (lseek(fd, 0, SEEK_END)==-1) {
335 			DPRINTF("Lseek EOF failed (%d), internal error\n",
336 				errno);
337 			free(buf);
338 			return -1;
339 		}
340 		if (rem) {
341 			ret = write(fd, buf, rem);
342 			if (ret != rem) {
343 				DPRINTF("write failed: ret = %d, err = %s\n",
344 					ret, strerror(errno));
345 				free(buf);
346 				return -1;
347 			}
348 		}
349 		for (i = current; i < sectors; i++ ) {
350 			ret = write(fd, buf, DEFAULT_SECTOR_SIZE);
351 			if (ret != DEFAULT_SECTOR_SIZE) {
352 				DPRINTF("write failed: ret = %d, err = %s\n",
353 					ret, strerror(errno));
354 				free(buf);
355 				return -1;
356 			}
357 		}
358 		free(buf);
359 	} else if(sparse && (st.st_size > sectors * DEFAULT_SECTOR_SIZE))
360 		if (ftruncate(fd, (off_t)sectors * DEFAULT_SECTOR_SIZE)==-1) {
361 			DPRINTF("Ftruncate failed (%s)\n", strerror(errno));
362 			return -1;
363 		}
364 	return 0;
365 }
366 
367 /* 'allocate' is:
368  *
369  * 0 to not allocate.
370  *
371  * 1 to allocate a normal cluster (for sector indexes 'n_start' to
372  * 'n_end')
373  *
374  * 2 to allocate a compressed cluster of size
375  * 'compressed_size'. 'compressed_size' must be > 0 and <
376  * cluster_size
377  *
378  * return 0 if not allocated.
379  */
get_cluster_offset(struct tdqcow_state * s,uint64_t offset,int allocate,int compressed_size,int n_start,int n_end)380 static uint64_t get_cluster_offset(struct tdqcow_state *s,
381                                    uint64_t offset, int allocate,
382                                    int compressed_size,
383                                    int n_start, int n_end)
384 {
385 	int min_index, i, j, l1_index, l2_index, l2_sector, l1_sector;
386 	char *tmp_ptr2, *l2_ptr, *l1_ptr;
387 	uint64_t *tmp_ptr;
388 	uint64_t l2_offset, *l2_table, cluster_offset, tmp;
389 	uint32_t min_count;
390 	int new_l2_table;
391 
392 	/*Check L1 table for the extent offset*/
393 	l1_index = offset >> (s->l2_bits + s->cluster_bits);
394 	l2_offset = s->l1_table[l1_index];
395 	new_l2_table = 0;
396 	if (!l2_offset) {
397 		if (!allocate)
398 			return 0;
399 		/*
400 		 * allocating a new l2 entry + extent
401 		 * at the end of the file, we must also
402 		 * update the L1 entry safely.
403 		 */
404 		l2_offset = s->fd_end;
405 
406 		/* round to cluster size */
407 		l2_offset = (l2_offset + s->cluster_size - 1)
408 			& ~(s->cluster_size - 1);
409 
410 		/* update the L1 entry */
411 		s->l1_table[l1_index] = l2_offset;
412 
413 		/*Truncate file for L2 table
414 		 *(initialised to zero in case we crash)*/
415 		if (qtruncate(s->fd,
416 			      l2_offset + (s->l2_size * sizeof(uint64_t)),
417 			      s->sparse) != 0) {
418 			DPRINTF("ERROR truncating file\n");
419 			return 0;
420 		}
421 		s->fd_end = l2_offset + (s->l2_size * sizeof(uint64_t));
422 
423 		/*Update the L1 table entry on disk
424                  * (for O_DIRECT we write 4KByte blocks)*/
425 		l1_sector = (l1_index * sizeof(uint64_t)) >> 12;
426 		l1_ptr = (char *)s->l1_table + (l1_sector << 12);
427 
428 		if (posix_memalign((void **)&tmp_ptr, 4096, 4096) != 0) {
429 			DPRINTF("ERROR allocating memory for L1 table\n");
430                         return 0;
431 		}
432 		memcpy(tmp_ptr, l1_ptr, 4096);
433 
434 		/* Convert block to write to big endian */
435 		for(i = 0; i < 4096 / sizeof(uint64_t); i++) {
436 			cpu_to_be64s(&tmp_ptr[i]);
437 		}
438 
439 		/*
440 		 * Issue non-asynchronous L1 write.
441 		 * For safety, we must ensure that
442 		 * entry is written before blocks.
443 		 */
444 		lseek(s->fd, s->l1_table_offset + (l1_sector << 12), SEEK_SET);
445 		if (write(s->fd, tmp_ptr, 4096) != 4096) {
446 			free(tmp_ptr);
447 		 	return 0;
448 		}
449 		free(tmp_ptr);
450 
451 		new_l2_table = 1;
452 		goto cache_miss;
453 	} else if (s->min_cluster_alloc == s->l2_size) {
454 		/*Fast-track the request*/
455 		cluster_offset = l2_offset + (s->l2_size * sizeof(uint64_t));
456 		l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
457 		return cluster_offset + (l2_index * s->cluster_size);
458 	}
459 
460 	/*Check to see if L2 entry is already cached*/
461 	for (i = 0; i < L2_CACHE_SIZE; i++) {
462 		if (l2_offset == s->l2_cache_offsets[i]) {
463 			/* increment the hit count */
464 			if (++s->l2_cache_counts[i] == 0xffffffff) {
465 				for (j = 0; j < L2_CACHE_SIZE; j++) {
466 					s->l2_cache_counts[j] >>= 1;
467 				}
468 			}
469 			l2_table = s->l2_cache + (i << s->l2_bits);
470 			goto found;
471 		}
472 	}
473 
474 cache_miss:
475 	/* not found: load a new entry in the least used one */
476 	min_index = 0;
477 	min_count = 0xffffffff;
478 	for (i = 0; i < L2_CACHE_SIZE; i++) {
479 		if (s->l2_cache_counts[i] < min_count) {
480 			min_count = s->l2_cache_counts[i];
481 			min_index = i;
482 		}
483 	}
484 	l2_table = s->l2_cache + (min_index << s->l2_bits);
485 
486 	/*If extent pre-allocated, read table from disk,
487 	 *otherwise write new table to disk*/
488 	if (new_l2_table) {
489 		/*Should we allocate the whole extent? Adjustable parameter.*/
490 		if (s->cluster_alloc == s->l2_size) {
491 			cluster_offset = l2_offset +
492 				(s->l2_size * sizeof(uint64_t));
493 			cluster_offset = (cluster_offset + s->cluster_size - 1)
494 				& ~(s->cluster_size - 1);
495 			if (qtruncate(s->fd, cluster_offset +
496 				  (s->cluster_size * s->l2_size),
497 				      s->sparse) != 0) {
498 				DPRINTF("ERROR truncating file\n");
499 				return 0;
500 			}
501 			s->fd_end = cluster_offset +
502 				(s->cluster_size * s->l2_size);
503 			for (i = 0; i < s->l2_size; i++) {
504 				l2_table[i] = cpu_to_be64(cluster_offset +
505 							  (i*s->cluster_size));
506 			}
507 		} else memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
508 
509 		lseek(s->fd, l2_offset, SEEK_SET);
510 		if (write(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
511 		   s->l2_size * sizeof(uint64_t))
512 			return 0;
513 	} else {
514 		lseek(s->fd, l2_offset, SEEK_SET);
515 		if (read(s->fd, l2_table, s->l2_size * sizeof(uint64_t)) !=
516 		    s->l2_size * sizeof(uint64_t))
517 			return 0;
518 	}
519 
520 	/*Update the cache entries*/
521 	s->l2_cache_offsets[min_index] = l2_offset;
522 	s->l2_cache_counts[min_index] = 1;
523 
524 found:
525 	/*The extent is split into 's->l2_size' blocks of
526 	 *size 's->cluster_size'*/
527 	l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
528 	cluster_offset = be64_to_cpu(l2_table[l2_index]);
529 
530 	if (!cluster_offset ||
531 	    ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1) ) {
532 		if (!allocate)
533 			return 0;
534 
535 		if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
536 		    (n_end - n_start) < s->cluster_sectors) {
537 			/* cluster is already allocated but compressed, we must
538 			   decompress it in the case it is not completely
539 			   overwritten */
540 			if (decompress_cluster(s, cluster_offset) < 0)
541 				return 0;
542 			cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
543 			cluster_offset = (cluster_offset + s->cluster_size - 1)
544 				& ~(s->cluster_size - 1);
545 			/* write the cluster content - not asynchronous */
546 			lseek(s->fd, cluster_offset, SEEK_SET);
547 			if (write(s->fd, s->cluster_cache, s->cluster_size) !=
548 			    s->cluster_size)
549 			    return -1;
550 		} else {
551 			/* allocate a new cluster */
552 			cluster_offset = lseek(s->fd, s->fd_end, SEEK_SET);
553 			if (allocate == 1) {
554 				/* round to cluster size */
555 				cluster_offset =
556 					(cluster_offset + s->cluster_size - 1)
557 					& ~(s->cluster_size - 1);
558 				if (qtruncate(s->fd, cluster_offset +
559 					      s->cluster_size, s->sparse)!=0) {
560 					DPRINTF("ERROR truncating file\n");
561 					return 0;
562 				}
563 				s->fd_end = (cluster_offset + s->cluster_size);
564 				/* if encrypted, we must initialize the cluster
565 				   content which won't be written */
566 				if (s->crypt_method &&
567 				    (n_end - n_start) < s->cluster_sectors) {
568 					uint64_t start_sect;
569 					start_sect = (offset &
570 						      ~(s->cluster_size - 1))
571 							      >> 9;
572 					memset(s->cluster_data + 512,
573 					       0xaa, 512);
574 					for (i = 0; i < s->cluster_sectors;i++)
575 					{
576 						if (i < n_start || i >= n_end)
577 						{
578 							encrypt_sectors(s, start_sect + i,
579 									s->cluster_data,
580 									s->cluster_data + 512, 1, 1,
581 									&s->aes_encrypt_key);
582 							lseek(s->fd, cluster_offset + i * 512, SEEK_SET);
583 							if (write(s->fd, s->cluster_data, 512) != 512)
584 								return -1;
585 						}
586 					}
587 				}
588 			} else {
589 				cluster_offset |= QCOW_OFLAG_COMPRESSED |
590 					(uint64_t)compressed_size
591 						<< (63 - s->cluster_bits);
592 			}
593 		}
594 		/* update L2 table */
595 		tmp = cpu_to_be64(cluster_offset);
596 		l2_table[l2_index] = tmp;
597 
598 		/*For IO_DIRECT we write 4KByte blocks*/
599 		l2_sector = (l2_index * sizeof(uint64_t)) >> 12;
600 		l2_ptr = (char *)l2_table + (l2_sector << 12);
601 
602 		if (posix_memalign((void **)&tmp_ptr2, 4096, 4096) != 0) {
603 			DPRINTF("ERROR allocating memory for L1 table\n");
604                         return 0;
605 		}
606 		memcpy(tmp_ptr2, l2_ptr, 4096);
607 		lseek(s->fd, l2_offset + (l2_sector << 12), SEEK_SET);
608 		if (write(s->fd, tmp_ptr2, 4096) != 4096) {
609 			free(tmp_ptr2);
610 			return -1;
611 		}
612 		free(tmp_ptr2);
613 	}
614 	return cluster_offset;
615 }
616 
qcow_is_allocated(struct tdqcow_state * s,int64_t sector_num,int nb_sectors,int * pnum)617 static int qcow_is_allocated(struct tdqcow_state *s, int64_t sector_num,
618                              int nb_sectors, int *pnum)
619 {
620 	int index_in_cluster, n;
621 	uint64_t cluster_offset;
622 
623 	cluster_offset = get_cluster_offset(s, sector_num << 9, 0, 0, 0, 0);
624 	index_in_cluster = sector_num & (s->cluster_sectors - 1);
625 	n = s->cluster_sectors - index_in_cluster;
626 	if (n > nb_sectors)
627 		n = nb_sectors;
628 	*pnum = n;
629 	return (cluster_offset != 0);
630 }
631 
decompress_buffer(uint8_t * out_buf,int out_buf_size,const uint8_t * buf,int buf_size)632 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
633                              const uint8_t *buf, int buf_size)
634 {
635 	z_stream strm1, *strm = &strm1;
636 	int ret, out_len;
637 
638 	memset(strm, 0, sizeof(*strm));
639 
640 	strm->next_in = (uint8_t *)buf;
641 	strm->avail_in = buf_size;
642 	strm->next_out = out_buf;
643 	strm->avail_out = out_buf_size;
644 
645 	ret = inflateInit2(strm, -12);
646 	if (ret != Z_OK)
647 		return -1;
648 	ret = inflate(strm, Z_FINISH);
649 	out_len = strm->next_out - out_buf;
650 	if ( (ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
651 	    (out_len != out_buf_size) ) {
652 		inflateEnd(strm);
653 		return -1;
654 	}
655 	inflateEnd(strm);
656 	return 0;
657 }
658 
decompress_cluster(struct tdqcow_state * s,uint64_t cluster_offset)659 static int decompress_cluster(struct tdqcow_state *s, uint64_t cluster_offset)
660 {
661 	int ret, csize;
662 	uint64_t coffset;
663 
664 	coffset = cluster_offset & s->cluster_offset_mask;
665 	if (s->cluster_cache_offset != coffset) {
666 		csize = cluster_offset >> (63 - s->cluster_bits);
667 		csize &= (s->cluster_size - 1);
668 		lseek(s->fd, coffset, SEEK_SET);
669 		ret = read(s->fd, s->cluster_data, csize);
670 		if (ret != csize)
671 			return -1;
672 		if (decompress_buffer(s->cluster_cache, s->cluster_size,
673 				      s->cluster_data, csize) < 0) {
674 			return -1;
675 		}
676 		s->cluster_cache_offset = coffset;
677 	}
678 	return 0;
679 }
680 
681 static int
tdqcow_read_header(int fd,QCowHeader * header)682 tdqcow_read_header(int fd, QCowHeader *header)
683 {
684 	int err;
685 	char *buf;
686 	struct stat st;
687 	size_t size, expected;
688 
689 	memset(header, 0, sizeof(*header));
690 
691 	err = fstat(fd, &st);
692 	if (err)
693 		return -errno;
694 
695 	err = lseek(fd, 0, SEEK_SET);
696 	if (err == (off_t)-1)
697 		return -errno;
698 
699 	size = (sizeof(*header) + 511) & ~511;
700 	err = posix_memalign((void **)&buf, 512, size);
701 	if (err)
702 		return err;
703 
704 	expected = size;
705 	if (st.st_size < size)
706 		expected = st.st_size;
707 
708 	errno = 0;
709 	err = read(fd, buf, size);
710 	if (err != expected) {
711 		err = (errno ? -errno : -EIO);
712 		goto out;
713 	}
714 
715 	memcpy(header, buf, sizeof(*header));
716 	be32_to_cpus(&header->magic);
717 	be32_to_cpus(&header->version);
718 	be64_to_cpus(&header->backing_file_offset);
719 	be32_to_cpus(&header->backing_file_size);
720 	be32_to_cpus(&header->mtime);
721 	be64_to_cpus(&header->size);
722 	be32_to_cpus(&header->crypt_method);
723 	be64_to_cpus(&header->l1_table_offset);
724 
725 	err = 0;
726 
727 out:
728 	free(buf);
729 	return err;
730 }
731 
732 static int
tdqcow_load_l1_table(struct tdqcow_state * s,QCowHeader * header)733 tdqcow_load_l1_table(struct tdqcow_state *s, QCowHeader *header)
734 {
735 	char *buf;
736 	struct stat st;
737 	size_t expected;
738 	int i, err, shift;
739 	QCowHeader_ext *exthdr;
740 	uint32_t l1_table_bytes, l1_table_block, l1_table_size;
741 
742 	buf         = NULL;
743 	s->l1_table = NULL;
744 
745 	shift = s->cluster_bits + s->l2_bits;
746 
747 	s->l1_size = (header->size + (1LL << shift) - 1) >> shift;
748 	s->l1_table_offset = header->l1_table_offset;
749 
750 	s->min_cluster_alloc = 1; /* default */
751 
752 	l1_table_bytes = s->l1_size * sizeof(uint64_t);
753 	l1_table_size  = (l1_table_bytes + 4095) & ~4095;
754 	l1_table_block = (l1_table_bytes + s->l1_table_offset + 4095) & ~4095;
755 
756 	DPRINTF("L1 Table offset detected: %"PRIu64", size %d (%d)\n",
757 		(uint64_t)s->l1_table_offset,
758 		(int) (s->l1_size * sizeof(uint64_t)),
759 		l1_table_size);
760 
761 	err = fstat(s->fd, &st);
762 	if (err) {
763 		err = -errno;
764 		goto out;
765 	}
766 
767 	err = lseek(s->fd, 0, SEEK_SET);
768 	if (err == (off_t)-1) {
769 		err = -errno;
770 		goto out;
771 	}
772 
773 	err = posix_memalign((void **)&buf, 512, l1_table_block);
774 	if (err) {
775 		buf = NULL;
776 		goto out;
777 	}
778 
779 	err = posix_memalign((void **)&s->l1_table, 4096, l1_table_size);
780 	if (err) {
781 		s->l1_table = NULL;
782 		goto out;
783 	}
784 
785 	memset(buf, 0, l1_table_block);
786 	memset(s->l1_table, 0, l1_table_size);
787 
788 	expected = l1_table_block;
789 	if (st.st_size < l1_table_block)
790 		expected = st.st_size;
791 
792 	errno = 0;
793 	err = read(s->fd, buf, l1_table_block);
794 	if (err != expected) {
795 		err = (errno ? -errno : -EIO);
796 		goto out;
797 	}
798 
799 	memcpy(s->l1_table, buf + s->l1_table_offset, l1_table_size);
800 	exthdr = (QCowHeader_ext *)(buf + sizeof(QCowHeader));
801 
802 	/* check for xen extended header */
803 	if (s->l1_table_offset % 4096 == 0 &&
804 	    be32_to_cpu(exthdr->xmagic) == XEN_MAGIC) {
805 		uint32_t flags = be32_to_cpu(exthdr->flags);
806 		uint32_t cksum = be32_to_cpu(exthdr->cksum);
807 
808 		/*
809 		 * Try to detect old tapdisk images. They have to be fixed
810 		 * because they use big endian rather than native endian for
811 		 * the L1 table.  After this block, the l1 table will
812 		 * definitely be in BIG endian.
813 		 */
814 		if (!(flags & EXTHDR_L1_BIG_ENDIAN)) {
815 			DPRINTF("qcow: converting to big endian L1 table\n");
816 
817 			/* convert to big endian */
818 			for (i = 0; i < s->l1_size; i++)
819 				cpu_to_be64s(&s->l1_table[i]);
820 
821 			flags |= EXTHDR_L1_BIG_ENDIAN;
822 			exthdr->flags = cpu_to_be32(flags);
823 
824 			memcpy(buf + s->l1_table_offset,
825 			       s->l1_table, l1_table_size);
826 
827 			err = lseek(s->fd, 0, SEEK_SET);
828 			if (err == (off_t)-1) {
829 				err = -errno;
830 				goto out;
831 			}
832 
833 			err = atomicio(vwrite, s->fd, buf, l1_table_block);
834 			if (err != l1_table_block) {
835 				err = -errno;
836 				goto out;
837 			}
838 		}
839 
840 		/* check the L1 table checksum */
841 		if (cksum != gen_cksum((char *)s->l1_table,
842 				       s->l1_size * sizeof(uint64_t)))
843 			DPRINTF("qcow: bad L1 checksum\n");
844 		else {
845 			s->extended = 1;
846 			s->sparse = (be32_to_cpu(exthdr->flags) & SPARSE_FILE);
847 			s->min_cluster_alloc =
848 				be32_to_cpu(exthdr->min_cluster_alloc);
849 		}
850 	}
851 
852 	/* convert L1 table to native endian for operation */
853 	for (i = 0; i < s->l1_size; i++)
854 		be64_to_cpus(&s->l1_table[i]);
855 
856 	err = 0;
857 
858 out:
859 	if (err) {
860 		free(buf);
861 		free(s->l1_table);
862 		s->l1_table = NULL;
863 	}
864 	return err;
865 }
866 
867 /* Open the disk file and initialize qcow state. */
tdqcow_open(td_driver_t * driver,const char * name,td_flag_t flags)868 int tdqcow_open (td_driver_t *driver, const char *name, td_flag_t flags)
869 {
870 	int fd, len, i, ret, size, o_flags;
871 	td_disk_info_t *bs = &(driver->info);
872 	struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
873 	QCowHeader header;
874 	uint64_t final_cluster = 0;
875 
876  	DPRINTF("QCOW: Opening %s\n", name);
877 
878 	o_flags = O_DIRECT | O_LARGEFILE |
879 		((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR);
880 	fd = open(name, o_flags);
881 	if (fd < 0) {
882 		DPRINTF("Unable to open %s (%d)\n", name, -errno);
883 		return -1;
884 	}
885 
886 	s->fd = fd;
887 	s->name = strdup(name);
888 	if (!s->name)
889 		goto fail;
890 
891 	if (tdqcow_read_header(fd, &header))
892 		goto fail;
893 
894 	if (header.magic != QCOW_MAGIC)
895 		goto fail;
896 
897 	switch (header.version) {
898 	case QCOW_VERSION:
899 		break;
900 	case 2:
901 	  //TODO: Port qcow2 to new blktap framework.
902 	  //		close(fd);
903 	  //		dd->drv = &tapdisk_qcow2;
904 	  //		return dd->drv->td_open(dd, name, flags);
905 	  goto fail;
906 	default:
907 		goto fail;
908 	}
909 
910 	if (header.size <= 1 || header.cluster_bits < 9)
911 		goto fail;
912 	if (header.crypt_method > QCOW_CRYPT_AES)
913 		goto fail;
914 	s->crypt_method_header = header.crypt_method;
915 	if (s->crypt_method_header)
916 		s->encrypted = 1;
917 	s->cluster_bits = header.cluster_bits;
918 	s->cluster_size = 1 << s->cluster_bits;
919 	s->cluster_sectors = 1 << (s->cluster_bits - 9);
920 	s->l2_bits = header.l2_bits;
921 	s->l2_size = 1 << s->l2_bits;
922 	s->cluster_alloc = s->l2_size;
923 	bs->size = header.size / 512;
924 	s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
925 	s->backing_file_offset = header.backing_file_offset;
926 	s->backing_file_size   = header.backing_file_size;
927 
928 	/* allocate and load l1 table */
929 	if (tdqcow_load_l1_table(s, &header))
930 		goto fail;
931 
932 	/* alloc L2 cache */
933 	size = s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t);
934 	ret = posix_memalign((void **)&s->l2_cache, 4096, size);
935 	if(ret != 0) goto fail;
936 
937 	size = s->cluster_size;
938 	ret = posix_memalign((void **)&s->cluster_cache, 4096, size);
939 	if(ret != 0) goto fail;
940 
941 	ret = posix_memalign((void **)&s->cluster_data, 4096, size);
942 	if(ret != 0) goto fail;
943 	s->cluster_cache_offset = -1;
944 
945 	if (s->backing_file_offset != 0)
946 		s->cluster_alloc = 1; /*Cannot use pre-alloc*/
947 
948         bs->sector_size = 512;
949         bs->info = 0;
950 
951 	for(i = 0; i < s->l1_size; i++)
952 		if (s->l1_table[i] > final_cluster)
953 			final_cluster = s->l1_table[i];
954 
955 	if (init_aio_state(driver)!=0) {
956 	  DPRINTF("Unable to initialise AIO state\n");
957 	  free_aio_state(s);
958 	  goto fail;
959 	}
960 
961 	if (!final_cluster)
962 		s->fd_end = s->l1_table_offset +
963 			((s->l1_size * sizeof(uint64_t) + 4095) & ~4095);
964 	else {
965 		s->fd_end = lseek(fd, 0, SEEK_END);
966 		if (s->fd_end == (off_t)-1)
967 			goto fail;
968 	}
969 
970 	return 0;
971 
972 fail:
973 	DPRINTF("QCOW Open failed\n");
974 
975 	free_aio_state(s);
976 	free(s->l1_table);
977 	free(s->l2_cache);
978 	free(s->cluster_cache);
979 	free(s->cluster_data);
980 	close(fd);
981 	return -1;
982 }
983 
tdqcow_queue_read(td_driver_t * driver,td_request_t treq)984 void tdqcow_queue_read(td_driver_t *driver, td_request_t treq)
985 {
986 	struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
987 	int ret = 0, index_in_cluster, n, i;
988 	uint64_t cluster_offset, sector, nb_sectors;
989 	struct qcow_prv* prv;
990 	td_request_t clone = treq;
991 	char* buf = treq.buf;
992 
993 	sector     = treq.sec;
994 	nb_sectors = treq.secs;
995 
996 	/*We store a local record of the request*/
997 	while (nb_sectors > 0) {
998 		cluster_offset =
999 			get_cluster_offset(s, sector << 9, 0, 0, 0, 0);
1000 		index_in_cluster = sector & (s->cluster_sectors - 1);
1001 		n = s->cluster_sectors - index_in_cluster;
1002 		if (n > nb_sectors)
1003 			n = nb_sectors;
1004 
1005 		if (s->aio_free_count == 0) {
1006 			td_complete_request(treq, -EBUSY);
1007 			return;
1008 		}
1009 
1010 		if(!cluster_offset) {
1011             int i;
1012             /* Forward entire request if possible. */
1013             for(i=0; i<nb_sectors; i++)
1014                 if(get_cluster_offset(s, (sector+i) << 9, 0, 0, 0, 0))
1015                     goto coalesce_failed;
1016             treq.buf  = buf;
1017             treq.sec  = sector;
1018             treq.secs = nb_sectors;
1019 			td_forward_request(treq);
1020             return;
1021 coalesce_failed:
1022 			treq.buf  = buf;
1023 			treq.sec  = sector;
1024 			treq.secs = n;
1025 			td_forward_request(treq);
1026 
1027 		} else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
1028 			if (decompress_cluster(s, cluster_offset) < 0) {
1029 				td_complete_request(treq, -EIO);
1030 				goto done;
1031 			}
1032 			memcpy(buf, s->cluster_cache + index_in_cluster * 512,
1033 			       512 * n);
1034 
1035 			treq.buf  = buf;
1036 			treq.sec  = sector;
1037 			treq.secs = n;
1038 			td_complete_request(treq, 0);
1039 		} else {
1040 		  clone.buf  = buf;
1041 		  clone.sec  = (cluster_offset>>9)+index_in_cluster;
1042 		  clone.secs = n;
1043 		  async_read(driver, clone);
1044 		}
1045 		nb_sectors -= n;
1046 		sector += n;
1047 		buf += n * 512;
1048 	}
1049 done:
1050 	return;
1051 }
1052 
tdqcow_queue_write(td_driver_t * driver,td_request_t treq)1053 void tdqcow_queue_write(td_driver_t *driver, td_request_t treq)
1054 {
1055 	struct tdqcow_state   *s  = (struct tdqcow_state *)driver->data;
1056 	int ret = 0, index_in_cluster, n, i;
1057 	uint64_t cluster_offset, sector, nb_sectors;
1058 	td_callback_t cb;
1059 	struct qcow_prv* prv;
1060 	char* buf = treq.buf;
1061 	td_request_t clone=treq;
1062 
1063 	sector     = treq.sec;
1064 	nb_sectors = treq.secs;
1065 
1066 	/*We store a local record of the request*/
1067 	while (nb_sectors > 0) {
1068 		index_in_cluster = sector & (s->cluster_sectors - 1);
1069 		n = s->cluster_sectors - index_in_cluster;
1070 		if (n > nb_sectors)
1071 			n = nb_sectors;
1072 
1073 		if (s->aio_free_count == 0) {
1074 			td_complete_request(treq, -EBUSY);
1075 			return;
1076 		}
1077 
1078 		cluster_offset = get_cluster_offset(s, sector << 9, 1, 0,
1079 						    index_in_cluster,
1080 						    index_in_cluster+n);
1081 		if (!cluster_offset) {
1082 			DPRINTF("Ooops, no write cluster offset!\n");
1083 			td_complete_request(treq, -EIO);
1084 			return;
1085 		}
1086 
1087 		if (s->crypt_method) {
1088 			encrypt_sectors(s, sector, s->cluster_data,
1089 					(unsigned char *)buf, n, 1,
1090 					&s->aes_encrypt_key);
1091 
1092 			clone.buf  = buf;
1093 			clone.sec  = (cluster_offset>>9) + index_in_cluster;
1094 			clone.secs = n;
1095 			async_write(driver, clone);
1096 		} else {
1097 		  clone.buf  = buf;
1098 		  clone.sec  = (cluster_offset>>9) + index_in_cluster;
1099 		  clone.secs = n;
1100 
1101 		  async_write(driver, clone);
1102 		}
1103 
1104 		nb_sectors -= n;
1105 		sector += n;
1106 		buf += n * 512;
1107 	}
1108 	s->cluster_cache_offset = -1; /* disable compressed cache */
1109 
1110 	return;
1111 }
1112 
1113 static int
tdqcow_update_checksum(struct tdqcow_state * s)1114 tdqcow_update_checksum(struct tdqcow_state *s)
1115 {
1116 	int i, fd, err;
1117 	uint32_t offset, cksum, out;
1118 
1119 	if (!s->extended)
1120 		return 0;
1121 
1122 	fd = open(s->name, O_WRONLY | O_LARGEFILE); /* open without O_DIRECT */
1123 	if (fd == -1) {
1124 		err = errno;
1125 		goto out;
1126 	}
1127 
1128 	offset = sizeof(QCowHeader) + offsetof(QCowHeader_ext, cksum);
1129 	if (lseek(fd, offset, SEEK_SET) == (off_t)-1) {
1130 		err = errno;
1131 		goto out;
1132 	}
1133 
1134 	/* convert to big endian for checksum */
1135 	for (i = 0; i < s->l1_size; i++)
1136 		cpu_to_be64s(&s->l1_table[i]);
1137 
1138 	cksum = gen_cksum((char *)s->l1_table, s->l1_size * sizeof(uint64_t));
1139 
1140 	/* and back again... */
1141 	for (i = 0; i < s->l1_size; i++)
1142 		be64_to_cpus(&s->l1_table[i]);
1143 
1144 	DPRINTF("Writing cksum: %d", cksum);
1145 
1146 	out = cpu_to_be32(cksum);
1147 	if (write(fd, &out, sizeof(out)) != sizeof(out)) {
1148 		err = errno;
1149 		goto out;
1150 	}
1151 
1152 	err = 0;
1153 
1154 out:
1155 	if (err)
1156 		DPRINTF("failed to update checksum: %d\n", err);
1157 	if (fd != -1)
1158 		close(fd);
1159 	return err;
1160 }
1161 
tdqcow_close(td_driver_t * driver)1162 int tdqcow_close(td_driver_t *driver)
1163 {
1164 	struct tdqcow_state *s = (struct tdqcow_state *)driver->data;
1165 
1166 	/*Update the hdr cksum*/
1167 	tdqcow_update_checksum(s);
1168 
1169 	free_aio_state(s);
1170 	free(s->name);
1171 	free(s->l1_table);
1172 	free(s->l2_cache);
1173 	free(s->cluster_cache);
1174 	free(s->cluster_data);
1175 	close(s->fd);
1176 	return 0;
1177 }
1178 
qcow_create(const char * filename,uint64_t total_size,const char * backing_file,int sparse)1179 int qcow_create(const char *filename, uint64_t total_size,
1180 		const char *backing_file, int sparse)
1181 {
1182 	int fd, header_size, backing_filename_len, l1_size, i;
1183 	int shift, length, adjust, flags = 0, ret = 0;
1184 	QCowHeader header;
1185 	QCowHeader_ext exthdr;
1186 	char backing_filename[PATH_MAX], *ptr;
1187 	uint64_t tmp, size, total_length;
1188 	struct stat st;
1189 
1190 	DPRINTF("Qcow_create: size %"PRIu64"\n",total_size);
1191 
1192 	fd = open(filename,
1193 		  O_WRONLY | O_CREAT | O_TRUNC | O_BINARY | O_LARGEFILE,
1194 		  0644);
1195 	if (fd < 0)
1196 		return -1;
1197 
1198 	memset(&header, 0, sizeof(header));
1199 	header.magic = cpu_to_be32(QCOW_MAGIC);
1200 	header.version = cpu_to_be32(QCOW_VERSION);
1201 
1202 	/*Create extended header fields*/
1203 	exthdr.xmagic = cpu_to_be32(XEN_MAGIC);
1204 
1205 	header_size = sizeof(header) + sizeof(QCowHeader_ext);
1206 	backing_filename_len = 0;
1207 	size = (total_size >> SECTOR_SHIFT);
1208 	if (backing_file) {
1209 		if (strcmp(backing_file, "fat:")) {
1210 			const char *p;
1211 			/* XXX: this is a hack: we do not attempt to
1212 			 *check for URL like syntax */
1213 			p = strchr(backing_file, ':');
1214 			if (p && (p - backing_file) >= 2) {
1215 				/* URL like but exclude "c:" like filenames */
1216 				strncpy(backing_filename, backing_file,
1217 					sizeof(backing_filename));
1218 			} else {
1219 				if (realpath(backing_file, backing_filename) == NULL ||
1220 				    stat(backing_filename, &st) != 0) {
1221 					return -1;
1222 				}
1223 			}
1224 			header.backing_file_offset = cpu_to_be64(header_size);
1225 			backing_filename_len = strlen(backing_filename);
1226 			header.backing_file_size = cpu_to_be32(
1227 				backing_filename_len);
1228 			header_size += backing_filename_len;
1229 
1230 			/*Set to the backing file size*/
1231 			if(get_filesize(backing_filename, &size, &st)) {
1232 				return -1;
1233 			}
1234 			DPRINTF("Backing file size detected: %"PRId64" sectors"
1235 				"(total %"PRId64" [%"PRId64" MB])\n",
1236 				size,
1237 				(uint64_t)(size << SECTOR_SHIFT),
1238 				(uint64_t)(size >> 11));
1239 		} else {
1240 			backing_file = NULL;
1241 			DPRINTF("Setting file size: %"PRId64" (total %"PRId64")\n",
1242 				total_size,
1243 				(uint64_t) (total_size << SECTOR_SHIFT));
1244 		}
1245 		header.mtime = cpu_to_be32(st.st_mtime);
1246 		header.cluster_bits = 9; /* 512 byte cluster to avoid copying
1247 					    unmodifyed sectors */
1248 		header.l2_bits = 12; /* 32 KB L2 tables */
1249 		exthdr.min_cluster_alloc = cpu_to_be32(1);
1250 	} else {
1251 		DPRINTF("Setting file size: %"PRId64" sectors"
1252 			"(total %"PRId64" [%"PRId64" MB])\n",
1253 			size,
1254 			(uint64_t) (size << SECTOR_SHIFT),
1255 			(uint64_t) (size >> 11));
1256 		header.cluster_bits = 12; /* 4 KB clusters */
1257 		header.l2_bits = 9; /* 4 KB L2 tables */
1258 		exthdr.min_cluster_alloc = cpu_to_be32(1 << 9);
1259 	}
1260 	/*Set the header size value*/
1261 	header.size = cpu_to_be64(size * 512);
1262 
1263 	header_size = (header_size + 7) & ~7;
1264 	if (header_size % 4096 > 0) {
1265 		header_size = ((header_size >> 12) + 1) << 12;
1266 	}
1267 
1268 	shift = header.cluster_bits + header.l2_bits;
1269 	l1_size = ((size * 512) + (1LL << shift) - 1) >> shift;
1270 
1271 	header.l1_table_offset = cpu_to_be64(header_size);
1272 	DPRINTF("L1 Table offset: %d, size %d\n",
1273 		header_size,
1274 		(int)(l1_size * sizeof(uint64_t)));
1275 	header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
1276 
1277 	ptr = calloc(1, l1_size * sizeof(uint64_t));
1278 	exthdr.cksum = cpu_to_be32(gen_cksum(ptr, l1_size * sizeof(uint64_t)));
1279 	printf("Created cksum: %d\n",exthdr.cksum);
1280 	free(ptr);
1281 
1282 	/*adjust file length to system page size boundary*/
1283 	length = ROUNDUP(header_size + (l1_size * sizeof(uint64_t)),
1284 		getpagesize());
1285 	if (qtruncate(fd, length, 0)!=0) {
1286 		DPRINTF("ERROR truncating file\n");
1287 		return -1;
1288 	}
1289 
1290 	if (sparse == 0) {
1291 		/*Filesize is length+l1_size*(1 << s->l2_bits)+(size*512)*/
1292 		total_length = length + (l1_size * (1 << 9)) + (size * 512);
1293 		if (qtruncate(fd, total_length, 0)!=0) {
1294                         DPRINTF("ERROR truncating file\n");
1295                         return -1;
1296 		}
1297 		printf("File truncated to length %"PRIu64"\n",total_length);
1298 	} else
1299 		flags = SPARSE_FILE;
1300 
1301 	flags |= EXTHDR_L1_BIG_ENDIAN;
1302 	exthdr.flags = cpu_to_be32(flags);
1303 
1304 	/* write all the data */
1305 	lseek(fd, 0, SEEK_SET);
1306 	ret += write(fd, &header, sizeof(header));
1307 	ret += write(fd, &exthdr, sizeof(exthdr));
1308 	if (backing_file)
1309 		ret += write(fd, backing_filename, backing_filename_len);
1310 
1311 	lseek(fd, header_size, SEEK_SET);
1312 	tmp = 0;
1313 	for (i = 0;i < l1_size; i++) {
1314 		ret += write(fd, &tmp, sizeof(tmp));
1315 	}
1316 
1317 	close(fd);
1318 
1319 	return 0;
1320 }
1321 
qcow_make_empty(struct tdqcow_state * s)1322 static int qcow_make_empty(struct tdqcow_state *s)
1323 {
1324 	uint32_t l1_length = s->l1_size * sizeof(uint64_t);
1325 
1326 	memset(s->l1_table, 0, l1_length);
1327 	lseek(s->fd, s->l1_table_offset, SEEK_SET);
1328 	if (write(s->fd, s->l1_table, l1_length) < 0)
1329 		return -1;
1330 	if (qtruncate(s->fd, s->l1_table_offset + l1_length, s->sparse)!=0) {
1331 		DPRINTF("ERROR truncating file\n");
1332 		return -1;
1333 	}
1334 
1335 	memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
1336 	memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
1337 	memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
1338 
1339 	return 0;
1340 }
1341 
qcow_get_cluster_size(struct tdqcow_state * s)1342 static int qcow_get_cluster_size(struct tdqcow_state *s)
1343 {
1344 	return s->cluster_size;
1345 }
1346 
1347 /* XXX: put compressed sectors first, then all the cluster aligned
1348    tables to avoid losing bytes in alignment */
qcow_compress_cluster(struct tdqcow_state * s,int64_t sector_num,const uint8_t * buf)1349 static int qcow_compress_cluster(struct tdqcow_state *s, int64_t sector_num,
1350                           const uint8_t *buf)
1351 {
1352 	z_stream strm;
1353 	int ret, out_len;
1354 	uint8_t *out_buf;
1355 	uint64_t cluster_offset;
1356 
1357 	out_buf = malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
1358 	if (!out_buf)
1359 		return -1;
1360 
1361 	/* best compression, small window, no zlib header */
1362 	memset(&strm, 0, sizeof(strm));
1363 	ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
1364 			   Z_DEFLATED, -12,
1365 			   9, Z_DEFAULT_STRATEGY);
1366 	if (ret != 0) {
1367 		free(out_buf);
1368 		return -1;
1369 	}
1370 
1371 	strm.avail_in = s->cluster_size;
1372 	strm.next_in = (uint8_t *)buf;
1373 	strm.avail_out = s->cluster_size;
1374 	strm.next_out = out_buf;
1375 
1376 	ret = deflate(&strm, Z_FINISH);
1377 	if (ret != Z_STREAM_END && ret != Z_OK) {
1378 		free(out_buf);
1379 		deflateEnd(&strm);
1380 		return -1;
1381 	}
1382 	out_len = strm.next_out - out_buf;
1383 
1384 	deflateEnd(&strm);
1385 
1386 	if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
1387 		/* could not compress: write normal cluster */
1388 		//tdqcow_queue_write(bs, sector_num, buf, s->cluster_sectors);
1389 	} else {
1390 		cluster_offset = get_cluster_offset(s, sector_num << 9, 2,
1391                                             out_len, 0, 0);
1392 		cluster_offset &= s->cluster_offset_mask;
1393 		lseek(s->fd, cluster_offset, SEEK_SET);
1394 		if (write(s->fd, out_buf, out_len) != out_len) {
1395 			free(out_buf);
1396 			return -1;
1397 		}
1398 	}
1399 
1400 	free(out_buf);
1401 	return 0;
1402 }
1403 
1404 static int
tdqcow_get_image_type(const char * file,int * type)1405 tdqcow_get_image_type(const char *file, int *type)
1406 {
1407 	int fd;
1408 	size_t size;
1409 	QCowHeader header;
1410 
1411 	fd = open(file, O_RDONLY);
1412 	if (fd == -1)
1413 		return -errno;
1414 
1415 	size = read(fd, &header, sizeof(header));
1416 	close(fd);
1417 	if (size != sizeof(header))
1418 		return (errno ? -errno : -EIO);
1419 
1420 	be32_to_cpus(&header.magic);
1421 	if (header.magic == QCOW_MAGIC)
1422 		*type = DISK_TYPE_QCOW;
1423 	else
1424 		*type = DISK_TYPE_AIO;
1425 
1426 	return 0;
1427 }
1428 
tdqcow_get_parent_id(td_driver_t * driver,td_disk_id_t * id)1429 int tdqcow_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
1430 {
1431 	off_t off;
1432 	char *buf, *filename;
1433 	int len, secs, type = 0, err = -EINVAL;
1434 	struct tdqcow_state *child  = (struct tdqcow_state *)driver->data;
1435 
1436 	if (!child->backing_file_offset)
1437 		return TD_NO_PARENT;
1438 
1439 	/* read the backing file name */
1440 	len  = child->backing_file_size;
1441 	off  = child->backing_file_offset - (child->backing_file_offset % 512);
1442 	secs = (len + (child->backing_file_offset - off) + 511) >> 9;
1443 
1444 	if (posix_memalign((void **)&buf, 512, secs << 9))
1445 		return -1;
1446 
1447 	if (lseek(child->fd, off, SEEK_SET) == (off_t)-1)
1448 		goto out;
1449 
1450 	if (read(child->fd, buf, secs << 9) != secs << 9)
1451 		goto out;
1452 	filename       = buf + (child->backing_file_offset - off);
1453 	filename[len]  = '\0';
1454 
1455 	if (tdqcow_get_image_type(filename, &type))
1456 		goto out;
1457 
1458 	id->name       = strdup(filename);
1459 	id->drivertype = type;
1460 	err            = 0;
1461  out:
1462 	free(buf);
1463 	return err;
1464 }
1465 
tdqcow_validate_parent(td_driver_t * driver,td_driver_t * pdriver,td_flag_t flags)1466 int tdqcow_validate_parent(td_driver_t *driver,
1467 			  td_driver_t *pdriver, td_flag_t flags)
1468 {
1469 	struct stat stats;
1470 	uint64_t psize, csize;
1471 	struct tdqcow_state *c = (struct tdqcow_state *)driver->data;
1472 	struct tdqcow_state *p = (struct tdqcow_state *)pdriver->data;
1473 
1474 	if (stat(p->name, &stats))
1475 		return -EINVAL;
1476 	if (get_filesize(p->name, &psize, &stats))
1477 		return -EINVAL;
1478 
1479 	if (stat(c->name, &stats))
1480 		return -EINVAL;
1481 	if (get_filesize(c->name, &csize, &stats))
1482 		return -EINVAL;
1483 
1484 	if (csize != psize)
1485 		return -EINVAL;
1486 
1487 	return 0;
1488 }
1489 
1490 struct tap_disk tapdisk_qcow = {
1491 	.disk_type           = "tapdisk_qcow",
1492 	.flags              = 0,
1493 	.private_data_size   = sizeof(struct tdqcow_state),
1494 	.td_open             = tdqcow_open,
1495 	.td_close            = tdqcow_close,
1496 	.td_queue_read       = tdqcow_queue_read,
1497 	.td_queue_write      = tdqcow_queue_write,
1498 	.td_get_parent_id    = tdqcow_get_parent_id,
1499 	.td_validate_parent  = tdqcow_validate_parent,
1500 	.td_debug           = NULL,
1501 };
1502