1 /*
2  * Copyright (c) 2008, XenSource Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of XenSource Inc. nor the names of its contributors
13  *       may be used to endorse or promote products derived from this software
14  *       without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 #include <errno.h>
29 #include <fcntl.h>
30 #include <unistd.h>
31 #include <stdlib.h>
32 #include <sys/mman.h>
33 
34 #include "tapdisk.h"
35 #include "tapdisk-utils.h"
36 #include "tapdisk-driver.h"
37 #include "tapdisk-server.h"
38 #include "tapdisk-interface.h"
39 
40 #ifdef DEBUG
41 #define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a)
42 #else
43 #define DBG(_f, _a...) ((void)0)
44 #endif
45 
46 #define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a)
47 
48 #define RADIX_TREE_PAGE_SHIFT           12 /* 4K pages */
49 #define RADIX_TREE_PAGE_SIZE            (1 << RADIX_TREE_PAGE_SHIFT)
50 
51 #define RADIX_TREE_NODE_SHIFT           9 /* 512B nodes */
52 #define RADIX_TREE_NODE_SIZE            (1 << RADIX_TREE_NODE_SHIFT)
53 #define RADIX_TREE_NODE_MASK            (RADIX_TREE_NODE_SIZE - 1)
54 
55 #define BLOCK_CACHE_NODES_PER_PAGE      (1 << (RADIX_TREE_PAGE_SHIFT - RADIX_TREE_NODE_SHIFT))
56 
57 #define BLOCK_CACHE_MAX_SIZE            (10 << 20) /* 100MB cache */
58 #define BLOCK_CACHE_REQUESTS            (TAPDISK_DATA_REQUESTS << 3)
59 #define BLOCK_CACHE_PAGE_IDLETIME       60
60 
61 typedef struct radix_tree               radix_tree_t;
62 typedef struct radix_tree_node          radix_tree_node_t;
63 typedef struct radix_tree_link          radix_tree_link_t;
64 typedef struct radix_tree_leaf          radix_tree_leaf_t;
65 typedef struct radix_tree_page          radix_tree_page_t;
66 
67 typedef struct block_cache              block_cache_t;
68 typedef struct block_cache_request      block_cache_request_t;
69 typedef struct block_cache_stats        block_cache_stats_t;
70 
71 struct radix_tree_page {
72 	char                           *buf;
73 	size_t                          size;
74 	uint64_t                        sec;
75 	radix_tree_link_t              *owners[BLOCK_CACHE_NODES_PER_PAGE];
76 };
77 
78 struct radix_tree_leaf {
79 	radix_tree_page_t              *page;
80 	char                           *buf;
81 };
82 
83 struct radix_tree_link {
84 	uint32_t                        time;
85 	union {
86 		radix_tree_node_t      *next;
87 		radix_tree_leaf_t       leaf;
88 	} u;
89 };
90 
91 struct radix_tree_node {
92 	int                             height;
93 	radix_tree_link_t               links[RADIX_TREE_NODE_SIZE];
94 };
95 
96 struct radix_tree {
97 	int                             height;
98 	uint64_t                        size;
99 	uint32_t                        nodes;
100 	radix_tree_node_t              *root;
101 
102 	block_cache_t                  *cache;
103 };
104 
105 struct block_cache_request {
106 	int                             err;
107 	char                           *buf;
108 	uint64_t                        secs;
109 	td_request_t                    treq;
110 	block_cache_t                  *cache;
111 };
112 
113 struct block_cache_stats {
114 	uint64_t                        reads;
115 	uint64_t                        hits;
116 	uint64_t                        misses;
117 	uint64_t                        prunes;
118 };
119 
120 struct block_cache {
121 	int                             ptype;
122 	char                           *name;
123 
124 	uint64_t                        sectors;
125 
126 	block_cache_request_t           requests[BLOCK_CACHE_REQUESTS];
127 	block_cache_request_t          *request_free_list[BLOCK_CACHE_REQUESTS];
128 	int                             requests_free;
129 
130 	event_id_t                      timeout_id;
131 
132 	radix_tree_t                    tree;
133 
134 	block_cache_stats_t             stats;
135 };
136 
137 static inline uint64_t
radix_tree_calculate_size(int height)138 radix_tree_calculate_size(int height)
139 {
140 	return (uint64_t)RADIX_TREE_NODE_SIZE <<
141 	  (height * RADIX_TREE_NODE_SHIFT);
142 }
143 
144 static inline int
radix_tree_calculate_height(uint64_t sectors)145 radix_tree_calculate_height(uint64_t sectors)
146 {
147 	int height;
148 	uint64_t tree_size;
149 
150 	height = 1;  /* always allocate root node */
151 	tree_size = radix_tree_calculate_size(height);
152 	while (sectors > tree_size)
153 		tree_size = radix_tree_calculate_size(++height);
154 
155 	return height;
156 }
157 
158 static inline int
radix_tree_index(radix_tree_node_t * node,uint64_t sector)159 radix_tree_index(radix_tree_node_t *node, uint64_t sector)
160 {
161 	return ((sector >> (node->height * RADIX_TREE_NODE_SHIFT)) &
162 		RADIX_TREE_NODE_MASK);
163 }
164 
165 static inline int
radix_tree_node_contains_leaves(radix_tree_t * tree,radix_tree_node_t * node)166 radix_tree_node_contains_leaves(radix_tree_t *tree, radix_tree_node_t *node)
167 {
168 	return (node->height == 0);
169 }
170 
171 static inline int
radix_tree_node_is_root(radix_tree_t * tree,radix_tree_node_t * node)172 radix_tree_node_is_root(radix_tree_t *tree, radix_tree_node_t *node)
173 {
174 	return (node->height == tree->height);
175 }
176 
177 static inline uint64_t
radix_tree_size(radix_tree_t * tree)178 radix_tree_size(radix_tree_t *tree)
179 {
180 	return tree->size + tree->nodes * sizeof(radix_tree_node_t);
181 }
182 
183 static inline void
radix_tree_clear_link(radix_tree_link_t * link)184 radix_tree_clear_link(radix_tree_link_t *link)
185 {
186 	if (link)
187 		memset(link, 0, sizeof(radix_tree_link_t));
188 }
189 
190 static inline radix_tree_node_t *
radix_tree_allocate_node(radix_tree_t * tree,int height)191 radix_tree_allocate_node(radix_tree_t *tree, int height)
192 {
193 	radix_tree_node_t *node;
194 
195 	node = calloc(1, sizeof(radix_tree_node_t));
196 	if (!node)
197 		return NULL;
198 
199 	node->height = height;
200 	tree->nodes++;
201 
202 	return node;
203 }
204 
205 static inline radix_tree_node_t *
radix_tree_allocate_child_node(radix_tree_t * tree,radix_tree_node_t * parent)206 radix_tree_allocate_child_node(radix_tree_t *tree, radix_tree_node_t *parent)
207 {
208 	return radix_tree_allocate_node(tree, parent->height - 1);
209 }
210 
211 void
radix_tree_free_node(radix_tree_t * tree,radix_tree_node_t * node)212 radix_tree_free_node(radix_tree_t *tree, radix_tree_node_t *node)
213 {
214 	if (!node)
215 		return;
216 
217 	free(node);
218 	tree->nodes--;
219 }
220 
221 static inline radix_tree_page_t *
radix_tree_allocate_page(radix_tree_t * tree,char * buf,uint64_t sec,size_t size)222 radix_tree_allocate_page(radix_tree_t *tree,
223 			 char *buf, uint64_t sec, size_t size)
224 {
225 	radix_tree_page_t *page;
226 
227 	page = calloc(1, sizeof(radix_tree_page_t));
228 	if (!page)
229 		return NULL;
230 
231 	page->buf   = buf;
232 	page->sec   = sec;
233 	page->size  = size;
234 	tree->size += size;
235 
236 	return page;
237 }
238 
239 static inline void
radix_tree_free_page(radix_tree_t * tree,radix_tree_page_t * page)240 radix_tree_free_page(radix_tree_t *tree, radix_tree_page_t *page)
241 {
242 	int i;
243 
244 	for (i = 0; i < page->size >> RADIX_TREE_NODE_SHIFT; i++)
245 		DBG("%s: ejecting sector 0x%llx\n",
246 		    tree->cache->name, page->sec + i);
247 
248 	tree->cache->stats.prunes += (page->size >> RADIX_TREE_NODE_SHIFT);
249 	tree->size -= page->size;
250 	free(page->buf);
251 	free(page);
252 }
253 
254 /*
255  * remove a leaf and the shared radix_tree_page_t containing its buffer.
256  * leaves are deleted, nodes are not; gc will reap the nodes later.
257  */
258 static void
radix_tree_remove_page(radix_tree_t * tree,radix_tree_page_t * page)259 radix_tree_remove_page(radix_tree_t *tree, radix_tree_page_t *page)
260 {
261 	int i;
262 
263 	if (!page)
264 		return;
265 
266 	for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++)
267 		radix_tree_clear_link(page->owners[i]);
268 
269 	radix_tree_free_page(tree, page);
270 }
271 
272 static void
radix_tree_insert_leaf(radix_tree_t * tree,radix_tree_link_t * link,radix_tree_page_t * page,off_t off)273 radix_tree_insert_leaf(radix_tree_t *tree, radix_tree_link_t *link,
274 		       radix_tree_page_t *page, off_t off)
275 {
276 	int i;
277 
278 	if (off + RADIX_TREE_NODE_SIZE > page->size)
279 		return;
280 
281 	for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) {
282 		if (page->owners[i])
283 			continue;
284 
285 		page->owners[i]   = link;
286 		link->u.leaf.page = page;
287 		link->u.leaf.buf  = page->buf + off;
288 
289 		break;
290 	}
291 }
292 
293 static char *
radix_tree_find_leaf(radix_tree_t * tree,uint64_t sector)294 radix_tree_find_leaf(radix_tree_t *tree, uint64_t sector)
295 {
296 	int idx;
297 	struct timeval now;
298 	radix_tree_link_t *link;
299 	radix_tree_node_t *node;
300 
301 	node = tree->root;
302 	gettimeofday(&now, NULL);
303 
304 	do {
305 		idx        = radix_tree_index(node, sector);
306 		link       = node->links + idx;
307 		link->time = now.tv_sec;
308 
309 		if (radix_tree_node_contains_leaves(tree, node))
310 			return link->u.leaf.buf;
311 
312 		if (!link->u.next)
313 			return NULL;
314 
315 		node = link->u.next;
316 	} while (1);
317 }
318 
319 static char *
radix_tree_add_leaf(radix_tree_t * tree,uint64_t sector,radix_tree_page_t * page,off_t off)320 radix_tree_add_leaf(radix_tree_t *tree, uint64_t sector,
321 		    radix_tree_page_t *page, off_t off)
322 {
323 	int idx;
324 	struct timeval now;
325 	radix_tree_link_t *link;
326 	radix_tree_node_t *node;
327 
328 	node = tree->root;
329 	gettimeofday(&now, NULL);
330 
331 	do {
332 		idx        = radix_tree_index(node, sector);
333 		link       = node->links + idx;
334 		link->time = now.tv_sec;
335 
336 		if (radix_tree_node_contains_leaves(tree, node)) {
337 			radix_tree_remove_page(tree, link->u.leaf.page);
338 			radix_tree_insert_leaf(tree, link, page, off);
339 			return link->u.leaf.buf;
340 		}
341 
342 		if (!link->u.next) {
343 			link->u.next = radix_tree_allocate_child_node(tree,
344 								      node);
345 			if (!link->u.next)
346 				return NULL;
347 		}
348 
349 		node = link->u.next;
350 	} while (1);
351 }
352 
353 static int
radix_tree_add_leaves(radix_tree_t * tree,char * buf,uint64_t sector,uint64_t sectors)354 radix_tree_add_leaves(radix_tree_t *tree, char *buf,
355 		      uint64_t sector, uint64_t sectors)
356 {
357 	int i;
358 	radix_tree_page_t *page;
359 
360 	page = radix_tree_allocate_page(tree, buf, sector,
361 					sectors << RADIX_TREE_NODE_SHIFT);
362 	if (!page)
363 		return -ENOMEM;
364 
365 	for (i = 0; i < sectors; i++)
366 		if (!radix_tree_add_leaf(tree, sector + i,
367 					 page, (i << RADIX_TREE_NODE_SHIFT)))
368 			goto fail;
369 
370 	return 0;
371 
372 fail:
373 	page->buf = NULL;
374 	radix_tree_remove_page(tree, page);
375 	return -ENOMEM;
376 }
377 
378 static void
radix_tree_delete_branch(radix_tree_t * tree,radix_tree_node_t * node)379 radix_tree_delete_branch(radix_tree_t *tree, radix_tree_node_t *node)
380 {
381 	int i;
382 	radix_tree_link_t *link;
383 
384 	if (!node)
385 		return;
386 
387 	for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
388 		link = node->links + i;
389 
390 		if (radix_tree_node_contains_leaves(tree, node))
391 			radix_tree_remove_page(tree, link->u.leaf.page);
392 		else
393 			radix_tree_delete_branch(tree, link->u.next);
394 
395 		radix_tree_clear_link(link);
396 	}
397 
398 	radix_tree_free_node(tree, node);
399 }
400 
401 static inline void
radix_tree_destroy(radix_tree_t * tree)402 radix_tree_destroy(radix_tree_t *tree)
403 {
404 	radix_tree_delete_branch(tree, tree->root);
405 	tree->root = NULL;
406 }
407 
408 /*
409  * returns 1 if @node is empty after pruning, 0 otherwise
410  */
411 static int
radix_tree_prune_branch(radix_tree_t * tree,radix_tree_node_t * node,uint32_t now)412 radix_tree_prune_branch(radix_tree_t *tree,
413 			radix_tree_node_t *node, uint32_t now)
414 {
415 	int i, empty;
416 	radix_tree_link_t *link;
417 
418 	empty = 1;
419 	if (!node)
420 		return empty;
421 
422 	for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) {
423 		link = node->links + i;
424 
425 		if (now - link->time < BLOCK_CACHE_PAGE_IDLETIME) {
426 			if (radix_tree_node_contains_leaves(tree, node)) {
427 				empty = 0;
428 				continue;
429 			}
430 
431 			if (radix_tree_prune_branch(tree, link->u.next, now))
432 				radix_tree_clear_link(link);
433 			else
434 				empty = 0;
435 
436 			continue;
437 		}
438 
439 		if (radix_tree_node_contains_leaves(tree, node))
440 			radix_tree_remove_page(tree, link->u.leaf.page);
441 		else
442 			radix_tree_delete_branch(tree, link->u.next);
443 
444 		radix_tree_clear_link(link);
445 	}
446 
447 	if (empty && !radix_tree_node_is_root(tree, node))
448 		radix_tree_free_node(tree, node);
449 
450 	return empty;
451 }
452 
453 /*
454  * walk tree and free any node that has been idle for too long
455  */
456 static void
radix_tree_prune(radix_tree_t * tree)457 radix_tree_prune(radix_tree_t *tree)
458 {
459 	struct timeval now;
460 
461 	if (!tree->root)
462 		return;
463 
464 	DPRINTF("tree %s has %"PRIu64" bytes\n",
465 		tree->cache->name, tree->size);
466 
467 	gettimeofday(&now, NULL);
468 	radix_tree_prune_branch(tree, tree->root, now.tv_sec);
469 
470 	DPRINTF("tree %s now has %"PRIu64" bytes\n",
471 		tree->cache->name, tree->size);
472 }
473 
474 static inline int
radix_tree_initialize(radix_tree_t * tree,uint64_t sectors)475 radix_tree_initialize(radix_tree_t *tree, uint64_t sectors)
476 {
477 	tree->height = radix_tree_calculate_height(sectors);
478 	tree->root   = radix_tree_allocate_node(tree, tree->height);
479 	if (!tree->root)
480 		return -ENOMEM;
481 
482 	return 0;
483 }
484 
485 static inline void
radix_tree_free(radix_tree_t * tree)486 radix_tree_free(radix_tree_t *tree)
487 {
488 	radix_tree_destroy(tree);
489 }
490 
491 static void
block_cache_prune_event(event_id_t id,char mode,void * private)492 block_cache_prune_event(event_id_t id, char mode, void *private)
493 {
494 	radix_tree_t *tree;
495 	block_cache_t *cache;
496 
497 	cache = (block_cache_t *)private;
498 	tree  = &cache->tree;
499 
500 	radix_tree_prune(tree);
501 }
502 
503 static inline block_cache_request_t *
block_cache_get_request(block_cache_t * cache)504 block_cache_get_request(block_cache_t *cache)
505 {
506 	if (!cache->requests_free)
507 		return NULL;
508 
509 	return cache->request_free_list[--cache->requests_free];
510 }
511 
512 static inline void
block_cache_put_request(block_cache_t * cache,block_cache_request_t * breq)513 block_cache_put_request(block_cache_t *cache, block_cache_request_t *breq)
514 {
515 	memset(breq, 0, sizeof(block_cache_request_t));
516 	cache->request_free_list[cache->requests_free++] = breq;
517 }
518 
519 static int
block_cache_open(td_driver_t * driver,const char * name,td_flag_t flags)520 block_cache_open(td_driver_t *driver, const char *name, td_flag_t flags)
521 {
522 	int i, err;
523 	radix_tree_t *tree;
524 	block_cache_t *cache;
525 
526 	if (!td_flag_test(flags, TD_OPEN_RDONLY))
527 		return -EINVAL;
528 
529 	if (driver->info.sector_size != RADIX_TREE_NODE_SIZE)
530 		return -EINVAL;
531 
532 	cache = (block_cache_t *)driver->data;
533 	err   = tapdisk_namedup(&cache->name, (char *)name);
534 	if (err)
535 		return -ENOMEM;
536 
537 	cache->sectors = driver->info.size;
538 
539 	tree = &cache->tree;
540 	err  = radix_tree_initialize(tree, cache->sectors);
541 	if (err)
542 		goto fail;
543 
544 	tree->cache = cache;
545 	cache->requests_free = BLOCK_CACHE_REQUESTS;
546 	for (i = 0; i < BLOCK_CACHE_REQUESTS; i++)
547 		cache->request_free_list[i] = cache->requests + i;
548 
549 	cache->timeout_id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT,
550 							  -1, /* dummy fd */
551 							  BLOCK_CACHE_PAGE_IDLETIME << 1,
552 							  block_cache_prune_event,
553 							  cache);
554 	if (cache->timeout_id < 0)
555 		goto fail;
556 
557 	DPRINTF("opening cache for %s, sectors: %"PRIu64", "
558 		"tree: %p, height: %d\n",
559 		cache->name, cache->sectors, tree, tree->height);
560 
561 	if (mlockall(MCL_CURRENT | MCL_FUTURE))
562 		DPRINTF("mlockall failed: %d\n", -errno);
563 
564 	return 0;
565 
566 fail:
567 	free(cache->name);
568 	radix_tree_free(&cache->tree);
569 	return err;
570 }
571 
572 static int
block_cache_close(td_driver_t * driver)573 block_cache_close(td_driver_t *driver)
574 {
575 	radix_tree_t *tree;
576 	block_cache_t *cache;
577 
578 	cache = (block_cache_t *)driver->data;
579 	tree  = &cache->tree;
580 
581 	DPRINTF("closing cache for %s\n", cache->name);
582 
583 	tapdisk_server_unregister_event(cache->timeout_id);
584 	radix_tree_free(tree);
585 	free(cache->name);
586 
587 	return 0;
588 }
589 
590 static inline uint64_t
block_cache_hash(block_cache_t * cache,char * buf)591 block_cache_hash(block_cache_t *cache, char *buf)
592 {
593 	int i, n;
594 	uint64_t cksm, *data;
595 
596 	return 0;
597 
598 	cksm = 0;
599 	data = (uint64_t *)buf;
600 	n    = RADIX_TREE_NODE_SIZE / sizeof(uint64_t);
601 
602 	for (i = 0; i < n; i++)
603 		cksm += data[i];
604 
605 	return ~cksm;
606 }
607 
608 static void
block_cache_hit(block_cache_t * cache,td_request_t treq,char * iov[])609 block_cache_hit(block_cache_t *cache, td_request_t treq, char *iov[])
610 {
611 	int i;
612 	off_t off;
613 
614 	cache->stats.hits += treq.secs;
615 
616 	for (i = 0; i < treq.secs; i++) {
617 		DBG("%s: block cache hit: sec 0x%08llx, hash: 0x%08llx\n",
618 		    cache->name, treq.sec + i, block_cache_hash(cache, iov[i]));
619 
620 		off = i << RADIX_TREE_NODE_SHIFT;
621 		memcpy(treq.buf + off, iov[i], RADIX_TREE_NODE_SIZE);
622 	}
623 
624 	td_complete_request(treq, 0);
625 }
626 
627 static void
block_cache_populate_cache(td_request_t clone,int err)628 block_cache_populate_cache(td_request_t clone, int err)
629 {
630 	int i;
631 	radix_tree_t *tree;
632 	block_cache_t *cache;
633 	block_cache_request_t *breq;
634 
635 	breq        = (block_cache_request_t *)clone.cb_data;
636 	cache       = breq->cache;
637 	tree        = &cache->tree;
638 	breq->secs -= clone.secs;
639 	breq->err   = (breq->err ? breq->err : err);
640 
641 	if (breq->secs)
642 		return;
643 
644 	if (breq->err) {
645 		free(breq->buf);
646 		goto out;
647 	}
648 
649 	for (i = 0; i < breq->treq.secs; i++) {
650 		off_t off = i << RADIX_TREE_NODE_SHIFT;
651 		DBG("%s: populating sec 0x%08llx\n",
652 		    cache->name, breq->treq.sec + i);
653 		memcpy(breq->treq.buf + off,
654 		       breq->buf + off, RADIX_TREE_NODE_SIZE);
655 	}
656 
657 	if (radix_tree_add_leaves(tree, breq->buf,
658 				  breq->treq.sec, breq->treq.secs))
659 		free(breq->buf);
660 
661 out:
662 	td_complete_request(breq->treq, breq->err);
663 	block_cache_put_request(cache, breq);
664 }
665 
666 static void
block_cache_miss(block_cache_t * cache,td_request_t treq)667 block_cache_miss(block_cache_t *cache, td_request_t treq)
668 {
669 	char *buf;
670 	size_t size;
671 	td_request_t clone;
672 	radix_tree_t *tree;
673 	block_cache_request_t *breq;
674 
675 	DBG("%s: block cache miss: sec 0x%08llx\n", cache->name, treq.sec);
676 
677 	clone = treq;
678 	tree  = &cache->tree;
679 	size  = treq.secs << RADIX_TREE_NODE_SHIFT;
680 
681 	cache->stats.misses += treq.secs;
682 
683 	if (radix_tree_size(tree) + size >= BLOCK_CACHE_MAX_SIZE)
684 		goto out;
685 
686 	breq = block_cache_get_request(cache);
687 	if (!breq)
688 		goto out;
689 
690 	if (posix_memalign((void **)&buf, RADIX_TREE_NODE_SIZE, size)) {
691 		block_cache_put_request(cache, breq);
692 		goto out;
693 	}
694 
695 	breq->treq    = treq;
696 	breq->secs    = treq.secs;
697 	breq->err     = 0;
698 	breq->buf     = buf;
699 	breq->cache   = cache;
700 
701 	clone.buf     = buf;
702 	clone.cb      = block_cache_populate_cache;
703 	clone.cb_data = breq;
704 
705 out:
706 	td_forward_request(clone);
707 }
708 
709 static void
block_cache_queue_read(td_driver_t * driver,td_request_t treq)710 block_cache_queue_read(td_driver_t *driver, td_request_t treq)
711 {
712 	int i;
713 	radix_tree_t *tree;
714 	block_cache_t *cache;
715 	char *iov[BLOCK_CACHE_NODES_PER_PAGE];
716 
717 	cache = (block_cache_t *)driver->data;
718 	tree  = &cache->tree;
719 
720 	cache->stats.reads += treq.secs;
721 
722 	if (treq.secs > BLOCK_CACHE_NODES_PER_PAGE)
723 		return td_forward_request(treq);
724 
725 	for (i = 0; i < treq.secs; i++) {
726 		iov[i] = radix_tree_find_leaf(tree, treq.sec + i);
727 		if (!iov[i])
728 			return block_cache_miss(cache, treq);
729 	}
730 
731 	return block_cache_hit(cache, treq, iov);
732 }
733 
734 static void
block_cache_queue_write(td_driver_t * driver,td_request_t treq)735 block_cache_queue_write(td_driver_t *driver, td_request_t treq)
736 {
737 	td_complete_request(treq, -EPERM);
738 }
739 
740 static int
block_cache_get_parent_id(td_driver_t * driver,td_disk_id_t * id)741 block_cache_get_parent_id(td_driver_t *driver, td_disk_id_t *id)
742 {
743 	return -EINVAL;
744 }
745 
746 static int
block_cache_validate_parent(td_driver_t * driver,td_driver_t * pdriver,td_flag_t flags)747 block_cache_validate_parent(td_driver_t *driver,
748 			    td_driver_t *pdriver, td_flag_t flags)
749 {
750 	block_cache_t *cache;
751 
752 	if (!td_flag_test(pdriver->state, TD_DRIVER_RDONLY))
753 		return -EINVAL;
754 
755 	cache = (block_cache_t *)driver->data;
756 	if (strcmp(driver->name, pdriver->name))
757 		return -EINVAL;
758 
759 	return 0;
760 }
761 
762 static void
block_cache_debug(td_driver_t * driver)763 block_cache_debug(td_driver_t *driver)
764 {
765 	block_cache_t *cache;
766 	block_cache_stats_t *stats;
767 
768 	cache = (block_cache_t *)driver->data;
769 	stats = &cache->stats;
770 
771 	WARN("BLOCK CACHE %s\n", cache->name);
772 	WARN("reads: %"PRIu64", hits: %"PRIu64", misses: %"PRIu64", prunes: %"PRIu64"\n",
773 	     stats->reads, stats->hits, stats->misses, stats->prunes);
774 }
775 
776 struct tap_disk tapdisk_block_cache = {
777 	.disk_type                  = "tapdisk_block_cache",
778 	.flags                      = 0,
779 	.private_data_size          = sizeof(block_cache_t),
780 	.td_open                    = block_cache_open,
781 	.td_close                   = block_cache_close,
782 	.td_queue_read              = block_cache_queue_read,
783 	.td_queue_write             = block_cache_queue_write,
784 	.td_get_parent_id           = block_cache_get_parent_id,
785 	.td_validate_parent         = block_cache_validate_parent,
786 	.td_debug                   = block_cache_debug,
787 };
788