1 /*
2 * Copyright (c) 2008, XenSource Inc.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of XenSource Inc. nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20 * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /* Driver to sit on top of another disk and log writes, in order
30 * to synchronize two distinct disks
31 *
32 * On receipt of a control request it can export a list of dirty
33 * sectors in the following format:
34 * struct writerange {
35 * u64 sector;
36 * u32 count;
37 * }
38 * terminated by { 0, 0 }
39 */
40
41 #include <errno.h>
42 #include <stdio.h>
43 #include <fcntl.h>
44 #include <unistd.h>
45 #include <stdlib.h>
46 #include <sys/mman.h>
47 #include <sys/socket.h>
48 #include <sys/un.h>
49
50 #include "xc_bitops.h"
51 #include "log.h"
52 #include "tapdisk.h"
53 #include "tapdisk-server.h"
54 #include "tapdisk-driver.h"
55 #include "tapdisk-interface.h"
56
57 #define MAX_CONNECTIONS 1
58
59 typedef struct poll_fd {
60 int fd;
61 event_id_t id;
62 } poll_fd_t;
63
64 struct tdlog_state {
65 uint64_t size;
66
67 void* writelog;
68
69 char* ctlpath;
70 poll_fd_t ctl;
71
72 int connected;
73 poll_fd_t connections[MAX_CONNECTIONS];
74
75 char* shmpath;
76 void* shm;
77
78 log_sring_t* sring;
79 log_back_ring_t bring;
80 };
81
82 #define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a)
83
84 #define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a)
85
86 static void ctl_accept(event_id_t, char, void *);
87 static void ctl_request(event_id_t, char, void *);
88
89 /* -- write log -- */
90
91 /* large flat bitmaps don't scale particularly well either in size or scan
92 * time, but they'll do for now */
93
writelog_create(struct tdlog_state * s)94 static int writelog_create(struct tdlog_state *s)
95 {
96 uint64_t bmsize;
97
98 bmsize = bitmap_size(s->size);
99
100 BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize);
101
102 s->writelog = bitmap_alloc(s->size);
103 if (!s->writelog) {
104 BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize);
105 return -1;
106 }
107
108 return 0;
109 }
110
writelog_free(struct tdlog_state * s)111 static int writelog_free(struct tdlog_state *s)
112 {
113 if (s->writelog)
114 free(s->writelog);
115
116 return 0;
117 }
118
writelog_set(struct tdlog_state * s,uint64_t sector,int count)119 static int writelog_set(struct tdlog_state* s, uint64_t sector, int count)
120 {
121 int i;
122
123 for (i = 0; i < count; i++)
124 set_bit(sector + i, s->writelog);
125
126 return 0;
127 }
128
129 /* if end is 0, clear to end of disk */
writelog_clear(struct tdlog_state * s,uint64_t start,uint64_t end)130 int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end)
131 {
132 if (!end)
133 end = s->size;
134
135 /* clear to word boundaries */
136 while (BITMAP_SHIFT(start))
137 clear_bit(start++, s->writelog);
138 while (BITMAP_SHIFT(end))
139 clear_bit(end--, s->writelog);
140
141 memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3);
142
143 return 0;
144 }
145
146 /* returns last block exported (may not be end of disk if shm region
147 * overflows) */
writelog_export(struct tdlog_state * s)148 static uint64_t writelog_export(struct tdlog_state* s)
149 {
150 struct disk_range* range = s->shm;
151 uint64_t i = 0;
152
153 BDPRINTF("sector count: %"PRIu64, s->size);
154
155 for (i = 0; i < s->size; i++) {
156 if (test_bit(i, s->writelog)) {
157 /* range start */
158 range->sector = i;
159 range->count = 1;
160 /* find end */
161 for (i++; i < s->size && test_bit(i, s->writelog); i++)
162 range->count++;
163
164 BDPRINTF("export: dirty extent %"PRIu64":%u",
165 range->sector, range->count);
166 range++;
167
168 /* out of space in shared memory region */
169 if ((void*)range >= bmend(s->shm)) {
170 BDPRINTF("out of space in shm region at sector %"PRIu64, i);
171 return i;
172 }
173
174 /* undo forloop increment */
175 i--;
176 }
177 }
178
179 /* NULL-terminate range list */
180 range->sector = 0;
181 range->count = 0;
182
183 return i;
184 }
185
186 /* -- communication channel -- */
187
188 /* remove FS special characters in up to len bytes of path */
path_escape(char * path,size_t len)189 static inline void path_escape(char* path, size_t len) {
190 int i;
191
192 for (i = 0; i < len && path[i]; i++)
193 if (strchr(":/", path[i]))
194 path[i] = '_';
195 }
196
ctl_makepath(const char * name,const char * ext)197 static char* ctl_makepath(const char* name, const char* ext)
198 {
199 char* res;
200 char *file;
201
202 file = strrchr(name, '/');
203 if (!file) {
204 BWPRINTF("invalid name %s\n", name);
205 return NULL;
206 }
207
208 if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) {
209 BWPRINTF("could not allocate path");
210 return NULL;
211 }
212
213 path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file));
214
215 return res;
216 }
217
shmem_open(struct tdlog_state * s,const char * name)218 static int shmem_open(struct tdlog_state* s, const char* name)
219 {
220 int i, l, fd;
221
222 /* device name -> path */
223 if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) {
224 BWPRINTF("could not allocate shm path");
225 return -1;
226 }
227
228 path_escape(s->shmpath + 5, strlen(name));
229
230 if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) {
231 BWPRINTF("could not open shared memory file %s: %s", s->shmpath,
232 strerror(errno));
233 goto err;
234 }
235 if (ftruncate(fd, SHMSIZE) < 0) {
236 BWPRINTF("error truncating shmem to size %u", SHMSIZE);
237 close(fd);
238 goto err;
239 }
240
241 s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
242 close(fd);
243 if (s->shm == MAP_FAILED) {
244 BWPRINTF("could not mmap write log shm: %s", strerror(errno));
245 goto err;
246 }
247 return 0;
248
249 err:
250 s->shm = NULL;
251 free(s->shmpath);
252 s->shmpath = NULL;
253 return -1;
254 }
255
shmem_close(struct tdlog_state * s)256 static int shmem_close(struct tdlog_state* s)
257 {
258 if (s->shm) {
259 munmap(s->shm, SHMSIZE);
260 s->shm = NULL;
261 }
262
263 if (s->shmpath) {
264 shm_unlink(s->shmpath);
265 s->shmpath = NULL;
266 }
267
268 return 0;
269 }
270
271 /* control socket */
272
ctl_open(struct tdlog_state * s,const char * name)273 static int ctl_open(struct tdlog_state* s, const char* name)
274 {
275 struct sockaddr_un saddr;
276
277 if (!(s->ctlpath = ctl_makepath(name, "ctl")))
278 return -1;
279
280 if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
281 BWPRINTF("error opening control socket: %s", strerror(errno));
282 goto err;
283 }
284
285 memset(&saddr, 0, sizeof(saddr));
286 saddr.sun_family = AF_UNIX;
287 memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath));
288 if (unlink(s->ctlpath) && errno != ENOENT) {
289 BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath,
290 strerror(errno));
291 goto err_sock;
292 }
293
294 if (bind(s->ctl.fd, (const struct sockaddr *)&saddr, sizeof(saddr)) < 0) {
295 BWPRINTF("error binding control socket to %s: %s", s->ctlpath,
296 strerror(errno));
297 goto err_sock;
298 }
299
300 if (listen(s->ctl.fd, 1) < 0) {
301 BWPRINTF("error listening on control socket: %s", strerror(errno));
302 goto err_sock;
303 }
304
305 s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
306 s->ctl.fd, 0, ctl_accept, s);
307 if (s->ctl.id < 0) {
308 BWPRINTF("error register event handler: %s", strerror(s->ctl.id));
309 goto err_sock;
310 }
311
312 return 0;
313
314 err_sock:
315 close(s->ctl.fd);
316 s->ctl.fd = -1;
317 err:
318 free(s->ctlpath);
319 s->ctlpath = NULL;
320
321 return -1;
322 }
323
ctl_close(struct tdlog_state * s)324 static int ctl_close(struct tdlog_state* s)
325 {
326 while (s->connected) {
327 s->connected--;
328 tapdisk_server_unregister_event(s->connections[s->connected].id);
329 close(s->connections[s->connected].fd);
330 s->connections[s->connected].fd = -1;
331 s->connections[s->connected].id = 0;
332 }
333
334 if (s->ctl.fd >= 0) {
335 tapdisk_server_unregister_event(s->ctl.id);
336 close(s->ctl.fd);
337 s->ctl.fd = -1;
338 s->ctl.id = 0;
339 }
340
341 if (s->ctlpath) {
342 unlink(s->ctlpath);
343 free(s->ctlpath);
344 s->ctlpath = NULL;
345 }
346
347 /* XXX this must be fixed once requests are actually in flight */
348 /* could just drain the existing ring here first */
349 if (s->sring) {
350 SHARED_RING_INIT(s->sring);
351 BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
352 }
353
354 return 0;
355 }
356
357 /* walk list of open sockets, close matching fd */
ctl_close_sock(struct tdlog_state * s,int fd)358 static int ctl_close_sock(struct tdlog_state* s, int fd)
359 {
360 int i;
361
362 for (i = 0; i < s->connected; i++) {
363 if (s->connections[i].fd == fd) {
364 tapdisk_server_unregister_event(s->connections[i].id);
365 close(s->connections[i].fd);
366 s->connections[i].fd = -1;
367 s->connections[i].id = 0;
368 s->connected--;
369 return 0;
370 }
371 }
372
373 BWPRINTF("requested to close unknown socket %d", fd);
374 return -1;
375 }
376
ctl_accept(event_id_t id,char mode,void * private)377 static void ctl_accept(event_id_t id, char mode, void *private)
378 {
379 struct tdlog_state* s = (struct tdlog_state *)private;
380 int fd;
381 event_id_t cid;
382
383 if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) {
384 BWPRINTF("error accepting control connection: %s", strerror(errno));
385 return;
386 }
387
388 if (s->connected) {
389 BWPRINTF("control session in progress, closing new connection");
390 close(fd);
391 return;
392 }
393
394 cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
395 fd, 0, ctl_request, s);
396 if (cid < 0) {
397 BWPRINTF("error registering connection event handler: %s", strerror(cid));
398 close(fd);
399 return;
400 }
401
402 s->connections[s->connected].fd = fd;
403 s->connections[s->connected].id = cid;
404 s->connected++;
405 }
406
407 /* response format: 4 bytes shmsize, 0-terminated path */
ctl_get_shmpath(struct tdlog_state * s,int fd)408 static int ctl_get_shmpath(struct tdlog_state* s, int fd)
409 {
410 char msg[CTLRSPLEN_SHMP + 1];
411 uint32_t sz;
412 int rc;
413
414 BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)",
415 SHMSIZE, s->shmpath);
416
417 /* TMP: sanity-check shm */
418 sz = 0xdeadbeef;
419 memcpy(s->shm, &sz, sizeof(sz));
420
421 sz = SHMSIZE;
422 memcpy(msg, &sz, sizeof(sz));
423 snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath);
424 if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) {
425 BWPRINTF("error writing shmpath: %s", strerror(errno));
426 return -1;
427 }
428
429 return 0;
430 }
431
ctl_peek_writes(struct tdlog_state * s,int fd)432 static int ctl_peek_writes(struct tdlog_state* s, int fd)
433 {
434 int rc;
435
436 BDPRINTF("ctl: peeking bitmap");
437
438 writelog_export(s);
439
440 if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) {
441 BWPRINTF("error writing peek ack: %s", strerror(errno));
442 return -1;
443 }
444
445 return 0;
446 }
447
ctl_clear_writes(struct tdlog_state * s,int fd)448 static int ctl_clear_writes(struct tdlog_state* s, int fd)
449 {
450 int rc;
451
452 BDPRINTF("ctl: clearing bitmap");
453
454 writelog_clear(s, 0, 0);
455
456 if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) {
457 BWPRINTF("error writing clear ack: %s", strerror(errno));
458 return -1;
459 }
460
461 return 0;
462 }
463
464 /* get dirty bitmap and clear it atomically */
ctl_get_writes(struct tdlog_state * s,int fd)465 static int ctl_get_writes(struct tdlog_state* s, int fd)
466 {
467 int rc;
468
469 BDPRINTF("ctl: getting bitmap");
470
471 writelog_export(s);
472 writelog_clear(s, 0, 0);
473
474 if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) {
475 BWPRINTF("error writing get ack: %s", strerror(errno));
476 return -1;
477 }
478
479 return 0;
480 }
481
482 /* get requests from ring */
ctl_kick(struct tdlog_state * s,int fd)483 static int ctl_kick(struct tdlog_state* s, int fd)
484 {
485 RING_IDX reqstart, reqend;
486 log_request_t req;
487
488 /* XXX testing */
489 RING_IDX rspstart, rspend;
490 log_response_t rsp;
491 struct log_ctlmsg msg;
492 int rc;
493
494 reqstart = s->bring.req_cons;
495 reqend = s->sring->req_prod;
496
497 BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend);
498
499 while (reqstart != reqend) {
500 /* XXX actually submit these! */
501 memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req));
502 BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count);
503 s->bring.req_cons = ++reqstart;
504
505 rsp.sector = req.sector;
506 rsp.count = req.count;
507 memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp,
508 sizeof(rsp));
509 s->bring.rsp_prod_pvt++;
510 }
511
512 RING_PUSH_RESPONSES(&s->bring);
513 memset(&msg, 0, sizeof(msg));
514 memcpy(msg.msg, LOGCMD_KICK, 4);
515 if ((rc = write(fd, &msg, sizeof(msg))) < 0) {
516 BWPRINTF("error sending notify: %s", strerror(errno));
517 return -1;
518 } else if (rc < sizeof(msg)) {
519 BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg));
520 return -1;
521 }
522
523 return 0;
524 }
525
ctl_do_request(struct tdlog_state * s,int fd,struct log_ctlmsg * msg)526 static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* msg)
527 {
528 if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) {
529 return ctl_get_shmpath(s, fd);
530 } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) {
531 return ctl_peek_writes(s, fd);
532 } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) {
533 return ctl_clear_writes(s, fd);
534 } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) {
535 return ctl_get_writes(s, fd);
536 } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) {
537 return ctl_kick(s, fd);
538 }
539
540 BWPRINTF("unknown control request %.4s", msg->msg);
541 return -1;
542 }
543
ctl_find_connection(struct tdlog_state * s,event_id_t id)544 static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id)
545 {
546 int i;
547
548 for (i = 0; i < s->connected; i++)
549 if (s->connections[i].id == id)
550 return s->connections[i].fd;
551
552 BWPRINTF("unrecognized event callback id %d", id);
553 return -1;
554 }
555
ctl_request(event_id_t id,char mode,void * private)556 static void ctl_request(event_id_t id, char mode, void *private)
557 {
558 struct tdlog_state* s = (struct tdlog_state*)private;
559 struct log_ctlmsg msg;
560 int rc, i, fd = -1;
561
562 fd = ctl_find_connection(s, id);
563 if (fd == -1)
564 return;
565
566 if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
567 BWPRINTF("error reading from ctl socket %d, closing: %s", fd,
568 strerror(errno));
569 ctl_close_sock(s, fd);
570 return;
571 } else if (rc == 0) {
572 BDPRINTF("ctl_request: EOF, closing socket");
573 ctl_close_sock(s, fd);
574 return;
575 } else if (rc < sizeof(msg)) {
576 BWPRINTF("short request received (%d/%zd bytes), ignoring", rc,
577 sizeof(msg));
578 return;
579 }
580
581 ctl_do_request(s, fd, &msg);
582 }
583
584 /* -- interface -- */
585
586 static int tdlog_close(td_driver_t*);
587
tdlog_open(td_driver_t * driver,const char * name,td_flag_t flags)588 static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags)
589 {
590 struct tdlog_state* s = (struct tdlog_state*)driver->data;
591 int rc;
592
593 memset(s, 0, sizeof(*s));
594
595 s->size = driver->info.size;
596
597 if ((rc = writelog_create(s))) {
598 tdlog_close(driver);
599 return rc;
600 }
601 if ((rc = shmem_open(s, name))) {
602 tdlog_close(driver);
603 return rc;
604 }
605 if ((rc = ctl_open(s, name))) {
606 tdlog_close(driver);
607 return rc;
608 }
609
610 s->sring = (log_sring_t*)sringstart(s->shm);
611 SHARED_RING_INIT(s->sring);
612 BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
613
614 BDPRINTF("opened ctl socket");
615
616 return 0;
617 }
618
tdlog_close(td_driver_t * driver)619 static int tdlog_close(td_driver_t* driver)
620 {
621 struct tdlog_state* s = (struct tdlog_state*)driver->data;
622
623 ctl_close(s);
624 shmem_close(s);
625 writelog_free(s);
626
627 return 0;
628 }
629
tdlog_queue_read(td_driver_t * driver,td_request_t treq)630 static void tdlog_queue_read(td_driver_t* driver, td_request_t treq)
631 {
632 td_forward_request(treq);
633 }
634
tdlog_queue_write(td_driver_t * driver,td_request_t treq)635 static void tdlog_queue_write(td_driver_t* driver, td_request_t treq)
636 {
637 struct tdlog_state* s = (struct tdlog_state*)driver->data;
638 int rc;
639
640 writelog_set(s, treq.sec, treq.secs);
641 td_forward_request(treq);
642 }
643
tdlog_get_parent_id(td_driver_t * driver,td_disk_id_t * id)644 static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id)
645 {
646 return -EINVAL;
647 }
648
tdlog_validate_parent(td_driver_t * driver,td_driver_t * parent,td_flag_t flags)649 static int tdlog_validate_parent(td_driver_t *driver,
650 td_driver_t *parent, td_flag_t flags)
651 {
652 return 0;
653 }
654
655 struct tap_disk tapdisk_log = {
656 .disk_type = "tapdisk_log",
657 .private_data_size = sizeof(struct tdlog_state),
658 .flags = 0,
659 .td_open = tdlog_open,
660 .td_close = tdlog_close,
661 .td_queue_read = tdlog_queue_read,
662 .td_queue_write = tdlog_queue_write,
663 .td_get_parent_id = tdlog_get_parent_id,
664 .td_validate_parent = tdlog_validate_parent,
665 };
666