1 /*
2  * Copyright (c) 2008, XenSource Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *     * Redistributions of source code must retain the above copyright
8  *       notice, this list of conditions and the following disclaimer.
9  *     * Redistributions in binary form must reproduce the above copyright
10  *       notice, this list of conditions and the following disclaimer in the
11  *       documentation and/or other materials provided with the distribution.
12  *     * Neither the name of XenSource Inc. nor the names of its contributors
13  *       may be used to endorse or promote products derived from this software
14  *       without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20  * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /* Driver to sit on top of another disk and log writes, in order
30  * to synchronize two distinct disks
31  *
32  * On receipt of a control request it can export a list of dirty
33  * sectors in the following format:
34  * struct writerange {
35  *   u64 sector;
36  *   u32 count;
37  * }
38  * terminated by { 0, 0 }
39  */
40 
41 #include <errno.h>
42 #include <stdio.h>
43 #include <fcntl.h>
44 #include <unistd.h>
45 #include <stdlib.h>
46 #include <sys/mman.h>
47 #include <sys/socket.h>
48 #include <sys/un.h>
49 
50 #include "xc_bitops.h"
51 #include "log.h"
52 #include "tapdisk.h"
53 #include "tapdisk-server.h"
54 #include "tapdisk-driver.h"
55 #include "tapdisk-interface.h"
56 
57 #define MAX_CONNECTIONS 1
58 
59 typedef struct poll_fd {
60   int          fd;
61   event_id_t   id;
62 } poll_fd_t;
63 
64 struct tdlog_state {
65   uint64_t     size;
66 
67   void*        writelog;
68 
69   char*        ctlpath;
70   poll_fd_t    ctl;
71 
72   int          connected;
73   poll_fd_t    connections[MAX_CONNECTIONS];
74 
75   char*        shmpath;
76   void*        shm;
77 
78   log_sring_t* sring;
79   log_back_ring_t bring;
80 };
81 
82 #define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a)
83 
84 #define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a)
85 
86 static void ctl_accept(event_id_t, char, void *);
87 static void ctl_request(event_id_t, char, void *);
88 
89 /* -- write log -- */
90 
91 /* large flat bitmaps don't scale particularly well either in size or scan
92  * time, but they'll do for now */
93 
writelog_create(struct tdlog_state * s)94 static int writelog_create(struct tdlog_state *s)
95 {
96   uint64_t bmsize;
97 
98   bmsize = bitmap_size(s->size);
99 
100   BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize);
101 
102   s->writelog = bitmap_alloc(s->size);
103   if (!s->writelog) {
104     BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize);
105     return -1;
106   }
107 
108   return 0;
109 }
110 
writelog_free(struct tdlog_state * s)111 static int writelog_free(struct tdlog_state *s)
112 {
113   if (s->writelog)
114     free(s->writelog);
115 
116   return 0;
117 }
118 
writelog_set(struct tdlog_state * s,uint64_t sector,int count)119 static int writelog_set(struct tdlog_state* s, uint64_t sector, int count)
120 {
121   int i;
122 
123   for (i = 0; i < count; i++)
124     set_bit(sector + i, s->writelog);
125 
126   return 0;
127 }
128 
129 /* if end is 0, clear to end of disk */
writelog_clear(struct tdlog_state * s,uint64_t start,uint64_t end)130 int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end)
131 {
132   if (!end)
133     end = s->size;
134 
135   /* clear to word boundaries */
136   while (BITMAP_SHIFT(start))
137     clear_bit(start++, s->writelog);
138   while (BITMAP_SHIFT(end))
139     clear_bit(end--, s->writelog);
140 
141   memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3);
142 
143   return 0;
144 }
145 
146 /* returns last block exported (may not be end of disk if shm region
147  * overflows) */
writelog_export(struct tdlog_state * s)148 static uint64_t writelog_export(struct tdlog_state* s)
149 {
150   struct disk_range* range = s->shm;
151   uint64_t i = 0;
152 
153   BDPRINTF("sector count: %"PRIu64, s->size);
154 
155   for (i = 0; i < s->size; i++) {
156     if (test_bit(i, s->writelog)) {
157       /* range start */
158       range->sector = i;
159       range->count = 1;
160       /* find end */
161       for (i++; i < s->size && test_bit(i, s->writelog); i++)
162 	range->count++;
163 
164       BDPRINTF("export: dirty extent %"PRIu64":%u",
165 	       range->sector, range->count);
166       range++;
167 
168       /* out of space in shared memory region */
169       if ((void*)range >= bmend(s->shm)) {
170 	BDPRINTF("out of space in shm region at sector %"PRIu64, i);
171 	return i;
172       }
173 
174       /* undo forloop increment */
175       i--;
176     }
177   }
178 
179   /* NULL-terminate range list */
180   range->sector = 0;
181   range->count = 0;
182 
183   return i;
184 }
185 
186 /* -- communication channel -- */
187 
188 /* remove FS special characters in up to len bytes of path */
path_escape(char * path,size_t len)189 static inline void path_escape(char* path, size_t len) {
190   int i;
191 
192   for (i = 0; i < len && path[i]; i++)
193     if (strchr(":/", path[i]))
194       path[i] = '_';
195 }
196 
ctl_makepath(const char * name,const char * ext)197 static char* ctl_makepath(const char* name, const char* ext)
198 {
199   char* res;
200   char *file;
201 
202   file = strrchr(name, '/');
203   if (!file) {
204     BWPRINTF("invalid name %s\n", name);
205     return NULL;
206   }
207 
208   if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) {
209     BWPRINTF("could not allocate path");
210     return NULL;
211   }
212 
213   path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file));
214 
215   return res;
216 }
217 
shmem_open(struct tdlog_state * s,const char * name)218 static int shmem_open(struct tdlog_state* s, const char* name)
219 {
220   int i, l, fd;
221 
222   /* device name -> path */
223   if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) {
224     BWPRINTF("could not allocate shm path");
225     return -1;
226   }
227 
228   path_escape(s->shmpath + 5, strlen(name));
229 
230   if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) {
231     BWPRINTF("could not open shared memory file %s: %s", s->shmpath,
232 	     strerror(errno));
233     goto err;
234   }
235   if (ftruncate(fd, SHMSIZE) < 0) {
236     BWPRINTF("error truncating shmem to size %u", SHMSIZE);
237     close(fd);
238     goto err;
239   }
240 
241   s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
242   close(fd);
243   if (s->shm == MAP_FAILED) {
244     BWPRINTF("could not mmap write log shm: %s", strerror(errno));
245     goto err;
246   }
247   return 0;
248 
249   err:
250   s->shm = NULL;
251   free(s->shmpath);
252   s->shmpath = NULL;
253   return -1;
254 }
255 
shmem_close(struct tdlog_state * s)256 static int shmem_close(struct tdlog_state* s)
257 {
258   if (s->shm) {
259     munmap(s->shm, SHMSIZE);
260     s->shm = NULL;
261   }
262 
263   if (s->shmpath) {
264     shm_unlink(s->shmpath);
265     s->shmpath = NULL;
266   }
267 
268   return 0;
269 }
270 
271 /* control socket */
272 
ctl_open(struct tdlog_state * s,const char * name)273 static int ctl_open(struct tdlog_state* s, const char* name)
274 {
275   struct sockaddr_un saddr;
276 
277   if (!(s->ctlpath = ctl_makepath(name, "ctl")))
278     return -1;
279 
280   if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
281     BWPRINTF("error opening control socket: %s", strerror(errno));
282     goto err;
283   }
284 
285   memset(&saddr, 0, sizeof(saddr));
286   saddr.sun_family = AF_UNIX;
287   memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath));
288   if (unlink(s->ctlpath) && errno != ENOENT) {
289     BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath,
290 	     strerror(errno));
291     goto err_sock;
292   }
293 
294   if (bind(s->ctl.fd, (const struct sockaddr *)&saddr, sizeof(saddr)) < 0) {
295     BWPRINTF("error binding control socket to %s: %s", s->ctlpath,
296 	     strerror(errno));
297     goto err_sock;
298   }
299 
300   if (listen(s->ctl.fd, 1) < 0) {
301     BWPRINTF("error listening on control socket: %s", strerror(errno));
302     goto err_sock;
303   }
304 
305   s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
306 					    s->ctl.fd, 0, ctl_accept, s);
307   if (s->ctl.id < 0) {
308     BWPRINTF("error register event handler: %s", strerror(s->ctl.id));
309     goto err_sock;
310   }
311 
312   return 0;
313 
314   err_sock:
315   close(s->ctl.fd);
316   s->ctl.fd = -1;
317   err:
318   free(s->ctlpath);
319   s->ctlpath = NULL;
320 
321   return -1;
322 }
323 
ctl_close(struct tdlog_state * s)324 static int ctl_close(struct tdlog_state* s)
325 {
326   while (s->connected) {
327     s->connected--;
328     tapdisk_server_unregister_event(s->connections[s->connected].id);
329     close(s->connections[s->connected].fd);
330     s->connections[s->connected].fd = -1;
331     s->connections[s->connected].id = 0;
332   }
333 
334   if (s->ctl.fd >= 0) {
335     tapdisk_server_unregister_event(s->ctl.id);
336     close(s->ctl.fd);
337     s->ctl.fd = -1;
338     s->ctl.id = 0;
339   }
340 
341   if (s->ctlpath) {
342     unlink(s->ctlpath);
343     free(s->ctlpath);
344     s->ctlpath = NULL;
345   }
346 
347   /* XXX this must be fixed once requests are actually in flight */
348   /* could just drain the existing ring here first */
349   if (s->sring) {
350     SHARED_RING_INIT(s->sring);
351     BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
352   }
353 
354   return 0;
355 }
356 
357 /* walk list of open sockets, close matching fd */
ctl_close_sock(struct tdlog_state * s,int fd)358 static int ctl_close_sock(struct tdlog_state* s, int fd)
359 {
360   int i;
361 
362   for (i = 0; i < s->connected; i++) {
363     if (s->connections[i].fd == fd) {
364       tapdisk_server_unregister_event(s->connections[i].id);
365       close(s->connections[i].fd);
366       s->connections[i].fd = -1;
367       s->connections[i].id = 0;
368       s->connected--;
369       return 0;
370     }
371   }
372 
373   BWPRINTF("requested to close unknown socket %d", fd);
374   return -1;
375 }
376 
ctl_accept(event_id_t id,char mode,void * private)377 static void ctl_accept(event_id_t id, char mode, void *private)
378 {
379   struct tdlog_state* s = (struct tdlog_state *)private;
380   int fd;
381   event_id_t cid;
382 
383   if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) {
384     BWPRINTF("error accepting control connection: %s", strerror(errno));
385     return;
386   }
387 
388   if (s->connected) {
389     BWPRINTF("control session in progress, closing new connection");
390     close(fd);
391     return;
392   }
393 
394   cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD,
395 				      fd, 0, ctl_request, s);
396   if (cid < 0) {
397     BWPRINTF("error registering connection event handler: %s", strerror(cid));
398     close(fd);
399     return;
400   }
401 
402   s->connections[s->connected].fd = fd;
403   s->connections[s->connected].id = cid;
404   s->connected++;
405 }
406 
407 /* response format: 4 bytes shmsize, 0-terminated path */
ctl_get_shmpath(struct tdlog_state * s,int fd)408 static int ctl_get_shmpath(struct tdlog_state* s, int fd)
409 {
410   char msg[CTLRSPLEN_SHMP + 1];
411   uint32_t sz;
412   int rc;
413 
414   BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)",
415 	   SHMSIZE, s->shmpath);
416 
417   /* TMP: sanity-check shm */
418   sz = 0xdeadbeef;
419   memcpy(s->shm, &sz, sizeof(sz));
420 
421   sz = SHMSIZE;
422   memcpy(msg, &sz, sizeof(sz));
423   snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath);
424   if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) {
425     BWPRINTF("error writing shmpath: %s", strerror(errno));
426     return -1;
427   }
428 
429   return 0;
430 }
431 
ctl_peek_writes(struct tdlog_state * s,int fd)432 static int ctl_peek_writes(struct tdlog_state* s, int fd)
433 {
434   int rc;
435 
436   BDPRINTF("ctl: peeking bitmap");
437 
438   writelog_export(s);
439 
440   if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) {
441     BWPRINTF("error writing peek ack: %s", strerror(errno));
442     return -1;
443   }
444 
445   return 0;
446 }
447 
ctl_clear_writes(struct tdlog_state * s,int fd)448 static int ctl_clear_writes(struct tdlog_state* s, int fd)
449 {
450   int rc;
451 
452   BDPRINTF("ctl: clearing bitmap");
453 
454   writelog_clear(s, 0, 0);
455 
456   if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) {
457     BWPRINTF("error writing clear ack: %s", strerror(errno));
458     return -1;
459   }
460 
461   return 0;
462 }
463 
464 /* get dirty bitmap and clear it atomically */
ctl_get_writes(struct tdlog_state * s,int fd)465 static int ctl_get_writes(struct tdlog_state* s, int fd)
466 {
467   int rc;
468 
469   BDPRINTF("ctl: getting bitmap");
470 
471   writelog_export(s);
472   writelog_clear(s, 0, 0);
473 
474   if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) {
475     BWPRINTF("error writing get ack: %s", strerror(errno));
476     return -1;
477   }
478 
479   return 0;
480 }
481 
482 /* get requests from ring */
ctl_kick(struct tdlog_state * s,int fd)483 static int ctl_kick(struct tdlog_state* s, int fd)
484 {
485   RING_IDX reqstart, reqend;
486   log_request_t req;
487 
488   /* XXX testing */
489   RING_IDX rspstart, rspend;
490   log_response_t rsp;
491   struct log_ctlmsg msg;
492   int rc;
493 
494   reqstart = s->bring.req_cons;
495   reqend = s->sring->req_prod;
496 
497   BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend);
498 
499   while (reqstart != reqend) {
500     /* XXX actually submit these! */
501     memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req));
502     BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count);
503     s->bring.req_cons = ++reqstart;
504 
505     rsp.sector = req.sector;
506     rsp.count = req.count;
507     memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp,
508 	   sizeof(rsp));
509     s->bring.rsp_prod_pvt++;
510   }
511 
512   RING_PUSH_RESPONSES(&s->bring);
513   memset(&msg, 0, sizeof(msg));
514   memcpy(msg.msg, LOGCMD_KICK, 4);
515   if ((rc = write(fd, &msg, sizeof(msg))) < 0) {
516     BWPRINTF("error sending notify: %s", strerror(errno));
517     return -1;
518   } else if (rc < sizeof(msg)) {
519     BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg));
520     return -1;
521   }
522 
523   return 0;
524 }
525 
ctl_do_request(struct tdlog_state * s,int fd,struct log_ctlmsg * msg)526 static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* msg)
527 {
528   if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) {
529     return ctl_get_shmpath(s, fd);
530   } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) {
531     return ctl_peek_writes(s, fd);
532   } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) {
533     return ctl_clear_writes(s, fd);
534   } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) {
535     return ctl_get_writes(s, fd);
536   } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) {
537     return ctl_kick(s, fd);
538   }
539 
540   BWPRINTF("unknown control request %.4s", msg->msg);
541   return -1;
542 }
543 
ctl_find_connection(struct tdlog_state * s,event_id_t id)544 static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id)
545 {
546   int i;
547 
548   for (i = 0; i < s->connected; i++)
549     if (s->connections[i].id == id)
550       return s->connections[i].fd;
551 
552   BWPRINTF("unrecognized event callback id %d", id);
553   return -1;
554 }
555 
ctl_request(event_id_t id,char mode,void * private)556 static void ctl_request(event_id_t id, char mode, void *private)
557 {
558   struct tdlog_state* s = (struct tdlog_state*)private;
559   struct log_ctlmsg msg;
560   int rc, i, fd = -1;
561 
562   fd = ctl_find_connection(s, id);
563   if (fd == -1)
564     return;
565 
566   if ((rc = read(fd, &msg, sizeof(msg))) < 0) {
567     BWPRINTF("error reading from ctl socket %d, closing: %s", fd,
568 	     strerror(errno));
569     ctl_close_sock(s, fd);
570     return;
571   } else if (rc == 0) {
572     BDPRINTF("ctl_request: EOF, closing socket");
573     ctl_close_sock(s, fd);
574     return;
575   } else if (rc < sizeof(msg)) {
576     BWPRINTF("short request received (%d/%zd bytes), ignoring", rc,
577 	     sizeof(msg));
578     return;
579   }
580 
581   ctl_do_request(s, fd, &msg);
582 }
583 
584 /* -- interface -- */
585 
586 static int tdlog_close(td_driver_t*);
587 
tdlog_open(td_driver_t * driver,const char * name,td_flag_t flags)588 static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags)
589 {
590   struct tdlog_state* s = (struct tdlog_state*)driver->data;
591   int rc;
592 
593   memset(s, 0, sizeof(*s));
594 
595   s->size = driver->info.size;
596 
597   if ((rc = writelog_create(s))) {
598     tdlog_close(driver);
599     return rc;
600   }
601   if ((rc = shmem_open(s, name))) {
602     tdlog_close(driver);
603     return rc;
604   }
605   if ((rc = ctl_open(s, name))) {
606     tdlog_close(driver);
607     return rc;
608   }
609 
610   s->sring = (log_sring_t*)sringstart(s->shm);
611   SHARED_RING_INIT(s->sring);
612   BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE);
613 
614   BDPRINTF("opened ctl socket");
615 
616   return 0;
617 }
618 
tdlog_close(td_driver_t * driver)619 static int tdlog_close(td_driver_t* driver)
620 {
621   struct tdlog_state* s = (struct tdlog_state*)driver->data;
622 
623   ctl_close(s);
624   shmem_close(s);
625   writelog_free(s);
626 
627   return 0;
628 }
629 
tdlog_queue_read(td_driver_t * driver,td_request_t treq)630 static void tdlog_queue_read(td_driver_t* driver, td_request_t treq)
631 {
632   td_forward_request(treq);
633 }
634 
tdlog_queue_write(td_driver_t * driver,td_request_t treq)635 static void tdlog_queue_write(td_driver_t* driver, td_request_t treq)
636 {
637   struct tdlog_state* s = (struct tdlog_state*)driver->data;
638   int rc;
639 
640   writelog_set(s, treq.sec, treq.secs);
641   td_forward_request(treq);
642 }
643 
tdlog_get_parent_id(td_driver_t * driver,td_disk_id_t * id)644 static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id)
645 {
646   return -EINVAL;
647 }
648 
tdlog_validate_parent(td_driver_t * driver,td_driver_t * parent,td_flag_t flags)649 static int tdlog_validate_parent(td_driver_t *driver,
650 				 td_driver_t *parent, td_flag_t flags)
651 {
652   return 0;
653 }
654 
655 struct tap_disk tapdisk_log = {
656   .disk_type          = "tapdisk_log",
657   .private_data_size  = sizeof(struct tdlog_state),
658   .flags              = 0,
659   .td_open            = tdlog_open,
660   .td_close           = tdlog_close,
661   .td_queue_read      = tdlog_queue_read,
662   .td_queue_write     = tdlog_queue_write,
663   .td_get_parent_id   = tdlog_get_parent_id,
664   .td_validate_parent = tdlog_validate_parent,
665 };
666