1 /*
2 Domain communications for Xen Store Daemon.
3 Copyright (C) 2005 Rusty Russell IBM Corporation
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include <stdio.h>
20 #include <sys/mman.h>
21 #include <unistd.h>
22 #include <stdlib.h>
23 #include <stdarg.h>
24 #include <time.h>
25 #include <syslog.h>
26
27 #include "utils.h"
28 #include "talloc.h"
29 #include "xenstored_core.h"
30 #include "xenstored_domain.h"
31 #include "xenstored_transaction.h"
32 #include "xenstored_watch.h"
33
34 #include <xenevtchn.h>
35 #include <xenctrl.h>
36 #include <xen/grant_table.h>
37
38 static xc_interface **xc_handle;
39 xengnttab_handle **xgt_handle;
40 static evtchn_port_t virq_port;
41
42 xenevtchn_handle *xce_handle = NULL;
43
44 struct domain
45 {
46 struct list_head list;
47
48 /* The id of this domain */
49 unsigned int domid;
50
51 /* Event channel port */
52 evtchn_port_t port;
53
54 /* The remote end of the event channel, used only to validate
55 repeated domain introductions. */
56 evtchn_port_t remote_port;
57
58 /* The mfn associated with the event channel, used only to validate
59 repeated domain introductions. */
60 unsigned long mfn;
61
62 /* Domain path in store. */
63 char *path;
64
65 /* Shared page. */
66 struct xenstore_domain_interface *interface;
67
68 /* The connection associated with this. */
69 struct connection *conn;
70
71 /* Have we noticed that this domain is shutdown? */
72 int shutdown;
73
74 /* number of entry from this domain in the store */
75 int nbentry;
76
77 /* number of watch for this domain */
78 int nbwatch;
79
80 /* write rate limit */
81 wrl_creditt wrl_credit; /* [ -wrl_config_writecost, +_dburst ] */
82 struct wrl_timestampt wrl_timestamp;
83 bool wrl_delay_logged;
84 };
85
86 static LIST_HEAD(domains);
87
check_indexes(XENSTORE_RING_IDX cons,XENSTORE_RING_IDX prod)88 static bool check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
89 {
90 return ((prod - cons) <= XENSTORE_RING_SIZE);
91 }
92
get_output_chunk(XENSTORE_RING_IDX cons,XENSTORE_RING_IDX prod,char * buf,uint32_t * len)93 static void *get_output_chunk(XENSTORE_RING_IDX cons,
94 XENSTORE_RING_IDX prod,
95 char *buf, uint32_t *len)
96 {
97 *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
98 if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
99 *len = XENSTORE_RING_SIZE - (prod - cons);
100 return buf + MASK_XENSTORE_IDX(prod);
101 }
102
get_input_chunk(XENSTORE_RING_IDX cons,XENSTORE_RING_IDX prod,const char * buf,uint32_t * len)103 static const void *get_input_chunk(XENSTORE_RING_IDX cons,
104 XENSTORE_RING_IDX prod,
105 const char *buf, uint32_t *len)
106 {
107 *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
108 if ((prod - cons) < *len)
109 *len = prod - cons;
110 return buf + MASK_XENSTORE_IDX(cons);
111 }
112
writechn(struct connection * conn,const void * data,unsigned int len)113 static int writechn(struct connection *conn,
114 const void *data, unsigned int len)
115 {
116 uint32_t avail;
117 void *dest;
118 struct xenstore_domain_interface *intf = conn->domain->interface;
119 XENSTORE_RING_IDX cons, prod;
120
121 /* Must read indexes once, and before anything else, and verified. */
122 cons = intf->rsp_cons;
123 prod = intf->rsp_prod;
124 xen_mb();
125
126 if (!check_indexes(cons, prod)) {
127 errno = EIO;
128 return -1;
129 }
130
131 dest = get_output_chunk(cons, prod, intf->rsp, &avail);
132 if (avail < len)
133 len = avail;
134
135 memcpy(dest, data, len);
136 xen_mb();
137 intf->rsp_prod += len;
138
139 xenevtchn_notify(xce_handle, conn->domain->port);
140
141 return len;
142 }
143
readchn(struct connection * conn,void * data,unsigned int len)144 static int readchn(struct connection *conn, void *data, unsigned int len)
145 {
146 uint32_t avail;
147 const void *src;
148 struct xenstore_domain_interface *intf = conn->domain->interface;
149 XENSTORE_RING_IDX cons, prod;
150
151 /* Must read indexes once, and before anything else, and verified. */
152 cons = intf->req_cons;
153 prod = intf->req_prod;
154 xen_mb();
155
156 if (!check_indexes(cons, prod)) {
157 errno = EIO;
158 return -1;
159 }
160
161 src = get_input_chunk(cons, prod, intf->req, &avail);
162 if (avail < len)
163 len = avail;
164
165 memcpy(data, src, len);
166 xen_mb();
167 intf->req_cons += len;
168
169 xenevtchn_notify(xce_handle, conn->domain->port);
170
171 return len;
172 }
173
map_interface(domid_t domid,unsigned long mfn)174 static void *map_interface(domid_t domid, unsigned long mfn)
175 {
176 if (*xgt_handle != NULL) {
177 /* this is the preferred method */
178 return xengnttab_map_grant_ref(*xgt_handle, domid,
179 GNTTAB_RESERVED_XENSTORE, PROT_READ|PROT_WRITE);
180 } else {
181 return xc_map_foreign_range(*xc_handle, domid,
182 XC_PAGE_SIZE, PROT_READ|PROT_WRITE, mfn);
183 }
184 }
185
unmap_interface(void * interface)186 static void unmap_interface(void *interface)
187 {
188 if (*xgt_handle != NULL)
189 xengnttab_unmap(*xgt_handle, interface, 1);
190 else
191 munmap(interface, XC_PAGE_SIZE);
192 }
193
destroy_domain(void * _domain)194 static int destroy_domain(void *_domain)
195 {
196 struct domain *domain = _domain;
197
198 list_del(&domain->list);
199
200 if (domain->port) {
201 if (xenevtchn_unbind(xce_handle, domain->port) == -1)
202 eprintf("> Unbinding port %i failed!\n", domain->port);
203 }
204
205 if (domain->interface) {
206 /* Domain 0 was mapped by dom0_init, so it must be unmapped
207 using munmap() and not the grant unmap call. */
208 if (domain->domid == 0)
209 unmap_xenbus(domain->interface);
210 else
211 unmap_interface(domain->interface);
212 }
213
214 fire_watches(NULL, domain, "@releaseDomain", false);
215
216 wrl_domain_destroy(domain);
217
218 return 0;
219 }
220
domain_cleanup(void)221 static void domain_cleanup(void)
222 {
223 xc_dominfo_t dominfo;
224 struct domain *domain;
225 int notify = 0;
226
227 again:
228 list_for_each_entry(domain, &domains, list) {
229 if (xc_domain_getinfo(*xc_handle, domain->domid, 1,
230 &dominfo) == 1 &&
231 dominfo.domid == domain->domid) {
232 if ((dominfo.crashed || dominfo.shutdown)
233 && !domain->shutdown) {
234 domain->shutdown = 1;
235 notify = 1;
236 }
237 if (!dominfo.dying)
238 continue;
239 }
240 if (domain->conn) {
241 talloc_unlink(talloc_autofree_context(), domain->conn);
242 domain->conn = NULL;
243 notify = 0; /* destroy_domain() fires the watch */
244 goto again;
245 }
246 }
247
248 if (notify)
249 fire_watches(NULL, NULL, "@releaseDomain", false);
250 }
251
252 /* We scan all domains rather than use the information given here. */
handle_event(void)253 void handle_event(void)
254 {
255 evtchn_port_t port;
256
257 if ((port = xenevtchn_pending(xce_handle)) == -1)
258 barf_perror("Failed to read from event fd");
259
260 if (port == virq_port)
261 domain_cleanup();
262
263 if (xenevtchn_unmask(xce_handle, port) == -1)
264 barf_perror("Failed to write to event fd");
265 }
266
domain_can_read(struct connection * conn)267 bool domain_can_read(struct connection *conn)
268 {
269 struct xenstore_domain_interface *intf = conn->domain->interface;
270
271 if (domain_is_unprivileged(conn) && conn->domain->wrl_credit < 0)
272 return false;
273 return (intf->req_cons != intf->req_prod);
274 }
275
domid_is_unprivileged(unsigned int domid)276 static bool domid_is_unprivileged(unsigned int domid)
277 {
278 return domid != 0 && domid != priv_domid;
279 }
280
domain_is_unprivileged(struct connection * conn)281 bool domain_is_unprivileged(struct connection *conn)
282 {
283 return conn && conn->domain &&
284 domid_is_unprivileged(conn->domain->domid);
285 }
286
domain_can_write(struct connection * conn)287 bool domain_can_write(struct connection *conn)
288 {
289 struct xenstore_domain_interface *intf = conn->domain->interface;
290 return ((intf->rsp_prod - intf->rsp_cons) != XENSTORE_RING_SIZE);
291 }
292
talloc_domain_path(void * context,unsigned int domid)293 static char *talloc_domain_path(void *context, unsigned int domid)
294 {
295 return talloc_asprintf(context, "/local/domain/%u", domid);
296 }
297
new_domain(void * context,unsigned int domid,int port)298 static struct domain *new_domain(void *context, unsigned int domid,
299 int port)
300 {
301 struct domain *domain;
302 int rc;
303
304 domain = talloc(context, struct domain);
305 if (!domain)
306 return NULL;
307
308 domain->port = 0;
309 domain->shutdown = 0;
310 domain->domid = domid;
311 domain->path = talloc_domain_path(domain, domid);
312 if (!domain->path)
313 return NULL;
314
315 wrl_domain_new(domain);
316
317 list_add(&domain->list, &domains);
318 talloc_set_destructor(domain, destroy_domain);
319
320 /* Tell kernel we're interested in this event. */
321 rc = xenevtchn_bind_interdomain(xce_handle, domid, port);
322 if (rc == -1)
323 return NULL;
324 domain->port = rc;
325
326 domain->conn = new_connection(writechn, readchn);
327 if (!domain->conn)
328 return NULL;
329
330 domain->conn->domain = domain;
331 domain->conn->id = domid;
332
333 domain->remote_port = port;
334 domain->nbentry = 0;
335 domain->nbwatch = 0;
336
337 return domain;
338 }
339
340
find_domain_by_domid(unsigned int domid)341 static struct domain *find_domain_by_domid(unsigned int domid)
342 {
343 struct domain *i;
344
345 list_for_each_entry(i, &domains, list) {
346 if (i->domid == domid)
347 return i;
348 }
349 return NULL;
350 }
351
domain_conn_reset(struct domain * domain)352 static void domain_conn_reset(struct domain *domain)
353 {
354 struct connection *conn = domain->conn;
355 struct buffered_data *out;
356
357 conn_delete_all_watches(conn);
358 conn_delete_all_transactions(conn);
359
360 while ((out = list_top(&conn->out_list, struct buffered_data, list))) {
361 list_del(&out->list);
362 talloc_free(out);
363 }
364
365 talloc_free(conn->in);
366
367 domain->interface->req_cons = domain->interface->req_prod = 0;
368 domain->interface->rsp_cons = domain->interface->rsp_prod = 0;
369 }
370
371 /* domid, mfn, evtchn, path */
do_introduce(struct connection * conn,struct buffered_data * in)372 int do_introduce(struct connection *conn, struct buffered_data *in)
373 {
374 struct domain *domain;
375 char *vec[3];
376 unsigned int domid;
377 unsigned long mfn;
378 evtchn_port_t port;
379 int rc;
380 struct xenstore_domain_interface *interface;
381
382 if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
383 return EINVAL;
384
385 if (domain_is_unprivileged(conn) || !conn->can_write)
386 return EACCES;
387
388 domid = atoi(vec[0]);
389 mfn = atol(vec[1]);
390 port = atoi(vec[2]);
391
392 /* Sanity check args. */
393 if (port <= 0)
394 return EINVAL;
395
396 domain = find_domain_by_domid(domid);
397
398 if (domain == NULL) {
399 interface = map_interface(domid, mfn);
400 if (!interface)
401 return errno;
402 /* Hang domain off "in" until we're finished. */
403 domain = new_domain(in, domid, port);
404 if (!domain) {
405 rc = errno;
406 unmap_interface(interface);
407 return rc;
408 }
409 domain->interface = interface;
410 domain->mfn = mfn;
411
412 /* Now domain belongs to its connection. */
413 talloc_steal(domain->conn, domain);
414
415 fire_watches(NULL, in, "@introduceDomain", false);
416 } else if ((domain->mfn == mfn) && (domain->conn != conn)) {
417 /* Use XS_INTRODUCE for recreating the xenbus event-channel. */
418 if (domain->port)
419 xenevtchn_unbind(xce_handle, domain->port);
420 rc = xenevtchn_bind_interdomain(xce_handle, domid, port);
421 domain->port = (rc == -1) ? 0 : rc;
422 domain->remote_port = port;
423 } else
424 return EINVAL;
425
426 domain_conn_reset(domain);
427
428 send_ack(conn, XS_INTRODUCE);
429
430 return 0;
431 }
432
find_connected_domain(unsigned int domid)433 static struct domain *find_connected_domain(unsigned int domid)
434 {
435 struct domain *domain;
436
437 domain = find_domain_by_domid(domid);
438 if (!domain)
439 return ERR_PTR(-ENOENT);
440 if (!domain->conn)
441 return ERR_PTR(-EINVAL);
442 return domain;
443 }
444
do_set_target(struct connection * conn,struct buffered_data * in)445 int do_set_target(struct connection *conn, struct buffered_data *in)
446 {
447 char *vec[2];
448 unsigned int domid, tdomid;
449 struct domain *domain, *tdomain;
450 if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
451 return EINVAL;
452
453 if (domain_is_unprivileged(conn) || !conn->can_write)
454 return EACCES;
455
456 domid = atoi(vec[0]);
457 tdomid = atoi(vec[1]);
458
459 domain = find_connected_domain(domid);
460 if (IS_ERR(domain))
461 return -PTR_ERR(domain);
462
463 tdomain = find_connected_domain(tdomid);
464 if (IS_ERR(tdomain))
465 return -PTR_ERR(tdomain);
466
467 talloc_reference(domain->conn, tdomain->conn);
468 domain->conn->target = tdomain->conn;
469
470 send_ack(conn, XS_SET_TARGET);
471
472 return 0;
473 }
474
onearg_domain(struct connection * conn,struct buffered_data * in)475 static struct domain *onearg_domain(struct connection *conn,
476 struct buffered_data *in)
477 {
478 const char *domid_str = onearg(in);
479 unsigned int domid;
480
481 if (!domid_str)
482 return ERR_PTR(-EINVAL);
483
484 domid = atoi(domid_str);
485 if (!domid)
486 return ERR_PTR(-EINVAL);
487
488 if (domain_is_unprivileged(conn))
489 return ERR_PTR(-EACCES);
490
491 return find_connected_domain(domid);
492 }
493
494 /* domid */
do_release(struct connection * conn,struct buffered_data * in)495 int do_release(struct connection *conn, struct buffered_data *in)
496 {
497 struct domain *domain;
498
499 domain = onearg_domain(conn, in);
500 if (IS_ERR(domain))
501 return -PTR_ERR(domain);
502
503 talloc_free(domain->conn);
504
505 send_ack(conn, XS_RELEASE);
506
507 return 0;
508 }
509
do_resume(struct connection * conn,struct buffered_data * in)510 int do_resume(struct connection *conn, struct buffered_data *in)
511 {
512 struct domain *domain;
513
514 domain = onearg_domain(conn, in);
515 if (IS_ERR(domain))
516 return -PTR_ERR(domain);
517
518 domain->shutdown = 0;
519
520 send_ack(conn, XS_RESUME);
521
522 return 0;
523 }
524
do_get_domain_path(struct connection * conn,struct buffered_data * in)525 int do_get_domain_path(struct connection *conn, struct buffered_data *in)
526 {
527 char *path;
528 const char *domid_str = onearg(in);
529
530 if (!domid_str)
531 return EINVAL;
532
533 path = talloc_domain_path(conn, atoi(domid_str));
534 if (!path)
535 return errno;
536
537 send_reply(conn, XS_GET_DOMAIN_PATH, path, strlen(path) + 1);
538
539 talloc_free(path);
540
541 return 0;
542 }
543
do_is_domain_introduced(struct connection * conn,struct buffered_data * in)544 int do_is_domain_introduced(struct connection *conn, struct buffered_data *in)
545 {
546 int result;
547 unsigned int domid;
548 const char *domid_str = onearg(in);
549
550 if (!domid_str)
551 return EINVAL;
552
553 domid = atoi(domid_str);
554 if (domid == DOMID_SELF)
555 result = 1;
556 else
557 result = (find_domain_by_domid(domid) != NULL);
558
559 send_reply(conn, XS_IS_DOMAIN_INTRODUCED, result ? "T" : "F", 2);
560
561 return 0;
562 }
563
564 /* Allow guest to reset all watches */
do_reset_watches(struct connection * conn,struct buffered_data * in)565 int do_reset_watches(struct connection *conn, struct buffered_data *in)
566 {
567 conn_delete_all_watches(conn);
568 conn_delete_all_transactions(conn);
569
570 send_ack(conn, XS_RESET_WATCHES);
571
572 return 0;
573 }
574
close_xc_handle(void * _handle)575 static int close_xc_handle(void *_handle)
576 {
577 xc_interface_close(*(xc_interface**)_handle);
578 return 0;
579 }
580
close_xgt_handle(void * _handle)581 static int close_xgt_handle(void *_handle)
582 {
583 xengnttab_close(*(xengnttab_handle **)_handle);
584 return 0;
585 }
586
587 /* Returns the implicit path of a connection (only domains have this) */
get_implicit_path(const struct connection * conn)588 const char *get_implicit_path(const struct connection *conn)
589 {
590 if (!conn->domain)
591 return "/local/domain/0";
592 return conn->domain->path;
593 }
594
595 /* Restore existing connections. */
restore_existing_connections(void)596 void restore_existing_connections(void)
597 {
598 }
599
dom0_init(void)600 static int dom0_init(void)
601 {
602 evtchn_port_t port;
603 struct domain *dom0;
604
605 port = xenbus_evtchn();
606 if (port == -1)
607 return -1;
608
609 dom0 = new_domain(NULL, xenbus_master_domid(), port);
610 if (dom0 == NULL)
611 return -1;
612
613 dom0->interface = xenbus_map();
614 if (dom0->interface == NULL)
615 return -1;
616
617 talloc_steal(dom0->conn, dom0);
618
619 xenevtchn_notify(xce_handle, dom0->port);
620
621 return 0;
622 }
623
domain_init(void)624 void domain_init(void)
625 {
626 int rc;
627
628 xc_handle = talloc(talloc_autofree_context(), xc_interface*);
629 if (!xc_handle)
630 barf_perror("Failed to allocate domain handle");
631
632 *xc_handle = xc_interface_open(0,0,0);
633 if (!*xc_handle)
634 barf_perror("Failed to open connection to hypervisor");
635
636 talloc_set_destructor(xc_handle, close_xc_handle);
637
638 xgt_handle = talloc(talloc_autofree_context(), xengnttab_handle*);
639 if (!xgt_handle)
640 barf_perror("Failed to allocate domain gnttab handle");
641
642 *xgt_handle = xengnttab_open(NULL, 0);
643 if (*xgt_handle == NULL)
644 xprintf("WARNING: Failed to open connection to gnttab\n");
645 else
646 talloc_set_destructor(xgt_handle, close_xgt_handle);
647
648 xce_handle = xenevtchn_open(NULL, 0);
649
650 if (xce_handle == NULL)
651 barf_perror("Failed to open evtchn device");
652
653 if (dom0_init() != 0)
654 barf_perror("Failed to initialize dom0 state");
655
656 if ((rc = xenevtchn_bind_virq(xce_handle, VIRQ_DOM_EXC)) == -1)
657 barf_perror("Failed to bind to domain exception virq port");
658 virq_port = rc;
659 }
660
domain_entry_inc(struct connection * conn,struct node * node)661 void domain_entry_inc(struct connection *conn, struct node *node)
662 {
663 struct domain *d;
664
665 if (!conn)
666 return;
667
668 if (node->perms && node->perms[0].id != conn->id) {
669 if (conn->transaction) {
670 transaction_entry_inc(conn->transaction,
671 node->perms[0].id);
672 } else {
673 d = find_domain_by_domid(node->perms[0].id);
674 if (d)
675 d->nbentry++;
676 }
677 } else if (conn->domain) {
678 if (conn->transaction) {
679 transaction_entry_inc(conn->transaction,
680 conn->domain->domid);
681 } else {
682 conn->domain->nbentry++;
683 }
684 }
685 }
686
domain_entry_dec(struct connection * conn,struct node * node)687 void domain_entry_dec(struct connection *conn, struct node *node)
688 {
689 struct domain *d;
690
691 if (!conn)
692 return;
693
694 if (node->perms && node->perms[0].id != conn->id) {
695 if (conn->transaction) {
696 transaction_entry_dec(conn->transaction,
697 node->perms[0].id);
698 } else {
699 d = find_domain_by_domid(node->perms[0].id);
700 if (d && d->nbentry)
701 d->nbentry--;
702 }
703 } else if (conn->domain && conn->domain->nbentry) {
704 if (conn->transaction) {
705 transaction_entry_dec(conn->transaction,
706 conn->domain->domid);
707 } else {
708 conn->domain->nbentry--;
709 }
710 }
711 }
712
domain_entry_fix(unsigned int domid,int num,bool update)713 int domain_entry_fix(unsigned int domid, int num, bool update)
714 {
715 struct domain *d;
716 int cnt;
717
718 d = find_domain_by_domid(domid);
719 if (!d)
720 return 0;
721
722 cnt = d->nbentry + num;
723 if (cnt < 0)
724 cnt = 0;
725
726 if (update)
727 d->nbentry = cnt;
728
729 return domid_is_unprivileged(domid) ? cnt : 0;
730 }
731
domain_entry(struct connection * conn)732 int domain_entry(struct connection *conn)
733 {
734 return (domain_is_unprivileged(conn))
735 ? conn->domain->nbentry
736 : 0;
737 }
738
domain_watch_inc(struct connection * conn)739 void domain_watch_inc(struct connection *conn)
740 {
741 if (!conn || !conn->domain)
742 return;
743 conn->domain->nbwatch++;
744 }
745
domain_watch_dec(struct connection * conn)746 void domain_watch_dec(struct connection *conn)
747 {
748 if (!conn || !conn->domain)
749 return;
750 if (conn->domain->nbwatch)
751 conn->domain->nbwatch--;
752 }
753
domain_watch(struct connection * conn)754 int domain_watch(struct connection *conn)
755 {
756 return (domain_is_unprivileged(conn))
757 ? conn->domain->nbwatch
758 : 0;
759 }
760
761 static wrl_creditt wrl_config_writecost = WRL_FACTOR;
762 static wrl_creditt wrl_config_rate = WRL_RATE * WRL_FACTOR;
763 static wrl_creditt wrl_config_dburst = WRL_DBURST * WRL_FACTOR;
764 static wrl_creditt wrl_config_gburst = WRL_GBURST * WRL_FACTOR;
765 static wrl_creditt wrl_config_newdoms_dburst =
766 WRL_DBURST * WRL_NEWDOMS * WRL_FACTOR;
767
768 long wrl_ntransactions;
769
770 static long wrl_ndomains;
771 static wrl_creditt wrl_reserve; /* [-wrl_config_newdoms_dburst, +_gburst ] */
772 static time_t wrl_log_last_warning; /* 0: no previous warning */
773
wrl_gettime_now(struct wrl_timestampt * now_wt)774 void wrl_gettime_now(struct wrl_timestampt *now_wt)
775 {
776 struct timespec now_ts;
777 int r;
778
779 r = clock_gettime(CLOCK_MONOTONIC, &now_ts);
780 if (r)
781 barf_perror("Could not find time (clock_gettime failed)");
782
783 now_wt->sec = now_ts.tv_sec;
784 now_wt->msec = now_ts.tv_nsec / 1000000;
785 }
786
wrl_xfer_credit(wrl_creditt * debit,wrl_creditt debit_floor,wrl_creditt * credit,wrl_creditt credit_ceil)787 static void wrl_xfer_credit(wrl_creditt *debit, wrl_creditt debit_floor,
788 wrl_creditt *credit, wrl_creditt credit_ceil)
789 /*
790 * Transfers zero or more credit from "debit" to "credit".
791 * Transfers as much as possible while maintaining
792 * debit >= debit_floor and credit <= credit_ceil.
793 * (If that's violated already, does nothing.)
794 *
795 * Sufficient conditions to avoid overflow, either of:
796 * |every argument| <= 0x3fffffff
797 * |every argument| <= 1E9
798 * |every argument| <= WRL_CREDIT_MAX
799 * (And this condition is preserved.)
800 */
801 {
802 wrl_creditt xfer = MIN( *debit - debit_floor,
803 credit_ceil - *credit );
804 if (xfer > 0) {
805 *debit -= xfer;
806 *credit += xfer;
807 }
808 }
809
wrl_domain_new(struct domain * domain)810 void wrl_domain_new(struct domain *domain)
811 {
812 domain->wrl_credit = 0;
813 wrl_gettime_now(&domain->wrl_timestamp);
814 wrl_ndomains++;
815 /* Steal up to DBURST from the reserve */
816 wrl_xfer_credit(&wrl_reserve, -wrl_config_newdoms_dburst,
817 &domain->wrl_credit, wrl_config_dburst);
818 }
819
wrl_domain_destroy(struct domain * domain)820 void wrl_domain_destroy(struct domain *domain)
821 {
822 wrl_ndomains--;
823 /*
824 * Don't bother recalculating domain's credit - this just
825 * means we don't give the reserve the ending domain's credit
826 * for time elapsed since last update.
827 */
828 wrl_xfer_credit(&domain->wrl_credit, 0,
829 &wrl_reserve, wrl_config_dburst);
830 }
831
wrl_credit_update(struct domain * domain,struct wrl_timestampt now)832 void wrl_credit_update(struct domain *domain, struct wrl_timestampt now)
833 {
834 /*
835 * We want to calculate
836 * credit += (now - timestamp) * RATE / ndoms;
837 * But we want it to saturate, and to avoid floating point.
838 * To avoid rounding errors from constantly adding small
839 * amounts of credit, we only add credit for whole milliseconds.
840 */
841 long seconds = now.sec - domain->wrl_timestamp.sec;
842 long milliseconds = now.msec - domain->wrl_timestamp.msec;
843 long msec;
844 int64_t denom, num;
845 wrl_creditt surplus;
846
847 seconds = MIN(seconds, 1000*1000); /* arbitrary, prevents overflow */
848 msec = seconds * 1000 + milliseconds;
849
850 if (msec < 0)
851 /* shouldn't happen with CLOCK_MONOTONIC */
852 msec = 0;
853
854 /* 32x32 -> 64 cannot overflow */
855 denom = (int64_t)msec * wrl_config_rate;
856 num = (int64_t)wrl_ndomains * 1000;
857 /* denom / num <= 1E6 * wrl_config_rate, so with
858 reasonable wrl_config_rate, denom / num << 2^64 */
859
860 /* at last! */
861 domain->wrl_credit = MIN( (int64_t)domain->wrl_credit + denom / num,
862 WRL_CREDIT_MAX );
863 /* (maybe briefly violating the DBURST cap on wrl_credit) */
864
865 /* maybe take from the reserve to make us nonnegative */
866 wrl_xfer_credit(&wrl_reserve, 0,
867 &domain->wrl_credit, 0);
868
869 /* return any surplus (over DBURST) to the reserve */
870 surplus = 0;
871 wrl_xfer_credit(&domain->wrl_credit, wrl_config_dburst,
872 &surplus, WRL_CREDIT_MAX);
873 wrl_xfer_credit(&surplus, 0,
874 &wrl_reserve, wrl_config_gburst);
875 /* surplus is now implicitly discarded */
876
877 domain->wrl_timestamp = now;
878
879 trace("wrl: dom %4d %6ld msec %9ld credit %9ld reserve"
880 " %9ld discard\n",
881 domain->domid,
882 msec,
883 (long)domain->wrl_credit, (long)wrl_reserve,
884 (long)surplus);
885 }
886
wrl_check_timeout(struct domain * domain,struct wrl_timestampt now,int * ptimeout)887 void wrl_check_timeout(struct domain *domain,
888 struct wrl_timestampt now,
889 int *ptimeout)
890 {
891 uint64_t num, denom;
892 int wakeup;
893
894 wrl_credit_update(domain, now);
895
896 if (domain->wrl_credit >= 0)
897 /* not blocked */
898 return;
899
900 if (!*ptimeout)
901 /* already decided on immediate wakeup,
902 so no need to calculate our timeout */
903 return;
904
905 /* calculate wakeup = now + -credit / (RATE / ndoms); */
906
907 /* credit cannot go more -ve than one transaction,
908 * so the first multiplication cannot overflow even 32-bit */
909 num = (uint64_t)(-domain->wrl_credit * 1000) * wrl_ndomains;
910 denom = wrl_config_rate;
911
912 wakeup = MIN( num / denom /* uint64_t */, INT_MAX );
913 if (*ptimeout==-1 || wakeup < *ptimeout)
914 *ptimeout = wakeup;
915
916 trace("wrl: domain %u credit=%ld (reserve=%ld) SLEEPING for %d\n",
917 domain->domid,
918 (long)domain->wrl_credit, (long)wrl_reserve,
919 wakeup);
920 }
921
922 #define WRL_LOG(now, ...) \
923 (syslog(LOG_WARNING, "write rate limit: " __VA_ARGS__))
924
wrl_apply_debit_actual(struct domain * domain)925 void wrl_apply_debit_actual(struct domain *domain)
926 {
927 struct wrl_timestampt now;
928
929 if (!domain)
930 /* sockets escape the write rate limit */
931 return;
932
933 wrl_gettime_now(&now);
934 wrl_credit_update(domain, now);
935
936 domain->wrl_credit -= wrl_config_writecost;
937 trace("wrl: domain %u credit=%ld (reserve=%ld)\n",
938 domain->domid,
939 (long)domain->wrl_credit, (long)wrl_reserve);
940
941 if (domain->wrl_credit < 0) {
942 if (!domain->wrl_delay_logged) {
943 domain->wrl_delay_logged = true;
944 WRL_LOG(now, "domain %ld is affected",
945 (long)domain->domid);
946 } else if (!wrl_log_last_warning) {
947 WRL_LOG(now, "rate limiting restarts");
948 }
949 wrl_log_last_warning = now.sec;
950 }
951 }
952
wrl_log_periodic(struct wrl_timestampt now)953 void wrl_log_periodic(struct wrl_timestampt now)
954 {
955 if (wrl_log_last_warning &&
956 (now.sec - wrl_log_last_warning) > WRL_LOGEVERY) {
957 WRL_LOG(now, "not in force recently");
958 wrl_log_last_warning = 0;
959 }
960 }
961
wrl_apply_debit_direct(struct connection * conn)962 void wrl_apply_debit_direct(struct connection *conn)
963 {
964 if (!conn)
965 /* some writes are generated internally */
966 return;
967
968 if (conn->transaction)
969 /* these are accounted for when the transaction ends */
970 return;
971
972 if (!wrl_ntransactions)
973 /* we don't conflict with anyone */
974 return;
975
976 wrl_apply_debit_actual(conn->domain);
977 }
978
wrl_apply_debit_trans_commit(struct connection * conn)979 void wrl_apply_debit_trans_commit(struct connection *conn)
980 {
981 if (wrl_ntransactions <= 1)
982 /* our own transaction appears in the counter */
983 return;
984
985 wrl_apply_debit_actual(conn->domain);
986 }
987
988 /*
989 * Local variables:
990 * c-file-style: "linux"
991 * indent-tabs-mode: t
992 * c-indent-level: 8
993 * c-basic-offset: 8
994 * tab-width: 8
995 * End:
996 */
997