1 /*
2     Domain communications for Xen Store Daemon.
3     Copyright (C) 2005 Rusty Russell IBM Corporation
4 
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9 
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <stdio.h>
20 #include <sys/mman.h>
21 #include <unistd.h>
22 #include <stdlib.h>
23 #include <stdarg.h>
24 #include <time.h>
25 #include <syslog.h>
26 
27 #include "utils.h"
28 #include "talloc.h"
29 #include "xenstored_core.h"
30 #include "xenstored_domain.h"
31 #include "xenstored_transaction.h"
32 #include "xenstored_watch.h"
33 
34 #include <xenevtchn.h>
35 #include <xenctrl.h>
36 #include <xen/grant_table.h>
37 
38 static xc_interface **xc_handle;
39 xengnttab_handle **xgt_handle;
40 static evtchn_port_t virq_port;
41 
42 xenevtchn_handle *xce_handle = NULL;
43 
44 struct domain
45 {
46 	struct list_head list;
47 
48 	/* The id of this domain */
49 	unsigned int domid;
50 
51 	/* Event channel port */
52 	evtchn_port_t port;
53 
54 	/* The remote end of the event channel, used only to validate
55 	   repeated domain introductions. */
56 	evtchn_port_t remote_port;
57 
58 	/* The mfn associated with the event channel, used only to validate
59 	   repeated domain introductions. */
60 	unsigned long mfn;
61 
62 	/* Domain path in store. */
63 	char *path;
64 
65 	/* Shared page. */
66 	struct xenstore_domain_interface *interface;
67 
68 	/* The connection associated with this. */
69 	struct connection *conn;
70 
71 	/* Have we noticed that this domain is shutdown? */
72 	int shutdown;
73 
74 	/* number of entry from this domain in the store */
75 	int nbentry;
76 
77 	/* number of watch for this domain */
78 	int nbwatch;
79 
80 	/* write rate limit */
81 	wrl_creditt wrl_credit; /* [ -wrl_config_writecost, +_dburst ] */
82 	struct wrl_timestampt wrl_timestamp;
83 	bool wrl_delay_logged;
84 };
85 
86 static LIST_HEAD(domains);
87 
check_indexes(XENSTORE_RING_IDX cons,XENSTORE_RING_IDX prod)88 static bool check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod)
89 {
90 	return ((prod - cons) <= XENSTORE_RING_SIZE);
91 }
92 
get_output_chunk(XENSTORE_RING_IDX cons,XENSTORE_RING_IDX prod,char * buf,uint32_t * len)93 static void *get_output_chunk(XENSTORE_RING_IDX cons,
94 			      XENSTORE_RING_IDX prod,
95 			      char *buf, uint32_t *len)
96 {
97 	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod);
98 	if ((XENSTORE_RING_SIZE - (prod - cons)) < *len)
99 		*len = XENSTORE_RING_SIZE - (prod - cons);
100 	return buf + MASK_XENSTORE_IDX(prod);
101 }
102 
get_input_chunk(XENSTORE_RING_IDX cons,XENSTORE_RING_IDX prod,const char * buf,uint32_t * len)103 static const void *get_input_chunk(XENSTORE_RING_IDX cons,
104 				   XENSTORE_RING_IDX prod,
105 				   const char *buf, uint32_t *len)
106 {
107 	*len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons);
108 	if ((prod - cons) < *len)
109 		*len = prod - cons;
110 	return buf + MASK_XENSTORE_IDX(cons);
111 }
112 
writechn(struct connection * conn,const void * data,unsigned int len)113 static int writechn(struct connection *conn,
114 		    const void *data, unsigned int len)
115 {
116 	uint32_t avail;
117 	void *dest;
118 	struct xenstore_domain_interface *intf = conn->domain->interface;
119 	XENSTORE_RING_IDX cons, prod;
120 
121 	/* Must read indexes once, and before anything else, and verified. */
122 	cons = intf->rsp_cons;
123 	prod = intf->rsp_prod;
124 	xen_mb();
125 
126 	if (!check_indexes(cons, prod)) {
127 		errno = EIO;
128 		return -1;
129 	}
130 
131 	dest = get_output_chunk(cons, prod, intf->rsp, &avail);
132 	if (avail < len)
133 		len = avail;
134 
135 	memcpy(dest, data, len);
136 	xen_mb();
137 	intf->rsp_prod += len;
138 
139 	xenevtchn_notify(xce_handle, conn->domain->port);
140 
141 	return len;
142 }
143 
readchn(struct connection * conn,void * data,unsigned int len)144 static int readchn(struct connection *conn, void *data, unsigned int len)
145 {
146 	uint32_t avail;
147 	const void *src;
148 	struct xenstore_domain_interface *intf = conn->domain->interface;
149 	XENSTORE_RING_IDX cons, prod;
150 
151 	/* Must read indexes once, and before anything else, and verified. */
152 	cons = intf->req_cons;
153 	prod = intf->req_prod;
154 	xen_mb();
155 
156 	if (!check_indexes(cons, prod)) {
157 		errno = EIO;
158 		return -1;
159 	}
160 
161 	src = get_input_chunk(cons, prod, intf->req, &avail);
162 	if (avail < len)
163 		len = avail;
164 
165 	memcpy(data, src, len);
166 	xen_mb();
167 	intf->req_cons += len;
168 
169 	xenevtchn_notify(xce_handle, conn->domain->port);
170 
171 	return len;
172 }
173 
map_interface(domid_t domid,unsigned long mfn)174 static void *map_interface(domid_t domid, unsigned long mfn)
175 {
176 	if (*xgt_handle != NULL) {
177 		/* this is the preferred method */
178 		return xengnttab_map_grant_ref(*xgt_handle, domid,
179 			GNTTAB_RESERVED_XENSTORE, PROT_READ|PROT_WRITE);
180 	} else {
181 		return xc_map_foreign_range(*xc_handle, domid,
182 			XC_PAGE_SIZE, PROT_READ|PROT_WRITE, mfn);
183 	}
184 }
185 
unmap_interface(void * interface)186 static void unmap_interface(void *interface)
187 {
188 	if (*xgt_handle != NULL)
189 		xengnttab_unmap(*xgt_handle, interface, 1);
190 	else
191 		munmap(interface, XC_PAGE_SIZE);
192 }
193 
destroy_domain(void * _domain)194 static int destroy_domain(void *_domain)
195 {
196 	struct domain *domain = _domain;
197 
198 	list_del(&domain->list);
199 
200 	if (domain->port) {
201 		if (xenevtchn_unbind(xce_handle, domain->port) == -1)
202 			eprintf("> Unbinding port %i failed!\n", domain->port);
203 	}
204 
205 	if (domain->interface) {
206 		/* Domain 0 was mapped by dom0_init, so it must be unmapped
207 		   using munmap() and not the grant unmap call. */
208 		if (domain->domid == 0)
209 			unmap_xenbus(domain->interface);
210 		else
211 			unmap_interface(domain->interface);
212 	}
213 
214 	fire_watches(NULL, domain, "@releaseDomain", false);
215 
216 	wrl_domain_destroy(domain);
217 
218 	return 0;
219 }
220 
domain_cleanup(void)221 static void domain_cleanup(void)
222 {
223 	xc_dominfo_t dominfo;
224 	struct domain *domain;
225 	int notify = 0;
226 
227  again:
228 	list_for_each_entry(domain, &domains, list) {
229 		if (xc_domain_getinfo(*xc_handle, domain->domid, 1,
230 				      &dominfo) == 1 &&
231 		    dominfo.domid == domain->domid) {
232 			if ((dominfo.crashed || dominfo.shutdown)
233 			    && !domain->shutdown) {
234 				domain->shutdown = 1;
235 				notify = 1;
236 			}
237 			if (!dominfo.dying)
238 				continue;
239 		}
240 		if (domain->conn) {
241 			talloc_unlink(talloc_autofree_context(), domain->conn);
242 			domain->conn = NULL;
243 			notify = 0; /* destroy_domain() fires the watch */
244 			goto again;
245 		}
246 	}
247 
248 	if (notify)
249 		fire_watches(NULL, NULL, "@releaseDomain", false);
250 }
251 
252 /* We scan all domains rather than use the information given here. */
handle_event(void)253 void handle_event(void)
254 {
255 	evtchn_port_t port;
256 
257 	if ((port = xenevtchn_pending(xce_handle)) == -1)
258 		barf_perror("Failed to read from event fd");
259 
260 	if (port == virq_port)
261 		domain_cleanup();
262 
263 	if (xenevtchn_unmask(xce_handle, port) == -1)
264 		barf_perror("Failed to write to event fd");
265 }
266 
domain_can_read(struct connection * conn)267 bool domain_can_read(struct connection *conn)
268 {
269 	struct xenstore_domain_interface *intf = conn->domain->interface;
270 
271 	if (domain_is_unprivileged(conn) && conn->domain->wrl_credit < 0)
272 		return false;
273 	return (intf->req_cons != intf->req_prod);
274 }
275 
domid_is_unprivileged(unsigned int domid)276 static bool domid_is_unprivileged(unsigned int domid)
277 {
278 	return domid != 0 && domid != priv_domid;
279 }
280 
domain_is_unprivileged(struct connection * conn)281 bool domain_is_unprivileged(struct connection *conn)
282 {
283 	return conn && conn->domain &&
284 	       domid_is_unprivileged(conn->domain->domid);
285 }
286 
domain_can_write(struct connection * conn)287 bool domain_can_write(struct connection *conn)
288 {
289 	struct xenstore_domain_interface *intf = conn->domain->interface;
290 	return ((intf->rsp_prod - intf->rsp_cons) != XENSTORE_RING_SIZE);
291 }
292 
talloc_domain_path(void * context,unsigned int domid)293 static char *talloc_domain_path(void *context, unsigned int domid)
294 {
295 	return talloc_asprintf(context, "/local/domain/%u", domid);
296 }
297 
new_domain(void * context,unsigned int domid,int port)298 static struct domain *new_domain(void *context, unsigned int domid,
299 				 int port)
300 {
301 	struct domain *domain;
302 	int rc;
303 
304 	domain = talloc(context, struct domain);
305 	if (!domain)
306 		return NULL;
307 
308 	domain->port = 0;
309 	domain->shutdown = 0;
310 	domain->domid = domid;
311 	domain->path = talloc_domain_path(domain, domid);
312 	if (!domain->path)
313 		return NULL;
314 
315 	wrl_domain_new(domain);
316 
317 	list_add(&domain->list, &domains);
318 	talloc_set_destructor(domain, destroy_domain);
319 
320 	/* Tell kernel we're interested in this event. */
321 	rc = xenevtchn_bind_interdomain(xce_handle, domid, port);
322 	if (rc == -1)
323 	    return NULL;
324 	domain->port = rc;
325 
326 	domain->conn = new_connection(writechn, readchn);
327 	if (!domain->conn)
328 		return NULL;
329 
330 	domain->conn->domain = domain;
331 	domain->conn->id = domid;
332 
333 	domain->remote_port = port;
334 	domain->nbentry = 0;
335 	domain->nbwatch = 0;
336 
337 	return domain;
338 }
339 
340 
find_domain_by_domid(unsigned int domid)341 static struct domain *find_domain_by_domid(unsigned int domid)
342 {
343 	struct domain *i;
344 
345 	list_for_each_entry(i, &domains, list) {
346 		if (i->domid == domid)
347 			return i;
348 	}
349 	return NULL;
350 }
351 
domain_conn_reset(struct domain * domain)352 static void domain_conn_reset(struct domain *domain)
353 {
354 	struct connection *conn = domain->conn;
355 	struct buffered_data *out;
356 
357 	conn_delete_all_watches(conn);
358 	conn_delete_all_transactions(conn);
359 
360 	while ((out = list_top(&conn->out_list, struct buffered_data, list))) {
361 		list_del(&out->list);
362 		talloc_free(out);
363 	}
364 
365 	talloc_free(conn->in);
366 
367 	domain->interface->req_cons = domain->interface->req_prod = 0;
368 	domain->interface->rsp_cons = domain->interface->rsp_prod = 0;
369 }
370 
371 /* domid, mfn, evtchn, path */
do_introduce(struct connection * conn,struct buffered_data * in)372 int do_introduce(struct connection *conn, struct buffered_data *in)
373 {
374 	struct domain *domain;
375 	char *vec[3];
376 	unsigned int domid;
377 	unsigned long mfn;
378 	evtchn_port_t port;
379 	int rc;
380 	struct xenstore_domain_interface *interface;
381 
382 	if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
383 		return EINVAL;
384 
385 	if (domain_is_unprivileged(conn) || !conn->can_write)
386 		return EACCES;
387 
388 	domid = atoi(vec[0]);
389 	mfn = atol(vec[1]);
390 	port = atoi(vec[2]);
391 
392 	/* Sanity check args. */
393 	if (port <= 0)
394 		return EINVAL;
395 
396 	domain = find_domain_by_domid(domid);
397 
398 	if (domain == NULL) {
399 		interface = map_interface(domid, mfn);
400 		if (!interface)
401 			return errno;
402 		/* Hang domain off "in" until we're finished. */
403 		domain = new_domain(in, domid, port);
404 		if (!domain) {
405 			rc = errno;
406 			unmap_interface(interface);
407 			return rc;
408 		}
409 		domain->interface = interface;
410 		domain->mfn = mfn;
411 
412 		/* Now domain belongs to its connection. */
413 		talloc_steal(domain->conn, domain);
414 
415 		fire_watches(NULL, in, "@introduceDomain", false);
416 	} else if ((domain->mfn == mfn) && (domain->conn != conn)) {
417 		/* Use XS_INTRODUCE for recreating the xenbus event-channel. */
418 		if (domain->port)
419 			xenevtchn_unbind(xce_handle, domain->port);
420 		rc = xenevtchn_bind_interdomain(xce_handle, domid, port);
421 		domain->port = (rc == -1) ? 0 : rc;
422 		domain->remote_port = port;
423 	} else
424 		return EINVAL;
425 
426 	domain_conn_reset(domain);
427 
428 	send_ack(conn, XS_INTRODUCE);
429 
430 	return 0;
431 }
432 
find_connected_domain(unsigned int domid)433 static struct domain *find_connected_domain(unsigned int domid)
434 {
435 	struct domain *domain;
436 
437 	domain = find_domain_by_domid(domid);
438 	if (!domain)
439 		return ERR_PTR(-ENOENT);
440 	if (!domain->conn)
441 		return ERR_PTR(-EINVAL);
442 	return domain;
443 }
444 
do_set_target(struct connection * conn,struct buffered_data * in)445 int do_set_target(struct connection *conn, struct buffered_data *in)
446 {
447 	char *vec[2];
448 	unsigned int domid, tdomid;
449         struct domain *domain, *tdomain;
450 	if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
451 		return EINVAL;
452 
453 	if (domain_is_unprivileged(conn) || !conn->can_write)
454 		return EACCES;
455 
456 	domid = atoi(vec[0]);
457 	tdomid = atoi(vec[1]);
458 
459         domain = find_connected_domain(domid);
460 	if (IS_ERR(domain))
461 		return -PTR_ERR(domain);
462 
463         tdomain = find_connected_domain(tdomid);
464 	if (IS_ERR(tdomain))
465 		return -PTR_ERR(tdomain);
466 
467         talloc_reference(domain->conn, tdomain->conn);
468         domain->conn->target = tdomain->conn;
469 
470 	send_ack(conn, XS_SET_TARGET);
471 
472 	return 0;
473 }
474 
onearg_domain(struct connection * conn,struct buffered_data * in)475 static struct domain *onearg_domain(struct connection *conn,
476 				    struct buffered_data *in)
477 {
478 	const char *domid_str = onearg(in);
479 	unsigned int domid;
480 
481 	if (!domid_str)
482 		return ERR_PTR(-EINVAL);
483 
484 	domid = atoi(domid_str);
485 	if (!domid)
486 		return ERR_PTR(-EINVAL);
487 
488 	if (domain_is_unprivileged(conn))
489 		return ERR_PTR(-EACCES);
490 
491 	return find_connected_domain(domid);
492 }
493 
494 /* domid */
do_release(struct connection * conn,struct buffered_data * in)495 int do_release(struct connection *conn, struct buffered_data *in)
496 {
497 	struct domain *domain;
498 
499 	domain = onearg_domain(conn, in);
500 	if (IS_ERR(domain))
501 		return -PTR_ERR(domain);
502 
503 	talloc_free(domain->conn);
504 
505 	send_ack(conn, XS_RELEASE);
506 
507 	return 0;
508 }
509 
do_resume(struct connection * conn,struct buffered_data * in)510 int do_resume(struct connection *conn, struct buffered_data *in)
511 {
512 	struct domain *domain;
513 
514 	domain = onearg_domain(conn, in);
515 	if (IS_ERR(domain))
516 		return -PTR_ERR(domain);
517 
518 	domain->shutdown = 0;
519 
520 	send_ack(conn, XS_RESUME);
521 
522 	return 0;
523 }
524 
do_get_domain_path(struct connection * conn,struct buffered_data * in)525 int do_get_domain_path(struct connection *conn, struct buffered_data *in)
526 {
527 	char *path;
528 	const char *domid_str = onearg(in);
529 
530 	if (!domid_str)
531 		return EINVAL;
532 
533 	path = talloc_domain_path(conn, atoi(domid_str));
534 	if (!path)
535 		return errno;
536 
537 	send_reply(conn, XS_GET_DOMAIN_PATH, path, strlen(path) + 1);
538 
539 	talloc_free(path);
540 
541 	return 0;
542 }
543 
do_is_domain_introduced(struct connection * conn,struct buffered_data * in)544 int do_is_domain_introduced(struct connection *conn, struct buffered_data *in)
545 {
546 	int result;
547 	unsigned int domid;
548 	const char *domid_str = onearg(in);
549 
550 	if (!domid_str)
551 		return EINVAL;
552 
553 	domid = atoi(domid_str);
554 	if (domid == DOMID_SELF)
555 		result = 1;
556 	else
557 		result = (find_domain_by_domid(domid) != NULL);
558 
559 	send_reply(conn, XS_IS_DOMAIN_INTRODUCED, result ? "T" : "F", 2);
560 
561 	return 0;
562 }
563 
564 /* Allow guest to reset all watches */
do_reset_watches(struct connection * conn,struct buffered_data * in)565 int do_reset_watches(struct connection *conn, struct buffered_data *in)
566 {
567 	conn_delete_all_watches(conn);
568 	conn_delete_all_transactions(conn);
569 
570 	send_ack(conn, XS_RESET_WATCHES);
571 
572 	return 0;
573 }
574 
close_xc_handle(void * _handle)575 static int close_xc_handle(void *_handle)
576 {
577 	xc_interface_close(*(xc_interface**)_handle);
578 	return 0;
579 }
580 
close_xgt_handle(void * _handle)581 static int close_xgt_handle(void *_handle)
582 {
583 	xengnttab_close(*(xengnttab_handle **)_handle);
584 	return 0;
585 }
586 
587 /* Returns the implicit path of a connection (only domains have this) */
get_implicit_path(const struct connection * conn)588 const char *get_implicit_path(const struct connection *conn)
589 {
590 	if (!conn->domain)
591 		return "/local/domain/0";
592 	return conn->domain->path;
593 }
594 
595 /* Restore existing connections. */
restore_existing_connections(void)596 void restore_existing_connections(void)
597 {
598 }
599 
dom0_init(void)600 static int dom0_init(void)
601 {
602 	evtchn_port_t port;
603 	struct domain *dom0;
604 
605 	port = xenbus_evtchn();
606 	if (port == -1)
607 		return -1;
608 
609 	dom0 = new_domain(NULL, xenbus_master_domid(), port);
610 	if (dom0 == NULL)
611 		return -1;
612 
613 	dom0->interface = xenbus_map();
614 	if (dom0->interface == NULL)
615 		return -1;
616 
617 	talloc_steal(dom0->conn, dom0);
618 
619 	xenevtchn_notify(xce_handle, dom0->port);
620 
621 	return 0;
622 }
623 
domain_init(void)624 void domain_init(void)
625 {
626 	int rc;
627 
628 	xc_handle = talloc(talloc_autofree_context(), xc_interface*);
629 	if (!xc_handle)
630 		barf_perror("Failed to allocate domain handle");
631 
632 	*xc_handle = xc_interface_open(0,0,0);
633 	if (!*xc_handle)
634 		barf_perror("Failed to open connection to hypervisor");
635 
636 	talloc_set_destructor(xc_handle, close_xc_handle);
637 
638 	xgt_handle = talloc(talloc_autofree_context(), xengnttab_handle*);
639 	if (!xgt_handle)
640 		barf_perror("Failed to allocate domain gnttab handle");
641 
642 	*xgt_handle = xengnttab_open(NULL, 0);
643 	if (*xgt_handle == NULL)
644 		xprintf("WARNING: Failed to open connection to gnttab\n");
645 	else
646 		talloc_set_destructor(xgt_handle, close_xgt_handle);
647 
648 	xce_handle = xenevtchn_open(NULL, 0);
649 
650 	if (xce_handle == NULL)
651 		barf_perror("Failed to open evtchn device");
652 
653 	if (dom0_init() != 0)
654 		barf_perror("Failed to initialize dom0 state");
655 
656 	if ((rc = xenevtchn_bind_virq(xce_handle, VIRQ_DOM_EXC)) == -1)
657 		barf_perror("Failed to bind to domain exception virq port");
658 	virq_port = rc;
659 }
660 
domain_entry_inc(struct connection * conn,struct node * node)661 void domain_entry_inc(struct connection *conn, struct node *node)
662 {
663 	struct domain *d;
664 
665 	if (!conn)
666 		return;
667 
668 	if (node->perms && node->perms[0].id != conn->id) {
669 		if (conn->transaction) {
670 			transaction_entry_inc(conn->transaction,
671 				node->perms[0].id);
672 		} else {
673 			d = find_domain_by_domid(node->perms[0].id);
674 			if (d)
675 				d->nbentry++;
676 		}
677 	} else if (conn->domain) {
678 		if (conn->transaction) {
679 			transaction_entry_inc(conn->transaction,
680 				conn->domain->domid);
681  		} else {
682  			conn->domain->nbentry++;
683 		}
684 	}
685 }
686 
domain_entry_dec(struct connection * conn,struct node * node)687 void domain_entry_dec(struct connection *conn, struct node *node)
688 {
689 	struct domain *d;
690 
691 	if (!conn)
692 		return;
693 
694 	if (node->perms && node->perms[0].id != conn->id) {
695 		if (conn->transaction) {
696 			transaction_entry_dec(conn->transaction,
697 				node->perms[0].id);
698 		} else {
699 			d = find_domain_by_domid(node->perms[0].id);
700 			if (d && d->nbentry)
701 				d->nbentry--;
702 		}
703 	} else if (conn->domain && conn->domain->nbentry) {
704 		if (conn->transaction) {
705 			transaction_entry_dec(conn->transaction,
706 				conn->domain->domid);
707 		} else {
708 			conn->domain->nbentry--;
709 		}
710 	}
711 }
712 
domain_entry_fix(unsigned int domid,int num,bool update)713 int domain_entry_fix(unsigned int domid, int num, bool update)
714 {
715 	struct domain *d;
716 	int cnt;
717 
718 	d = find_domain_by_domid(domid);
719 	if (!d)
720 		return 0;
721 
722 	cnt = d->nbentry + num;
723 	if (cnt < 0)
724 		cnt = 0;
725 
726 	if (update)
727 		d->nbentry = cnt;
728 
729 	return domid_is_unprivileged(domid) ? cnt : 0;
730 }
731 
domain_entry(struct connection * conn)732 int domain_entry(struct connection *conn)
733 {
734 	return (domain_is_unprivileged(conn))
735 		? conn->domain->nbentry
736 		: 0;
737 }
738 
domain_watch_inc(struct connection * conn)739 void domain_watch_inc(struct connection *conn)
740 {
741 	if (!conn || !conn->domain)
742 		return;
743 	conn->domain->nbwatch++;
744 }
745 
domain_watch_dec(struct connection * conn)746 void domain_watch_dec(struct connection *conn)
747 {
748 	if (!conn || !conn->domain)
749 		return;
750 	if (conn->domain->nbwatch)
751 		conn->domain->nbwatch--;
752 }
753 
domain_watch(struct connection * conn)754 int domain_watch(struct connection *conn)
755 {
756 	return (domain_is_unprivileged(conn))
757 		? conn->domain->nbwatch
758 		: 0;
759 }
760 
761 static wrl_creditt wrl_config_writecost      = WRL_FACTOR;
762 static wrl_creditt wrl_config_rate           = WRL_RATE   * WRL_FACTOR;
763 static wrl_creditt wrl_config_dburst         = WRL_DBURST * WRL_FACTOR;
764 static wrl_creditt wrl_config_gburst         = WRL_GBURST * WRL_FACTOR;
765 static wrl_creditt wrl_config_newdoms_dburst =
766 	                         WRL_DBURST * WRL_NEWDOMS * WRL_FACTOR;
767 
768 long wrl_ntransactions;
769 
770 static long wrl_ndomains;
771 static wrl_creditt wrl_reserve; /* [-wrl_config_newdoms_dburst, +_gburst ] */
772 static time_t wrl_log_last_warning; /* 0: no previous warning */
773 
wrl_gettime_now(struct wrl_timestampt * now_wt)774 void wrl_gettime_now(struct wrl_timestampt *now_wt)
775 {
776 	struct timespec now_ts;
777 	int r;
778 
779 	r = clock_gettime(CLOCK_MONOTONIC, &now_ts);
780 	if (r)
781 		barf_perror("Could not find time (clock_gettime failed)");
782 
783 	now_wt->sec = now_ts.tv_sec;
784 	now_wt->msec = now_ts.tv_nsec / 1000000;
785 }
786 
wrl_xfer_credit(wrl_creditt * debit,wrl_creditt debit_floor,wrl_creditt * credit,wrl_creditt credit_ceil)787 static void wrl_xfer_credit(wrl_creditt *debit,  wrl_creditt debit_floor,
788 			    wrl_creditt *credit, wrl_creditt credit_ceil)
789 	/*
790 	 * Transfers zero or more credit from "debit" to "credit".
791 	 * Transfers as much as possible while maintaining
792 	 * debit >= debit_floor and credit <= credit_ceil.
793 	 * (If that's violated already, does nothing.)
794 	 *
795 	 * Sufficient conditions to avoid overflow, either of:
796 	 *  |every argument| <= 0x3fffffff
797 	 *  |every argument| <= 1E9
798 	 *  |every argument| <= WRL_CREDIT_MAX
799 	 * (And this condition is preserved.)
800 	 */
801 {
802 	wrl_creditt xfer = MIN( *debit      - debit_floor,
803 			        credit_ceil - *credit      );
804 	if (xfer > 0) {
805 		*debit -= xfer;
806 		*credit += xfer;
807 	}
808 }
809 
wrl_domain_new(struct domain * domain)810 void wrl_domain_new(struct domain *domain)
811 {
812 	domain->wrl_credit = 0;
813 	wrl_gettime_now(&domain->wrl_timestamp);
814 	wrl_ndomains++;
815 	/* Steal up to DBURST from the reserve */
816 	wrl_xfer_credit(&wrl_reserve, -wrl_config_newdoms_dburst,
817 			&domain->wrl_credit, wrl_config_dburst);
818 }
819 
wrl_domain_destroy(struct domain * domain)820 void wrl_domain_destroy(struct domain *domain)
821 {
822 	wrl_ndomains--;
823 	/*
824 	 * Don't bother recalculating domain's credit - this just
825 	 * means we don't give the reserve the ending domain's credit
826 	 * for time elapsed since last update.
827 	 */
828 	wrl_xfer_credit(&domain->wrl_credit, 0,
829 			&wrl_reserve, wrl_config_dburst);
830 }
831 
wrl_credit_update(struct domain * domain,struct wrl_timestampt now)832 void wrl_credit_update(struct domain *domain, struct wrl_timestampt now)
833 {
834 	/*
835 	 * We want to calculate
836 	 *    credit += (now - timestamp) * RATE / ndoms;
837 	 * But we want it to saturate, and to avoid floating point.
838 	 * To avoid rounding errors from constantly adding small
839 	 * amounts of credit, we only add credit for whole milliseconds.
840 	 */
841 	long seconds      = now.sec -  domain->wrl_timestamp.sec;
842 	long milliseconds = now.msec - domain->wrl_timestamp.msec;
843 	long msec;
844 	int64_t denom, num;
845 	wrl_creditt surplus;
846 
847 	seconds = MIN(seconds, 1000*1000); /* arbitrary, prevents overflow */
848 	msec = seconds * 1000 + milliseconds;
849 
850 	if (msec < 0)
851                 /* shouldn't happen with CLOCK_MONOTONIC */
852 		msec = 0;
853 
854 	/* 32x32 -> 64 cannot overflow */
855 	denom = (int64_t)msec * wrl_config_rate;
856 	num  =  (int64_t)wrl_ndomains * 1000;
857 	/* denom / num <= 1E6 * wrl_config_rate, so with
858 	   reasonable wrl_config_rate, denom / num << 2^64 */
859 
860 	/* at last! */
861 	domain->wrl_credit = MIN( (int64_t)domain->wrl_credit + denom / num,
862 				  WRL_CREDIT_MAX );
863 	/* (maybe briefly violating the DBURST cap on wrl_credit) */
864 
865 	/* maybe take from the reserve to make us nonnegative */
866 	wrl_xfer_credit(&wrl_reserve,        0,
867 			&domain->wrl_credit, 0);
868 
869 	/* return any surplus (over DBURST) to the reserve */
870 	surplus = 0;
871 	wrl_xfer_credit(&domain->wrl_credit, wrl_config_dburst,
872 			&surplus,            WRL_CREDIT_MAX);
873 	wrl_xfer_credit(&surplus,     0,
874 			&wrl_reserve, wrl_config_gburst);
875 	/* surplus is now implicitly discarded */
876 
877 	domain->wrl_timestamp = now;
878 
879 	trace("wrl: dom %4d %6ld  msec  %9ld credit   %9ld reserve"
880 	      "  %9ld discard\n",
881 	      domain->domid,
882 	      msec,
883 	      (long)domain->wrl_credit, (long)wrl_reserve,
884 	      (long)surplus);
885 }
886 
wrl_check_timeout(struct domain * domain,struct wrl_timestampt now,int * ptimeout)887 void wrl_check_timeout(struct domain *domain,
888 		       struct wrl_timestampt now,
889 		       int *ptimeout)
890 {
891 	uint64_t num, denom;
892 	int wakeup;
893 
894 	wrl_credit_update(domain, now);
895 
896 	if (domain->wrl_credit >= 0)
897 		/* not blocked */
898 		return;
899 
900 	if (!*ptimeout)
901 		/* already decided on immediate wakeup,
902 		   so no need to calculate our timeout */
903 		return;
904 
905 	/* calculate  wakeup = now + -credit / (RATE / ndoms); */
906 
907 	/* credit cannot go more -ve than one transaction,
908 	 * so the first multiplication cannot overflow even 32-bit */
909 	num   = (uint64_t)(-domain->wrl_credit * 1000) * wrl_ndomains;
910 	denom = wrl_config_rate;
911 
912 	wakeup = MIN( num / denom /* uint64_t */, INT_MAX );
913 	if (*ptimeout==-1 || wakeup < *ptimeout)
914 		*ptimeout = wakeup;
915 
916 	trace("wrl: domain %u credit=%ld (reserve=%ld) SLEEPING for %d\n",
917 	      domain->domid,
918 	      (long)domain->wrl_credit, (long)wrl_reserve,
919 	      wakeup);
920 }
921 
922 #define WRL_LOG(now, ...) \
923 	(syslog(LOG_WARNING, "write rate limit: " __VA_ARGS__))
924 
wrl_apply_debit_actual(struct domain * domain)925 void wrl_apply_debit_actual(struct domain *domain)
926 {
927 	struct wrl_timestampt now;
928 
929 	if (!domain)
930 		/* sockets escape the write rate limit */
931 		return;
932 
933 	wrl_gettime_now(&now);
934 	wrl_credit_update(domain, now);
935 
936 	domain->wrl_credit -= wrl_config_writecost;
937 	trace("wrl: domain %u credit=%ld (reserve=%ld)\n",
938 	      domain->domid,
939 	      (long)domain->wrl_credit, (long)wrl_reserve);
940 
941 	if (domain->wrl_credit < 0) {
942 		if (!domain->wrl_delay_logged) {
943 			domain->wrl_delay_logged = true;
944 			WRL_LOG(now, "domain %ld is affected",
945 				(long)domain->domid);
946 		} else if (!wrl_log_last_warning) {
947 			WRL_LOG(now, "rate limiting restarts");
948 		}
949 		wrl_log_last_warning = now.sec;
950 	}
951 }
952 
wrl_log_periodic(struct wrl_timestampt now)953 void wrl_log_periodic(struct wrl_timestampt now)
954 {
955 	if (wrl_log_last_warning &&
956 	    (now.sec - wrl_log_last_warning) > WRL_LOGEVERY) {
957 		WRL_LOG(now, "not in force recently");
958 		wrl_log_last_warning = 0;
959 	}
960 }
961 
wrl_apply_debit_direct(struct connection * conn)962 void wrl_apply_debit_direct(struct connection *conn)
963 {
964 	if (!conn)
965 		/* some writes are generated internally */
966 		return;
967 
968 	if (conn->transaction)
969 		/* these are accounted for when the transaction ends */
970 		return;
971 
972 	if (!wrl_ntransactions)
973 		/* we don't conflict with anyone */
974 		return;
975 
976 	wrl_apply_debit_actual(conn->domain);
977 }
978 
wrl_apply_debit_trans_commit(struct connection * conn)979 void wrl_apply_debit_trans_commit(struct connection *conn)
980 {
981 	if (wrl_ntransactions <= 1)
982 		/* our own transaction appears in the counter */
983 		return;
984 
985 	wrl_apply_debit_actual(conn->domain);
986 }
987 
988 /*
989  * Local variables:
990  *  c-file-style: "linux"
991  *  indent-tabs-mode: t
992  *  c-indent-level: 8
993  *  c-basic-offset: 8
994  *  tab-width: 8
995  * End:
996  */
997