1 /*
2     Transaction code for Xen Store Daemon.
3     Copyright (C) 2005 Rusty Russell IBM Corporation
4 
5     This program is free software; you can redistribute it and/or modify
6     it under the terms of the GNU General Public License as published by
7     the Free Software Foundation; either version 2 of the License, or
8     (at your option) any later version.
9 
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU General Public License for more details.
14 
15     You should have received a copy of the GNU General Public License
16     along with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18 
19 #include <inttypes.h>
20 #include <stdio.h>
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <sys/wait.h>
24 #include <sys/time.h>
25 #include <time.h>
26 #include <assert.h>
27 #include <stdarg.h>
28 #include <stdlib.h>
29 #include <fcntl.h>
30 #include <unistd.h>
31 #include "talloc.h"
32 #include "list.h"
33 #include "xenstored_transaction.h"
34 #include "xenstored_watch.h"
35 #include "xenstored_domain.h"
36 #include "xenstore_lib.h"
37 #include "utils.h"
38 
39 /*
40  * Some notes regarding detection and handling of transaction conflicts:
41  *
42  * Basic source of reference is the 'generation' count. Each writing access
43  * (either normal write or in a transaction) to the tdb data base will set
44  * the node specific generation count to the global generation count.
45  * For being able to identify a transaction the transaction specific generation
46  * count is initialized with the global generation count when starting the
47  * transaction.
48  * Each time the global generation count is copied to either a node or a
49  * transaction it is incremented. This ensures all nodes and/or transactions
50  * are having a unique generation count.
51  *
52  * Transaction conflicts are detected by checking the generation count of all
53  * nodes read in the transaction to match with the generation count in the
54  * global data base at the end of the transaction. Nodes which have been
55  * modified in the transaction don't have to be checked to match even if they
56  * have been read, as the modified node will be globally visible after the
57  * succeeded transaction possibly overwriting another modification which may
58  * have occurred concurrent to the transaction.
59  *
60  * Examples:
61  * ---------
62  * The following notation is used:
63  * I:      initial state
64  * G       global generation count
65  * g(X)    generation count of node X
66  * G(1)    generation count of transaction 1
67  * g(1:Y)  saved generation count of node Y in transaction 1
68  * TA1:    operation in transaction 1
69  * X=1:X   replace global node X with transaction 1 specific value of X
70  *
71  * 1. Simple transaction doing: read node A, write node B
72  *    I: g(A) = 1, g(B) = 2, G = 3
73  *    Start transaction 1: G(1) = 3, G = 4
74  *    TA1: read node A:    g(1:A) = 1
75  *    TA1: write node B:   g(1:B) = 4, G = 5
76  *    End TA1: g(1:A) == g(A) => okay, B = 1:B, g(B) = 5, G = 6
77  *
78  * 2. Transaction with conflicting write
79  *    I: g(A) = 1, g(B) = 2, G = 3
80  *    Start transaction 1: G(1) = 3, G = 4
81  *    TA1: read node A:    g(1:A) = 1
82  *    write node A:        g(A) = 4, G = 5
83  *    TA1: write node B:   g(1:B) = 5, G = 6
84  *    End TA1: g(1:A) != g(A) => EAGAIN
85  *
86  * 3. Transaction with conflicting delete
87  *    I: g(A) = 1, g(B) = 2, G = 3
88  *    Start transaction 1: G(1) = 3, G = 4
89  *    TA1: read node A:    g(1:A) = 1
90  *    delete node A:       g(A) = ~0
91  *    TA1: write node B:   g(1:B) = 4, G = 5
92  *    End TA1: g(1:A) != g(A) => EAGAIN
93  *
94  * 4. Two interfering transactions
95  *    I: g(A) = 1, g(B) = 2, G = 3
96  *    Start transaction 1: G(1) = 3, G = 4
97  *    Start transaction 2: G(2) = 4, G = 5
98  *    TA1: read node A:    g(1:A) = 1
99  *    TA2: read node B:    g(2:B) = 2
100  *    TA1: write node B:   g(1:B) = 5, G = 6
101  *    TA2: write node A:   g(2:A) = 6, G = 7
102  *    End TA1: g(1:A) == g(A) => okay, B = 1:B, g(B) = 7, G = 8
103  *    End TA2: g(2:B) != g(B) => EAGAIN
104  */
105 
106 struct accessed_node
107 {
108 	/* List of all changed nodes in the context of this transaction. */
109 	struct list_head list;
110 
111 	/* The name of the node. */
112 	char *node;
113 
114 	/* Generation count (or NO_GENERATION) for conflict checking. */
115 	uint64_t generation;
116 
117 	/* Generation count checking required? */
118 	bool check_gen;
119 
120 	/* Modified? */
121 	bool modified;
122 
123 	/* Transaction node in data base? */
124 	bool ta_node;
125 };
126 
127 struct changed_domain
128 {
129 	/* List of all changed domains in the context of this transaction. */
130 	struct list_head list;
131 
132 	/* Identifier of the changed domain. */
133 	unsigned int domid;
134 
135 	/* Amount by which this domain's nbentry field has changed. */
136 	int nbentry;
137 };
138 
139 struct transaction
140 {
141 	/* List of all transactions active on this connection. */
142 	struct list_head list;
143 
144 	/* Connection-local identifier for this transaction. */
145 	uint32_t id;
146 
147 	/* Generation when transaction started. */
148 	uint64_t generation;
149 
150 	/* List of accessed nodes. */
151 	struct list_head accessed;
152 
153 	/* List of changed domains - to record the changed domain entry number */
154 	struct list_head changed_domains;
155 
156 	/* Flag for letting transaction fail. */
157 	bool fail;
158 };
159 
160 extern int quota_max_transaction;
161 static uint64_t generation;
162 
set_tdb_key(const char * name,TDB_DATA * key)163 static void set_tdb_key(const char *name, TDB_DATA *key)
164 {
165 	key->dptr = (char *)name;
166 	key->dsize = strlen(name);
167 }
168 
find_accessed_node(struct transaction * trans,const char * name)169 static struct accessed_node *find_accessed_node(struct transaction *trans,
170 						const char *name)
171 {
172 	struct accessed_node *i;
173 
174 	list_for_each_entry(i, &trans->accessed, list)
175 		if (streq(i->node, name))
176 			return i;
177 
178 	return NULL;
179 }
180 
transaction_get_node_name(void * ctx,struct transaction * trans,const char * name)181 static char *transaction_get_node_name(void *ctx, struct transaction *trans,
182 				       const char *name)
183 {
184 	return talloc_asprintf(ctx, "%"PRIu64"/%s", trans->generation, name);
185 }
186 
187 /*
188  * Prepend the transaction to name if node has been modified in the current
189  * transaction.
190  */
transaction_prepend(struct connection * conn,const char * name,TDB_DATA * key)191 int transaction_prepend(struct connection *conn, const char *name,
192 			TDB_DATA *key)
193 {
194 	char *tdb_name;
195 
196 	if (!conn || !conn->transaction ||
197 	    !find_accessed_node(conn->transaction, name)) {
198 		set_tdb_key(name, key);
199 		return 0;
200 	}
201 
202 	tdb_name = transaction_get_node_name(conn->transaction,
203 					     conn->transaction, name);
204 	if (!tdb_name)
205 		return errno;
206 
207 	set_tdb_key(tdb_name, key);
208 
209 	return 0;
210 }
211 
212 /*
213  * A node has been accessed.
214  *
215  * Modifying accesses (write, delete) always update the generation (global and
216  * node->generation).
217  *
218  * Accesses in a transaction will be added to the list of accessed nodes
219  * if not already done. Read type accesses will copy the node to the
220  * transaction specific data base part, write type accesses go there
221  * anyway.
222  *
223  * If not NULL, key will be supplied with name and length of name of the node
224  * to be accessed in the data base.
225  */
access_node(struct connection * conn,struct node * node,enum node_access_type type,TDB_DATA * key)226 int access_node(struct connection *conn, struct node *node,
227 		enum node_access_type type, TDB_DATA *key)
228 {
229 	struct accessed_node *i = NULL;
230 	struct transaction *trans;
231 	TDB_DATA local_key;
232 	const char *trans_name = NULL;
233 	int ret;
234 	bool introduce = false;
235 
236 	if (type != NODE_ACCESS_READ) {
237 		node->generation = generation++;
238 		if (conn && !conn->transaction)
239 			wrl_apply_debit_direct(conn);
240 	}
241 
242 	if (!conn || !conn->transaction) {
243 		/* They're changing the global database. */
244 		if (key)
245 			set_tdb_key(node->name, key);
246 		return 0;
247 	}
248 
249 	trans = conn->transaction;
250 
251 	trans_name = transaction_get_node_name(node, trans, node->name);
252 	if (!trans_name)
253 		goto nomem;
254 
255 	i = find_accessed_node(trans, node->name);
256 	if (!i) {
257 		i = talloc_zero(trans, struct accessed_node);
258 		if (!i)
259 			goto nomem;
260 		i->node = talloc_strdup(i, node->name);
261 		if (!i->node)
262 			goto nomem;
263 
264 		introduce = true;
265 		i->ta_node = false;
266 
267 		/*
268 		 * Additional transaction-specific node for read type. We only
269 		 * have to verify read nodes if we didn't write them.
270 		 *
271 		 * The node is created and written to DB here to distinguish
272 		 * from the write types.
273 		 */
274 		if (type == NODE_ACCESS_READ) {
275 			i->generation = node->generation;
276 			i->check_gen = true;
277 			if (node->generation != NO_GENERATION) {
278 				set_tdb_key(trans_name, &local_key);
279 				ret = write_node_raw(conn, &local_key, node);
280 				if (ret)
281 					goto err;
282 				i->ta_node = true;
283 			}
284 		}
285 		list_add_tail(&i->list, &trans->accessed);
286 	}
287 
288 	if (type != NODE_ACCESS_READ)
289 		i->modified = true;
290 
291 	if (introduce && type == NODE_ACCESS_DELETE)
292 		/* Nothing to delete. */
293 		return -1;
294 
295 	if (key) {
296 		set_tdb_key(trans_name, key);
297 		if (type == NODE_ACCESS_WRITE)
298 			i->ta_node = true;
299 		if (type == NODE_ACCESS_DELETE)
300 			i->ta_node = false;
301 	}
302 
303 	return 0;
304 
305 nomem:
306 	ret = ENOMEM;
307 err:
308 	talloc_free((void *)trans_name);
309 	talloc_free(i);
310 	trans->fail = true;
311 	errno = ret;
312 	return ret;
313 }
314 
315 /*
316  * Finalize transaction:
317  * Walk through accessed nodes and check generation against global data.
318  * If all entries match, read the transaction entries and write them without
319  * transaction prepended. Delete all transaction specific nodes in the data
320  * base.
321  */
finalize_transaction(struct connection * conn,struct transaction * trans)322 static int finalize_transaction(struct connection *conn,
323 				struct transaction *trans)
324 {
325 	struct accessed_node *i;
326 	TDB_DATA key, ta_key, data;
327 	struct xs_tdb_record_hdr *hdr;
328 	uint64_t gen;
329 	char *trans_name;
330 	int ret;
331 
332 	list_for_each_entry(i, &trans->accessed, list) {
333 		if (!i->check_gen)
334 			continue;
335 
336 		set_tdb_key(i->node, &key);
337 		data = tdb_fetch(tdb_ctx, key);
338 		hdr = (void *)data.dptr;
339 		if (!data.dptr) {
340 			if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST)
341 				return EIO;
342 			gen = NO_GENERATION;
343 		} else
344 			gen = hdr->generation;
345 		talloc_free(data.dptr);
346 		if (i->generation != gen)
347 			return EAGAIN;
348 	}
349 
350 	while ((i = list_top(&trans->accessed, struct accessed_node, list))) {
351 		trans_name = transaction_get_node_name(i, trans, i->node);
352 		if (!trans_name)
353 			/* We are doomed: the transaction is only partial. */
354 			goto err;
355 
356 		set_tdb_key(trans_name, &ta_key);
357 
358 		if (i->modified) {
359 			set_tdb_key(i->node, &key);
360 			if (i->ta_node) {
361 				data = tdb_fetch(tdb_ctx, ta_key);
362 				if (!data.dptr)
363 					goto err;
364 				hdr = (void *)data.dptr;
365 				hdr->generation = generation++;
366 				ret = tdb_store(tdb_ctx, key, data,
367 						TDB_REPLACE);
368 				talloc_free(data.dptr);
369 				if (ret)
370 					goto err;
371 			} else if (tdb_delete(tdb_ctx, key))
372 					goto err;
373 			fire_watches(conn, trans, i->node, false);
374 		}
375 
376 		if (i->ta_node && tdb_delete(tdb_ctx, ta_key))
377 			goto err;
378 		list_del(&i->list);
379 		talloc_free(i);
380 	}
381 
382 	return 0;
383 
384 err:
385 	corrupt(conn, "Partial transaction");
386 	return EIO;
387 }
388 
destroy_transaction(void * _transaction)389 static int destroy_transaction(void *_transaction)
390 {
391 	struct transaction *trans = _transaction;
392 	struct accessed_node *i;
393 	char *trans_name;
394 	TDB_DATA key;
395 
396 	wrl_ntransactions--;
397 	trace_destroy(trans, "transaction");
398 	while ((i = list_top(&trans->accessed, struct accessed_node, list))) {
399 		if (i->ta_node) {
400 			trans_name = transaction_get_node_name(i, trans,
401 							       i->node);
402 			if (trans_name) {
403 				set_tdb_key(trans_name, &key);
404 				tdb_delete(tdb_ctx, key);
405 			}
406 		}
407 		list_del(&i->list);
408 		talloc_free(i);
409 	}
410 
411 	return 0;
412 }
413 
transaction_lookup(struct connection * conn,uint32_t id)414 struct transaction *transaction_lookup(struct connection *conn, uint32_t id)
415 {
416 	struct transaction *trans;
417 
418 	if (id == 0)
419 		return NULL;
420 
421 	list_for_each_entry(trans, &conn->transaction_list, list)
422 		if (trans->id == id)
423 			return trans;
424 
425 	return ERR_PTR(-ENOENT);
426 }
427 
do_transaction_start(struct connection * conn,struct buffered_data * in)428 int do_transaction_start(struct connection *conn, struct buffered_data *in)
429 {
430 	struct transaction *trans, *exists;
431 	char id_str[20];
432 
433 	/* We don't support nested transactions. */
434 	if (conn->transaction)
435 		return EBUSY;
436 
437 	if (conn->id && conn->transaction_started > quota_max_transaction)
438 		return ENOSPC;
439 
440 	/* Attach transaction to input for autofree until it's complete */
441 	trans = talloc_zero(in, struct transaction);
442 	if (!trans)
443 		return ENOMEM;
444 
445 	INIT_LIST_HEAD(&trans->accessed);
446 	INIT_LIST_HEAD(&trans->changed_domains);
447 	trans->fail = false;
448 	trans->generation = generation++;
449 
450 	/* Pick an unused transaction identifier. */
451 	do {
452 		trans->id = conn->next_transaction_id;
453 		exists = transaction_lookup(conn, conn->next_transaction_id++);
454 	} while (!IS_ERR(exists));
455 
456 	/* Now we own it. */
457 	list_add_tail(&trans->list, &conn->transaction_list);
458 	talloc_steal(conn, trans);
459 	talloc_set_destructor(trans, destroy_transaction);
460 	conn->transaction_started++;
461 	wrl_ntransactions++;
462 
463 	snprintf(id_str, sizeof(id_str), "%u", trans->id);
464 	send_reply(conn, XS_TRANSACTION_START, id_str, strlen(id_str)+1);
465 
466 	return 0;
467 }
468 
transaction_fix_domains(struct transaction * trans,bool update)469 static int transaction_fix_domains(struct transaction *trans, bool update)
470 {
471 	struct changed_domain *d;
472 	int cnt;
473 
474 	list_for_each_entry(d, &trans->changed_domains, list) {
475 		cnt = domain_entry_fix(d->domid, d->nbentry, update);
476 		if (!update && cnt >= quota_nb_entry_per_domain)
477 			return ENOSPC;
478 	}
479 
480 	return 0;
481 }
482 
do_transaction_end(struct connection * conn,struct buffered_data * in)483 int do_transaction_end(struct connection *conn, struct buffered_data *in)
484 {
485 	const char *arg = onearg(in);
486 	struct transaction *trans;
487 	int ret;
488 
489 	if (!arg || (!streq(arg, "T") && !streq(arg, "F")))
490 		return EINVAL;
491 
492 	if ((trans = conn->transaction) == NULL)
493 		return ENOENT;
494 
495 	conn->transaction = NULL;
496 	list_del(&trans->list);
497 	conn->transaction_started--;
498 
499 	/* Attach transaction to in for auto-cleanup */
500 	talloc_steal(in, trans);
501 
502 	if (streq(arg, "T")) {
503 		if (trans->fail)
504 			return ENOMEM;
505 		ret = transaction_fix_domains(trans, false);
506 		if (ret)
507 			return ret;
508 		if (finalize_transaction(conn, trans))
509 			return EAGAIN;
510 
511 		wrl_apply_debit_trans_commit(conn);
512 
513 		/* fix domain entry for each changed domain */
514 		transaction_fix_domains(trans, true);
515 	}
516 	send_ack(conn, XS_TRANSACTION_END);
517 
518 	return 0;
519 }
520 
transaction_entry_inc(struct transaction * trans,unsigned int domid)521 void transaction_entry_inc(struct transaction *trans, unsigned int domid)
522 {
523 	struct changed_domain *d;
524 
525 	list_for_each_entry(d, &trans->changed_domains, list)
526 		if (d->domid == domid) {
527 			d->nbentry++;
528 			return;
529 		}
530 
531 	d = talloc(trans, struct changed_domain);
532 	if (!d) {
533 		/* Let the transaction fail. */
534 		trans->fail = true;
535 		return;
536 	}
537 	d->domid = domid;
538 	d->nbentry = 1;
539 	list_add_tail(&d->list, &trans->changed_domains);
540 }
541 
transaction_entry_dec(struct transaction * trans,unsigned int domid)542 void transaction_entry_dec(struct transaction *trans, unsigned int domid)
543 {
544 	struct changed_domain *d;
545 
546 	list_for_each_entry(d, &trans->changed_domains, list)
547 		if (d->domid == domid) {
548 			d->nbentry--;
549 			return;
550 		}
551 
552 	d = talloc(trans, struct changed_domain);
553 	if (!d) {
554 		/* Let the transaction fail. */
555 		trans->fail = true;
556 		return;
557 	}
558 	d->domid = domid;
559 	d->nbentry = -1;
560 	list_add_tail(&d->list, &trans->changed_domains);
561 }
562 
conn_delete_all_transactions(struct connection * conn)563 void conn_delete_all_transactions(struct connection *conn)
564 {
565 	struct transaction *trans;
566 
567 	while ((trans = list_top(&conn->transaction_list,
568 				 struct transaction, list))) {
569 		list_del(&trans->list);
570 		talloc_free(trans);
571 	}
572 
573 	assert(conn->transaction == NULL);
574 
575 	conn->transaction_started = 0;
576 }
577 
check_transactions(struct hashtable * hash)578 int check_transactions(struct hashtable *hash)
579 {
580 	struct connection *conn;
581 	struct transaction *trans;
582 	struct accessed_node *i;
583 	char *tname, *tnode;
584 
585 	list_for_each_entry(conn, &connections, list) {
586 		list_for_each_entry(trans, &conn->transaction_list, list) {
587 			tname = talloc_asprintf(trans, "%"PRIu64,
588 						trans->generation);
589 			if (!tname || !remember_string(hash, tname))
590 				goto nomem;
591 
592 			list_for_each_entry(i, &trans->accessed, list) {
593 				if (!i->ta_node)
594 					continue;
595 				tnode = transaction_get_node_name(tname, trans,
596 								  i->node);
597 				if (!tnode || !remember_string(hash, tnode))
598 					goto nomem;
599 				talloc_free(tnode);
600 			}
601 
602 			talloc_free(tname);
603 		}
604 	}
605 
606 	return 0;
607 
608 nomem:
609 	talloc_free(tname);
610 	return ENOMEM;
611 }
612 
613 /*
614  * Local variables:
615  *  c-file-style: "linux"
616  *  indent-tabs-mode: t
617  *  c-indent-level: 8
618  *  c-basic-offset: 8
619  *  tab-width: 8
620  * End:
621  */
622