1 /*
2 Transaction code for Xen Store Daemon.
3 Copyright (C) 2005 Rusty Russell IBM Corporation
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include <inttypes.h>
20 #include <stdio.h>
21 #include <sys/types.h>
22 #include <sys/stat.h>
23 #include <sys/wait.h>
24 #include <sys/time.h>
25 #include <time.h>
26 #include <assert.h>
27 #include <stdarg.h>
28 #include <stdlib.h>
29 #include <fcntl.h>
30 #include <unistd.h>
31 #include "talloc.h"
32 #include "list.h"
33 #include "xenstored_transaction.h"
34 #include "xenstored_watch.h"
35 #include "xenstored_domain.h"
36 #include "xenstore_lib.h"
37 #include "utils.h"
38
39 /*
40 * Some notes regarding detection and handling of transaction conflicts:
41 *
42 * Basic source of reference is the 'generation' count. Each writing access
43 * (either normal write or in a transaction) to the tdb data base will set
44 * the node specific generation count to the global generation count.
45 * For being able to identify a transaction the transaction specific generation
46 * count is initialized with the global generation count when starting the
47 * transaction.
48 * Each time the global generation count is copied to either a node or a
49 * transaction it is incremented. This ensures all nodes and/or transactions
50 * are having a unique generation count.
51 *
52 * Transaction conflicts are detected by checking the generation count of all
53 * nodes read in the transaction to match with the generation count in the
54 * global data base at the end of the transaction. Nodes which have been
55 * modified in the transaction don't have to be checked to match even if they
56 * have been read, as the modified node will be globally visible after the
57 * succeeded transaction possibly overwriting another modification which may
58 * have occurred concurrent to the transaction.
59 *
60 * Examples:
61 * ---------
62 * The following notation is used:
63 * I: initial state
64 * G global generation count
65 * g(X) generation count of node X
66 * G(1) generation count of transaction 1
67 * g(1:Y) saved generation count of node Y in transaction 1
68 * TA1: operation in transaction 1
69 * X=1:X replace global node X with transaction 1 specific value of X
70 *
71 * 1. Simple transaction doing: read node A, write node B
72 * I: g(A) = 1, g(B) = 2, G = 3
73 * Start transaction 1: G(1) = 3, G = 4
74 * TA1: read node A: g(1:A) = 1
75 * TA1: write node B: g(1:B) = 4, G = 5
76 * End TA1: g(1:A) == g(A) => okay, B = 1:B, g(B) = 5, G = 6
77 *
78 * 2. Transaction with conflicting write
79 * I: g(A) = 1, g(B) = 2, G = 3
80 * Start transaction 1: G(1) = 3, G = 4
81 * TA1: read node A: g(1:A) = 1
82 * write node A: g(A) = 4, G = 5
83 * TA1: write node B: g(1:B) = 5, G = 6
84 * End TA1: g(1:A) != g(A) => EAGAIN
85 *
86 * 3. Transaction with conflicting delete
87 * I: g(A) = 1, g(B) = 2, G = 3
88 * Start transaction 1: G(1) = 3, G = 4
89 * TA1: read node A: g(1:A) = 1
90 * delete node A: g(A) = ~0
91 * TA1: write node B: g(1:B) = 4, G = 5
92 * End TA1: g(1:A) != g(A) => EAGAIN
93 *
94 * 4. Two interfering transactions
95 * I: g(A) = 1, g(B) = 2, G = 3
96 * Start transaction 1: G(1) = 3, G = 4
97 * Start transaction 2: G(2) = 4, G = 5
98 * TA1: read node A: g(1:A) = 1
99 * TA2: read node B: g(2:B) = 2
100 * TA1: write node B: g(1:B) = 5, G = 6
101 * TA2: write node A: g(2:A) = 6, G = 7
102 * End TA1: g(1:A) == g(A) => okay, B = 1:B, g(B) = 7, G = 8
103 * End TA2: g(2:B) != g(B) => EAGAIN
104 */
105
106 struct accessed_node
107 {
108 /* List of all changed nodes in the context of this transaction. */
109 struct list_head list;
110
111 /* The name of the node. */
112 char *node;
113
114 /* Generation count (or NO_GENERATION) for conflict checking. */
115 uint64_t generation;
116
117 /* Generation count checking required? */
118 bool check_gen;
119
120 /* Modified? */
121 bool modified;
122
123 /* Transaction node in data base? */
124 bool ta_node;
125 };
126
127 struct changed_domain
128 {
129 /* List of all changed domains in the context of this transaction. */
130 struct list_head list;
131
132 /* Identifier of the changed domain. */
133 unsigned int domid;
134
135 /* Amount by which this domain's nbentry field has changed. */
136 int nbentry;
137 };
138
139 struct transaction
140 {
141 /* List of all transactions active on this connection. */
142 struct list_head list;
143
144 /* Connection-local identifier for this transaction. */
145 uint32_t id;
146
147 /* Generation when transaction started. */
148 uint64_t generation;
149
150 /* List of accessed nodes. */
151 struct list_head accessed;
152
153 /* List of changed domains - to record the changed domain entry number */
154 struct list_head changed_domains;
155
156 /* Flag for letting transaction fail. */
157 bool fail;
158 };
159
160 extern int quota_max_transaction;
161 static uint64_t generation;
162
set_tdb_key(const char * name,TDB_DATA * key)163 static void set_tdb_key(const char *name, TDB_DATA *key)
164 {
165 key->dptr = (char *)name;
166 key->dsize = strlen(name);
167 }
168
find_accessed_node(struct transaction * trans,const char * name)169 static struct accessed_node *find_accessed_node(struct transaction *trans,
170 const char *name)
171 {
172 struct accessed_node *i;
173
174 list_for_each_entry(i, &trans->accessed, list)
175 if (streq(i->node, name))
176 return i;
177
178 return NULL;
179 }
180
transaction_get_node_name(void * ctx,struct transaction * trans,const char * name)181 static char *transaction_get_node_name(void *ctx, struct transaction *trans,
182 const char *name)
183 {
184 return talloc_asprintf(ctx, "%"PRIu64"/%s", trans->generation, name);
185 }
186
187 /*
188 * Prepend the transaction to name if node has been modified in the current
189 * transaction.
190 */
transaction_prepend(struct connection * conn,const char * name,TDB_DATA * key)191 int transaction_prepend(struct connection *conn, const char *name,
192 TDB_DATA *key)
193 {
194 char *tdb_name;
195
196 if (!conn || !conn->transaction ||
197 !find_accessed_node(conn->transaction, name)) {
198 set_tdb_key(name, key);
199 return 0;
200 }
201
202 tdb_name = transaction_get_node_name(conn->transaction,
203 conn->transaction, name);
204 if (!tdb_name)
205 return errno;
206
207 set_tdb_key(tdb_name, key);
208
209 return 0;
210 }
211
212 /*
213 * A node has been accessed.
214 *
215 * Modifying accesses (write, delete) always update the generation (global and
216 * node->generation).
217 *
218 * Accesses in a transaction will be added to the list of accessed nodes
219 * if not already done. Read type accesses will copy the node to the
220 * transaction specific data base part, write type accesses go there
221 * anyway.
222 *
223 * If not NULL, key will be supplied with name and length of name of the node
224 * to be accessed in the data base.
225 */
access_node(struct connection * conn,struct node * node,enum node_access_type type,TDB_DATA * key)226 int access_node(struct connection *conn, struct node *node,
227 enum node_access_type type, TDB_DATA *key)
228 {
229 struct accessed_node *i = NULL;
230 struct transaction *trans;
231 TDB_DATA local_key;
232 const char *trans_name = NULL;
233 int ret;
234 bool introduce = false;
235
236 if (type != NODE_ACCESS_READ) {
237 node->generation = generation++;
238 if (conn && !conn->transaction)
239 wrl_apply_debit_direct(conn);
240 }
241
242 if (!conn || !conn->transaction) {
243 /* They're changing the global database. */
244 if (key)
245 set_tdb_key(node->name, key);
246 return 0;
247 }
248
249 trans = conn->transaction;
250
251 trans_name = transaction_get_node_name(node, trans, node->name);
252 if (!trans_name)
253 goto nomem;
254
255 i = find_accessed_node(trans, node->name);
256 if (!i) {
257 i = talloc_zero(trans, struct accessed_node);
258 if (!i)
259 goto nomem;
260 i->node = talloc_strdup(i, node->name);
261 if (!i->node)
262 goto nomem;
263
264 introduce = true;
265 i->ta_node = false;
266
267 /*
268 * Additional transaction-specific node for read type. We only
269 * have to verify read nodes if we didn't write them.
270 *
271 * The node is created and written to DB here to distinguish
272 * from the write types.
273 */
274 if (type == NODE_ACCESS_READ) {
275 i->generation = node->generation;
276 i->check_gen = true;
277 if (node->generation != NO_GENERATION) {
278 set_tdb_key(trans_name, &local_key);
279 ret = write_node_raw(conn, &local_key, node);
280 if (ret)
281 goto err;
282 i->ta_node = true;
283 }
284 }
285 list_add_tail(&i->list, &trans->accessed);
286 }
287
288 if (type != NODE_ACCESS_READ)
289 i->modified = true;
290
291 if (introduce && type == NODE_ACCESS_DELETE)
292 /* Nothing to delete. */
293 return -1;
294
295 if (key) {
296 set_tdb_key(trans_name, key);
297 if (type == NODE_ACCESS_WRITE)
298 i->ta_node = true;
299 if (type == NODE_ACCESS_DELETE)
300 i->ta_node = false;
301 }
302
303 return 0;
304
305 nomem:
306 ret = ENOMEM;
307 err:
308 talloc_free((void *)trans_name);
309 talloc_free(i);
310 trans->fail = true;
311 errno = ret;
312 return ret;
313 }
314
315 /*
316 * Finalize transaction:
317 * Walk through accessed nodes and check generation against global data.
318 * If all entries match, read the transaction entries and write them without
319 * transaction prepended. Delete all transaction specific nodes in the data
320 * base.
321 */
finalize_transaction(struct connection * conn,struct transaction * trans)322 static int finalize_transaction(struct connection *conn,
323 struct transaction *trans)
324 {
325 struct accessed_node *i;
326 TDB_DATA key, ta_key, data;
327 struct xs_tdb_record_hdr *hdr;
328 uint64_t gen;
329 char *trans_name;
330 int ret;
331
332 list_for_each_entry(i, &trans->accessed, list) {
333 if (!i->check_gen)
334 continue;
335
336 set_tdb_key(i->node, &key);
337 data = tdb_fetch(tdb_ctx, key);
338 hdr = (void *)data.dptr;
339 if (!data.dptr) {
340 if (tdb_error(tdb_ctx) != TDB_ERR_NOEXIST)
341 return EIO;
342 gen = NO_GENERATION;
343 } else
344 gen = hdr->generation;
345 talloc_free(data.dptr);
346 if (i->generation != gen)
347 return EAGAIN;
348 }
349
350 while ((i = list_top(&trans->accessed, struct accessed_node, list))) {
351 trans_name = transaction_get_node_name(i, trans, i->node);
352 if (!trans_name)
353 /* We are doomed: the transaction is only partial. */
354 goto err;
355
356 set_tdb_key(trans_name, &ta_key);
357
358 if (i->modified) {
359 set_tdb_key(i->node, &key);
360 if (i->ta_node) {
361 data = tdb_fetch(tdb_ctx, ta_key);
362 if (!data.dptr)
363 goto err;
364 hdr = (void *)data.dptr;
365 hdr->generation = generation++;
366 ret = tdb_store(tdb_ctx, key, data,
367 TDB_REPLACE);
368 talloc_free(data.dptr);
369 if (ret)
370 goto err;
371 } else if (tdb_delete(tdb_ctx, key))
372 goto err;
373 fire_watches(conn, trans, i->node, false);
374 }
375
376 if (i->ta_node && tdb_delete(tdb_ctx, ta_key))
377 goto err;
378 list_del(&i->list);
379 talloc_free(i);
380 }
381
382 return 0;
383
384 err:
385 corrupt(conn, "Partial transaction");
386 return EIO;
387 }
388
destroy_transaction(void * _transaction)389 static int destroy_transaction(void *_transaction)
390 {
391 struct transaction *trans = _transaction;
392 struct accessed_node *i;
393 char *trans_name;
394 TDB_DATA key;
395
396 wrl_ntransactions--;
397 trace_destroy(trans, "transaction");
398 while ((i = list_top(&trans->accessed, struct accessed_node, list))) {
399 if (i->ta_node) {
400 trans_name = transaction_get_node_name(i, trans,
401 i->node);
402 if (trans_name) {
403 set_tdb_key(trans_name, &key);
404 tdb_delete(tdb_ctx, key);
405 }
406 }
407 list_del(&i->list);
408 talloc_free(i);
409 }
410
411 return 0;
412 }
413
transaction_lookup(struct connection * conn,uint32_t id)414 struct transaction *transaction_lookup(struct connection *conn, uint32_t id)
415 {
416 struct transaction *trans;
417
418 if (id == 0)
419 return NULL;
420
421 list_for_each_entry(trans, &conn->transaction_list, list)
422 if (trans->id == id)
423 return trans;
424
425 return ERR_PTR(-ENOENT);
426 }
427
do_transaction_start(struct connection * conn,struct buffered_data * in)428 int do_transaction_start(struct connection *conn, struct buffered_data *in)
429 {
430 struct transaction *trans, *exists;
431 char id_str[20];
432
433 /* We don't support nested transactions. */
434 if (conn->transaction)
435 return EBUSY;
436
437 if (conn->id && conn->transaction_started > quota_max_transaction)
438 return ENOSPC;
439
440 /* Attach transaction to input for autofree until it's complete */
441 trans = talloc_zero(in, struct transaction);
442 if (!trans)
443 return ENOMEM;
444
445 INIT_LIST_HEAD(&trans->accessed);
446 INIT_LIST_HEAD(&trans->changed_domains);
447 trans->fail = false;
448 trans->generation = generation++;
449
450 /* Pick an unused transaction identifier. */
451 do {
452 trans->id = conn->next_transaction_id;
453 exists = transaction_lookup(conn, conn->next_transaction_id++);
454 } while (!IS_ERR(exists));
455
456 /* Now we own it. */
457 list_add_tail(&trans->list, &conn->transaction_list);
458 talloc_steal(conn, trans);
459 talloc_set_destructor(trans, destroy_transaction);
460 conn->transaction_started++;
461 wrl_ntransactions++;
462
463 snprintf(id_str, sizeof(id_str), "%u", trans->id);
464 send_reply(conn, XS_TRANSACTION_START, id_str, strlen(id_str)+1);
465
466 return 0;
467 }
468
transaction_fix_domains(struct transaction * trans,bool update)469 static int transaction_fix_domains(struct transaction *trans, bool update)
470 {
471 struct changed_domain *d;
472 int cnt;
473
474 list_for_each_entry(d, &trans->changed_domains, list) {
475 cnt = domain_entry_fix(d->domid, d->nbentry, update);
476 if (!update && cnt >= quota_nb_entry_per_domain)
477 return ENOSPC;
478 }
479
480 return 0;
481 }
482
do_transaction_end(struct connection * conn,struct buffered_data * in)483 int do_transaction_end(struct connection *conn, struct buffered_data *in)
484 {
485 const char *arg = onearg(in);
486 struct transaction *trans;
487 int ret;
488
489 if (!arg || (!streq(arg, "T") && !streq(arg, "F")))
490 return EINVAL;
491
492 if ((trans = conn->transaction) == NULL)
493 return ENOENT;
494
495 conn->transaction = NULL;
496 list_del(&trans->list);
497 conn->transaction_started--;
498
499 /* Attach transaction to in for auto-cleanup */
500 talloc_steal(in, trans);
501
502 if (streq(arg, "T")) {
503 if (trans->fail)
504 return ENOMEM;
505 ret = transaction_fix_domains(trans, false);
506 if (ret)
507 return ret;
508 if (finalize_transaction(conn, trans))
509 return EAGAIN;
510
511 wrl_apply_debit_trans_commit(conn);
512
513 /* fix domain entry for each changed domain */
514 transaction_fix_domains(trans, true);
515 }
516 send_ack(conn, XS_TRANSACTION_END);
517
518 return 0;
519 }
520
transaction_entry_inc(struct transaction * trans,unsigned int domid)521 void transaction_entry_inc(struct transaction *trans, unsigned int domid)
522 {
523 struct changed_domain *d;
524
525 list_for_each_entry(d, &trans->changed_domains, list)
526 if (d->domid == domid) {
527 d->nbentry++;
528 return;
529 }
530
531 d = talloc(trans, struct changed_domain);
532 if (!d) {
533 /* Let the transaction fail. */
534 trans->fail = true;
535 return;
536 }
537 d->domid = domid;
538 d->nbentry = 1;
539 list_add_tail(&d->list, &trans->changed_domains);
540 }
541
transaction_entry_dec(struct transaction * trans,unsigned int domid)542 void transaction_entry_dec(struct transaction *trans, unsigned int domid)
543 {
544 struct changed_domain *d;
545
546 list_for_each_entry(d, &trans->changed_domains, list)
547 if (d->domid == domid) {
548 d->nbentry--;
549 return;
550 }
551
552 d = talloc(trans, struct changed_domain);
553 if (!d) {
554 /* Let the transaction fail. */
555 trans->fail = true;
556 return;
557 }
558 d->domid = domid;
559 d->nbentry = -1;
560 list_add_tail(&d->list, &trans->changed_domains);
561 }
562
conn_delete_all_transactions(struct connection * conn)563 void conn_delete_all_transactions(struct connection *conn)
564 {
565 struct transaction *trans;
566
567 while ((trans = list_top(&conn->transaction_list,
568 struct transaction, list))) {
569 list_del(&trans->list);
570 talloc_free(trans);
571 }
572
573 assert(conn->transaction == NULL);
574
575 conn->transaction_started = 0;
576 }
577
check_transactions(struct hashtable * hash)578 int check_transactions(struct hashtable *hash)
579 {
580 struct connection *conn;
581 struct transaction *trans;
582 struct accessed_node *i;
583 char *tname, *tnode;
584
585 list_for_each_entry(conn, &connections, list) {
586 list_for_each_entry(trans, &conn->transaction_list, list) {
587 tname = talloc_asprintf(trans, "%"PRIu64,
588 trans->generation);
589 if (!tname || !remember_string(hash, tname))
590 goto nomem;
591
592 list_for_each_entry(i, &trans->accessed, list) {
593 if (!i->ta_node)
594 continue;
595 tnode = transaction_get_node_name(tname, trans,
596 i->node);
597 if (!tnode || !remember_string(hash, tnode))
598 goto nomem;
599 talloc_free(tnode);
600 }
601
602 talloc_free(tname);
603 }
604 }
605
606 return 0;
607
608 nomem:
609 talloc_free(tname);
610 return ENOMEM;
611 }
612
613 /*
614 * Local variables:
615 * c-file-style: "linux"
616 * indent-tabs-mode: t
617 * c-indent-level: 8
618 * c-basic-offset: 8
619 * tab-width: 8
620 * End:
621 */
622