/*
Simple prototype Xen Store Daemon providing simple tree-like database.
Copyright (C) 2005 Rusty Russell IBM Corporation
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "utils.h"
#include "list.h"
#include "talloc.h"
#include "core.h"
#include "watch.h"
#include "transaction.h"
#include "domain.h"
#include "control.h"
#include "lu.h"
static int xce_pollfd_idx = -1;
struct pollfd *poll_fds;
static unsigned int current_array_size;
static unsigned int nr_fds;
static unsigned int delayed_requests;
int orig_argc;
char **orig_argv;
LIST_HEAD(connections);
int tracefd = -1;
bool keep_orphans = false;
const char *tracefile = NULL;
static struct hashtable *nodes;
unsigned int trace_flags = TRACE_OBJ | TRACE_IO;
static const char *sockmsg_string(enum xsd_sockmsg_type type);
unsigned int timeout_watch_event_msec = 20000;
void trace(const char *fmt, ...)
{
va_list arglist;
char *str;
char sbuf[1024];
int ret, dummy;
if (tracefd < 0)
return;
/* try to use a static buffer */
va_start(arglist, fmt);
ret = vsnprintf(sbuf, 1024, fmt, arglist);
va_end(arglist);
if (ret <= 1024) {
dummy = write(tracefd, sbuf, ret);
return;
}
/* fail back to dynamic allocation */
va_start(arglist, fmt);
str = talloc_vasprintf(NULL, fmt, arglist);
va_end(arglist);
if (str) {
dummy = write(tracefd, str, strlen(str));
talloc_free(str);
}
}
static void trace_io(const struct connection *conn,
const struct buffered_data *data,
const char *type)
{
unsigned int i;
time_t now;
struct tm *tm;
if (tracefd < 0 || !(trace_flags & TRACE_IO))
return;
now = time(NULL);
tm = localtime(&now);
trace("io: %s %p (d%u) %04d%02d%02d %02d:%02d:%02d %s (",
type, conn, conn->id, tm->tm_year + 1900, tm->tm_mon + 1,
tm->tm_mday, tm->tm_hour, tm->tm_min, tm->tm_sec,
sockmsg_string(data->hdr.msg.type));
for (i = 0; i < data->hdr.msg.len; i++)
trace("%c", (data->buffer[i] != '\0') ? data->buffer[i] : ' ');
trace(")\n");
}
void trace_create(const void *data, const char *type)
{
if (trace_flags & TRACE_OBJ)
trace("obj: CREATE %s %p\n", type, data);
}
void trace_destroy(const void *data, const char *type)
{
if (trace_flags & TRACE_OBJ)
trace("obj: DESTROY %s %p\n", type, data);
}
/*
* Return an absolute filename.
* In case of a relative filename given as input, prepend XENSTORE_LIB_DIR.
*/
const char *absolute_filename(const void *ctx, const char *filename)
{
if (filename[0] != '/')
return talloc_asprintf(ctx, XENSTORE_LIB_DIR "/%s", filename);
return talloc_strdup(ctx, filename);
}
void close_log(void)
{
if (tracefd >= 0)
close(tracefd);
tracefd = -1;
}
void reopen_log(void)
{
if (tracefile) {
close_log();
tracefd = open(tracefile,
O_WRONLY | O_CREAT | O_APPEND | O_CLOEXEC, 0600);
if (tracefd < 0)
perror("Could not open tracefile");
else
trace("\n***\n");
}
}
uint64_t get_now_msec(void)
{
struct timespec now_ts;
if (clock_gettime(CLOCK_MONOTONIC, &now_ts))
barf_perror("Could not find time (clock_gettime failed)");
return now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000;
}
/*
* Remove a struct buffered_data from the list of outgoing data.
* A struct buffered_data related to a request having caused watch events to be
* sent is kept until all those events have been written out.
* Each watch event is referencing the related request via pend.req, while the
* number of watch events caused by a request is kept in pend.ref.event_cnt
* (those two cases are mutually exclusive, so the two fields can share memory
* via a union).
* The struct buffered_data is freed only if no related watch event is
* referencing it. The related return data can be freed right away.
*/
static void free_buffered_data(struct buffered_data *out,
struct connection *conn)
{
struct buffered_data *req;
list_del(&out->list);
out->on_out_list = false;
/*
* Update conn->timeout_msec with the next found timeout value in the
* queued pending requests.
*/
if (out->timeout_msec) {
conn->timeout_msec = 0;
list_for_each_entry(req, &conn->out_list, list) {
if (req->timeout_msec) {
conn->timeout_msec = req->timeout_msec;
break;
}
}
}
domain_memory_add_nochk(conn, conn->id,
-out->hdr.msg.len - sizeof(out->hdr));
if (out->hdr.msg.type == XS_WATCH_EVENT) {
req = out->pend.req;
if (req) {
req->pend.ref.event_cnt--;
if (!req->pend.ref.event_cnt && !req->on_out_list) {
if (req->on_ref_list) {
domain_outstanding_dec(conn,
req->pend.ref.domid);
list_del(&req->list);
}
talloc_free(req);
}
}
} else if (out->pend.ref.event_cnt) {
/* Hang out off from conn. */
talloc_steal(NULL, out);
if (out->buffer != out->default_buffer)
talloc_free(out->buffer);
list_add(&out->list, &conn->ref_list);
out->on_ref_list = true;
return;
} else
domain_outstanding_dec(conn, conn->id);
talloc_free(out);
}
static void check_event_timeout(struct connection *conn, uint64_t msecs,
int *ptimeout)
{
uint64_t delta;
struct buffered_data *out, *tmp;
if (!conn->timeout_msec)
return;
delta = conn->timeout_msec - msecs;
if (conn->timeout_msec <= msecs) {
delta = 0;
list_for_each_entry_safe(out, tmp, &conn->out_list, list) {
/*
* Only look at buffers with timeout and no data
* already written to the ring.
*/
if (out->timeout_msec && out->inhdr && !out->used) {
if (out->timeout_msec > msecs) {
conn->timeout_msec = out->timeout_msec;
delta = conn->timeout_msec - msecs;
break;
}
/*
* Free out without updating conn->timeout_msec,
* as the update is done in this loop already.
*/
out->timeout_msec = 0;
trace("watch event path %s for domain %u timed out\n",
out->buffer, conn->id);
free_buffered_data(out, conn);
}
}
if (!delta) {
conn->timeout_msec = 0;
return;
}
}
if (*ptimeout == -1 || *ptimeout > delta)
*ptimeout = delta;
}
void conn_free_buffered_data(struct connection *conn)
{
struct buffered_data *out;
while ((out = list_top(&conn->out_list, struct buffered_data, list)))
free_buffered_data(out, conn);
conn->timeout_msec = 0;
}
static bool write_messages(struct connection *conn)
{
int ret;
struct buffered_data *out;
bool started = false;
out = list_top(&conn->out_list, struct buffered_data, list);
if (out == NULL)
return true;
if (out->inhdr) {
started = !out->used;
ret = conn->funcs->write(conn, out->hdr.raw + out->used,
sizeof(out->hdr) - out->used);
if (ret < 0)
goto err;
out->used += ret;
if (out->used < sizeof(out->hdr))
goto start;
out->inhdr = false;
out->used = 0;
/* Second write might block if non-zero. */
if (out->hdr.msg.len && !conn->domain)
goto start;
}
ret = conn->funcs->write(conn, out->buffer + out->used,
out->hdr.msg.len - out->used);
if (ret < 0)
goto err;
out->used += ret;
if (out->used != out->hdr.msg.len)
goto start;
trace_io(conn, out, started ? "OUT" : "OUT(END)");
free_buffered_data(out, conn);
return true;
err:
trace_io(conn, out, "OUT(ERR)");
return false;
start:
if (started)
trace_io(conn, out, "OUT(START)");
return true;
}
static int undelay_request(void *_req)
{
struct delayed_request *req = _req;
list_del(&req->list);
delayed_requests--;
return 0;
}
static void call_delayed(struct delayed_request *req)
{
if (req->func(req)) {
undelay_request(req);
talloc_set_destructor(req, NULL);
}
}
int delay_request(struct connection *conn, struct buffered_data *in,
bool (*func)(struct delayed_request *), void *data,
bool no_quota_check)
{
struct delayed_request *req;
/*
* Only allow one request can be delayed for an unprivileged
* connection.
*/
if (!no_quota_check && domain_is_unprivileged(conn) &&
!list_empty(&conn->delayed))
return ENOSPC;
req = talloc(in, struct delayed_request);
if (!req)
return ENOMEM;
/* For the case of connection being closed. */
talloc_set_destructor(req, undelay_request);
req->in = in;
req->func = func;
req->data = data;
delayed_requests++;
list_add(&req->list, &conn->delayed);
/* Unlink the request from conn if this is the current one */
if (conn->in == in)
conn->in = NULL;
return 0;
}
static int destroy_conn(void *_conn)
{
struct connection *conn = _conn;
struct buffered_data *req;
/* Flush outgoing if possible, but don't block. */
if (!conn->domain) {
struct pollfd pfd;
pfd.fd = conn->fd;
pfd.events = POLLOUT;
while (!list_empty(&conn->out_list)
&& poll(&pfd, 1, 0) == 1)
if (!write_messages(conn))
break;
close(conn->fd);
}
conn_free_buffered_data(conn);
conn_delete_all_watches(conn);
list_for_each_entry(req, &conn->ref_list, list)
req->on_ref_list = false;
if (conn->target)
talloc_unlink(conn, conn->target);
list_del(&conn->list);
trace_destroy(conn, "connection");
return 0;
}
static bool conn_can_read(struct connection *conn)
{
if (conn->is_ignored)
return false;
if (!conn->funcs->can_read(conn))
return false;
/*
* For stalled connection, we want to process the pending
* command as soon as live-update has aborted.
*/
if (conn->is_stalled)
return !lu_is_pending();
return true;
}
static bool conn_can_write(struct connection *conn)
{
return !conn->is_ignored && conn->funcs->can_write(conn);
}
/* This function returns index inside the array if succeed, -1 if fail */
int set_fd(int fd, short events)
{
int ret;
if (current_array_size < nr_fds + 1) {
struct pollfd *new_fds = NULL;
unsigned long newsize;
/* Round up to 2^8 boundary, in practice this just
* make newsize larger than current_array_size.
*/
newsize = ROUNDUP(nr_fds + 1, 8);
new_fds = realloc(poll_fds, sizeof(struct pollfd)*newsize);
if (!new_fds)
goto fail;
poll_fds = new_fds;
memset(&poll_fds[0] + current_array_size, 0,
sizeof(struct pollfd ) * (newsize-current_array_size));
current_array_size = newsize;
}
poll_fds[nr_fds].fd = fd;
poll_fds[nr_fds].events = events;
ret = nr_fds;
nr_fds++;
return ret;
fail:
syslog(LOG_ERR, "realloc failed, ignoring fd %d\n", fd);
return -1;
}
static void initialize_fds(int *ptimeout)
{
struct connection *conn;
uint64_t msecs;
if (poll_fds)
memset(poll_fds, 0, sizeof(struct pollfd) * current_array_size);
nr_fds = 0;
/* In case of delayed requests pause for max 1 second. */
*ptimeout = delayed_requests ? 1000 : -1;
set_special_fds();
if (xce_handle != NULL)
xce_pollfd_idx = set_fd(xenevtchn_fd(xce_handle),
POLLIN|POLLPRI);
msecs = get_now_msec();
wrl_log_periodic(msecs);
list_for_each_entry(conn, &connections, list) {
if (conn->domain) {
wrl_check_timeout(conn->domain, msecs, ptimeout);
check_event_timeout(conn, msecs, ptimeout);
if (conn_can_read(conn) ||
(conn_can_write(conn) &&
!list_empty(&conn->out_list)))
*ptimeout = 0;
} else {
short events = POLLIN|POLLPRI;
if (!list_empty(&conn->out_list))
events |= POLLOUT;
conn->pollfd_idx = set_fd(conn->fd, events);
/*
* For stalled connection, we want to process the
* pending command as soon as live-update has aborted.
*/
if (conn->is_stalled && !lu_is_pending())
*ptimeout = 0;
}
}
}
static size_t calc_node_acc_size(const struct node_hdr *hdr)
{
return sizeof(*hdr) + hdr->num_perms * sizeof(struct xs_permissions) +
hdr->datalen + hdr->childlen;
}
const struct node_hdr *db_fetch(const char *db_name, size_t *size)
{
const struct node_hdr *hdr;
hdr = hashtable_search(nodes, db_name);
if (!hdr) {
errno = ENOENT;
return NULL;
}
*size = calc_node_acc_size(hdr);
trace_tdb("read %s size %zu\n", db_name, *size + strlen(db_name));
return hdr;
}
static const struct xs_permissions *perms_from_node_hdr(
const struct node_hdr *hdr)
{
return (const struct xs_permissions *)(hdr + 1);
}
static void get_acc_data(const char *name, struct node_account_data *acc)
{
size_t size;
const struct node_hdr *hdr;
if (acc->memory < 0) {
hdr = db_fetch(name, &size);
/* No check for error, as the node might not exist. */
if (hdr == NULL) {
acc->memory = 0;
} else {
acc->memory = size;
acc->domid = perms_from_node_hdr(hdr)->id;
}
}
}
/*
* Per-transaction nodes need to be accounted for the transaction owner.
* Those nodes are stored in the data base with the transaction generation
* count prepended (e.g. 123/local/domain/...). So testing for the node's
* key not to start with "/" or "@" is sufficient.
*/
static unsigned int get_acc_domid(struct connection *conn, const char *name,
unsigned int domid)
{
return (!conn || name[0] == '/' || name[0] == '@') ? domid : conn->id;
}
int db_write(struct connection *conn, const char *db_name, void *data,
size_t size, struct node_account_data *acc,
enum write_node_mode mode, bool no_quota_check)
{
const struct node_hdr *hdr = data;
struct node_account_data old_acc = {};
unsigned int old_domid, new_domid;
size_t name_len = strlen(db_name);
const char *name;
int ret;
if (!acc)
old_acc.memory = -1;
else
old_acc = *acc;
get_acc_data(db_name, &old_acc);
old_domid = get_acc_domid(conn, db_name, old_acc.domid);
new_domid = get_acc_domid(conn, db_name, perms_from_node_hdr(hdr)->id);
/*
* Don't check for ENOENT, as we want to be able to switch orphaned
* nodes to new owners.
*/
if (old_acc.memory)
domain_memory_add_nochk(conn, old_domid,
-old_acc.memory - name_len);
ret = domain_memory_add(conn, new_domid, size + name_len,
no_quota_check);
if (ret) {
/* Error path, so no quota check. */
if (old_acc.memory)
domain_memory_add_nochk(conn, old_domid,
old_acc.memory + name_len);
return ret;
}
if (mode == NODE_CREATE) {
/* db_name could be modified later, so allocate a copy. */
name = talloc_strdup(data, db_name);
ret = name ? hashtable_add(nodes, name, data) : ENOMEM;
} else
ret = hashtable_replace(nodes, db_name, data);
if (ret) {
/* Free data, as it isn't owned by hashtable now. */
talloc_free(data);
domain_memory_add_nochk(conn, new_domid, -size - name_len);
/* Error path, so no quota check. */
if (old_acc.memory)
domain_memory_add_nochk(conn, old_domid,
old_acc.memory + name_len);
errno = ret;
return errno;
}
trace_tdb("store %s size %zu\n", db_name, size + name_len);
if (acc) {
/* Don't use new_domid, as it might be a transaction node. */
acc->domid = perms_from_node_hdr(hdr)->id;
acc->memory = size;
}
return 0;
}
void db_delete(struct connection *conn, const char *name,
struct node_account_data *acc)
{
struct node_account_data tmp_acc;
unsigned int domid;
if (!acc) {
acc = &tmp_acc;
acc->memory = -1;
}
get_acc_data(name, acc);
hashtable_remove(nodes, name);
trace_tdb("delete %s\n", name);
if (acc->memory) {
domid = get_acc_domid(conn, name, acc->domid);
domain_memory_add_nochk(conn, domid,
-acc->memory - strlen(name));
}
}
/*
* If it fails, returns NULL and sets errno.
* Temporary memory allocations will be done with ctx.
*/
static struct node *read_node_alloc(struct connection *conn, const void *ctx,
const char *name,
const struct node_hdr **hdr)
{
size_t size;
struct node *node;
const char *db_name;
int err;
node = talloc(ctx, struct node);
if (!node) {
errno = ENOMEM;
return NULL;
}
node->name = talloc_strdup(node, name);
if (!node->name) {
errno = ENOMEM;
goto error;
}
db_name = transaction_prepend(conn, name);
*hdr = db_fetch(db_name, &size);
if (*hdr == NULL) {
node->hdr.generation = NO_GENERATION;
err = access_node(conn, node, NODE_ACCESS_READ, NULL);
errno = err ? : ENOENT;
goto error;
}
node->parent = NULL;
/* Datalen, childlen, number of permissions */
node->hdr = **hdr;
node->acc.domid = perms_from_node_hdr(*hdr)->id;
node->acc.memory = size;
return node;
error:
talloc_free(node);
return NULL;
}
static bool read_node_helper(struct connection *conn, struct node *node)
{
/* Data is binary blob (usually ascii, no nul). */
node->data = node->perms + node->hdr.num_perms;
/* Children is strings, nul separated. */
node->children = node->data + node->hdr.datalen;
if (domain_adjust_node_perms(node))
return false;
/* If owner is gone reset currently accounted memory size. */
if (node->acc.domid != get_node_owner(node))
node->acc.memory = 0;
if (access_node(conn, node, NODE_ACCESS_READ, NULL))
return false;
return true;
}
struct node *read_node(struct connection *conn, const void *ctx,
const char *name)
{
size_t size;
const struct node_hdr *hdr;
struct node *node;
node = read_node_alloc(conn, ctx, name, &hdr);
if (!node)
return NULL;
/* Copy node data to new memory area, starting with permissions. */
size = node->acc.memory - sizeof(*hdr);
node->perms = talloc_memdup(node, perms_from_node_hdr(hdr), size);
if (node->perms == NULL) {
errno = ENOMEM;
goto error;
}
if (!read_node_helper(conn, node))
goto error;
return node;
error:
talloc_free(node);
return NULL;
}
const struct node *read_node_const(struct connection *conn, const void *ctx,
const char *name)
{
const struct node_hdr *hdr;
struct node *node;
node = read_node_alloc(conn, ctx, name, &hdr);
if (!node)
return NULL;
/* Unfortunately node->perms isn't const. */
node->perms = (void *)perms_from_node_hdr(hdr);
if (!read_node_helper(conn, node))
goto error;
return node;
error:
talloc_free(node);
return NULL;
}
static bool read_node_can_propagate_errno(void)
{
/*
* 2 error cases for read_node() can always be propagated up:
* ENOMEM, because this has nothing to do with the node being in the
* data base or not, but is caused by a general lack of memory.
* ENOSPC, because this is related to hitting quota limits which need
* to be respected.
*/
return errno == ENOMEM || errno == ENOSPC;
}
int write_node_raw(struct connection *conn, const char *db_name,
struct node *node, enum write_node_mode mode,
bool no_quota_check)
{
void *data;
size_t size;
void *p;
struct node_hdr *hdr;
if (domain_adjust_node_perms(node))
return errno;
size = calc_node_acc_size(&node->hdr);
/* Call domain_max_chk() in any case in order to record max values. */
if (domain_max_chk(conn, ACC_NODESZ, size) && !no_quota_check) {
errno = ENOSPC;
return errno;
}
data = talloc_size(node, size);
if (!data) {
errno = ENOMEM;
return errno;
}
BUILD_BUG_ON(XENSTORE_PAYLOAD_MAX >= (typeof(hdr->datalen))(-1));
hdr = data;
*hdr = node->hdr;
/* Open code perms_from_node_hdr() for the non-const case. */
p = hdr + 1;
memcpy(p, node->perms, node->hdr.num_perms * sizeof(*node->perms));
p += node->hdr.num_perms * sizeof(*node->perms);
memcpy(p, node->data, node->hdr.datalen);
p += node->hdr.datalen;
memcpy(p, node->children, node->hdr.childlen);
if (db_write(conn, db_name, data, size, &node->acc, mode,
no_quota_check))
return EIO;
return 0;
}
/*
* Write the node. If the node is written, caller can find the DB name used in
* node->db_name. This can later be used if the change needs to be reverted.
*/
static int write_node(struct connection *conn, struct node *node,
enum write_node_mode mode, bool no_quota_check)
{
int ret;
if (access_node(conn, node, NODE_ACCESS_WRITE, &node->db_name))
return errno;
ret = write_node_raw(conn, node->db_name, node, mode, no_quota_check);
if (ret && conn && conn->transaction) {
/*
* Reverting access_node() is hard, so just fail the
* transaction.
*/
fail_transaction(conn->transaction);
}
return ret;
}
unsigned int perm_for_conn(struct connection *conn,
const struct node_perms *perms)
{
unsigned int i;
unsigned int mask = XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER;
/* Owners and tools get it all... */
if (!domain_is_unprivileged(conn) || perms->p[0].id == conn->id
|| (conn->target && perms->p[0].id == conn->target->id))
return (XS_PERM_READ|XS_PERM_WRITE|XS_PERM_OWNER) & mask;
for (i = 1; i < perms->num; i++)
if (!(perms->p[i].perms & XS_PERM_IGNORE) &&
(perms->p[i].id == conn->id ||
(conn->target && perms->p[i].id == conn->target->id)))
return perms->p[i].perms & mask;
return perms->p[0].perms & mask;
}
/*
* Get name of node parent.
* Temporary memory allocations are done with ctx.
*/
char *get_parent(const void *ctx, const char *node)
{
char *parent;
char *slash = strrchr(node + 1, '/');
parent = slash ? talloc_asprintf(ctx, "%.*s", (int)(slash - node), node)
: talloc_strdup(ctx, "/");
if (!parent)
errno = ENOMEM;
return parent;
}
/*
* What do parents say?
* Temporary memory allocations are done with ctx.
*/
static int ask_parents(struct connection *conn, const void *ctx,
const char *name, unsigned int *perm)
{
const struct node *node;
do {
name = get_parent(ctx, name);
if (!name)
return errno;
node = read_node_const(conn, ctx, name);
if (node)
break;
if (read_node_can_propagate_errno())
return errno;
} while (!streq(name, "/"));
/* No permission at root? We're in trouble. */
if (!node) {
corrupt(conn, "No permissions file at root");
*perm = XS_PERM_NONE;
return 0;
}
*perm = perm_for_conn_from_node(conn, node);
return 0;
}
/*
* We have a weird permissions system. You can allow someone into a
* specific node without allowing it in the parents. If it's going to
* fail, however, we don't want the errno to indicate any information
* about the node.
* Temporary memory allocations are done with ctx.
*/
static int errno_from_parents(struct connection *conn, const void *ctx,
const char *node, int errnum, unsigned int perm)
{
unsigned int parent_perm = XS_PERM_NONE;
/* We always tell them about memory failures. */
if (errnum == ENOMEM)
return errnum;
if (ask_parents(conn, ctx, node, &parent_perm))
return errno;
if (parent_perm & perm)
return errnum;
return EACCES;
}
/*
* If it fails, returns NULL and sets errno.
* Temporary memory allocations are done with ctx.
*/
static bool get_node_chk_perm(struct connection *conn, const void *ctx,
const struct node *node, const char *name,
unsigned int perm)
{
bool success = node;
/* If we don't have permission, we don't have node. */
if (node && (perm_for_conn_from_node(conn, node) & perm) != perm) {
errno = EACCES;
success = false;
}
/* Clean up errno if they weren't supposed to know. */
if (!success && !read_node_can_propagate_errno())
errno = errno_from_parents(conn, ctx, name, errno, perm);
return success;
}
static struct buffered_data *new_buffer(void *ctx)
{
struct buffered_data *data;
data = talloc_zero(ctx, struct buffered_data);
if (data == NULL)
return NULL;
data->inhdr = true;
return data;
}
/* Return length of string (including nul) at this offset.
* If there is no nul, returns 0 for failure.
*/
unsigned int get_string(const struct buffered_data *data, unsigned int offset)
{
const char *nul;
if (offset >= data->used)
return 0;
nul = memchr(data->buffer + offset, 0, data->used - offset);
if (!nul)
return 0;
return nul - (data->buffer + offset) + 1;
}
/* Break input into vectors, return the number, fill in up to num of them.
* Always returns the actual number of nuls in the input. Stores the
* positions of the starts of the nul-terminated strings in vec.
* Callers who use this and then rely only on vec[] will
* ignore any data after the final nul.
*/
unsigned int get_strings(struct buffered_data *data,
const char *vec[], unsigned int num)
{
unsigned int off, i, len;
off = i = 0;
while ((len = get_string(data, off)) != 0) {
if (i < num)
vec[i] = data->buffer + off;
i++;
off += len;
}
return i;
}
static void send_error(struct connection *conn, int error)
{
unsigned int i;
for (i = 0; error != xsd_errors[i].errnum; i++) {
if (i == ARRAY_SIZE(xsd_errors) - 1) {
eprintf("xenstored: error %i untranslatable", error);
i = 0; /* EINVAL */
break;
}
}
acc_drop(conn);
send_reply(conn, XS_ERROR, xsd_errors[i].errstring,
strlen(xsd_errors[i].errstring) + 1);
}
void send_reply(struct connection *conn, enum xsd_sockmsg_type type,
const void *data, unsigned int len)
{
struct buffered_data *bdata = conn->in;
assert(type != XS_WATCH_EVENT);
/* Commit accounting now, as later errors won't undo any changes. */
acc_commit(conn);
if ( len > XENSTORE_PAYLOAD_MAX ) {
send_error(conn, E2BIG);
return;
}
if (!bdata)
return;
bdata->inhdr = true;
bdata->used = 0;
bdata->timeout_msec = 0;
bdata->watch_event = false;
if (len <= DEFAULT_BUFFER_SIZE) {
bdata->buffer = bdata->default_buffer;
/* Don't check quota, path might be used for returning error. */
domain_memory_add_nochk(conn, conn->id,
len + sizeof(bdata->hdr));
} else {
bdata->buffer = talloc_array(bdata, char, len);
if (!bdata->buffer ||
domain_memory_add_chk(conn, conn->id,
len + sizeof(bdata->hdr))) {
send_error(conn, ENOMEM);
return;
}
}
conn->in = NULL;
/* Update relevant header fields and fill in the message body. */
bdata->hdr.msg.type = type;
bdata->hdr.msg.len = len;
memcpy(bdata->buffer, data, len);
/* Queue for later transmission. */
list_add_tail(&bdata->list, &conn->out_list);
bdata->on_out_list = true;
domain_outstanding_inc(conn);
}
/*
* Send a watch event.
* As this is not directly related to the current command, errors can't be
* reported.
*/
void send_event(struct buffered_data *req, struct connection *conn,
const char *path, const char *token)
{
struct buffered_data *bdata, *bd;
unsigned int len;
len = strlen(path) + 1 + strlen(token) + 1;
/* Don't try to send over-long events. */
if (len > XENSTORE_PAYLOAD_MAX)
return;
bdata = new_buffer(conn);
if (!bdata)
return;
bdata->buffer = talloc_array(bdata, char, len);
if (!bdata->buffer) {
talloc_free(bdata);
return;
}
strcpy(bdata->buffer, path);
strcpy(bdata->buffer + strlen(path) + 1, token);
bdata->hdr.msg.type = XS_WATCH_EVENT;
bdata->hdr.msg.len = len;
/*
* Check whether an identical event is pending already.
* Special events are excluded from that check.
*/
if (path[0] != '@') {
list_for_each_entry(bd, &conn->out_list, list) {
if (bd->watch_event && bd->hdr.msg.len == len &&
!memcmp(bdata->buffer, bd->buffer, len)) {
trace("dropping duplicate watch %s %s for domain %u\n",
path, token, conn->id);
talloc_free(bdata);
return;
}
}
}
if (domain_memory_add_chk(conn, conn->id, len + sizeof(bdata->hdr))) {
talloc_free(bdata);
return;
}
if (timeout_watch_event_msec && domain_is_unprivileged(conn)) {
bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec;
if (!conn->timeout_msec)
conn->timeout_msec = bdata->timeout_msec;
}
bdata->watch_event = true;
bdata->pend.req = req;
if (req)
req->pend.ref.event_cnt++;
/* Queue for later transmission. */
list_add_tail(&bdata->list, &conn->out_list);
bdata->on_out_list = true;
}
/* Some routines (write, mkdir, etc) just need a non-error return */
void send_ack(struct connection *conn, enum xsd_sockmsg_type type)
{
send_reply(conn, type, "OK", sizeof("OK"));
}
static bool valid_chars(const char *node)
{
/* Nodes can have lots of crap. */
return (strspn(node,
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789-/_@") == strlen(node));
}
/* We expect one arg in the input: return NULL otherwise.
* The payload must contain exactly one nul, at the end.
*/
const char *onearg(struct buffered_data *in)
{
if (!in->used || get_string(in, 0) != in->used)
return NULL;
return in->buffer;
}
static char *node_perms_to_strings(const struct node *node, unsigned int *len)
{
unsigned int i;
char *strings = NULL;
char buffer[MAX_STRLEN(unsigned int) + 1];
for (*len = 0, i = 0; i < node->hdr.num_perms; i++) {
if (!xenstore_perm_to_string(&node->perms[i], buffer,
sizeof(buffer)))
return NULL;
strings = talloc_realloc(node, strings, char,
*len + strlen(buffer) + 1);
if (!strings)
return NULL;
strcpy(strings + *len, buffer);
*len += strlen(buffer) + 1;
}
return strings;
}
const char *canonicalize(struct connection *conn, const void *ctx,
const char *node, bool allow_special)
{
const char *name;
int local_off = 0;
unsigned int domid;
/*
* Invalid if any of:
* - no node at all
* - illegal character in node
* - starts with '@' but no special node allowed
*/
errno = EINVAL;
if (!node ||
!valid_chars(node) ||
(node[0] == '@' && !allow_special))
return NULL;
if (node[0] != '/' && node[0] != '@') {
name = talloc_asprintf(ctx, "%s/%s", get_implicit_path(conn),
node);
if (!name)
return NULL;
} else
name = node;
if (sscanf(name, "/local/domain/%5u/%n", &domid, &local_off) != 1)
local_off = 0;
/*
* Only valid if:
* - doesn't end in / (unless it's just "/")
* - no double //
* - not violating max allowed path length
*/
if (!(strends(name, "/") && !streq(name, "/")) &&
!strstr(name, "//") &&
!domain_max_chk(conn, ACC_PATHLEN, strlen(name) - local_off))
return name;
/* Release the memory if 'name' was allocated by us. */
if (name != node)
talloc_free(name);
return NULL;
}
static struct node *get_node(struct connection *conn, const void *ctx,
const char *name, const char **canonical_name,
unsigned int perm, bool allow_special)
{
struct node *node;
*canonical_name = canonicalize(conn, ctx, name, allow_special);
if (!*canonical_name)
return NULL;
node = read_node(conn, ctx, *canonical_name);
return get_node_chk_perm(conn, ctx, node, *canonical_name, perm)
? node : NULL;
}
static const struct node *get_node_const(struct connection *conn,
const void *ctx, const char *name,
unsigned int perm, bool allow_special)
{
const char *tmp_name;
const struct node *node;
tmp_name = canonicalize(conn, ctx, name, allow_special);
if (!tmp_name)
return NULL;
node = read_node_const(conn, ctx, tmp_name);
return get_node_chk_perm(conn, ctx, node, tmp_name, perm) ? node : NULL;
}
static int send_directory(const void *ctx, struct connection *conn,
struct buffered_data *in)
{
const struct node *node;
node = get_node_const(conn, ctx, onearg(in), XS_PERM_READ, false);
if (!node)
return errno;
send_reply(conn, XS_DIRECTORY, node->children, node->hdr.childlen);
return 0;
}
static int send_directory_part(const void *ctx, struct connection *conn,
struct buffered_data *in)
{
unsigned int off, len, maxlen, genlen;
char *child, *data;
const struct node *node;
char gen[24];
if (xenstore_count_strings(in->buffer, in->used) != 2)
return EINVAL;
/* First arg is node name. */
node = get_node_const(conn, ctx, in->buffer, XS_PERM_READ, false);
if (!node)
return errno;
/* Second arg is childlist offset. */
off = atoi(in->buffer + strlen(in->buffer) + 1);
genlen = snprintf(gen, sizeof(gen), "%"PRIu64, node->hdr.generation) +
1;
/* Offset behind list: just return a list with an empty string. */
if (off >= node->hdr.childlen) {
gen[genlen] = 0;
send_reply(conn, XS_DIRECTORY_PART, gen, genlen + 1);
return 0;
}
len = 0;
maxlen = XENSTORE_PAYLOAD_MAX - genlen - 1;
child = node->children + off;
while (len + strlen(child) < maxlen) {
len += strlen(child) + 1;
child += strlen(child) + 1;
if (off + len == node->hdr.childlen)
break;
}
data = talloc_array(ctx, char, genlen + len + 1);
if (!data)
return ENOMEM;
memcpy(data, gen, genlen);
memcpy(data + genlen, node->children + off, len);
if (off + len == node->hdr.childlen) {
data[genlen + len] = 0;
len++;
}
send_reply(conn, XS_DIRECTORY_PART, data, genlen + len);
return 0;
}
static int do_read(const void *ctx, struct connection *conn,
struct buffered_data *in)
{
const struct node *node;
node = get_node_const(conn, ctx, onearg(in), XS_PERM_READ, false);
if (!node)
return errno;
send_reply(conn, XS_READ, node->data, node->hdr.datalen);
return 0;
}
/* Must not be / */
static char *basename(const char *name)
{
return strrchr(name, '/') + 1;
}
static int add_child(const void *ctx, struct node *parent, const char *name)
{
const char *base;
unsigned int baselen;
char *children;
base = basename(name);
baselen = strlen(base) + 1;
children = talloc_array(ctx, char, parent->hdr.childlen + baselen);
if (!children)
return ENOMEM;
memcpy(children, parent->children, parent->hdr.childlen);
memcpy(children + parent->hdr.childlen, base, baselen);
parent->children = children;
parent->hdr.childlen += baselen;
return 0;
}
static struct node *construct_node(struct connection *conn, const void *ctx,
const char *name)
{
const char **names = NULL;
unsigned int levels = 0;
struct node *node = NULL;
struct node *parent = NULL;
const char *parentname = talloc_strdup(ctx, name);
if (!parentname)
return NULL;
/* Walk the path up until an existing node is found. */
while (!parent) {
names = talloc_realloc(ctx, names, const char *, levels + 1);
if (!names)
goto nomem;
/*
* names[0] is the name of the node to construct initially,
* names[1] is its parent, and so on.
*/
names[levels] = parentname;
parentname = get_parent(ctx, parentname);
if (!parentname)
return NULL;
/* Try to read parent node until we found an existing one. */
parent = read_node(conn, ctx, parentname);
if (!parent && (errno != ENOENT || !strcmp(parentname, "/")))
return NULL;
levels++;
}
/* Walk the path down again constructing the missing nodes. */
for (; levels > 0; levels--) {
/* Add child to parent. */
if (add_child(ctx, parent, names[levels - 1]))
goto nomem;
/* Allocate node */
node = talloc(ctx, struct node);
if (!node)
goto nomem;
node->name = talloc_steal(node, names[levels - 1]);
/* Inherit permissions, unpriv domains own what they create. */
node->hdr.num_perms = parent->hdr.num_perms;
node->perms = talloc_memdup(node, parent->perms,
node->hdr.num_perms *
sizeof(*node->perms));
if (!node->perms)
goto nomem;
if (domain_is_unprivileged(conn))
node->perms[0].id = conn->id;
/* No children, no data */
node->children = node->data = NULL;
node->hdr.childlen = node->hdr.datalen = 0;
node->acc.memory = 0;
node->parent = parent;
parent = node;
}
return node;
nomem:
errno = ENOMEM;
return NULL;
}
static void destroy_node_rm(struct connection *conn, struct node *node)
{
if (streq(node->name, "/"))
corrupt(NULL, "Destroying root node!");
db_delete(conn, node->db_name, &node->acc);
}
static int destroy_node(struct connection *conn, struct node *node)
{
destroy_node_rm(conn, node);
/*
* It is not possible to easily revert the changes in a transaction.
* So if the failure happens in a transaction, mark it as fail to
* prevent any commit.
*/
if ( conn->transaction )
fail_transaction(conn->transaction);
return 0;
}
static struct node *create_node(struct connection *conn, const void *ctx,
const char *name,
void *data, unsigned int datalen)
{
struct node *node, *i, *j;
int ret;
node = construct_node(conn, ctx, name);
if (!node)
return NULL;
if (conn && conn->transaction)
ta_node_created(conn->transaction);
node->data = data;
node->hdr.datalen = datalen;
/*
* We write out the nodes bottom up.
* All new created nodes will have i->parent set, while the final
* node will be already existing and won't have i->parent set.
* New nodes are subject to quota handling.
*/
for (i = node; i; i = i->parent) {
/* i->parent is set for each new node, so check quota. */
if (i->parent &&
domain_nbentry(conn) >= hard_quotas[ACC_NODES].val) {
ret = ENOSPC;
goto err;
}
ret = write_node(conn, i, i->parent ? NODE_CREATE : NODE_MODIFY,
false);
if (ret)
goto err;
/* Account for new node */
if (i->parent) {
if (domain_nbentry_inc(conn, get_node_owner(i))) {
destroy_node_rm(conn, i);
return NULL;
}
}
}
return node;
err:
/*
* We failed to update TDB for some of the nodes. Undo any work that
* have already been done.
*/
for (j = node; j != i; j = j->parent)
destroy_node(conn, j);
/* We don't need to keep the nodes around, so free them. */
i = node;
while (i) {
j = i;
i = i->parent;
talloc_free(j);
}
errno = ret;
return NULL;
}
/* path, data... */
static int do_write(const void *ctx, struct connection *conn,
struct buffered_data *in)
{
unsigned int offset, datalen;
struct node *node;
const char *vec[1] = { NULL }; /* gcc4 + -W + -Werror fucks code. */
const char *name;
/* Extra "strings" can be created by binary data. */
if (get_strings(in, vec, ARRAY_SIZE(vec)) < ARRAY_SIZE(vec))
return EINVAL;
offset = strlen(vec[0]) + 1;
datalen = in->used - offset;
node = get_node(conn, ctx, vec[0], &name, XS_PERM_WRITE, false);
if (!node) {
/* No permissions, invalid input? */
if (errno != ENOENT)
return errno;
node = create_node(conn, ctx, name, in->buffer + offset,
datalen);
if (!node)
return errno;
} else {
node->data = in->buffer + offset;
node->hdr.datalen = datalen;
if (write_node(conn, node, NODE_MODIFY, false))
return errno;
}
fire_watches(conn, ctx, name, node, false, NULL);
send_ack(conn, XS_WRITE);
return 0;
}
static int do_mkdir(const void *ctx, struct connection *conn,
struct buffered_data *in)
{
struct node *node;
const char *name;
node = get_node(conn, ctx, onearg(in), &name, XS_PERM_WRITE, false);
/* If it already exists, fine. */
if (!node) {
/* No permissions? */
if (errno != ENOENT)
return errno;
if (!name)
return ENOMEM;
node = create_node(conn, ctx, name, NULL, 0);
if (!node)
return errno;
fire_watches(conn, ctx, name, node, false, NULL);
}
send_ack(conn, XS_MKDIR);
return 0;
}
/* Delete memory using memmove. */
static void memdel(void *mem, unsigned off, unsigned len, unsigned total)
{
memmove(mem + off, mem + off + len, total - off - len);
}
static int remove_child_entry(struct connection *conn, struct node *node,
size_t offset)
{
size_t childlen = strlen(node->children + offset);
memdel(node->children, offset, childlen + 1, node->hdr.childlen);
node->hdr.childlen -= childlen + 1;
return write_node(conn, node, NODE_MODIFY, true);
}
static int delete_child(struct connection *conn,
struct node *node, const char *childname)
{
unsigned int i;
for (i = 0; i < node->hdr.childlen;
i += strlen(node->children + i) + 1) {
if (streq(node->children + i, childname)) {
errno = remove_child_entry(conn, node, i) ? EIO : 0;
return errno;
}
}
corrupt(conn, "Can't find child '%s' in %s", childname, node->name);
errno = EIO;
return errno;
}
static int delnode_sub(const void *ctx, struct connection *conn,
struct node *node, void *arg)
{
const char *root = arg;
bool watch_exact;
int ret;
const char *db_name;
/* Any error here will probably be repeated for all following calls. */
ret = access_node(conn, node, NODE_ACCESS_DELETE, &db_name);
if (ret > 0)
return WALK_TREE_SUCCESS_STOP;
if (domain_nbentry_dec(conn, get_node_owner(node)))
return WALK_TREE_ERROR_STOP;
if (!ret)
db_delete(conn, db_name, &node->acc);
/*
* Fire the watches now, when we can still see the node permissions.
* This fine as we are single threaded and the next possible read will
* be handled only after the node has been really removed.
*/
watch_exact = strcmp(root, node->name);
fire_watches(conn, ctx, node->name, node, watch_exact, NULL);
return WALK_TREE_RM_CHILDENTRY;
}
int rm_node(struct connection *conn, const void *ctx, const char *name)
{
struct node *parent;
char *parentname = get_parent(ctx, name);
struct walk_funcs walkfuncs = { .exit = delnode_sub };
int ret;
if (!parentname)
return errno;
parent = read_node(conn, ctx, parentname);
if (!parent)
return read_node_can_propagate_errno() ? errno : EINVAL;
ret = walk_node_tree(ctx, conn, name, &walkfuncs, (void *)name);
if (ret < 0) {
if (ret == WALK_TREE_ERROR_STOP) {
/*
* This can't be triggered by an unprivileged guest,
* so calling corrupt() is fine here.
* In fact it is needed in order to fix a potential
* accounting inconsistency.
*/
corrupt(conn, "error when deleting sub-nodes of %s\n",
name);
errno = EIO;
}
return errno;
}
if (delete_child(conn, parent, basename(name)))
return errno;
return 0;
}
static int do_rm(const void *ctx, struct connection *conn,
struct buffered_data *in)
{
struct node *node;
int ret;
const char *name;
char *parentname;
node = get_node(conn, ctx, onearg(in), &name, XS_PERM_WRITE, false);
if (!node) {
/* Didn't exist already? Fine, if parent exists. */
if (errno == ENOENT) {
if (!name)
return ENOMEM;
parentname = get_parent(ctx, name);
if (!parentname)
return errno;
node = read_node(conn, ctx, parentname);
if (node) {
send_ack(conn, XS_RM);
return 0;
}
/* Restore errno, just in case. */
if (!read_node_can_propagate_errno())
errno = ENOENT;
}
return errno;
}
if (streq(name, "/"))
return EINVAL;
ret = rm_node(conn, ctx, name);
if (ret)
return ret;
send_ack(conn, XS_RM);
return 0;
}
static int do_get_perms(const void *ctx, struct connection *conn,
struct buffered_data *in)
{
const struct node *node;
char *strings;
unsigned int len;
node = get_node_const(conn, ctx, onearg(in), XS_PERM_READ, true);
if (!node)
return errno;
strings = node_perms_to_strings(node, &len);
if (!strings)
return errno;
send_reply(conn, XS_GET_PERMS, strings, len);
return 0;
}
static int do_set_perms(const void *ctx, struct connection *conn,
struct buffered_data *in)
{
struct node_perms perms, old_perms;
const char *name;
char *permstr;
struct node *node;
perms.num = xenstore_count_strings(in->buffer, in->used);
if (perms.num < 2)
return EINVAL;
perms.num--;
if (domain_max_chk(conn, ACC_NPERM, perms.num))
return ENOSPC;
permstr = in->buffer + strlen(in->buffer) + 1;
perms.p = talloc_array(ctx, struct xs_permissions, perms.num);
if (!perms.p)
return ENOMEM;
if (!xenstore_strings_to_perms(perms.p, perms.num, permstr))
return errno;
if (domain_alloc_permrefs(&perms))
return ENOMEM;
if (perms.p[0].perms & XS_PERM_IGNORE)
return ENOENT;
/* We must own node to do this (tools can do this too). */
node = get_node(conn, ctx, in->buffer, &name,
XS_PERM_WRITE | XS_PERM_OWNER, true);
if (!node)
return errno;
/* Unprivileged domains may not change the owner. */
if (domain_is_unprivileged(conn) &&
perms.p[0].id != get_node_owner(node))
return EPERM;
node_to_node_perms(node, &old_perms);
if (domain_nbentry_dec(conn, get_node_owner(node)))
return ENOMEM;
node_perms_to_node(&perms, node);
if (domain_nbentry_inc(conn, get_node_owner(node)))
return ENOMEM;
if (write_node(conn, node, NODE_MODIFY, false))
return errno;
fire_watches(conn, ctx, name, node, false, &old_perms);
send_ack(conn, XS_SET_PERMS);
return 0;
}
static char *child_name(const void *ctx, const char *s1, const char *s2)
{
if (strcmp(s1, "/"))
return talloc_asprintf(ctx, "%s/%s", s1, s2);
return talloc_asprintf(ctx, "/%s", s2);
}
static int rm_from_parent(struct connection *conn, struct node *parent,
const char *name)
{
size_t off;
if (!parent)
return WALK_TREE_ERROR_STOP;
for (off = parent->childoff - 1; off && parent->children[off - 1];
off--);
if (remove_child_entry(conn, parent, off)) {
log("treewalk: child entry could not be removed from '%s'",
parent->name);
return WALK_TREE_ERROR_STOP;
}
parent->childoff = off;
return WALK_TREE_OK;
}
static int walk_call_func(const void *ctx, struct connection *conn,
struct node *node, struct node *parent, void *arg,
int (*func)(const void *ctx, struct connection *conn,
struct node *node, void *arg))
{
int ret;
if (!func)
return WALK_TREE_OK;
ret = func(ctx, conn, node, arg);
if (ret == WALK_TREE_RM_CHILDENTRY && parent)
ret = rm_from_parent(conn, parent, node->name);
return ret;
}
int walk_node_tree(const void *ctx, struct connection *conn, const char *root,
struct walk_funcs *funcs, void *arg)
{
int ret = 0;
void *tmpctx;
char *name;
struct node *node = NULL;
struct node *parent = NULL;
tmpctx = talloc_new(ctx);
if (!tmpctx) {
errno = ENOMEM;
return WALK_TREE_ERROR_STOP;
}
name = talloc_strdup(tmpctx, root);
if (!name) {
errno = ENOMEM;
talloc_free(tmpctx);
return WALK_TREE_ERROR_STOP;
}
/* Continue the walk until an error is returned. */
while (ret >= 0) {
/* node == NULL possible only for the initial loop iteration. */
if (node) {
/* Go one step up if ret or if last child finished. */
if (ret || node->childoff >= node->hdr.childlen) {
parent = node->parent;
/* Call function AFTER processing a node. */
ret = walk_call_func(ctx, conn, node, parent,
arg, funcs->exit);
/* Last node, so exit loop. */
if (!parent)
break;
talloc_free(node);
/* Continue with parent. */
node = parent;
continue;
}
/* Get next child of current node. */
name = child_name(tmpctx, node->name,
node->children + node->childoff);
if (!name) {
ret = WALK_TREE_ERROR_STOP;
break;
}
/* Point to next child. */
node->childoff += strlen(node->children +
node->childoff) + 1;
/* Descent into children. */
parent = node;
}
/* Read next node (root node or next child). */
node = read_node(conn, tmpctx, name);
if (!node) {
/* Child not found - should not happen! */
/* ENOENT case can be handled by supplied function. */
if (errno == ENOENT && funcs->enoent)
ret = funcs->enoent(ctx, conn, parent, name,
arg);
else
ret = WALK_TREE_ERROR_STOP;
if (!parent)
break;
if (ret == WALK_TREE_RM_CHILDENTRY)
ret = rm_from_parent(conn, parent, name);
if (ret < 0)
break;
talloc_free(name);
node = parent;
continue;
}
talloc_free(name);
node->parent = parent;
node->childoff = 0;
/* Call function BEFORE processing a node. */
ret = walk_call_func(ctx, conn, node, parent, arg,
funcs->enter);
}
talloc_free(tmpctx);
return ret < 0 ? ret : WALK_TREE_OK;
}
static struct {
const char *str;
int (*func)(const void *ctx, struct connection *conn,
struct buffered_data *in);
unsigned int flags;
#define XS_FLAG_NOTID (1U << 0) /* Ignore transaction id. */
#define XS_FLAG_PRIV (1U << 1) /* Privileged domain only. */
} const wire_funcs[XS_TYPE_COUNT] = {
[XS_CONTROL] =
{ "CONTROL", do_control, XS_FLAG_PRIV },
[XS_DIRECTORY] = { "DIRECTORY", send_directory },
[XS_READ] = { "READ", do_read },
[XS_GET_PERMS] = { "GET_PERMS", do_get_perms },
[XS_WATCH] =
{ "WATCH", do_watch, XS_FLAG_NOTID },
[XS_UNWATCH] =
{ "UNWATCH", do_unwatch, XS_FLAG_NOTID },
[XS_TRANSACTION_START] = { "TRANSACTION_START", do_transaction_start },
[XS_TRANSACTION_END] = { "TRANSACTION_END", do_transaction_end },
[XS_INTRODUCE] =
{ "INTRODUCE", do_introduce, XS_FLAG_PRIV },
[XS_RELEASE] =
{ "RELEASE", do_release, XS_FLAG_PRIV },
[XS_GET_DOMAIN_PATH] = { "GET_DOMAIN_PATH", do_get_domain_path },
[XS_WRITE] = { "WRITE", do_write },
[XS_MKDIR] = { "MKDIR", do_mkdir },
[XS_RM] = { "RM", do_rm },
[XS_SET_PERMS] = { "SET_PERMS", do_set_perms },
[XS_WATCH_EVENT] = { "WATCH_EVENT", NULL },
[XS_ERROR] = { "ERROR", NULL },
[XS_IS_DOMAIN_INTRODUCED] =
{ "IS_DOMAIN_INTRODUCED", do_is_domain_introduced, XS_FLAG_PRIV },
[XS_RESUME] =
{ "RESUME", do_resume, XS_FLAG_PRIV },
[XS_SET_TARGET] =
{ "SET_TARGET", do_set_target, XS_FLAG_PRIV },
[XS_RESET_WATCHES] = { "RESET_WATCHES", do_reset_watches },
[XS_DIRECTORY_PART] = { "DIRECTORY_PART", send_directory_part },
[XS_GET_FEATURE] =
{ "GET_FEATURE", do_get_feature, XS_FLAG_PRIV },
[XS_SET_FEATURE] =
{ "SET_FEATURE", do_set_feature, XS_FLAG_PRIV },
};
static const char *sockmsg_string(enum xsd_sockmsg_type type)
{
if ((unsigned int)type < ARRAY_SIZE(wire_funcs) && wire_funcs[type].str)
return wire_funcs[type].str;
return "**UNKNOWN**";
}
/* Process "in" for conn: "in" will vanish after this conversation, so
* we can talloc off it for temporary variables. May free "conn".
*/
static void process_message(struct connection *conn, struct buffered_data *in)
{
struct transaction *trans;
enum xsd_sockmsg_type type = in->hdr.msg.type;
int ret;
void *ctx;
/* At least send_error() and send_reply() expects conn->in == in */
assert(conn->in == in);
trace_io(conn, in, "IN");
if ((unsigned int)type >= XS_TYPE_COUNT || !wire_funcs[type].func) {
eprintf("Client unknown operation %i", type);
send_error(conn, ENOSYS);
return;
}
if ((wire_funcs[type].flags & XS_FLAG_PRIV) &&
domain_is_unprivileged(conn)) {
send_error(conn, EACCES);
return;
}
trans = (wire_funcs[type].flags & XS_FLAG_NOTID)
? NULL : transaction_lookup(conn, in->hdr.msg.tx_id);
if (IS_ERR(trans)) {
send_error(conn, -PTR_ERR(trans));
return;
}
ctx = talloc_new(NULL);
if (!ctx) {
send_error(conn, ENOMEM);
return;
}
assert(conn->transaction == NULL);
conn->transaction = trans;
ret = wire_funcs[type].func(ctx, conn, in);
talloc_free(ctx);
if (ret)
send_error(conn, ret);
conn->transaction = NULL;
}
static bool process_delayed_message(struct delayed_request *req)
{
struct connection *conn = req->data;
struct buffered_data *saved_in = conn->in;
if (lu_is_pending())
return false;
/*
* Part of process_message() expects conn->in to contains the
* processed response. So save the current conn->in and restore it
* afterwards.
*/
conn->in = req->in;
process_message(req->data, req->in);
conn->in = saved_in;
return true;
}
static void consider_message(struct connection *conn)
{
conn->is_stalled = false;
/*
* Currently, Live-Update is not supported if there is active
* transactions. In order to reduce the number of retry, delay
* any new request to start a transaction if Live-Update is pending
* and there are no transactions in-flight.
*
* If we can't delay the request, then mark the connection as
* stalled. This will ignore new requests until Live-Update happened
* or it was aborted.
*/
if (lu_is_pending() && list_empty(&conn->transaction_list) &&
conn->in->hdr.msg.type == XS_TRANSACTION_START) {
trace("Delaying transaction start for connection %p req_id %u\n",
conn, conn->in->hdr.msg.req_id);
if (delay_request(conn, conn->in, process_delayed_message,
conn, false) != 0) {
trace("Stalling connection %p\n", conn);
conn->is_stalled = true;
}
return;
}
process_message(conn, conn->in);
assert(conn->in == NULL);
}
/*
* Errors in reading or allocating here means we get out of sync, so we mark
* the connection as ignored.
*/
static void handle_input(struct connection *conn)
{
int bytes;
struct buffered_data *in;
unsigned int err;
if (!conn->in) {
conn->in = new_buffer(conn);
/* In case of no memory just try it again next time. */
if (!conn->in)
return;
}
in = conn->in;
in->pend.ref.domid = conn->id;
/* Not finished header yet? */
if (in->inhdr) {
if (in->used != sizeof(in->hdr)) {
bytes = conn->funcs->read(conn, in->hdr.raw + in->used,
sizeof(in->hdr) - in->used);
if (bytes < 0) {
err = XENSTORE_ERROR_RINGIDX;
goto bad_client;
}
in->used += bytes;
if (in->used != sizeof(in->hdr))
return;
/*
* The payload size is not only currently restricted by
* the protocol but also the internal implementation
* (see various BUILD_BUG_ON()).
* Any potential change of the maximum payload size
* needs to be negotiated between the involved parties.
*/
if (in->hdr.msg.len > XENSTORE_PAYLOAD_MAX) {
syslog(LOG_ERR, "Client tried to feed us %i",
in->hdr.msg.len);
err = XENSTORE_ERROR_PROTO;
goto bad_client;
}
}
if (in->hdr.msg.len <= DEFAULT_BUFFER_SIZE)
in->buffer = in->default_buffer;
else
in->buffer = talloc_array(in, char, in->hdr.msg.len);
/* In case of no memory just try it again next time. */
if (!in->buffer)
return;
in->used = 0;
in->inhdr = false;
}
bytes = conn->funcs->read(conn, in->buffer + in->used,
in->hdr.msg.len - in->used);
if (bytes < 0) {
err = XENSTORE_ERROR_RINGIDX;
goto bad_client;
}
in->used += bytes;
if (in->used != in->hdr.msg.len)
return;
consider_message(conn);
return;
bad_client:
ignore_connection(conn, err);
}
static void handle_output(struct connection *conn)
{
/* Ignore the connection if an error occured */
if (!write_messages(conn))
ignore_connection(conn, XENSTORE_ERROR_RINGIDX);
}
struct connection *new_connection(const struct interface_funcs *funcs)
{
struct connection *new;
new = talloc_zero(talloc_autofree_context(), struct connection);
if (!new)
return NULL;
new->fd = -1;
new->pollfd_idx = -1;
new->funcs = funcs;
new->is_ignored = false;
new->is_stalled = false;
INIT_LIST_HEAD(&new->out_list);
INIT_LIST_HEAD(&new->acc_list);
INIT_LIST_HEAD(&new->ref_list);
INIT_LIST_HEAD(&new->watches);
INIT_LIST_HEAD(&new->transaction_list);
INIT_LIST_HEAD(&new->delayed);
list_add_tail(&new->list, &connections);
talloc_set_destructor(new, destroy_conn);
trace_create(new, "connection");
return new;
}
struct connection *get_connection_by_id(unsigned int conn_id)
{
struct connection *conn;
list_for_each_entry(conn, &connections, list)
if (conn->conn_id == conn_id)
return conn;
return NULL;
}
/* We create initial nodes manually. */
static void manual_node(const char *name, const char *child)
{
struct node *node;
struct xs_permissions perms = { .id = priv_domid,
.perms = XS_PERM_NONE };
node = talloc_zero(NULL, struct node);
if (!node)
barf_perror("Could not allocate initial node %s", name);
node->name = name;
node->perms = &perms;
node->hdr.num_perms = 1;
node->children = (char *)child;
if (child)
node->hdr.childlen = strlen(child) + 1;
if (write_node(NULL, node, NODE_CREATE, false))
barf_perror("Could not create initial node %s", name);
talloc_free(node);
}
static unsigned int hash_from_key_fn(const void *k)
{
const char *str = k;
unsigned int hash = 5381;
char c;
while ((c = *str++))
hash = ((hash << 5) + hash) + (unsigned int)c;
return hash;
}
static int keys_equal_fn(const void *key1, const void *key2)
{
return 0 == strcmp(key1, key2);
}
void setup_structure(bool live_update)
{
nodes = create_hashtable(NULL, "nodes", hash_from_key_fn, keys_equal_fn,
HASHTABLE_FREE_KEY | HASHTABLE_FREE_VALUE);
if (!nodes)
barf_perror("Could not create nodes hashtable");
if (live_update)
manual_node("/", NULL);
else {
manual_node("/", "tool");
manual_node("/tool", "xenstored");
manual_node("/tool/xenstored", NULL);
manual_node("@releaseDomain", NULL);
manual_node("@introduceDomain", NULL);
domain_nbentry_fix(priv_domid, 5, true);
}
}
int remember_string(struct hashtable *hash, const char *str)
{
char *k = talloc_strdup(NULL, str);
if (!k)
return ENOMEM;
return hashtable_add(hash, k, (void *)1);
}
/**
* A node has a children field that names the children of the node, separated
* by NULs. We check whether there are entries in there that are duplicated
* (and if so, delete the second one), and whether there are any that do not
* have a corresponding child node (and if so, delete them). Each valid child
* is then recursively checked.
*
* As we go, we record each node in the given reachable hashtable. These
* entries will be used later in clean_store.
*/
struct check_store_data {
struct hashtable *reachable;
struct hashtable *domains;
};
static int check_store_step(const void *ctx, struct connection *conn,
struct node *node, void *arg)
{
struct check_store_data *data = arg;
if (hashtable_search(data->reachable, (void *)node->name)) {
log("check_store: '%s' is duplicated!", node->name);
return WALK_TREE_RM_CHILDENTRY;
}
if (remember_string(data->reachable, node->name))
return WALK_TREE_ERROR_STOP;
domain_check_acc_add(node, data->domains);
return WALK_TREE_OK;
}
static int check_store_enoent(const void *ctx, struct connection *conn,
struct node *parent, char *name, void *arg)
{
log("check_store: node '%s' not found", name);
return WALK_TREE_RM_CHILDENTRY;
}
/**
* Helper to clean_store below.
*/
static int clean_store_(const void *key, void *val, void *private)
{
struct hashtable *reachable = private;
char *slash;
char *name = talloc_strdup(NULL, key);
if (!name) {
log("clean_store: ENOMEM");
return 1;
}
if (name[0] != '/') {
slash = strchr(name, '/');
if (slash)
*slash = 0;
}
if (!hashtable_search(reachable, name)) {
log("clean_store: '%s' is orphaned!", name);
db_delete(NULL, name, NULL);
}
talloc_free(name);
return 0;
}
/**
* Given the list of reachable nodes, iterate over the whole store, and
* remove any that were not reached.
*/
static void clean_store(struct check_store_data *data)
{
hashtable_iterate(nodes, clean_store_, data->reachable);
domain_check_acc(data->domains);
}
int check_store_path(const void *ctx, const char *name, struct check_store_data *data)
{
struct node *node;
node = read_node(NULL, ctx, name);
if (!node) {
log("check_store: error %d reading special node '%s'", errno,
name);
return errno;
}
return check_store_step(ctx, NULL, node, data);
}
void check_store(void)
{
struct walk_funcs walkfuncs = {
.enter = check_store_step,
.enoent = check_store_enoent,
};
struct check_store_data data;
void *ctx;
/* Don't free values (they are all void *1) */
data.reachable = create_hashtable(NULL, "checkstore", hash_from_key_fn,
keys_equal_fn, HASHTABLE_FREE_KEY);
if (!data.reachable) {
log("check_store: ENOMEM");
return;
}
data.domains = domain_check_acc_init();
if (!data.domains) {
log("check_store: ENOMEM");
goto out_hash;
}
ctx = talloc_new(NULL);
log("Checking store ...");
if (walk_node_tree(ctx, NULL, "/", &walkfuncs, &data)) {
if (errno == ENOMEM)
log("check_store: ENOMEM");
} else if (!check_store_path(ctx, "@introduceDomain", &data) &&
!check_store_path(ctx, "@releaseDomain", &data) &&
!check_transactions(data.reachable))
clean_store(&data);
log("Checking store complete.");
hashtable_destroy(data.domains);
talloc_free(ctx);
out_hash:
hashtable_destroy(data.reachable);
}
/* Something is horribly wrong: check the store. */
void corrupt(struct connection *conn, const char *fmt, ...)
{
va_list arglist;
char *str;
int saved_errno = errno;
va_start(arglist, fmt);
str = talloc_vasprintf(NULL, fmt, arglist);
va_end(arglist);
log("corruption detected by connection %i: err %s: %s",
conn ? (int)conn->id : -1, strerror(saved_errno),
str ?: "ENOMEM");
talloc_free(str);
check_store();
errno = saved_errno;
}
static void usage(void)
{
fprintf(stderr,
"Usage:\n"
"\n"
" xenstored \n"
"\n"
"where options may include:\n"
"\n"
" -F, --pid-file giving a file for the daemon's pid to be written,\n"
" -H, --help to output this message,\n"
" -N, --no-fork to request that the daemon does not fork,\n"
" -T, --trace-file giving the file for logging, and\n"
" --trace-control=+ activate a specific \n"
" --trace-control=- deactivate a specific \n"
" -E, --entry-nb limit the number of entries per domain,\n"
" -S, --entry-size limit the size of entry per domain, and\n"
" -W, --watch-nb limit the number of watches per domain,\n"
" -t, --transaction limit the number of transaction allowed per domain,\n"
" -A, --perm-nb limit the number of permissions per node,\n"
" -M, --path-max limit the allowed Xenstore node path length,\n"
" -Q, --quota = set the quota to the value , allowed\n"
" quotas are:\n"
" transaction-nodes: number of accessed node per\n"
" transaction\n"
" memory: total used memory per domain for nodes,\n"
" transactions, watches and requests, above\n"
" which Xenstore will stop talking to domain\n"
" nodes: number nodes owned by a domain\n"
" node-permissions: number of access permissions per\n"
" node\n"
" node-size: total size of a node (permissions +\n"
" children names + content)\n"
" outstanding: number of outstanding requests\n"
" path-length: length of a node path\n"
" transactions: number of concurrent transactions\n"
" per domain\n"
" watches: number of watches per domain"
" -q, --quota-soft = set a soft quota to the value ,\n"
" causing a warning to be issued via syslog() if the\n"
" limit is violated, allowed quotas are:\n"
" memory: see above\n"
" -w, --timeout = set the timeout in seconds for ,\n"
" allowed timeout candidates are:\n"
" watch-event: time a watch-event is kept pending\n"
" -K, --keep-orphans don't delete nodes owned by a domain when the\n"
" domain is deleted (this is a security risk!)\n"
" -m, --master-domid specify the domid of the domain where xenstored\n"
" is running. defaults to 0\n"
);
}
static struct option options[] = {
{ "entry-nb", 1, NULL, 'E' },
{ "pid-file", 1, NULL, 'F' },
{ "event", 1, NULL, 'e' },
{ "master-domid", 1, NULL, 'm' },
{ "help", 0, NULL, 'H' },
{ "no-fork", 0, NULL, 'N' },
{ "priv-domid", 1, NULL, 'p' },
{ "entry-size", 1, NULL, 'S' },
{ "trace-file", 1, NULL, 'T' },
{ "trace-control", 1, NULL, 1 },
{ "transaction", 1, NULL, 't' },
{ "perm-nb", 1, NULL, 'A' },
{ "path-max", 1, NULL, 'M' },
{ "quota", 1, NULL, 'Q' },
{ "quota-soft", 1, NULL, 'q' },
{ "timeout", 1, NULL, 'w' },
{ "keep-orphans", 0, NULL, 'K' },
{ "watch-nb", 1, NULL, 'W' },
#ifndef NO_LIVE_UPDATE
{ "live-update", 0, NULL, 'U' },
#endif
{ NULL, 0, NULL, 0 } };
int store_domid = DOMID_INVALID;
int dom0_event = 0;
int priv_domid = DOMID_INVALID;
domid_t stub_domid = DOMID_INVALID;
static unsigned int get_optval_uint(const char *arg)
{
char *end;
unsigned long val;
val = strtoul(arg, &end, 10);
if (!*arg || *end || val > INT_MAX)
barf("invalid parameter value \"%s\"\n", arg);
return val;
}
static bool what_matches(const char *arg, const char *what)
{
unsigned int what_len;
if (!what)
return false;
what_len = strlen(what);
return !strncmp(arg, what, what_len) && arg[what_len] == '=';
}
static void set_timeout(const char *arg)
{
const char *eq = strchr(arg, '=');
unsigned int val;
if (!eq)
barf("quotas must be specified via =\n");
val = get_optval_uint(eq + 1);
if (what_matches(arg, "watch-event"))
timeout_watch_event_msec = val * 1000;
else
barf("unknown timeout \"%s\"\n", arg);
}
static void set_quota(const char *arg, bool soft)
{
const char *eq = strchr(arg, '=');
struct quota *q = soft ? soft_quotas : hard_quotas;
unsigned int val;
unsigned int i;
if (!eq)
barf("quotas must be specified via =\n");
val = get_optval_uint(eq + 1);
for (i = 0; i < ACC_N; i++) {
if (what_matches(arg, q[i].name)) {
q[i].val = val;
return;
}
}
barf("unknown quota \"%s\"\n", arg);
}
/* Sorted by bit values of TRACE_* flags. Flag is (1u << index). */
const char *const trace_switches[] = {
"obj", "io", "wrl", "acc", "tdb",
NULL
};
int set_trace_switch(const char *arg)
{
bool remove = (arg[0] == '-');
unsigned int idx;
switch (arg[0]) {
case '-':
remove = true;
break;
case '+':
remove = false;
break;
default:
return EINVAL;
}
arg++;
for (idx = 0; trace_switches[idx]; idx++) {
if (!strcmp(arg, trace_switches[idx])) {
if (remove)
trace_flags &= ~(1u << idx);
else
trace_flags |= 1u << idx;
return 0;
}
}
return EINVAL;
}
int main(int argc, char *argv[])
{
int opt;
bool dofork = true;
bool live_update = false;
const char *pidfile = NULL;
int timeout;
orig_argc = argc;
orig_argv = argv;
while ((opt = getopt_long(argc, argv,
"E:F:H::KNS:t:A:M:Q:q:T:W:w:U",
options, NULL)) != -1) {
switch (opt) {
case 'E':
hard_quotas[ACC_NODES].val = get_optval_uint(optarg);
break;
case 'F':
pidfile = optarg;
break;
case 'H':
usage();
return 0;
case 'N':
dofork = false;
break;
case 'S':
hard_quotas[ACC_NODESZ].val = get_optval_uint(optarg);
break;
case 't':
hard_quotas[ACC_TRANS].val = get_optval_uint(optarg);
break;
case 'T':
tracefile = optarg;
break;
case 1:
if (set_trace_switch(optarg))
barf("Illegal trace switch \"%s\"\n", optarg);
break;
case 'K':
keep_orphans = true;
break;
case 'W':
hard_quotas[ACC_WATCH].val = get_optval_uint(optarg);
break;
case 'A':
hard_quotas[ACC_NPERM].val = get_optval_uint(optarg);
break;
case 'M':
hard_quotas[ACC_PATHLEN].val = get_optval_uint(optarg);
hard_quotas[ACC_PATHLEN].val =
min((unsigned int)XENSTORE_REL_PATH_MAX,
hard_quotas[ACC_PATHLEN].val);
break;
case 'Q':
set_quota(optarg, false);
break;
case 'q':
set_quota(optarg, true);
break;
case 'w':
set_timeout(optarg);
break;
case 'e':
dom0_event = get_optval_uint(optarg);
break;
case 'm':
store_domid = get_optval_uint(optarg);
break;
case 'p':
priv_domid = get_optval_uint(optarg);
break;
#ifndef NO_LIVE_UPDATE
case 'U':
live_update = true;
break;
#endif
}
}
if (optind != argc)
barf("%s: No arguments desired", argv[0]);
early_init(live_update, dofork, pidfile);
talloc_enable_null_tracking();
domain_early_init();
/* Listen to hypervisor. */
if (!live_update) {
domain_init(-1);
init_domains();
}
/* redirect to /dev/null now we're ready to accept connections */
if (dofork && !live_update)
finish_daemonize();
#ifndef __MINIOS__
if (dofork)
xprintf = trace;
#endif
if (tracefile)
tracefile = absolute_filename(NULL, tracefile);
stubdom_init(live_update);
#ifndef NO_LIVE_UPDATE
/* Read state in case of live update. */
if (live_update)
lu_read_state();
#endif
check_store();
/* Get ready to listen to the tools. */
initialize_fds(&timeout);
late_init(live_update);
/* Main loop. */
for (;;) {
struct connection *conn, *next;
if (poll(poll_fds, nr_fds, timeout) < 0) {
if (errno == EINTR)
continue;
barf_perror("Poll failed");
}
handle_special_fds();
if (xce_pollfd_idx != -1) {
if (poll_fds[xce_pollfd_idx].revents & ~POLLIN) {
barf_perror("xce_handle poll failed");
break;
} else if (poll_fds[xce_pollfd_idx].revents & POLLIN) {
handle_event();
xce_pollfd_idx = -1;
}
}
/*
* list_for_each_entry_safe is not suitable here because
* handle_input may delete entries besides the current one, but
* those may be in the temporary next which would trigger a
* use-after-free. list_for_each_entry_safe is only safe for
* deleting the current entry.
*/
next = list_entry(connections.next, typeof(*conn), list);
if (&next->list != &connections)
talloc_increase_ref_count(next);
while (&next->list != &connections) {
conn = next;
next = list_entry(conn->list.next,
typeof(*conn), list);
if (&next->list != &connections)
talloc_increase_ref_count(next);
if (conn_can_read(conn))
handle_input(conn);
if (talloc_free(conn) == 0)
continue;
talloc_increase_ref_count(conn);
if (conn_can_write(conn))
handle_output(conn);
if (talloc_free(conn) == 0)
continue;
conn->pollfd_idx = -1;
}
if (delayed_requests) {
list_for_each_entry(conn, &connections, list) {
struct delayed_request *req, *tmp;
list_for_each_entry_safe(req, tmp,
&conn->delayed, list)
call_delayed(req);
}
}
initialize_fds(&timeout);
}
}
const char *dump_state_global(FILE *fp)
{
struct xs_state_record_header head;
struct xs_state_global glb;
head.type = XS_STATE_TYPE_GLOBAL;
head.length = sizeof(glb);
if (fwrite(&head, sizeof(head), 1, fp) != 1)
return "Dump global state error";
glb.socket_fd = get_socket_fd();
glb.evtchn_fd = xenevtchn_fd(xce_handle);
if (fwrite(&glb, sizeof(glb), 1, fp) != 1)
return "Dump global state error";
return NULL;
}
static const char *dump_input_buffered_data(FILE *fp,
const struct buffered_data *in,
unsigned int *total_len)
{
unsigned int hlen = in->inhdr ? in->used : sizeof(in->hdr);
*total_len += hlen;
if (fp && fwrite(&in->hdr, hlen, 1, fp) != 1)
return "Dump read data error";
if (!in->inhdr && in->used) {
*total_len += in->used;
if (fp && fwrite(in->buffer, in->used, 1, fp) != 1)
return "Dump read data error";
}
return NULL;
}
/* Called twice: first with fp == NULL to get length, then for writing data. */
const char *dump_state_buffered_data(FILE *fp, const struct connection *c,
struct xs_state_connection *sc)
{
unsigned int len = 0, used;
struct buffered_data *out;
bool partial = true;
struct delayed_request *req;
const char *ret;
/* Dump any command that was delayed */
list_for_each_entry(req, &c->delayed, list) {
/*
* We only want to preserve commands that weren't processed at
* all. All the other delayed requests (such as do_lu_start())
* must be processed before Live-Update.
*/
if (req->func != process_delayed_message)
continue;
assert(!req->in->inhdr);
if ((ret = dump_input_buffered_data(fp, req->in, &len)))
return ret;
}
if (c->in && (ret = dump_input_buffered_data(fp, c->in, &len)))
return ret;
if (sc) {
sc->data_in_len = len;
sc->data_resp_len = 0;
}
len = 0;
list_for_each_entry(out, &c->out_list, list) {
used = out->used;
if (out->inhdr) {
if (!used)
partial = false;
if (fp && fwrite(out->hdr.raw + out->used,
sizeof(out->hdr) - out->used, 1, fp) != 1)
return "Dump buffered data error";
len += sizeof(out->hdr) - out->used;
used = 0;
}
if (fp && out->hdr.msg.len &&
fwrite(out->buffer + used, out->hdr.msg.len - used,
1, fp) != 1)
return "Dump buffered data error";
len += out->hdr.msg.len - used;
if (partial && sc)
sc->data_resp_len = len;
partial = false;
}
/* Add "OK" for live-update command. */
if (c == lu_get_connection()) {
unsigned int rc = lu_write_response(fp);
if (!rc)
return "Dump buffered data error";
len += rc;
}
if (sc)
sc->data_out_len = len;
return NULL;
}
const char *dump_state_node_perms(FILE *fp, const struct xs_permissions *perms,
unsigned int n_perms)
{
unsigned int p;
for (p = 0; p < n_perms; p++) {
struct xs_state_node_perm sp;
switch ((int)perms[p].perms & ~XS_PERM_IGNORE) {
case XS_PERM_READ:
sp.access = XS_STATE_NODE_PERM_READ;
break;
case XS_PERM_WRITE:
sp.access = XS_STATE_NODE_PERM_WRITE;
break;
case XS_PERM_READ | XS_PERM_WRITE:
sp.access = XS_STATE_NODE_PERM_BOTH;
break;
default:
sp.access = XS_STATE_NODE_PERM_NONE;
break;
}
sp.flags = (perms[p].perms & XS_PERM_IGNORE)
? XS_STATE_NODE_PERM_IGNORE : 0;
sp.domid = perms[p].id;
if (fwrite(&sp, sizeof(sp), 1, fp) != 1)
return "Dump node permission error";
}
return NULL;
}
struct dump_node_data {
FILE *fp;
const char *err;
};
static int dump_state_node_err(struct dump_node_data *data, const char *err)
{
data->err = err;
return WALK_TREE_ERROR_STOP;
}
static int dump_state_node(const void *ctx, struct connection *conn,
const struct node *node, struct dump_node_data *data)
{
FILE *fp = data->fp;
unsigned int pathlen;
struct xs_state_record_header head;
struct xs_state_node sn;
const char *ret;
pathlen = strlen(node->name) + 1;
head.type = XS_STATE_TYPE_NODE;
head.length = sizeof(sn);
sn.conn_id = 0;
sn.ta_id = 0;
sn.ta_access = 0;
sn.perm_n = node->hdr.num_perms;
sn.path_len = pathlen;
sn.data_len = node->hdr.datalen;
head.length += node->hdr.num_perms * sizeof(*sn.perms);
head.length += pathlen;
head.length += node->hdr.datalen;
head.length = ROUNDUP(head.length, 3);
if (fwrite(&head, sizeof(head), 1, fp) != 1)
return dump_state_node_err(data, "Dump node head error");
if (fwrite(&sn, sizeof(sn), 1, fp) != 1)
return dump_state_node_err(data, "Dump node state error");
ret = dump_state_node_perms(fp, node->perms, node->hdr.num_perms);
if (ret)
return dump_state_node_err(data, ret);
if (fwrite(node->name, pathlen, 1, fp) != 1)
return dump_state_node_err(data, "Dump node path error");
if (node->hdr.datalen &&
fwrite(node->data, node->hdr.datalen, 1, fp) != 1)
return dump_state_node_err(data, "Dump node data error");
ret = dump_state_align(fp);
if (ret)
return dump_state_node_err(data, ret);
return WALK_TREE_OK;
}
static int dump_state_node_enter(const void *ctx, struct connection *conn,
struct node *node, void *arg)
{
return dump_state_node(ctx, conn, node, arg);
}
static int dump_state_special_node(FILE *fp, const void *ctx,
struct dump_node_data *data,
const char *name)
{
const struct node *node;
int ret;
node = read_node_const(NULL, ctx, name);
if (!node)
return dump_state_node_err(data, "Dump node read node error");
ret = dump_state_node(ctx, NULL, node, data);
talloc_free(node);
return ret;
}
const char *dump_state_nodes(FILE *fp, const void *ctx)
{
struct dump_node_data data = {
.fp = fp,
.err = "Dump node walk error"
};
struct walk_funcs walkfuncs = { .enter = dump_state_node_enter };
if (walk_node_tree(ctx, NULL, "/", &walkfuncs, &data))
return data.err;
if (dump_state_special_node(fp, ctx, &data, "@releaseDomain"))
return data.err;
if (dump_state_special_node(fp, ctx, &data, "@introduceDomain"))
return data.err;
return NULL;
}
void read_state_global(const void *ctx, const void *state)
{
const struct xs_state_global *glb = state;
set_socket_fd(glb->socket_fd);
domain_init(glb->evtchn_fd);
}
static void add_buffered_data(struct buffered_data *bdata,
struct connection *conn, const uint8_t *data,
unsigned int len)
{
bdata->hdr.msg.len = len;
if (len <= DEFAULT_BUFFER_SIZE)
bdata->buffer = bdata->default_buffer;
else
bdata->buffer = talloc_array(bdata, char, len);
if (!bdata->buffer)
barf("error restoring buffered data");
memcpy(bdata->buffer, data, len);
if (bdata->hdr.msg.type == XS_WATCH_EVENT && timeout_watch_event_msec &&
domain_is_unprivileged(conn)) {
bdata->timeout_msec = get_now_msec() + timeout_watch_event_msec;
if (!conn->timeout_msec)
conn->timeout_msec = bdata->timeout_msec;
}
/* Queue for later transmission. */
list_add_tail(&bdata->list, &conn->out_list);
bdata->on_out_list = true;
/*
* Watch events are never "outstanding", but the request causing them
* are instead kept "outstanding" until all watch events caused by that
* request have been delivered.
*/
if (bdata->hdr.msg.type != XS_WATCH_EVENT)
domain_outstanding_inc(conn);
/*
* We are restoring the state after Live-Update and the new quota may
* be smaller. So ignore it. The limit will be applied for any resource
* after the state has been fully restored.
*/
domain_memory_add_nochk(conn, conn->id, len + sizeof(bdata->hdr));
}
void read_state_buffered_data(const void *ctx, struct connection *conn,
const struct xs_state_connection *sc)
{
struct buffered_data *bdata;
const uint8_t *data;
unsigned int len;
bool partial = sc->data_resp_len;
for (data = sc->data; data < sc->data + sc->data_in_len; data += len) {
bdata = new_buffer(conn);
if (!bdata)
barf("error restoring read data");
/*
* We don't know yet if there is more than one message
* to process. So the len is the size of the leftover data.
*/
len = sc->data_in_len - (data - sc->data);
if (len < sizeof(bdata->hdr)) {
bdata->inhdr = true;
memcpy(&bdata->hdr, data, len);
bdata->used = len;
} else {
bdata->inhdr = false;
memcpy(&bdata->hdr, data, sizeof(bdata->hdr));
if (bdata->hdr.msg.len <= DEFAULT_BUFFER_SIZE)
bdata->buffer = bdata->default_buffer;
else
bdata->buffer = talloc_array(bdata, char,
bdata->hdr.msg.len);
if (!bdata->buffer)
barf("Error allocating in buffer");
bdata->used = min_t(unsigned int,
len - sizeof(bdata->hdr),
bdata->hdr.msg.len);
memcpy(bdata->buffer, data + sizeof(bdata->hdr),
bdata->used);
/* Update len to match the size of the message. */
len = bdata->used + sizeof(bdata->hdr);
}
/*
* If the message is not complete, then it means this was
* the current processed message. All the other messages
* will be queued to be handled after restoring.
*/
if (bdata->inhdr || bdata->used != bdata->hdr.msg.len) {
assert(conn->in == NULL);
conn->in = bdata;
} else if (delay_request(conn, bdata, process_delayed_message,
conn, true))
barf("Unable to delay the request");
}
for (data = sc->data + sc->data_in_len;
data < sc->data + sc->data_in_len + sc->data_out_len;
data += len) {
bdata = new_buffer(conn);
if (!bdata)
barf("error restoring buffered data");
if (partial) {
bdata->inhdr = false;
/* Make trace look nice. */
bdata->hdr.msg.type = XS_INVALID;
len = sc->data_resp_len;
add_buffered_data(bdata, conn, data, len);
partial = false;
continue;
}
memcpy(&bdata->hdr, data, sizeof(bdata->hdr));
data += sizeof(bdata->hdr);
len = bdata->hdr.msg.len;
add_buffered_data(bdata, conn, data, len);
}
}
void read_state_node(const void *ctx, const void *state)
{
const struct xs_state_node *sn = state;
struct node *node, *parent;
char *name, *parentname;
unsigned int i;
struct connection conn = { .id = priv_domid };
name = (char *)(sn->perms + sn->perm_n);
node = talloc(ctx, struct node);
if (!node)
barf("allocation error restoring node");
node->acc.memory = 0;
node->name = name;
node->hdr.generation = ++generation;
node->hdr.datalen = sn->data_len;
node->data = name + sn->path_len;
node->hdr.childlen = 0;
node->children = NULL;
node->hdr.num_perms = sn->perm_n;
node->perms = talloc_array(node, struct xs_permissions,
node->hdr.num_perms);
if (!node->perms)
barf("allocation error restoring node");
for (i = 0; i < node->hdr.num_perms; i++) {
switch (sn->perms[i].access) {
case 'r':
node->perms[i].perms = XS_PERM_READ;
break;
case 'w':
node->perms[i].perms = XS_PERM_WRITE;
break;
case 'b':
node->perms[i].perms = XS_PERM_READ | XS_PERM_WRITE;
break;
default:
node->perms[i].perms = XS_PERM_NONE;
break;
}
if (sn->perms[i].flags & XS_STATE_NODE_PERM_IGNORE)
node->perms[i].perms |= XS_PERM_IGNORE;
node->perms[i].id = sn->perms[i].domid;
}
if (!strstarts(name, "@")) {
parentname = get_parent(node, name);
if (!parentname)
barf("allocation error restoring node");
parent = read_node(NULL, node, parentname);
if (!parent)
barf("read parent error restoring node");
if (add_child(node, parent, name))
barf("allocation error restoring node");
if (write_node_raw(NULL, parentname, parent, NODE_MODIFY, true))
barf("write parent error restoring node");
}
/* The "/" node is already existing, so it can only be modified here. */
if (write_node_raw(NULL, name, node,
strcmp(name, "/") ? NODE_CREATE : NODE_MODIFY, true))
barf("write node error restoring node");
if (domain_nbentry_inc(&conn, get_node_owner(node)))
barf("node accounting error restoring node");
talloc_free(node);
}
/*
* Local variables:
* mode: C
* c-file-style: "linux"
* indent-tabs-mode: t
* c-basic-offset: 8
* tab-width: 8
* End:
*/