1 /*
2  * Copyright (C) 2014      Citrix Ltd.
3  * Author Wei Liu <wei.liu2@citrix.com>
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published
7  * by the Free Software Foundation; version 2.1 only. with the special
8  * exception on linking described in file LICENSE.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public License for more details.
14  */
15 #include "libxl_osdeps.h" /* must come before any other headers */
16 #include "libxl_internal.h"
17 #include "libxl_arch.h"
18 #include <stdlib.h>
19 
20 #include <xc_dom.h>
21 
libxl__vnuma_configured(const libxl_domain_build_info * b_info)22 bool libxl__vnuma_configured(const libxl_domain_build_info *b_info)
23 {
24     return b_info->num_vnuma_nodes != 0;
25 }
26 
27 /* Sort vmemranges in ascending order with "start" */
compare_vmemrange(const void * a,const void * b)28 static int compare_vmemrange(const void *a, const void *b)
29 {
30     const xen_vmemrange_t *x = a, *y = b;
31     if (x->start < y->start)
32         return -1;
33     if (x->start > y->start)
34         return 1;
35     return 0;
36 }
37 
38 /* Check if a vcpu has an hard (or soft) affinity set in such
39  * a way that it does not match the pnode to which the vcpu itself
40  * is assigned to.
41  */
check_vnuma_affinity(libxl__gc * gc,unsigned int vcpu,unsigned int pnode,unsigned int num_affinity,const libxl_bitmap * affinity,const char * kind)42 static int check_vnuma_affinity(libxl__gc *gc,
43                                  unsigned int vcpu,
44                                  unsigned int pnode,
45                                  unsigned int num_affinity,
46                                  const libxl_bitmap *affinity,
47                                  const char *kind)
48 {
49     libxl_bitmap nodemap;
50     int rc = 0;
51 
52     libxl_bitmap_init(&nodemap);
53 
54     rc = libxl_node_bitmap_alloc(CTX, &nodemap, 0);
55     if (rc) {
56         LOG(ERROR, "Can't allocate nodemap");
57         goto out;
58     }
59 
60     rc = libxl_cpumap_to_nodemap(CTX, affinity, &nodemap);
61     if (rc) {
62         LOG(ERROR, "Can't convert Vcpu %d affinity to nodemap", vcpu);
63         goto out;
64     }
65 
66     if (libxl_bitmap_count_set(&nodemap) != 1 ||
67         !libxl_bitmap_test(&nodemap, pnode))
68         LOG(WARN, "Vcpu %d %s affinity and vnuma info mismatch", vcpu, kind);
69 
70 out:
71     libxl_bitmap_dispose(&nodemap);
72     return rc;
73 }
74 
75 /* Check if vNUMA configuration is valid:
76  *  1. all pnodes inside vnode_to_pnode array are valid
77  *  2. each vcpu belongs to one and only one vnode
78  *  3. each vmemrange is valid and doesn't overlap with any other
79  *  4. local distance cannot be larger than remote distance
80  *
81  * Check also, if any hard or soft affinity is specified, whether
82  * they match with the vNUMA related bits (namely vcpus to vnodes
83  * mappings and vnodes to pnodes association). If that does not
84  * hold, however, just print a warning, as that has "only"
85  * performance implications.
86  */
libxl__vnuma_config_check(libxl__gc * gc,const libxl_domain_build_info * b_info,const libxl__domain_build_state * state)87 int libxl__vnuma_config_check(libxl__gc *gc,
88                               const libxl_domain_build_info *b_info,
89                               const libxl__domain_build_state *state)
90 {
91     int nr_nodes = 0, rc = ERROR_VNUMA_CONFIG_INVALID;
92     unsigned int i, j;
93     libxl_numainfo *ninfo = NULL;
94     uint64_t total_memkb = 0;
95     libxl_bitmap cpumap;
96     libxl_vnode_info *v;
97 
98     libxl_bitmap_init(&cpumap);
99 
100     /* Check pnode specified is valid */
101     ninfo = libxl_get_numainfo(CTX, &nr_nodes);
102     if (!ninfo) {
103         LOG(ERROR, "libxl_get_numainfo failed");
104         goto out;
105     }
106 
107     for (i = 0; i < b_info->num_vnuma_nodes; i++) {
108         uint32_t pnode;
109 
110         v = &b_info->vnuma_nodes[i];
111         pnode = v->pnode;
112 
113         /* The pnode specified is not valid? */
114         if (pnode >= nr_nodes) {
115             LOG(ERROR, "Invalid pnode %"PRIu32" specified", pnode);
116             goto out;
117         }
118 
119         total_memkb += v->memkb;
120     }
121 
122     if (total_memkb != b_info->max_memkb) {
123         LOG(ERROR, "Amount of memory mismatch (0x%"PRIx64" != 0x%"PRIx64")",
124             total_memkb, b_info->max_memkb);
125         goto out;
126     }
127 
128     /* Check vcpu mapping */
129     libxl_cpu_bitmap_alloc(CTX, &cpumap, b_info->max_vcpus);
130     for (i = 0; i < b_info->num_vnuma_nodes; i++) {
131         v = &b_info->vnuma_nodes[i];
132         libxl_for_each_set_bit(j, v->vcpus) {
133             if (!libxl_bitmap_test(&cpumap, j))
134                 libxl_bitmap_set(&cpumap, j);
135             else {
136                 LOG(ERROR, "Vcpu %d assigned more than once", j);
137                 goto out;
138             }
139         }
140     }
141 
142     for (i = 0; i < b_info->max_vcpus; i++) {
143         if (!libxl_bitmap_test(&cpumap, i)) {
144             LOG(ERROR, "Vcpu %d is not assigned to any vnode", i);
145             goto out;
146         }
147     }
148 
149     /* Check whether vcpu affinity (if any) matches vnuma configuration */
150     for (i = 0; i < b_info->num_vnuma_nodes; i++) {
151         v = &b_info->vnuma_nodes[i];
152         libxl_for_each_set_bit(j, v->vcpus) {
153             if (b_info->num_vcpu_hard_affinity > j)
154                 check_vnuma_affinity(gc, j, v->pnode,
155                                      b_info->num_vcpu_hard_affinity,
156                                      &b_info->vcpu_hard_affinity[j],
157                                      "hard");
158             if (b_info->num_vcpu_soft_affinity > j)
159                 check_vnuma_affinity(gc, j, v->pnode,
160                                      b_info->num_vcpu_soft_affinity,
161                                      &b_info->vcpu_soft_affinity[j],
162                                      "soft");
163         }
164     }
165 
166     /* Check vmemranges */
167     qsort(state->vmemranges, state->num_vmemranges, sizeof(xen_vmemrange_t),
168           compare_vmemrange);
169 
170     for (i = 0; i < state->num_vmemranges; i++) {
171         if (state->vmemranges[i].end < state->vmemranges[i].start) {
172                 LOG(ERROR, "Vmemrange end < start");
173                 goto out;
174         }
175     }
176 
177     for (i = 0; i < state->num_vmemranges - 1; i++) {
178         if (state->vmemranges[i].end > state->vmemranges[i+1].start) {
179             LOG(ERROR,
180                 "Vmemranges overlapped, 0x%"PRIx64"-0x%"PRIx64", 0x%"PRIx64"-0x%"PRIx64,
181                 state->vmemranges[i].start, state->vmemranges[i].end,
182                 state->vmemranges[i+1].start, state->vmemranges[i+1].end);
183             goto out;
184         }
185     }
186 
187     /* Check vdistances */
188     for (i = 0; i < b_info->num_vnuma_nodes; i++) {
189         uint32_t local_distance, remote_distance;
190 
191         v = &b_info->vnuma_nodes[i];
192         local_distance = v->distances[i];
193 
194         for (j = 0; j < v->num_distances; j++) {
195             if (i == j) continue;
196             remote_distance = v->distances[j];
197             if (local_distance > remote_distance) {
198                 LOG(ERROR,
199                     "Distance from %u to %u smaller than %u's local distance",
200                     i, j, i);
201                 goto out;
202             }
203         }
204     }
205 
206     rc = 0;
207 out:
208     libxl_numainfo_list_free(ninfo, nr_nodes);
209     libxl_bitmap_dispose(&cpumap);
210     return rc;
211 }
212 
libxl__vnuma_build_vmemrange_pv_generic(libxl__gc * gc,uint32_t domid,libxl_domain_build_info * b_info,libxl__domain_build_state * state)213 int libxl__vnuma_build_vmemrange_pv_generic(libxl__gc *gc,
214                                             uint32_t domid,
215                                             libxl_domain_build_info *b_info,
216                                             libxl__domain_build_state *state)
217 {
218     int i;
219     uint64_t next;
220     xen_vmemrange_t *v = NULL;
221 
222     /* Generate one vmemrange for each virtual node. */
223     GCREALLOC_ARRAY(v, b_info->num_vnuma_nodes);
224     next = 0;
225     for (i = 0; i < b_info->num_vnuma_nodes; i++) {
226         libxl_vnode_info *p = &b_info->vnuma_nodes[i];
227 
228         v[i].start = next;
229         v[i].end = next + (p->memkb << 10);
230         v[i].flags = 0;
231         v[i].nid = i;
232 
233         next = v[i].end;
234     }
235 
236     state->vmemranges = v;
237     state->num_vmemranges = i;
238 
239     return 0;
240 }
241 
242 /* Build vmemranges for PV guest */
libxl__vnuma_build_vmemrange_pv(libxl__gc * gc,uint32_t domid,libxl_domain_build_info * b_info,libxl__domain_build_state * state)243 int libxl__vnuma_build_vmemrange_pv(libxl__gc *gc,
244                                     uint32_t domid,
245                                     libxl_domain_build_info *b_info,
246                                     libxl__domain_build_state *state)
247 {
248     assert(state->vmemranges == NULL);
249     return libxl__arch_vnuma_build_vmemrange(gc, domid, b_info, state);
250 }
251 
252 /* Build vmemranges for HVM guest */
libxl__vnuma_build_vmemrange_hvm(libxl__gc * gc,uint32_t domid,libxl_domain_build_info * b_info,libxl__domain_build_state * state,struct xc_dom_image * dom)253 int libxl__vnuma_build_vmemrange_hvm(libxl__gc *gc,
254                                      uint32_t domid,
255                                      libxl_domain_build_info *b_info,
256                                      libxl__domain_build_state *state,
257                                      struct xc_dom_image *dom)
258 {
259     uint64_t hole_start, hole_end, next;
260     int nid, nr_vmemrange;
261     xen_vmemrange_t *vmemranges;
262     int rc;
263 
264     /* Derive vmemranges from vnode size and memory hole.
265      *
266      * Guest physical address space layout:
267      * [0, hole_start) [hole_start, hole_end) [hole_end, highmem_end)
268      */
269     hole_start = dom->lowmem_end < dom->mmio_start ?
270         dom->lowmem_end : dom->mmio_start;
271     hole_end = (dom->mmio_start + dom->mmio_size) > (1ULL << 32) ?
272         (dom->mmio_start + dom->mmio_size) : (1ULL << 32);
273 
274     assert(state->vmemranges == NULL);
275 
276     next = 0;
277     nr_vmemrange = 0;
278     vmemranges = NULL;
279     for (nid = 0; nid < b_info->num_vnuma_nodes; nid++) {
280         libxl_vnode_info *p = &b_info->vnuma_nodes[nid];
281         uint64_t remaining_bytes = p->memkb << 10;
282 
283         /* Consider video ram belongs to vnode 0 */
284         if (nid == 0) {
285             if (p->memkb < b_info->video_memkb) {
286                 LOGD(ERROR, domid, "vnode 0 too small to contain video ram");
287                 rc = ERROR_INVAL;
288                 goto out;
289             }
290             remaining_bytes -= (b_info->video_memkb << 10);
291         }
292 
293         while (remaining_bytes > 0) {
294             uint64_t count = remaining_bytes;
295 
296             if (next >= hole_start && next < hole_end)
297                 next = hole_end;
298             if ((next < hole_start) && (next + remaining_bytes >= hole_start))
299                 count = hole_start - next;
300 
301             GCREALLOC_ARRAY(vmemranges, nr_vmemrange+1);
302             vmemranges[nr_vmemrange].start = next;
303             vmemranges[nr_vmemrange].end = next + count;
304             vmemranges[nr_vmemrange].flags = 0;
305             vmemranges[nr_vmemrange].nid = nid;
306 
307             nr_vmemrange++;
308             remaining_bytes -= count;
309             next += count;
310         }
311     }
312 
313     state->vmemranges = vmemranges;
314     state->num_vmemranges = nr_vmemrange;
315 
316     rc = 0;
317 out:
318     return rc;
319 }
320 
321 /*
322  * Local variables:
323  * mode: C
324  * c-basic-offset: 4
325  * indent-tabs-mode: nil
326  * End:
327  */
328