1 /*
2  * Copyright (C) 2014      Citrix Ltd.
3  * Author Wei Liu <wei.liu2@citrix.com>
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU Lesser General Public License as published
7  * by the Free Software Foundation; version 2.1 only. with the special
8  * exception on linking described in file LICENSE.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU Lesser General Public License for more details.
14  */
15 #include "libxl_osdeps.h" /* must come before any other headers */
16 #include "libxl_internal.h"
17 #include "libxl_arch.h"
18 #include <stdlib.h>
19 
libxl__vnuma_configured(const libxl_domain_build_info * b_info)20 bool libxl__vnuma_configured(const libxl_domain_build_info *b_info)
21 {
22     return b_info->num_vnuma_nodes != 0;
23 }
24 
25 /* Sort vmemranges in ascending order with "start" */
compare_vmemrange(const void * a,const void * b)26 static int compare_vmemrange(const void *a, const void *b)
27 {
28     const xen_vmemrange_t *x = a, *y = b;
29     if (x->start < y->start)
30         return -1;
31     if (x->start > y->start)
32         return 1;
33     return 0;
34 }
35 
36 /* Check if a vcpu has an hard (or soft) affinity set in such
37  * a way that it does not match the pnode to which the vcpu itself
38  * is assigned to.
39  */
check_vnuma_affinity(libxl__gc * gc,unsigned int vcpu,unsigned int pnode,unsigned int num_affinity,const libxl_bitmap * affinity,const char * kind)40 static int check_vnuma_affinity(libxl__gc *gc,
41                                  unsigned int vcpu,
42                                  unsigned int pnode,
43                                  unsigned int num_affinity,
44                                  const libxl_bitmap *affinity,
45                                  const char *kind)
46 {
47     libxl_bitmap nodemap;
48     int rc = 0;
49 
50     libxl_bitmap_init(&nodemap);
51 
52     rc = libxl_node_bitmap_alloc(CTX, &nodemap, 0);
53     if (rc) {
54         LOG(ERROR, "Can't allocate nodemap");
55         goto out;
56     }
57 
58     rc = libxl_cpumap_to_nodemap(CTX, affinity, &nodemap);
59     if (rc) {
60         LOG(ERROR, "Can't convert Vcpu %d affinity to nodemap", vcpu);
61         goto out;
62     }
63 
64     if (libxl_bitmap_count_set(&nodemap) != 1 ||
65         !libxl_bitmap_test(&nodemap, pnode))
66         LOG(WARN, "Vcpu %d %s affinity and vnuma info mismatch", vcpu, kind);
67 
68 out:
69     libxl_bitmap_dispose(&nodemap);
70     return rc;
71 }
72 
73 /* Check if vNUMA configuration is valid:
74  *  1. all pnodes inside vnode_to_pnode array are valid
75  *  2. each vcpu belongs to one and only one vnode
76  *  3. each vmemrange is valid and doesn't overlap with any other
77  *  4. local distance cannot be larger than remote distance
78  *
79  * Check also, if any hard or soft affinity is specified, whether
80  * they match with the vNUMA related bits (namely vcpus to vnodes
81  * mappings and vnodes to pnodes association). If that does not
82  * hold, however, just print a warning, as that has "only"
83  * performance implications.
84  */
libxl__vnuma_config_check(libxl__gc * gc,const libxl_domain_build_info * b_info,const libxl__domain_build_state * state)85 int libxl__vnuma_config_check(libxl__gc *gc,
86                               const libxl_domain_build_info *b_info,
87                               const libxl__domain_build_state *state)
88 {
89     int nr_nodes = 0, rc = ERROR_VNUMA_CONFIG_INVALID;
90     unsigned int i, j;
91     libxl_numainfo *ninfo = NULL;
92     uint64_t total_memkb = 0;
93     libxl_bitmap cpumap;
94     libxl_vnode_info *v;
95 
96     libxl_bitmap_init(&cpumap);
97 
98     /* Check pnode specified is valid */
99     ninfo = libxl_get_numainfo(CTX, &nr_nodes);
100     if (!ninfo) {
101         LOG(ERROR, "libxl_get_numainfo failed");
102         goto out;
103     }
104 
105     for (i = 0; i < b_info->num_vnuma_nodes; i++) {
106         uint32_t pnode;
107 
108         v = &b_info->vnuma_nodes[i];
109         pnode = v->pnode;
110 
111         /* The pnode specified is not valid? */
112         if (pnode >= nr_nodes) {
113             LOG(ERROR, "Invalid pnode %"PRIu32" specified", pnode);
114             goto out;
115         }
116 
117         total_memkb += v->memkb;
118     }
119 
120     if (total_memkb != b_info->max_memkb) {
121         LOG(ERROR, "Amount of memory mismatch (0x%"PRIx64" != 0x%"PRIx64")",
122             total_memkb, b_info->max_memkb);
123         goto out;
124     }
125 
126     /* Check vcpu mapping */
127     libxl_cpu_bitmap_alloc(CTX, &cpumap, b_info->max_vcpus);
128     for (i = 0; i < b_info->num_vnuma_nodes; i++) {
129         v = &b_info->vnuma_nodes[i];
130         libxl_for_each_set_bit(j, v->vcpus) {
131             if (!libxl_bitmap_test(&cpumap, j))
132                 libxl_bitmap_set(&cpumap, j);
133             else {
134                 LOG(ERROR, "Vcpu %d assigned more than once", j);
135                 goto out;
136             }
137         }
138     }
139 
140     for (i = 0; i < b_info->max_vcpus; i++) {
141         if (!libxl_bitmap_test(&cpumap, i)) {
142             LOG(ERROR, "Vcpu %d is not assigned to any vnode", i);
143             goto out;
144         }
145     }
146 
147     /* Check whether vcpu affinity (if any) matches vnuma configuration */
148     for (i = 0; i < b_info->num_vnuma_nodes; i++) {
149         v = &b_info->vnuma_nodes[i];
150         libxl_for_each_set_bit(j, v->vcpus) {
151             if (b_info->num_vcpu_hard_affinity > j)
152                 check_vnuma_affinity(gc, j, v->pnode,
153                                      b_info->num_vcpu_hard_affinity,
154                                      &b_info->vcpu_hard_affinity[j],
155                                      "hard");
156             if (b_info->num_vcpu_soft_affinity > j)
157                 check_vnuma_affinity(gc, j, v->pnode,
158                                      b_info->num_vcpu_soft_affinity,
159                                      &b_info->vcpu_soft_affinity[j],
160                                      "soft");
161         }
162     }
163 
164     /* Check vmemranges */
165     qsort(state->vmemranges, state->num_vmemranges, sizeof(xen_vmemrange_t),
166           compare_vmemrange);
167 
168     for (i = 0; i < state->num_vmemranges; i++) {
169         if (state->vmemranges[i].end < state->vmemranges[i].start) {
170                 LOG(ERROR, "Vmemrange end < start");
171                 goto out;
172         }
173     }
174 
175     for (i = 0; i < state->num_vmemranges - 1; i++) {
176         if (state->vmemranges[i].end > state->vmemranges[i+1].start) {
177             LOG(ERROR,
178                 "Vmemranges overlapped, 0x%"PRIx64"-0x%"PRIx64", 0x%"PRIx64"-0x%"PRIx64,
179                 state->vmemranges[i].start, state->vmemranges[i].end,
180                 state->vmemranges[i+1].start, state->vmemranges[i+1].end);
181             goto out;
182         }
183     }
184 
185     /* Check vdistances */
186     for (i = 0; i < b_info->num_vnuma_nodes; i++) {
187         uint32_t local_distance, remote_distance;
188 
189         v = &b_info->vnuma_nodes[i];
190         local_distance = v->distances[i];
191 
192         for (j = 0; j < v->num_distances; j++) {
193             if (i == j) continue;
194             remote_distance = v->distances[j];
195             if (local_distance > remote_distance) {
196                 LOG(ERROR,
197                     "Distance from %u to %u smaller than %u's local distance",
198                     i, j, i);
199                 goto out;
200             }
201         }
202     }
203 
204     rc = 0;
205 out:
206     libxl_numainfo_list_free(ninfo, nr_nodes);
207     libxl_bitmap_dispose(&cpumap);
208     return rc;
209 }
210 
libxl__vnuma_build_vmemrange_pv_generic(libxl__gc * gc,uint32_t domid,libxl_domain_build_info * b_info,libxl__domain_build_state * state)211 int libxl__vnuma_build_vmemrange_pv_generic(libxl__gc *gc,
212                                             uint32_t domid,
213                                             libxl_domain_build_info *b_info,
214                                             libxl__domain_build_state *state)
215 {
216     int i;
217     uint64_t next;
218     xen_vmemrange_t *v = NULL;
219 
220     /* Generate one vmemrange for each virtual node. */
221     GCREALLOC_ARRAY(v, b_info->num_vnuma_nodes);
222     next = 0;
223     for (i = 0; i < b_info->num_vnuma_nodes; i++) {
224         libxl_vnode_info *p = &b_info->vnuma_nodes[i];
225 
226         v[i].start = next;
227         v[i].end = next + (p->memkb << 10);
228         v[i].flags = 0;
229         v[i].nid = i;
230 
231         next = v[i].end;
232     }
233 
234     state->vmemranges = v;
235     state->num_vmemranges = i;
236 
237     return 0;
238 }
239 
240 /* Build vmemranges for PV guest */
libxl__vnuma_build_vmemrange_pv(libxl__gc * gc,uint32_t domid,libxl_domain_build_info * b_info,libxl__domain_build_state * state)241 int libxl__vnuma_build_vmemrange_pv(libxl__gc *gc,
242                                     uint32_t domid,
243                                     libxl_domain_build_info *b_info,
244                                     libxl__domain_build_state *state)
245 {
246     assert(state->vmemranges == NULL);
247     return libxl__arch_vnuma_build_vmemrange(gc, domid, b_info, state);
248 }
249 
250 /* Build vmemranges for HVM guest */
libxl__vnuma_build_vmemrange_hvm(libxl__gc * gc,uint32_t domid,libxl_domain_build_info * b_info,libxl__domain_build_state * state,struct xc_dom_image * dom)251 int libxl__vnuma_build_vmemrange_hvm(libxl__gc *gc,
252                                      uint32_t domid,
253                                      libxl_domain_build_info *b_info,
254                                      libxl__domain_build_state *state,
255                                      struct xc_dom_image *dom)
256 {
257     uint64_t hole_start, hole_end, next;
258     int nid, nr_vmemrange;
259     xen_vmemrange_t *vmemranges;
260     int rc;
261 
262     /* Derive vmemranges from vnode size and memory hole.
263      *
264      * Guest physical address space layout:
265      * [0, hole_start) [hole_start, hole_end) [hole_end, highmem_end)
266      */
267     hole_start = dom->lowmem_end < dom->mmio_start ?
268         dom->lowmem_end : dom->mmio_start;
269     hole_end = (dom->mmio_start + dom->mmio_size) > (1ULL << 32) ?
270         (dom->mmio_start + dom->mmio_size) : (1ULL << 32);
271 
272     assert(state->vmemranges == NULL);
273 
274     next = 0;
275     nr_vmemrange = 0;
276     vmemranges = NULL;
277     for (nid = 0; nid < b_info->num_vnuma_nodes; nid++) {
278         libxl_vnode_info *p = &b_info->vnuma_nodes[nid];
279         uint64_t remaining_bytes = p->memkb << 10;
280 
281         /* Consider video ram belongs to vnode 0 */
282         if (nid == 0) {
283             if (p->memkb < b_info->video_memkb) {
284                 LOGD(ERROR, domid, "vnode 0 too small to contain video ram");
285                 rc = ERROR_INVAL;
286                 goto out;
287             }
288             remaining_bytes -= (b_info->video_memkb << 10);
289         }
290 
291         while (remaining_bytes > 0) {
292             uint64_t count = remaining_bytes;
293 
294             if (next >= hole_start && next < hole_end)
295                 next = hole_end;
296             if ((next < hole_start) && (next + remaining_bytes >= hole_start))
297                 count = hole_start - next;
298 
299             GCREALLOC_ARRAY(vmemranges, nr_vmemrange+1);
300             vmemranges[nr_vmemrange].start = next;
301             vmemranges[nr_vmemrange].end = next + count;
302             vmemranges[nr_vmemrange].flags = 0;
303             vmemranges[nr_vmemrange].nid = nid;
304 
305             nr_vmemrange++;
306             remaining_bytes -= count;
307             next += count;
308         }
309     }
310 
311     state->vmemranges = vmemranges;
312     state->num_vmemranges = nr_vmemrange;
313 
314     rc = 0;
315 out:
316     return rc;
317 }
318 
319 /*
320  * Local variables:
321  * mode: C
322  * c-basic-offset: 4
323  * indent-tabs-mode: nil
324  * End:
325  */
326