1 /*
2 * Copyright (C) 2012 Citrix Ltd.
3 * Author Dario Faggioli <dario.faggioli@citrix.com>
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU Lesser General Public License as published
7 * by the Free Software Foundation; version 2.1 only. with the special
8 * exception on linking described in file LICENSE.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU Lesser General Public License for more details.
14 */
15
16 #include "libxl_osdeps.h" /* must come before any other headers */
17
18 #include <glob.h>
19
20 #include "libxl_internal.h"
21
22 /*
23 * What follows are helpers for generating all the k-combinations
24 * without repetitions of a set S with n elements in it. Formally
25 * speaking, they are subsets of k distinct elements of S and, if
26 * S is n elements big, the number of k-combinations is equal to the
27 * binomial coefficient C(n k)=n!/(k! * (n - k)!).
28 *
29 * The various subset are generated one after the other by calling
30 * comb_init() first, and, after that, comb_next()
31 * C(n k)-1 times. An iterator is used to store the current status
32 * of the whole generation operation (i.e., basically, the last
33 * combination that has been generated). As soon as all combinations
34 * have been generated, comb_next() will start returning 0 instead of
35 * 1. The same instance of the iterator and the same values for
36 * n and k _must_ be used for each call (if that doesn't happen, the
37 * result is unspecified).
38 *
39 * The algorithm is a well known one (see, for example, D. Knuth's "The
40 * Art of Computer Programming - Volume 4, Fascicle 3" and it produces
41 * the combinations in such a way that they (well, more precisely, their
42 * indexes it the array/map representing the set) come with lexicographic
43 * ordering.
44 *
45 * For example, with n = 5 and k = 3, calling comb_init()
46 * will generate { 0, 1, 2 }, while subsequent valid calls to
47 * comb_next() will produce the following:
48 * { { 0, 1, 3 }, { 0, 1, 4 },
49 * { 0, 2, 3 }, { 0, 2, 4 }, { 0, 3, 4 },
50 * { 1, 2, 3 }, { 1, 2, 4 }, { 1, 3, 4 },
51 * { 2, 3, 4 } }
52 *
53 * This is used by the automatic NUMA placement logic below.
54 */
55 typedef int* comb_iter_t;
56
comb_init(libxl__gc * gc,comb_iter_t * it,int n,int k)57 static int comb_init(libxl__gc *gc, comb_iter_t *it, int n, int k)
58 {
59 comb_iter_t new_iter;
60 int i;
61
62 if (n < k)
63 return 0;
64
65 /* First set is always { 0, 1, 2, ..., k-1 } */
66 GCNEW_ARRAY(new_iter, k);
67 for (i = 0; i < k; i++)
68 new_iter[i] = i;
69
70 *it = new_iter;
71 return 1;
72 }
73
comb_next(comb_iter_t it,int n,int k)74 static int comb_next(comb_iter_t it, int n, int k)
75 {
76 int i;
77
78 /*
79 * The idea here is to find the leftmost element from where
80 * we should start incrementing the indexes of the iterator.
81 * This means looking for the highest index that can be increased
82 * while still producing value smaller than n-1. In the example
83 * above, when dealing with { 0, 1, 4 }, such an element is the
84 * second one, as the third is already equal to 4 (which actually
85 * is n-1).
86 * Once we found from where to start, we increment that element
87 * and override the right-hand rest of the iterator with its
88 * successors, thus achieving lexicographic ordering.
89 *
90 * Regarding the termination of the generation process, when we
91 * manage in bringing n-k at the very first position of the iterator,
92 * we know that is the last valid combination ( { 2, 3, 4 }, with
93 * n - k = 5 - 2 = 2, in the example above), and thus we start
94 * returning 0 as soon as we cross that border.
95 */
96 for (i = k - 1; it[i] == n - k + i; i--) {
97 if (i <= 0)
98 return 0;
99 }
100 for (it[i]++, i++; i < k; i++)
101 it[i] = it[i - 1] + 1;
102 return 1;
103 }
104
105 /* NUMA automatic placement (see libxl_internal.h for details) */
106
107 /*
108 * This function turns a k-combination iterator into a node map,
109 * given another map, telling us which nodes should be considered.
110 *
111 * This means the bits that are set in suitable_nodemap and that
112 * corresponds to the indexes of the given combination are the ones
113 * that will be set in nodemap.
114 *
115 * For example, given a fully set suitable_nodemap, if the iterator
116 * represents the combination { 0, 2, 4}, nodmeap will have bits #0,
117 * #2 and #4 set.
118 * On the other hand, if, say, suitable_nodemap=01011011, the same
119 * iterator will cause bits #1, #4 and #7 of nodemap to be set.
120 */
comb_get_nodemap(comb_iter_t it,libxl_bitmap * suitable_nodemap,libxl_bitmap * nodemap,int k)121 static void comb_get_nodemap(comb_iter_t it, libxl_bitmap *suitable_nodemap,
122 libxl_bitmap *nodemap, int k)
123 {
124 int i, m = 0, n = 0;
125
126 libxl_bitmap_set_none(nodemap);
127 libxl_for_each_set_bit(i, *suitable_nodemap) {
128 /* Check wether the n-th set bit of suitable_nodemap
129 * matches with the m-th element of the iterator (and,
130 * only if it does, advance to the next one) */
131 if (m < k && n == it[m]) {
132 libxl_bitmap_set(nodemap, i);
133 m++;
134 }
135 n++;
136 }
137 }
138
139 /* Retrieve the number of cpus that the nodes that are part of the nodemap
140 * span and are also set in suitable_cpumap. */
nodemap_to_nr_cpus(libxl_cputopology * tinfo,int nr_cpus,const libxl_bitmap * suitable_cpumap,const libxl_bitmap * nodemap)141 static int nodemap_to_nr_cpus(libxl_cputopology *tinfo, int nr_cpus,
142 const libxl_bitmap *suitable_cpumap,
143 const libxl_bitmap *nodemap)
144 {
145 int i, nodes_cpus = 0;
146
147 for (i = 0; i < nr_cpus; i++) {
148 if (libxl_bitmap_test(suitable_cpumap, i) &&
149 libxl_bitmap_test(nodemap, tinfo[i].node))
150 nodes_cpus++;
151 }
152 return nodes_cpus;
153 }
154
155 /* Retrieve the amount of free memory within the nodemap */
nodemap_to_free_memkb(libxl_numainfo * ninfo,libxl_bitmap * nodemap)156 static uint32_t nodemap_to_free_memkb(libxl_numainfo *ninfo,
157 libxl_bitmap *nodemap)
158 {
159 uint32_t free_memkb = 0;
160 int i;
161
162 libxl_for_each_set_bit(i, *nodemap)
163 free_memkb += ninfo[i].free / 1024;
164
165 return free_memkb;
166 }
167
168 /* Retrieve the number of vcpus able to run on the nodes in nodemap */
nodemap_to_nr_vcpus(libxl__gc * gc,int vcpus_on_node[],const libxl_bitmap * nodemap)169 static int nodemap_to_nr_vcpus(libxl__gc *gc, int vcpus_on_node[],
170 const libxl_bitmap *nodemap)
171 {
172 int i, nr_vcpus = 0;
173
174 libxl_for_each_set_bit(i, *nodemap)
175 nr_vcpus += vcpus_on_node[i];
176
177 return nr_vcpus;
178 }
179
180 /* Number of vcpus able to run on the cpus of the various nodes
181 * (reported by filling the array vcpus_on_node[]). */
nr_vcpus_on_nodes(libxl__gc * gc,libxl_cputopology * tinfo,size_t tinfo_elements,const libxl_bitmap * suitable_cpumap,int vcpus_on_node[])182 static int nr_vcpus_on_nodes(libxl__gc *gc, libxl_cputopology *tinfo,
183 size_t tinfo_elements,
184 const libxl_bitmap *suitable_cpumap,
185 int vcpus_on_node[])
186 {
187 libxl_dominfo *dinfo = NULL;
188 libxl_bitmap dom_nodemap, nodes_counted;
189 int nr_doms, nr_cpus;
190 int i, j, k;
191
192 dinfo = libxl_list_domain(CTX, &nr_doms);
193 if (dinfo == NULL)
194 return ERROR_FAIL;
195
196 if (libxl_node_bitmap_alloc(CTX, &nodes_counted, 0) < 0) {
197 libxl_dominfo_list_free(dinfo, nr_doms);
198 return ERROR_FAIL;
199 }
200
201 if (libxl_node_bitmap_alloc(CTX, &dom_nodemap, 0) < 0) {
202 libxl_bitmap_dispose(&nodes_counted);
203 libxl_dominfo_list_free(dinfo, nr_doms);
204 return ERROR_FAIL;
205 }
206
207 for (i = 0; i < nr_doms; i++) {
208 libxl_vcpuinfo *vinfo = NULL;
209 int nr_dom_vcpus = 0;
210 libxl_cpupoolinfo cpupool_info;
211 int cpupool;
212
213 libxl_cpupoolinfo_init(&cpupool_info);
214
215 cpupool = libxl__domain_cpupool(gc, dinfo[i].domid);
216 if (cpupool < 0)
217 goto next;
218 if (libxl_cpupool_info(CTX, &cpupool_info, cpupool))
219 goto next;
220
221 vinfo = libxl_list_vcpu(CTX, dinfo[i].domid, &nr_dom_vcpus, &nr_cpus);
222 if (vinfo == NULL)
223 goto next;
224
225 /* Retrieve the domain's node-affinity map */
226 libxl_domain_get_nodeaffinity(CTX, dinfo[i].domid, &dom_nodemap);
227
228 for (j = 0; j < nr_dom_vcpus; j++) {
229 /*
230 * For each vcpu of each domain, it must have both vcpu-affinity
231 * and node-affinity to (a pcpu belonging to) a certain node to
232 * cause an increment in the corresponding element of the array.
233 *
234 * Note that we also need to check whether the cpu actually
235 * belongs to the domain's cpupool (the cpupool of the domain
236 * being checked). In fact, it could be that the vcpu has affinity
237 * with cpus in suitable_cpumask, but that are not in its own
238 * cpupool, and we don't want to consider those!
239 */
240 libxl_bitmap_set_none(&nodes_counted);
241 libxl_for_each_set_bit(k, vinfo[j].cpumap) {
242 if (k >= tinfo_elements)
243 break;
244 int node = tinfo[k].node;
245
246 if (libxl_bitmap_test(suitable_cpumap, k) &&
247 libxl_bitmap_test(&cpupool_info.cpumap, k) &&
248 libxl_bitmap_test(&dom_nodemap, node) &&
249 !libxl_bitmap_test(&nodes_counted, node)) {
250 libxl_bitmap_set(&nodes_counted, node);
251 vcpus_on_node[node]++;
252 }
253 }
254 }
255
256 next:
257 libxl_cpupoolinfo_dispose(&cpupool_info);
258 libxl_vcpuinfo_list_free(vinfo, nr_dom_vcpus);
259 }
260
261 libxl_bitmap_dispose(&dom_nodemap);
262 libxl_bitmap_dispose(&nodes_counted);
263 libxl_dominfo_list_free(dinfo, nr_doms);
264 return 0;
265 }
266
267 /*
268 * This function tries to figure out if the host has a consistent number
269 * of cpus along all its NUMA nodes. In fact, if that is the case, we can
270 * calculate the minimum number of nodes needed for a domain by just
271 * dividing its total number of vcpus by this value computed here.
272 * However, we are not allowed to assume that all the nodes have the
273 * same number of cpus. Therefore, in case discrepancies among different
274 * nodes are found, this function just returns 0, for the caller to know
275 * it shouldn't rely on this 'optimization', and sort out things in some
276 * other way (by doing something basic, like starting trying with
277 * candidates with just one node).
278 */
count_cpus_per_node(libxl_cputopology * tinfo,int nr_cpus,int nr_nodes)279 static int count_cpus_per_node(libxl_cputopology *tinfo, int nr_cpus,
280 int nr_nodes)
281 {
282 int cpus_per_node = 0;
283 int j, i;
284
285 /* This makes sense iff # of PCPUs is the same for all nodes */
286 for (j = 0; j < nr_nodes; j++) {
287 int curr_cpus = 0;
288
289 for (i = 0; i < nr_cpus; i++) {
290 if (tinfo[i].node == j)
291 curr_cpus++;
292 }
293 /* So, if the above does not hold, turn the whole thing off! */
294 cpus_per_node = cpus_per_node == 0 ? curr_cpus : cpus_per_node;
295 if (cpus_per_node != curr_cpus)
296 return 0;
297 }
298 return cpus_per_node;
299 }
300
301 /*
302 * Looks for the placement candidates that satisfyies some specific
303 * conditions and return the best one according to the provided
304 * comparison function.
305 */
libxl__get_numa_candidate(libxl__gc * gc,uint64_t min_free_memkb,int min_cpus,int min_nodes,int max_nodes,const libxl_bitmap * suitable_cpumap,libxl__numa_candidate_cmpf numa_cmpf,libxl__numa_candidate * cndt_out,int * cndt_found)306 int libxl__get_numa_candidate(libxl__gc *gc,
307 uint64_t min_free_memkb, int min_cpus,
308 int min_nodes, int max_nodes,
309 const libxl_bitmap *suitable_cpumap,
310 libxl__numa_candidate_cmpf numa_cmpf,
311 libxl__numa_candidate *cndt_out,
312 int *cndt_found)
313 {
314 libxl__numa_candidate new_cndt;
315 libxl_cputopology *tinfo = NULL;
316 libxl_numainfo *ninfo = NULL;
317 int nr_nodes = 0, nr_suit_nodes, nr_cpus = 0;
318 libxl_bitmap suitable_nodemap, nodemap;
319 int *vcpus_on_node, rc = 0;
320
321 libxl_bitmap_init(&nodemap);
322 libxl_bitmap_init(&suitable_nodemap);
323 libxl__numa_candidate_init(&new_cndt);
324
325 /* Get platform info and prepare the map for testing the combinations */
326 ninfo = libxl_get_numainfo(CTX, &nr_nodes);
327 if (ninfo == NULL)
328 return ERROR_FAIL;
329
330 if (nr_nodes <= 1) {
331 *cndt_found = 0;
332 goto out;
333 }
334
335 GCNEW_ARRAY(vcpus_on_node, nr_nodes);
336
337 /*
338 * The good thing about this solution is that it is based on heuristics
339 * (implemented in numa_cmpf() ), but we at least can evaluate it on
340 * all the possible placement candidates. That can happen because the
341 * number of nodes present in current NUMA systems is quite small.
342 * In fact, even if a sum of binomials is involved, if the system has
343 * up to 16 nodes it "only" takes 65535 steps. This is fine, as the
344 * number of nodes the biggest NUMA systems provide at the time of this
345 * writing is 8 (and it will probably continue to be so for a while).
346 * However, computanional complexity would explode on systems bigger
347 * than that, and it's really important we avoid trying to run this
348 * on monsters with 32, 64 or more nodes (if they ever pop into being).
349 * Therefore, here it comes a safety catch that disables the algorithm
350 * for the cases when it wouldn't work well.
351 */
352 if (nr_nodes > 16) {
353 /* Log we did nothing and return 0, as no real error occurred */
354 LOG(WARN, "System has %d NUMA nodes, which is too big for the "
355 "placement algorithm to work effectively: skipping it. "
356 "Consider manually pinning the vCPUs and/or looking at "
357 "cpupools for manually partitioning the system.",
358 nr_nodes);
359 *cndt_found = 0;
360 goto out;
361 }
362
363 tinfo = libxl_get_cpu_topology(CTX, &nr_cpus);
364 if (tinfo == NULL) {
365 rc = ERROR_FAIL;
366 goto out;
367 }
368
369 rc = libxl_node_bitmap_alloc(CTX, &nodemap, 0);
370 if (rc)
371 goto out;
372 rc = libxl__numa_candidate_alloc(gc, &new_cndt);
373 if (rc)
374 goto out;
375
376 /* Allocate and prepare the map of the node that can be utilized for
377 * placement, basing on the map of suitable cpus. */
378 rc = libxl_node_bitmap_alloc(CTX, &suitable_nodemap, 0);
379 if (rc)
380 goto out;
381 rc = libxl_cpumap_to_nodemap(CTX, suitable_cpumap, &suitable_nodemap);
382 if (rc)
383 goto out;
384
385 /*
386 * Later on, we will try to figure out how many vcpus are runnable on
387 * each candidate (as a part of choosing the best one of them). That
388 * requires going through all the vcpus of all the domains and check
389 * their affinities. So, instead of doing that for each candidate,
390 * let's count here the number of vcpus runnable on each node, so that
391 * all we have to do later is summing up the right elements of the
392 * vcpus_on_node array.
393 */
394 rc = nr_vcpus_on_nodes(gc, tinfo, nr_cpus, suitable_cpumap, vcpus_on_node);
395 if (rc)
396 goto out;
397
398 /*
399 * If the minimum number of NUMA nodes is not explicitly specified
400 * (i.e., min_nodes == 0), we try to figure out a sensible number of nodes
401 * from where to start generating candidates, if possible (or just start
402 * from 1 otherwise). The maximum number of nodes should not exceed the
403 * number of existent NUMA nodes on the host, or the candidate generation
404 * won't work properly.
405 */
406 if (!min_nodes) {
407 int cpus_per_node;
408
409 cpus_per_node = count_cpus_per_node(tinfo, nr_cpus, nr_nodes);
410 if (cpus_per_node == 0)
411 min_nodes = 1;
412 else
413 min_nodes = (min_cpus + cpus_per_node - 1) / cpus_per_node;
414 }
415 /* We also need to be sure we do not exceed the number of
416 * nodes we are allowed to use. */
417 nr_suit_nodes = libxl_bitmap_count_set(&suitable_nodemap);
418
419 if (min_nodes > nr_suit_nodes)
420 min_nodes = nr_suit_nodes;
421 if (!max_nodes || max_nodes > nr_suit_nodes)
422 max_nodes = nr_suit_nodes;
423 if (min_nodes > max_nodes) {
424 LOG(ERROR, "Inconsistent minimum or maximum number of guest nodes");
425 rc = ERROR_INVAL;
426 goto out;
427 }
428
429 /* This is up to the caller to be disposed */
430 rc = libxl__numa_candidate_alloc(gc, cndt_out);
431 if (rc)
432 goto out;
433
434 /*
435 * Consider all the combinations with sizes in [min_nodes, max_nodes]
436 * (see comb_init() and comb_next()). Note that, since the fewer the
437 * number of nodes the better, it is guaranteed that any candidate
438 * found during the i-eth step will be better than any other one we
439 * could find during the (i+1)-eth and all the subsequent steps (they
440 * all will have more nodes). It's thus pointless to keep going if
441 * we already found something.
442 */
443 *cndt_found = 0;
444 while (min_nodes <= max_nodes && *cndt_found == 0) {
445 comb_iter_t comb_iter;
446 int comb_ok;
447
448 /*
449 * And here it is. Each step of this cycle generates a combination of
450 * nodes as big as min_nodes mandates. Each of these combinations is
451 * checked against the constraints provided by the caller (namely,
452 * amount of free memory and number of cpus) and it can concur to
453 * become our best placement iff it passes the check.
454 */
455 for (comb_ok = comb_init(gc, &comb_iter, nr_suit_nodes, min_nodes);
456 comb_ok;
457 comb_ok = comb_next(comb_iter, nr_suit_nodes, min_nodes)) {
458 uint64_t nodes_free_memkb;
459 int nodes_cpus;
460
461 /* Get the nodemap for the combination, only considering
462 * suitable nodes. */
463 comb_get_nodemap(comb_iter, &suitable_nodemap,
464 &nodemap, min_nodes);
465
466 /* If there is not enough memory in this combination, skip it
467 * and go generating the next one... */
468 nodes_free_memkb = nodemap_to_free_memkb(ninfo, &nodemap);
469 if (min_free_memkb && nodes_free_memkb < min_free_memkb)
470 continue;
471
472 /* And the same applies if this combination is short in cpus */
473 nodes_cpus = nodemap_to_nr_cpus(tinfo, nr_cpus, suitable_cpumap,
474 &nodemap);
475 if (min_cpus && nodes_cpus < min_cpus)
476 continue;
477
478 /*
479 * Conditions are met, we can compare this candidate with the
480 * current best one (if any).
481 */
482 libxl__numa_candidate_put_nodemap(gc, &new_cndt, &nodemap);
483 new_cndt.nr_vcpus = nodemap_to_nr_vcpus(gc, vcpus_on_node,
484 &nodemap);
485 new_cndt.free_memkb = nodes_free_memkb;
486 new_cndt.nr_nodes = libxl_bitmap_count_set(&nodemap);
487 new_cndt.nr_cpus = nodes_cpus;
488
489 /*
490 * Check if the new candidate we is better the what we found up
491 * to now by means of the comparison function. If no comparison
492 * function is provided, just return as soon as we find our first
493 * candidate.
494 */
495 if (*cndt_found == 0 || numa_cmpf(&new_cndt, cndt_out) < 0) {
496 *cndt_found = 1;
497
498 LOG(DEBUG, "New best NUMA placement candidate found: "
499 "nr_nodes=%d, nr_cpus=%d, nr_vcpus=%d, "
500 "free_memkb=%"PRIu64"", new_cndt.nr_nodes,
501 new_cndt.nr_cpus, new_cndt.nr_vcpus,
502 new_cndt.free_memkb / 1024);
503
504 libxl__numa_candidate_put_nodemap(gc, cndt_out, &nodemap);
505 cndt_out->nr_vcpus = new_cndt.nr_vcpus;
506 cndt_out->free_memkb = new_cndt.free_memkb;
507 cndt_out->nr_nodes = new_cndt.nr_nodes;
508 cndt_out->nr_cpus = new_cndt.nr_cpus;
509
510 if (numa_cmpf == NULL)
511 break;
512 }
513 }
514 min_nodes++;
515 }
516
517 if (*cndt_found == 0)
518 LOG(NOTICE, "NUMA placement failed, performance might be affected");
519
520 out:
521 libxl_bitmap_dispose(&nodemap);
522 libxl_bitmap_dispose(&suitable_nodemap);
523 libxl__numa_candidate_dispose(&new_cndt);
524 libxl_numainfo_list_free(ninfo, nr_nodes);
525 libxl_cputopology_list_free(tinfo, nr_cpus);
526 return rc;
527 }
528
529 /*
530 * Local variables:
531 * mode: C
532 * c-basic-offset: 4
533 * indent-tabs-mode: nil
534 * End:
535 */
536