1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES.
3 *
4 * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The
5 * PFNs can be placed into an iommu_domain, or returned to the caller as a page
6 * list for access by an in-kernel user.
7 *
8 * The datastructure uses the iopt_pages to optimize the storage of the PFNs
9 * between the domains and xarray.
10 */
11 #include <linux/iommufd.h>
12 #include <linux/lockdep.h>
13 #include <linux/iommu.h>
14 #include <linux/sched/mm.h>
15 #include <linux/err.h>
16 #include <linux/slab.h>
17 #include <linux/errno.h>
18
19 #include "io_pagetable.h"
20 #include "double_span.h"
21
22 struct iopt_pages_list {
23 struct iopt_pages *pages;
24 struct iopt_area *area;
25 struct list_head next;
26 unsigned long start_byte;
27 unsigned long length;
28 };
29
iopt_area_contig_init(struct iopt_area_contig_iter * iter,struct io_pagetable * iopt,unsigned long iova,unsigned long last_iova)30 struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter,
31 struct io_pagetable *iopt,
32 unsigned long iova,
33 unsigned long last_iova)
34 {
35 lockdep_assert_held(&iopt->iova_rwsem);
36
37 iter->cur_iova = iova;
38 iter->last_iova = last_iova;
39 iter->area = iopt_area_iter_first(iopt, iova, iova);
40 if (!iter->area)
41 return NULL;
42 if (!iter->area->pages) {
43 iter->area = NULL;
44 return NULL;
45 }
46 return iter->area;
47 }
48
iopt_area_contig_next(struct iopt_area_contig_iter * iter)49 struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter)
50 {
51 unsigned long last_iova;
52
53 if (!iter->area)
54 return NULL;
55 last_iova = iopt_area_last_iova(iter->area);
56 if (iter->last_iova <= last_iova)
57 return NULL;
58
59 iter->cur_iova = last_iova + 1;
60 iter->area = iopt_area_iter_next(iter->area, iter->cur_iova,
61 iter->last_iova);
62 if (!iter->area)
63 return NULL;
64 if (iter->cur_iova != iopt_area_iova(iter->area) ||
65 !iter->area->pages) {
66 iter->area = NULL;
67 return NULL;
68 }
69 return iter->area;
70 }
71
__alloc_iova_check_hole(struct interval_tree_double_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)72 static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span,
73 unsigned long length,
74 unsigned long iova_alignment,
75 unsigned long page_offset)
76 {
77 if (span->is_used || span->last_hole - span->start_hole < length - 1)
78 return false;
79
80 span->start_hole = ALIGN(span->start_hole, iova_alignment) |
81 page_offset;
82 if (span->start_hole > span->last_hole ||
83 span->last_hole - span->start_hole < length - 1)
84 return false;
85 return true;
86 }
87
__alloc_iova_check_used(struct interval_tree_span_iter * span,unsigned long length,unsigned long iova_alignment,unsigned long page_offset)88 static bool __alloc_iova_check_used(struct interval_tree_span_iter *span,
89 unsigned long length,
90 unsigned long iova_alignment,
91 unsigned long page_offset)
92 {
93 if (span->is_hole || span->last_used - span->start_used < length - 1)
94 return false;
95
96 span->start_used = ALIGN(span->start_used, iova_alignment) |
97 page_offset;
98 if (span->start_used > span->last_used ||
99 span->last_used - span->start_used < length - 1)
100 return false;
101 return true;
102 }
103
104 /*
105 * Automatically find a block of IOVA that is not being used and not reserved.
106 * Does not return a 0 IOVA even if it is valid.
107 */
iopt_alloc_iova(struct io_pagetable * iopt,unsigned long * iova,unsigned long uptr,unsigned long length)108 static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova,
109 unsigned long uptr, unsigned long length)
110 {
111 unsigned long page_offset = uptr % PAGE_SIZE;
112 struct interval_tree_double_span_iter used_span;
113 struct interval_tree_span_iter allowed_span;
114 unsigned long iova_alignment;
115
116 lockdep_assert_held(&iopt->iova_rwsem);
117
118 /* Protect roundup_pow-of_two() from overflow */
119 if (length == 0 || length >= ULONG_MAX / 2)
120 return -EOVERFLOW;
121
122 /*
123 * Keep alignment present in the uptr when building the IOVA, this
124 * increases the chance we can map a THP.
125 */
126 if (!uptr)
127 iova_alignment = roundup_pow_of_two(length);
128 else
129 iova_alignment = min_t(unsigned long,
130 roundup_pow_of_two(length),
131 1UL << __ffs64(uptr));
132
133 if (iova_alignment < iopt->iova_alignment)
134 return -EINVAL;
135
136 interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree,
137 PAGE_SIZE, ULONG_MAX - PAGE_SIZE) {
138 if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) {
139 allowed_span.start_used = PAGE_SIZE;
140 allowed_span.last_used = ULONG_MAX - PAGE_SIZE;
141 allowed_span.is_hole = false;
142 }
143
144 if (!__alloc_iova_check_used(&allowed_span, length,
145 iova_alignment, page_offset))
146 continue;
147
148 interval_tree_for_each_double_span(
149 &used_span, &iopt->reserved_itree, &iopt->area_itree,
150 allowed_span.start_used, allowed_span.last_used) {
151 if (!__alloc_iova_check_hole(&used_span, length,
152 iova_alignment,
153 page_offset))
154 continue;
155
156 *iova = used_span.start_hole;
157 return 0;
158 }
159 }
160 return -ENOSPC;
161 }
162
iopt_check_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length)163 static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova,
164 unsigned long length)
165 {
166 unsigned long last;
167
168 lockdep_assert_held(&iopt->iova_rwsem);
169
170 if ((iova & (iopt->iova_alignment - 1)))
171 return -EINVAL;
172
173 if (check_add_overflow(iova, length - 1, &last))
174 return -EOVERFLOW;
175
176 /* No reserved IOVA intersects the range */
177 if (iopt_reserved_iter_first(iopt, iova, last))
178 return -EINVAL;
179
180 /* Check that there is not already a mapping in the range */
181 if (iopt_area_iter_first(iopt, iova, last))
182 return -EEXIST;
183 return 0;
184 }
185
186 /*
187 * The area takes a slice of the pages from start_bytes to start_byte + length
188 */
iopt_insert_area(struct io_pagetable * iopt,struct iopt_area * area,struct iopt_pages * pages,unsigned long iova,unsigned long start_byte,unsigned long length,int iommu_prot)189 static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area,
190 struct iopt_pages *pages, unsigned long iova,
191 unsigned long start_byte, unsigned long length,
192 int iommu_prot)
193 {
194 lockdep_assert_held_write(&iopt->iova_rwsem);
195
196 if ((iommu_prot & IOMMU_WRITE) && !pages->writable)
197 return -EPERM;
198
199 area->iommu_prot = iommu_prot;
200 area->page_offset = start_byte % PAGE_SIZE;
201 if (area->page_offset & (iopt->iova_alignment - 1))
202 return -EINVAL;
203
204 area->node.start = iova;
205 if (check_add_overflow(iova, length - 1, &area->node.last))
206 return -EOVERFLOW;
207
208 area->pages_node.start = start_byte / PAGE_SIZE;
209 if (check_add_overflow(start_byte, length - 1, &area->pages_node.last))
210 return -EOVERFLOW;
211 area->pages_node.last = area->pages_node.last / PAGE_SIZE;
212 if (WARN_ON(area->pages_node.last >= pages->npages))
213 return -EOVERFLOW;
214
215 /*
216 * The area is inserted with a NULL pages indicating it is not fully
217 * initialized yet.
218 */
219 area->iopt = iopt;
220 interval_tree_insert(&area->node, &iopt->area_itree);
221 return 0;
222 }
223
iopt_alloc_area_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)224 static int iopt_alloc_area_pages(struct io_pagetable *iopt,
225 struct list_head *pages_list,
226 unsigned long length, unsigned long *dst_iova,
227 int iommu_prot, unsigned int flags)
228 {
229 struct iopt_pages_list *elm;
230 unsigned long iova;
231 int rc = 0;
232
233 list_for_each_entry(elm, pages_list, next) {
234 elm->area = kzalloc(sizeof(*elm->area), GFP_KERNEL_ACCOUNT);
235 if (!elm->area)
236 return -ENOMEM;
237 }
238
239 down_write(&iopt->iova_rwsem);
240 if ((length & (iopt->iova_alignment - 1)) || !length) {
241 rc = -EINVAL;
242 goto out_unlock;
243 }
244
245 if (flags & IOPT_ALLOC_IOVA) {
246 /* Use the first entry to guess the ideal IOVA alignment */
247 elm = list_first_entry(pages_list, struct iopt_pages_list,
248 next);
249 rc = iopt_alloc_iova(
250 iopt, dst_iova,
251 (uintptr_t)elm->pages->uptr + elm->start_byte, length);
252 if (rc)
253 goto out_unlock;
254 if (IS_ENABLED(CONFIG_IOMMUFD_TEST) &&
255 WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) {
256 rc = -EINVAL;
257 goto out_unlock;
258 }
259 } else {
260 rc = iopt_check_iova(iopt, *dst_iova, length);
261 if (rc)
262 goto out_unlock;
263 }
264
265 /*
266 * Areas are created with a NULL pages so that the IOVA space is
267 * reserved and we can unlock the iova_rwsem.
268 */
269 iova = *dst_iova;
270 list_for_each_entry(elm, pages_list, next) {
271 rc = iopt_insert_area(iopt, elm->area, elm->pages, iova,
272 elm->start_byte, elm->length, iommu_prot);
273 if (rc)
274 goto out_unlock;
275 iova += elm->length;
276 }
277
278 out_unlock:
279 up_write(&iopt->iova_rwsem);
280 return rc;
281 }
282
iopt_abort_area(struct iopt_area * area)283 static void iopt_abort_area(struct iopt_area *area)
284 {
285 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
286 WARN_ON(area->pages);
287 if (area->iopt) {
288 down_write(&area->iopt->iova_rwsem);
289 interval_tree_remove(&area->node, &area->iopt->area_itree);
290 up_write(&area->iopt->iova_rwsem);
291 }
292 kfree(area);
293 }
294
iopt_free_pages_list(struct list_head * pages_list)295 void iopt_free_pages_list(struct list_head *pages_list)
296 {
297 struct iopt_pages_list *elm;
298
299 while ((elm = list_first_entry_or_null(pages_list,
300 struct iopt_pages_list, next))) {
301 if (elm->area)
302 iopt_abort_area(elm->area);
303 if (elm->pages)
304 iopt_put_pages(elm->pages);
305 list_del(&elm->next);
306 kfree(elm);
307 }
308 }
309
iopt_fill_domains_pages(struct list_head * pages_list)310 static int iopt_fill_domains_pages(struct list_head *pages_list)
311 {
312 struct iopt_pages_list *undo_elm;
313 struct iopt_pages_list *elm;
314 int rc;
315
316 list_for_each_entry(elm, pages_list, next) {
317 rc = iopt_area_fill_domains(elm->area, elm->pages);
318 if (rc)
319 goto err_undo;
320 }
321 return 0;
322
323 err_undo:
324 list_for_each_entry(undo_elm, pages_list, next) {
325 if (undo_elm == elm)
326 break;
327 iopt_area_unfill_domains(undo_elm->area, undo_elm->pages);
328 }
329 return rc;
330 }
331
iopt_map_pages(struct io_pagetable * iopt,struct list_head * pages_list,unsigned long length,unsigned long * dst_iova,int iommu_prot,unsigned int flags)332 int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list,
333 unsigned long length, unsigned long *dst_iova,
334 int iommu_prot, unsigned int flags)
335 {
336 struct iopt_pages_list *elm;
337 int rc;
338
339 rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova,
340 iommu_prot, flags);
341 if (rc)
342 return rc;
343
344 down_read(&iopt->domains_rwsem);
345 rc = iopt_fill_domains_pages(pages_list);
346 if (rc)
347 goto out_unlock_domains;
348
349 down_write(&iopt->iova_rwsem);
350 list_for_each_entry(elm, pages_list, next) {
351 /*
352 * area->pages must be set inside the domains_rwsem to ensure
353 * any newly added domains will get filled. Moves the reference
354 * in from the list.
355 */
356 elm->area->pages = elm->pages;
357 elm->pages = NULL;
358 elm->area = NULL;
359 }
360 up_write(&iopt->iova_rwsem);
361 out_unlock_domains:
362 up_read(&iopt->domains_rwsem);
363 return rc;
364 }
365
366 /**
367 * iopt_map_user_pages() - Map a user VA to an iova in the io page table
368 * @ictx: iommufd_ctx the iopt is part of
369 * @iopt: io_pagetable to act on
370 * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains
371 * the chosen iova on output. Otherwise is the iova to map to on input
372 * @uptr: User VA to map
373 * @length: Number of bytes to map
374 * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping
375 * @flags: IOPT_ALLOC_IOVA or zero
376 *
377 * iova, uptr, and length must be aligned to iova_alignment. For domain backed
378 * page tables this will pin the pages and load them into the domain at iova.
379 * For non-domain page tables this will only setup a lazy reference and the
380 * caller must use iopt_access_pages() to touch them.
381 *
382 * iopt_unmap_iova() must be called to undo this before the io_pagetable can be
383 * destroyed.
384 */
iopt_map_user_pages(struct iommufd_ctx * ictx,struct io_pagetable * iopt,unsigned long * iova,void __user * uptr,unsigned long length,int iommu_prot,unsigned int flags)385 int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt,
386 unsigned long *iova, void __user *uptr,
387 unsigned long length, int iommu_prot,
388 unsigned int flags)
389 {
390 struct iopt_pages_list elm = {};
391 LIST_HEAD(pages_list);
392 int rc;
393
394 elm.pages = iopt_alloc_pages(uptr, length, iommu_prot & IOMMU_WRITE);
395 if (IS_ERR(elm.pages))
396 return PTR_ERR(elm.pages);
397 if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM &&
398 elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER)
399 elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM;
400 elm.start_byte = uptr - elm.pages->uptr;
401 elm.length = length;
402 list_add(&elm.next, &pages_list);
403
404 rc = iopt_map_pages(iopt, &pages_list, length, iova, iommu_prot, flags);
405 if (rc) {
406 if (elm.area)
407 iopt_abort_area(elm.area);
408 if (elm.pages)
409 iopt_put_pages(elm.pages);
410 return rc;
411 }
412 return 0;
413 }
414
iopt_get_pages(struct io_pagetable * iopt,unsigned long iova,unsigned long length,struct list_head * pages_list)415 int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova,
416 unsigned long length, struct list_head *pages_list)
417 {
418 struct iopt_area_contig_iter iter;
419 unsigned long last_iova;
420 struct iopt_area *area;
421 int rc;
422
423 if (!length)
424 return -EINVAL;
425 if (check_add_overflow(iova, length - 1, &last_iova))
426 return -EOVERFLOW;
427
428 down_read(&iopt->iova_rwsem);
429 iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) {
430 struct iopt_pages_list *elm;
431 unsigned long last = min(last_iova, iopt_area_last_iova(area));
432
433 elm = kzalloc(sizeof(*elm), GFP_KERNEL_ACCOUNT);
434 if (!elm) {
435 rc = -ENOMEM;
436 goto err_free;
437 }
438 elm->start_byte = iopt_area_start_byte(area, iter.cur_iova);
439 elm->pages = area->pages;
440 elm->length = (last - iter.cur_iova) + 1;
441 kref_get(&elm->pages->kref);
442 list_add_tail(&elm->next, pages_list);
443 }
444 if (!iopt_area_contig_done(&iter)) {
445 rc = -ENOENT;
446 goto err_free;
447 }
448 up_read(&iopt->iova_rwsem);
449 return 0;
450 err_free:
451 up_read(&iopt->iova_rwsem);
452 iopt_free_pages_list(pages_list);
453 return rc;
454 }
455
iopt_unmap_iova_range(struct io_pagetable * iopt,unsigned long start,unsigned long last,unsigned long * unmapped)456 static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start,
457 unsigned long last, unsigned long *unmapped)
458 {
459 struct iopt_area *area;
460 unsigned long unmapped_bytes = 0;
461 int rc = -ENOENT;
462
463 /*
464 * The domains_rwsem must be held in read mode any time any area->pages
465 * is NULL. This prevents domain attach/detatch from running
466 * concurrently with cleaning up the area.
467 */
468 again:
469 down_read(&iopt->domains_rwsem);
470 down_write(&iopt->iova_rwsem);
471 while ((area = iopt_area_iter_first(iopt, start, last))) {
472 unsigned long area_last = iopt_area_last_iova(area);
473 unsigned long area_first = iopt_area_iova(area);
474 struct iopt_pages *pages;
475
476 /* Userspace should not race map/unmap's of the same area */
477 if (!area->pages) {
478 rc = -EBUSY;
479 goto out_unlock_iova;
480 }
481
482 if (area_first < start || area_last > last) {
483 rc = -ENOENT;
484 goto out_unlock_iova;
485 }
486
487 /*
488 * num_accesses writers must hold the iova_rwsem too, so we can
489 * safely read it under the write side of the iovam_rwsem
490 * without the pages->mutex.
491 */
492 if (area->num_accesses) {
493 start = area_first;
494 area->prevent_access = true;
495 up_write(&iopt->iova_rwsem);
496 up_read(&iopt->domains_rwsem);
497 iommufd_access_notify_unmap(iopt, area_first,
498 iopt_area_length(area));
499 if (WARN_ON(READ_ONCE(area->num_accesses)))
500 return -EDEADLOCK;
501 goto again;
502 }
503
504 pages = area->pages;
505 area->pages = NULL;
506 up_write(&iopt->iova_rwsem);
507
508 iopt_area_unfill_domains(area, pages);
509 iopt_abort_area(area);
510 iopt_put_pages(pages);
511
512 unmapped_bytes += area_last - area_first + 1;
513
514 down_write(&iopt->iova_rwsem);
515 }
516 if (unmapped_bytes)
517 rc = 0;
518
519 out_unlock_iova:
520 up_write(&iopt->iova_rwsem);
521 up_read(&iopt->domains_rwsem);
522 if (unmapped)
523 *unmapped = unmapped_bytes;
524 return rc;
525 }
526
527 /**
528 * iopt_unmap_iova() - Remove a range of iova
529 * @iopt: io_pagetable to act on
530 * @iova: Starting iova to unmap
531 * @length: Number of bytes to unmap
532 * @unmapped: Return number of bytes unmapped
533 *
534 * The requested range must be a superset of existing ranges.
535 * Splitting/truncating IOVA mappings is not allowed.
536 */
iopt_unmap_iova(struct io_pagetable * iopt,unsigned long iova,unsigned long length,unsigned long * unmapped)537 int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova,
538 unsigned long length, unsigned long *unmapped)
539 {
540 unsigned long iova_last;
541
542 if (!length)
543 return -EINVAL;
544
545 if (check_add_overflow(iova, length - 1, &iova_last))
546 return -EOVERFLOW;
547
548 return iopt_unmap_iova_range(iopt, iova, iova_last, unmapped);
549 }
550
iopt_unmap_all(struct io_pagetable * iopt,unsigned long * unmapped)551 int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped)
552 {
553 int rc;
554
555 rc = iopt_unmap_iova_range(iopt, 0, ULONG_MAX, unmapped);
556 /* If the IOVAs are empty then unmap all succeeds */
557 if (rc == -ENOENT)
558 return 0;
559 return rc;
560 }
561
562 /* The caller must always free all the nodes in the allowed_iova rb_root. */
iopt_set_allow_iova(struct io_pagetable * iopt,struct rb_root_cached * allowed_iova)563 int iopt_set_allow_iova(struct io_pagetable *iopt,
564 struct rb_root_cached *allowed_iova)
565 {
566 struct iopt_allowed *allowed;
567
568 down_write(&iopt->iova_rwsem);
569 swap(*allowed_iova, iopt->allowed_itree);
570
571 for (allowed = iopt_allowed_iter_first(iopt, 0, ULONG_MAX); allowed;
572 allowed = iopt_allowed_iter_next(allowed, 0, ULONG_MAX)) {
573 if (iopt_reserved_iter_first(iopt, allowed->node.start,
574 allowed->node.last)) {
575 swap(*allowed_iova, iopt->allowed_itree);
576 up_write(&iopt->iova_rwsem);
577 return -EADDRINUSE;
578 }
579 }
580 up_write(&iopt->iova_rwsem);
581 return 0;
582 }
583
iopt_reserve_iova(struct io_pagetable * iopt,unsigned long start,unsigned long last,void * owner)584 int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start,
585 unsigned long last, void *owner)
586 {
587 struct iopt_reserved *reserved;
588
589 lockdep_assert_held_write(&iopt->iova_rwsem);
590
591 if (iopt_area_iter_first(iopt, start, last) ||
592 iopt_allowed_iter_first(iopt, start, last))
593 return -EADDRINUSE;
594
595 reserved = kzalloc(sizeof(*reserved), GFP_KERNEL_ACCOUNT);
596 if (!reserved)
597 return -ENOMEM;
598 reserved->node.start = start;
599 reserved->node.last = last;
600 reserved->owner = owner;
601 interval_tree_insert(&reserved->node, &iopt->reserved_itree);
602 return 0;
603 }
604
__iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)605 static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
606 {
607 struct iopt_reserved *reserved, *next;
608
609 lockdep_assert_held_write(&iopt->iova_rwsem);
610
611 for (reserved = iopt_reserved_iter_first(iopt, 0, ULONG_MAX); reserved;
612 reserved = next) {
613 next = iopt_reserved_iter_next(reserved, 0, ULONG_MAX);
614
615 if (reserved->owner == owner) {
616 interval_tree_remove(&reserved->node,
617 &iopt->reserved_itree);
618 kfree(reserved);
619 }
620 }
621 }
622
iopt_remove_reserved_iova(struct io_pagetable * iopt,void * owner)623 void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner)
624 {
625 down_write(&iopt->iova_rwsem);
626 __iopt_remove_reserved_iova(iopt, owner);
627 up_write(&iopt->iova_rwsem);
628 }
629
iopt_init_table(struct io_pagetable * iopt)630 void iopt_init_table(struct io_pagetable *iopt)
631 {
632 init_rwsem(&iopt->iova_rwsem);
633 init_rwsem(&iopt->domains_rwsem);
634 iopt->area_itree = RB_ROOT_CACHED;
635 iopt->allowed_itree = RB_ROOT_CACHED;
636 iopt->reserved_itree = RB_ROOT_CACHED;
637 xa_init_flags(&iopt->domains, XA_FLAGS_ACCOUNT);
638 xa_init_flags(&iopt->access_list, XA_FLAGS_ALLOC);
639
640 /*
641 * iopt's start as SW tables that can use the entire size_t IOVA space
642 * due to the use of size_t in the APIs. They have no alignment
643 * restriction.
644 */
645 iopt->iova_alignment = 1;
646 }
647
iopt_destroy_table(struct io_pagetable * iopt)648 void iopt_destroy_table(struct io_pagetable *iopt)
649 {
650 struct interval_tree_node *node;
651
652 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
653 iopt_remove_reserved_iova(iopt, NULL);
654
655 while ((node = interval_tree_iter_first(&iopt->allowed_itree, 0,
656 ULONG_MAX))) {
657 interval_tree_remove(node, &iopt->allowed_itree);
658 kfree(container_of(node, struct iopt_allowed, node));
659 }
660
661 WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root));
662 WARN_ON(!xa_empty(&iopt->domains));
663 WARN_ON(!xa_empty(&iopt->access_list));
664 WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root));
665 }
666
667 /**
668 * iopt_unfill_domain() - Unfill a domain with PFNs
669 * @iopt: io_pagetable to act on
670 * @domain: domain to unfill
671 *
672 * This is used when removing a domain from the iopt. Every area in the iopt
673 * will be unmapped from the domain. The domain must already be removed from the
674 * domains xarray.
675 */
iopt_unfill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)676 static void iopt_unfill_domain(struct io_pagetable *iopt,
677 struct iommu_domain *domain)
678 {
679 struct iopt_area *area;
680
681 lockdep_assert_held(&iopt->iova_rwsem);
682 lockdep_assert_held_write(&iopt->domains_rwsem);
683
684 /*
685 * Some other domain is holding all the pfns still, rapidly unmap this
686 * domain.
687 */
688 if (iopt->next_domain_id != 0) {
689 /* Pick an arbitrary remaining domain to act as storage */
690 struct iommu_domain *storage_domain =
691 xa_load(&iopt->domains, 0);
692
693 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
694 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
695 struct iopt_pages *pages = area->pages;
696
697 if (!pages)
698 continue;
699
700 mutex_lock(&pages->mutex);
701 if (IS_ENABLED(CONFIG_IOMMUFD_TEST))
702 WARN_ON(!area->storage_domain);
703 if (area->storage_domain == domain)
704 area->storage_domain = storage_domain;
705 mutex_unlock(&pages->mutex);
706
707 iopt_area_unmap_domain(area, domain);
708 }
709 return;
710 }
711
712 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
713 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
714 struct iopt_pages *pages = area->pages;
715
716 if (!pages)
717 continue;
718
719 mutex_lock(&pages->mutex);
720 interval_tree_remove(&area->pages_node, &pages->domains_itree);
721 WARN_ON(area->storage_domain != domain);
722 area->storage_domain = NULL;
723 iopt_area_unfill_domain(area, pages, domain);
724 mutex_unlock(&pages->mutex);
725 }
726 }
727
728 /**
729 * iopt_fill_domain() - Fill a domain with PFNs
730 * @iopt: io_pagetable to act on
731 * @domain: domain to fill
732 *
733 * Fill the domain with PFNs from every area in the iopt. On failure the domain
734 * is left unchanged.
735 */
iopt_fill_domain(struct io_pagetable * iopt,struct iommu_domain * domain)736 static int iopt_fill_domain(struct io_pagetable *iopt,
737 struct iommu_domain *domain)
738 {
739 struct iopt_area *end_area;
740 struct iopt_area *area;
741 int rc;
742
743 lockdep_assert_held(&iopt->iova_rwsem);
744 lockdep_assert_held_write(&iopt->domains_rwsem);
745
746 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
747 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
748 struct iopt_pages *pages = area->pages;
749
750 if (!pages)
751 continue;
752
753 mutex_lock(&pages->mutex);
754 rc = iopt_area_fill_domain(area, domain);
755 if (rc) {
756 mutex_unlock(&pages->mutex);
757 goto out_unfill;
758 }
759 if (!area->storage_domain) {
760 WARN_ON(iopt->next_domain_id != 0);
761 area->storage_domain = domain;
762 interval_tree_insert(&area->pages_node,
763 &pages->domains_itree);
764 }
765 mutex_unlock(&pages->mutex);
766 }
767 return 0;
768
769 out_unfill:
770 end_area = area;
771 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
772 area = iopt_area_iter_next(area, 0, ULONG_MAX)) {
773 struct iopt_pages *pages = area->pages;
774
775 if (area == end_area)
776 break;
777 if (!pages)
778 continue;
779 mutex_lock(&pages->mutex);
780 if (iopt->next_domain_id == 0) {
781 interval_tree_remove(&area->pages_node,
782 &pages->domains_itree);
783 area->storage_domain = NULL;
784 }
785 iopt_area_unfill_domain(area, pages, domain);
786 mutex_unlock(&pages->mutex);
787 }
788 return rc;
789 }
790
791 /* All existing area's conform to an increased page size */
iopt_check_iova_alignment(struct io_pagetable * iopt,unsigned long new_iova_alignment)792 static int iopt_check_iova_alignment(struct io_pagetable *iopt,
793 unsigned long new_iova_alignment)
794 {
795 unsigned long align_mask = new_iova_alignment - 1;
796 struct iopt_area *area;
797
798 lockdep_assert_held(&iopt->iova_rwsem);
799 lockdep_assert_held(&iopt->domains_rwsem);
800
801 for (area = iopt_area_iter_first(iopt, 0, ULONG_MAX); area;
802 area = iopt_area_iter_next(area, 0, ULONG_MAX))
803 if ((iopt_area_iova(area) & align_mask) ||
804 (iopt_area_length(area) & align_mask) ||
805 (area->page_offset & align_mask))
806 return -EADDRINUSE;
807
808 if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) {
809 struct iommufd_access *access;
810 unsigned long index;
811
812 xa_for_each(&iopt->access_list, index, access)
813 if (WARN_ON(access->iova_alignment >
814 new_iova_alignment))
815 return -EADDRINUSE;
816 }
817 return 0;
818 }
819
iopt_table_add_domain(struct io_pagetable * iopt,struct iommu_domain * domain)820 int iopt_table_add_domain(struct io_pagetable *iopt,
821 struct iommu_domain *domain)
822 {
823 const struct iommu_domain_geometry *geometry = &domain->geometry;
824 struct iommu_domain *iter_domain;
825 unsigned int new_iova_alignment;
826 unsigned long index;
827 int rc;
828
829 down_write(&iopt->domains_rwsem);
830 down_write(&iopt->iova_rwsem);
831
832 xa_for_each(&iopt->domains, index, iter_domain) {
833 if (WARN_ON(iter_domain == domain)) {
834 rc = -EEXIST;
835 goto out_unlock;
836 }
837 }
838
839 /*
840 * The io page size drives the iova_alignment. Internally the iopt_pages
841 * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE
842 * objects into the iommu_domain.
843 *
844 * A iommu_domain must always be able to accept PAGE_SIZE to be
845 * compatible as we can't guarantee higher contiguity.
846 */
847 new_iova_alignment = max_t(unsigned long,
848 1UL << __ffs(domain->pgsize_bitmap),
849 iopt->iova_alignment);
850 if (new_iova_alignment > PAGE_SIZE) {
851 rc = -EINVAL;
852 goto out_unlock;
853 }
854 if (new_iova_alignment != iopt->iova_alignment) {
855 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
856 if (rc)
857 goto out_unlock;
858 }
859
860 /* No area exists that is outside the allowed domain aperture */
861 if (geometry->aperture_start != 0) {
862 rc = iopt_reserve_iova(iopt, 0, geometry->aperture_start - 1,
863 domain);
864 if (rc)
865 goto out_reserved;
866 }
867 if (geometry->aperture_end != ULONG_MAX) {
868 rc = iopt_reserve_iova(iopt, geometry->aperture_end + 1,
869 ULONG_MAX, domain);
870 if (rc)
871 goto out_reserved;
872 }
873
874 rc = xa_reserve(&iopt->domains, iopt->next_domain_id, GFP_KERNEL);
875 if (rc)
876 goto out_reserved;
877
878 rc = iopt_fill_domain(iopt, domain);
879 if (rc)
880 goto out_release;
881
882 iopt->iova_alignment = new_iova_alignment;
883 xa_store(&iopt->domains, iopt->next_domain_id, domain, GFP_KERNEL);
884 iopt->next_domain_id++;
885 up_write(&iopt->iova_rwsem);
886 up_write(&iopt->domains_rwsem);
887 return 0;
888 out_release:
889 xa_release(&iopt->domains, iopt->next_domain_id);
890 out_reserved:
891 __iopt_remove_reserved_iova(iopt, domain);
892 out_unlock:
893 up_write(&iopt->iova_rwsem);
894 up_write(&iopt->domains_rwsem);
895 return rc;
896 }
897
iopt_calculate_iova_alignment(struct io_pagetable * iopt)898 static int iopt_calculate_iova_alignment(struct io_pagetable *iopt)
899 {
900 unsigned long new_iova_alignment;
901 struct iommufd_access *access;
902 struct iommu_domain *domain;
903 unsigned long index;
904
905 lockdep_assert_held_write(&iopt->iova_rwsem);
906 lockdep_assert_held(&iopt->domains_rwsem);
907
908 /* See batch_iommu_map_small() */
909 if (iopt->disable_large_pages)
910 new_iova_alignment = PAGE_SIZE;
911 else
912 new_iova_alignment = 1;
913
914 xa_for_each(&iopt->domains, index, domain)
915 new_iova_alignment = max_t(unsigned long,
916 1UL << __ffs(domain->pgsize_bitmap),
917 new_iova_alignment);
918 xa_for_each(&iopt->access_list, index, access)
919 new_iova_alignment = max_t(unsigned long,
920 access->iova_alignment,
921 new_iova_alignment);
922
923 if (new_iova_alignment > iopt->iova_alignment) {
924 int rc;
925
926 rc = iopt_check_iova_alignment(iopt, new_iova_alignment);
927 if (rc)
928 return rc;
929 }
930 iopt->iova_alignment = new_iova_alignment;
931 return 0;
932 }
933
iopt_table_remove_domain(struct io_pagetable * iopt,struct iommu_domain * domain)934 void iopt_table_remove_domain(struct io_pagetable *iopt,
935 struct iommu_domain *domain)
936 {
937 struct iommu_domain *iter_domain = NULL;
938 unsigned long index;
939
940 down_write(&iopt->domains_rwsem);
941 down_write(&iopt->iova_rwsem);
942
943 xa_for_each(&iopt->domains, index, iter_domain)
944 if (iter_domain == domain)
945 break;
946 if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id)
947 goto out_unlock;
948
949 /*
950 * Compress the xarray to keep it linear by swapping the entry to erase
951 * with the tail entry and shrinking the tail.
952 */
953 iopt->next_domain_id--;
954 iter_domain = xa_erase(&iopt->domains, iopt->next_domain_id);
955 if (index != iopt->next_domain_id)
956 xa_store(&iopt->domains, index, iter_domain, GFP_KERNEL);
957
958 iopt_unfill_domain(iopt, domain);
959 __iopt_remove_reserved_iova(iopt, domain);
960
961 WARN_ON(iopt_calculate_iova_alignment(iopt));
962 out_unlock:
963 up_write(&iopt->iova_rwsem);
964 up_write(&iopt->domains_rwsem);
965 }
966
967 /**
968 * iopt_area_split - Split an area into two parts at iova
969 * @area: The area to split
970 * @iova: Becomes the last of a new area
971 *
972 * This splits an area into two. It is part of the VFIO compatibility to allow
973 * poking a hole in the mapping. The two areas continue to point at the same
974 * iopt_pages, just with different starting bytes.
975 */
iopt_area_split(struct iopt_area * area,unsigned long iova)976 static int iopt_area_split(struct iopt_area *area, unsigned long iova)
977 {
978 unsigned long alignment = area->iopt->iova_alignment;
979 unsigned long last_iova = iopt_area_last_iova(area);
980 unsigned long start_iova = iopt_area_iova(area);
981 unsigned long new_start = iova + 1;
982 struct io_pagetable *iopt = area->iopt;
983 struct iopt_pages *pages = area->pages;
984 struct iopt_area *lhs;
985 struct iopt_area *rhs;
986 int rc;
987
988 lockdep_assert_held_write(&iopt->iova_rwsem);
989
990 if (iova == start_iova || iova == last_iova)
991 return 0;
992
993 if (!pages || area->prevent_access)
994 return -EBUSY;
995
996 if (new_start & (alignment - 1) ||
997 iopt_area_start_byte(area, new_start) & (alignment - 1))
998 return -EINVAL;
999
1000 lhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
1001 if (!lhs)
1002 return -ENOMEM;
1003
1004 rhs = kzalloc(sizeof(*area), GFP_KERNEL_ACCOUNT);
1005 if (!rhs) {
1006 rc = -ENOMEM;
1007 goto err_free_lhs;
1008 }
1009
1010 mutex_lock(&pages->mutex);
1011 /*
1012 * Splitting is not permitted if an access exists, we don't track enough
1013 * information to split existing accesses.
1014 */
1015 if (area->num_accesses) {
1016 rc = -EINVAL;
1017 goto err_unlock;
1018 }
1019
1020 /*
1021 * Splitting is not permitted if a domain could have been mapped with
1022 * huge pages.
1023 */
1024 if (area->storage_domain && !iopt->disable_large_pages) {
1025 rc = -EINVAL;
1026 goto err_unlock;
1027 }
1028
1029 interval_tree_remove(&area->node, &iopt->area_itree);
1030 rc = iopt_insert_area(iopt, lhs, area->pages, start_iova,
1031 iopt_area_start_byte(area, start_iova),
1032 (new_start - 1) - start_iova + 1,
1033 area->iommu_prot);
1034 if (WARN_ON(rc))
1035 goto err_insert;
1036
1037 rc = iopt_insert_area(iopt, rhs, area->pages, new_start,
1038 iopt_area_start_byte(area, new_start),
1039 last_iova - new_start + 1, area->iommu_prot);
1040 if (WARN_ON(rc))
1041 goto err_remove_lhs;
1042
1043 lhs->storage_domain = area->storage_domain;
1044 lhs->pages = area->pages;
1045 rhs->storage_domain = area->storage_domain;
1046 rhs->pages = area->pages;
1047 kref_get(&rhs->pages->kref);
1048 kfree(area);
1049 mutex_unlock(&pages->mutex);
1050
1051 /*
1052 * No change to domains or accesses because the pages hasn't been
1053 * changed
1054 */
1055 return 0;
1056
1057 err_remove_lhs:
1058 interval_tree_remove(&lhs->node, &iopt->area_itree);
1059 err_insert:
1060 interval_tree_insert(&area->node, &iopt->area_itree);
1061 err_unlock:
1062 mutex_unlock(&pages->mutex);
1063 kfree(rhs);
1064 err_free_lhs:
1065 kfree(lhs);
1066 return rc;
1067 }
1068
iopt_cut_iova(struct io_pagetable * iopt,unsigned long * iovas,size_t num_iovas)1069 int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas,
1070 size_t num_iovas)
1071 {
1072 int rc = 0;
1073 int i;
1074
1075 down_write(&iopt->iova_rwsem);
1076 for (i = 0; i < num_iovas; i++) {
1077 struct iopt_area *area;
1078
1079 area = iopt_area_iter_first(iopt, iovas[i], iovas[i]);
1080 if (!area)
1081 continue;
1082 rc = iopt_area_split(area, iovas[i]);
1083 if (rc)
1084 break;
1085 }
1086 up_write(&iopt->iova_rwsem);
1087 return rc;
1088 }
1089
iopt_enable_large_pages(struct io_pagetable * iopt)1090 void iopt_enable_large_pages(struct io_pagetable *iopt)
1091 {
1092 int rc;
1093
1094 down_write(&iopt->domains_rwsem);
1095 down_write(&iopt->iova_rwsem);
1096 WRITE_ONCE(iopt->disable_large_pages, false);
1097 rc = iopt_calculate_iova_alignment(iopt);
1098 WARN_ON(rc);
1099 up_write(&iopt->iova_rwsem);
1100 up_write(&iopt->domains_rwsem);
1101 }
1102
iopt_disable_large_pages(struct io_pagetable * iopt)1103 int iopt_disable_large_pages(struct io_pagetable *iopt)
1104 {
1105 int rc = 0;
1106
1107 down_write(&iopt->domains_rwsem);
1108 down_write(&iopt->iova_rwsem);
1109 if (iopt->disable_large_pages)
1110 goto out_unlock;
1111
1112 /* Won't do it if domains already have pages mapped in them */
1113 if (!xa_empty(&iopt->domains) &&
1114 !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) {
1115 rc = -EINVAL;
1116 goto out_unlock;
1117 }
1118
1119 WRITE_ONCE(iopt->disable_large_pages, true);
1120 rc = iopt_calculate_iova_alignment(iopt);
1121 if (rc)
1122 WRITE_ONCE(iopt->disable_large_pages, false);
1123 out_unlock:
1124 up_write(&iopt->iova_rwsem);
1125 up_write(&iopt->domains_rwsem);
1126 return rc;
1127 }
1128
iopt_add_access(struct io_pagetable * iopt,struct iommufd_access * access)1129 int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access)
1130 {
1131 int rc;
1132
1133 down_write(&iopt->domains_rwsem);
1134 down_write(&iopt->iova_rwsem);
1135 rc = xa_alloc(&iopt->access_list, &access->iopt_access_list_id, access,
1136 xa_limit_16b, GFP_KERNEL_ACCOUNT);
1137 if (rc)
1138 goto out_unlock;
1139
1140 rc = iopt_calculate_iova_alignment(iopt);
1141 if (rc) {
1142 xa_erase(&iopt->access_list, access->iopt_access_list_id);
1143 goto out_unlock;
1144 }
1145
1146 out_unlock:
1147 up_write(&iopt->iova_rwsem);
1148 up_write(&iopt->domains_rwsem);
1149 return rc;
1150 }
1151
iopt_remove_access(struct io_pagetable * iopt,struct iommufd_access * access)1152 void iopt_remove_access(struct io_pagetable *iopt,
1153 struct iommufd_access *access)
1154 {
1155 down_write(&iopt->domains_rwsem);
1156 down_write(&iopt->iova_rwsem);
1157 WARN_ON(xa_erase(&iopt->access_list, access->iopt_access_list_id) !=
1158 access);
1159 WARN_ON(iopt_calculate_iova_alignment(iopt));
1160 up_write(&iopt->iova_rwsem);
1161 up_write(&iopt->domains_rwsem);
1162 }
1163
1164 /* Narrow the valid_iova_itree to include reserved ranges from a group. */
iopt_table_enforce_group_resv_regions(struct io_pagetable * iopt,struct device * device,struct iommu_group * group,phys_addr_t * sw_msi_start)1165 int iopt_table_enforce_group_resv_regions(struct io_pagetable *iopt,
1166 struct device *device,
1167 struct iommu_group *group,
1168 phys_addr_t *sw_msi_start)
1169 {
1170 struct iommu_resv_region *resv;
1171 struct iommu_resv_region *tmp;
1172 LIST_HEAD(group_resv_regions);
1173 unsigned int num_hw_msi = 0;
1174 unsigned int num_sw_msi = 0;
1175 int rc;
1176
1177 down_write(&iopt->iova_rwsem);
1178 rc = iommu_get_group_resv_regions(group, &group_resv_regions);
1179 if (rc)
1180 goto out_unlock;
1181
1182 list_for_each_entry(resv, &group_resv_regions, list) {
1183 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1184 continue;
1185
1186 if (sw_msi_start && resv->type == IOMMU_RESV_MSI)
1187 num_hw_msi++;
1188 if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) {
1189 *sw_msi_start = resv->start;
1190 num_sw_msi++;
1191 }
1192
1193 rc = iopt_reserve_iova(iopt, resv->start,
1194 resv->length - 1 + resv->start, device);
1195 if (rc)
1196 goto out_reserved;
1197 }
1198
1199 /* Drivers must offer sane combinations of regions */
1200 if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) {
1201 rc = -EINVAL;
1202 goto out_reserved;
1203 }
1204
1205 rc = 0;
1206 goto out_free_resv;
1207
1208 out_reserved:
1209 __iopt_remove_reserved_iova(iopt, device);
1210 out_free_resv:
1211 list_for_each_entry_safe(resv, tmp, &group_resv_regions, list)
1212 kfree(resv);
1213 out_unlock:
1214 up_write(&iopt->iova_rwsem);
1215 return rc;
1216 }
1217