1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
7 *
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13 #include <linux/cdev.h>
14 #include <linux/compat.h>
15 #include <linux/device.h>
16 #include <linux/fs.h>
17 #include <linux/idr.h>
18 #include <linux/iommu.h>
19 #ifdef CONFIG_HAVE_KVM
20 #include <linux/kvm_host.h>
21 #endif
22 #include <linux/list.h>
23 #include <linux/miscdevice.h>
24 #include <linux/module.h>
25 #include <linux/mutex.h>
26 #include <linux/pci.h>
27 #include <linux/rwsem.h>
28 #include <linux/sched.h>
29 #include <linux/slab.h>
30 #include <linux/stat.h>
31 #include <linux/string.h>
32 #include <linux/uaccess.h>
33 #include <linux/vfio.h>
34 #include <linux/wait.h>
35 #include <linux/sched/signal.h>
36 #include <linux/pm_runtime.h>
37 #include <linux/interval_tree.h>
38 #include <linux/iova_bitmap.h>
39 #include <linux/iommufd.h>
40 #include "vfio.h"
41
42 #define DRIVER_VERSION "0.3"
43 #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
44 #define DRIVER_DESC "VFIO - User Level meta-driver"
45
46 static struct vfio {
47 struct class *device_class;
48 struct ida device_ida;
49 } vfio;
50
51 #ifdef CONFIG_VFIO_NOIOMMU
52 bool vfio_noiommu __read_mostly;
53 module_param_named(enable_unsafe_noiommu_mode,
54 vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55 MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
56 #endif
57
58 static DEFINE_XARRAY(vfio_device_set_xa);
59
vfio_assign_device_set(struct vfio_device * device,void * set_id)60 int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61 {
62 unsigned long idx = (unsigned long)set_id;
63 struct vfio_device_set *new_dev_set;
64 struct vfio_device_set *dev_set;
65
66 if (WARN_ON(!set_id))
67 return -EINVAL;
68
69 /*
70 * Atomically acquire a singleton object in the xarray for this set_id
71 */
72 xa_lock(&vfio_device_set_xa);
73 dev_set = xa_load(&vfio_device_set_xa, idx);
74 if (dev_set)
75 goto found_get_ref;
76 xa_unlock(&vfio_device_set_xa);
77
78 new_dev_set = kzalloc(sizeof(*new_dev_set), GFP_KERNEL);
79 if (!new_dev_set)
80 return -ENOMEM;
81 mutex_init(&new_dev_set->lock);
82 INIT_LIST_HEAD(&new_dev_set->device_list);
83 new_dev_set->set_id = set_id;
84
85 xa_lock(&vfio_device_set_xa);
86 dev_set = __xa_cmpxchg(&vfio_device_set_xa, idx, NULL, new_dev_set,
87 GFP_KERNEL);
88 if (!dev_set) {
89 dev_set = new_dev_set;
90 goto found_get_ref;
91 }
92
93 kfree(new_dev_set);
94 if (xa_is_err(dev_set)) {
95 xa_unlock(&vfio_device_set_xa);
96 return xa_err(dev_set);
97 }
98
99 found_get_ref:
100 dev_set->device_count++;
101 xa_unlock(&vfio_device_set_xa);
102 mutex_lock(&dev_set->lock);
103 device->dev_set = dev_set;
104 list_add_tail(&device->dev_set_list, &dev_set->device_list);
105 mutex_unlock(&dev_set->lock);
106 return 0;
107 }
108 EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109
vfio_release_device_set(struct vfio_device * device)110 static void vfio_release_device_set(struct vfio_device *device)
111 {
112 struct vfio_device_set *dev_set = device->dev_set;
113
114 if (!dev_set)
115 return;
116
117 mutex_lock(&dev_set->lock);
118 list_del(&device->dev_set_list);
119 mutex_unlock(&dev_set->lock);
120
121 xa_lock(&vfio_device_set_xa);
122 if (!--dev_set->device_count) {
123 __xa_erase(&vfio_device_set_xa,
124 (unsigned long)dev_set->set_id);
125 mutex_destroy(&dev_set->lock);
126 kfree(dev_set);
127 }
128 xa_unlock(&vfio_device_set_xa);
129 }
130
vfio_device_set_open_count(struct vfio_device_set * dev_set)131 unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132 {
133 struct vfio_device *cur;
134 unsigned int open_count = 0;
135
136 lockdep_assert_held(&dev_set->lock);
137
138 list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 open_count += cur->open_count;
140 return open_count;
141 }
142 EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143
144 /*
145 * Device objects - create, release, get, put, search
146 */
147 /* Device reference always implies a group reference */
vfio_device_put_registration(struct vfio_device * device)148 void vfio_device_put_registration(struct vfio_device *device)
149 {
150 if (refcount_dec_and_test(&device->refcount))
151 complete(&device->comp);
152 }
153
vfio_device_try_get_registration(struct vfio_device * device)154 bool vfio_device_try_get_registration(struct vfio_device *device)
155 {
156 return refcount_inc_not_zero(&device->refcount);
157 }
158
159 /*
160 * VFIO driver API
161 */
162 /* Release helper called by vfio_put_device() */
vfio_device_release(struct device * dev)163 static void vfio_device_release(struct device *dev)
164 {
165 struct vfio_device *device =
166 container_of(dev, struct vfio_device, device);
167
168 vfio_release_device_set(device);
169 ida_free(&vfio.device_ida, device->index);
170
171 if (device->ops->release)
172 device->ops->release(device);
173
174 kvfree(device);
175 }
176
177 static int vfio_init_device(struct vfio_device *device, struct device *dev,
178 const struct vfio_device_ops *ops);
179
180 /*
181 * Allocate and initialize vfio_device so it can be registered to vfio
182 * core.
183 *
184 * Drivers should use the wrapper vfio_alloc_device() for allocation.
185 * @size is the size of the structure to be allocated, including any
186 * private data used by the driver.
187 *
188 * Driver may provide an @init callback to cover device private data.
189 *
190 * Use vfio_put_device() to release the structure after success return.
191 */
_vfio_alloc_device(size_t size,struct device * dev,const struct vfio_device_ops * ops)192 struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
193 const struct vfio_device_ops *ops)
194 {
195 struct vfio_device *device;
196 int ret;
197
198 if (WARN_ON(size < sizeof(struct vfio_device)))
199 return ERR_PTR(-EINVAL);
200
201 device = kvzalloc(size, GFP_KERNEL);
202 if (!device)
203 return ERR_PTR(-ENOMEM);
204
205 ret = vfio_init_device(device, dev, ops);
206 if (ret)
207 goto out_free;
208 return device;
209
210 out_free:
211 kvfree(device);
212 return ERR_PTR(ret);
213 }
214 EXPORT_SYMBOL_GPL(_vfio_alloc_device);
215
216 /*
217 * Initialize a vfio_device so it can be registered to vfio core.
218 */
vfio_init_device(struct vfio_device * device,struct device * dev,const struct vfio_device_ops * ops)219 static int vfio_init_device(struct vfio_device *device, struct device *dev,
220 const struct vfio_device_ops *ops)
221 {
222 int ret;
223
224 ret = ida_alloc_max(&vfio.device_ida, MINORMASK, GFP_KERNEL);
225 if (ret < 0) {
226 dev_dbg(dev, "Error to alloc index\n");
227 return ret;
228 }
229
230 device->index = ret;
231 init_completion(&device->comp);
232 device->dev = dev;
233 device->ops = ops;
234
235 if (ops->init) {
236 ret = ops->init(device);
237 if (ret)
238 goto out_uninit;
239 }
240
241 device_initialize(&device->device);
242 device->device.release = vfio_device_release;
243 device->device.class = vfio.device_class;
244 device->device.parent = device->dev;
245 return 0;
246
247 out_uninit:
248 vfio_release_device_set(device);
249 ida_free(&vfio.device_ida, device->index);
250 return ret;
251 }
252
__vfio_register_dev(struct vfio_device * device,enum vfio_group_type type)253 static int __vfio_register_dev(struct vfio_device *device,
254 enum vfio_group_type type)
255 {
256 int ret;
257
258 if (WARN_ON(device->ops->bind_iommufd &&
259 (!device->ops->unbind_iommufd ||
260 !device->ops->attach_ioas)))
261 return -EINVAL;
262
263 /*
264 * If the driver doesn't specify a set then the device is added to a
265 * singleton set just for itself.
266 */
267 if (!device->dev_set)
268 vfio_assign_device_set(device, device);
269
270 ret = dev_set_name(&device->device, "vfio%d", device->index);
271 if (ret)
272 return ret;
273
274 ret = vfio_device_set_group(device, type);
275 if (ret)
276 return ret;
277
278 ret = device_add(&device->device);
279 if (ret)
280 goto err_out;
281
282 /* Refcounting can't start until the driver calls register */
283 refcount_set(&device->refcount, 1);
284
285 vfio_device_group_register(device);
286
287 return 0;
288 err_out:
289 vfio_device_remove_group(device);
290 return ret;
291 }
292
vfio_register_group_dev(struct vfio_device * device)293 int vfio_register_group_dev(struct vfio_device *device)
294 {
295 return __vfio_register_dev(device, VFIO_IOMMU);
296 }
297 EXPORT_SYMBOL_GPL(vfio_register_group_dev);
298
299 /*
300 * Register a virtual device without IOMMU backing. The user of this
301 * device must not be able to directly trigger unmediated DMA.
302 */
vfio_register_emulated_iommu_dev(struct vfio_device * device)303 int vfio_register_emulated_iommu_dev(struct vfio_device *device)
304 {
305 return __vfio_register_dev(device, VFIO_EMULATED_IOMMU);
306 }
307 EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
308
309 /*
310 * Decrement the device reference count and wait for the device to be
311 * removed. Open file descriptors for the device... */
vfio_unregister_group_dev(struct vfio_device * device)312 void vfio_unregister_group_dev(struct vfio_device *device)
313 {
314 unsigned int i = 0;
315 bool interrupted = false;
316 long rc;
317
318 vfio_device_put_registration(device);
319 rc = try_wait_for_completion(&device->comp);
320 while (rc <= 0) {
321 if (device->ops->request)
322 device->ops->request(device, i++);
323
324 if (interrupted) {
325 rc = wait_for_completion_timeout(&device->comp,
326 HZ * 10);
327 } else {
328 rc = wait_for_completion_interruptible_timeout(
329 &device->comp, HZ * 10);
330 if (rc < 0) {
331 interrupted = true;
332 dev_warn(device->dev,
333 "Device is currently in use, task"
334 " \"%s\" (%d) "
335 "blocked until device is released",
336 current->comm, task_pid_nr(current));
337 }
338 }
339 }
340
341 vfio_device_group_unregister(device);
342
343 /* Balances device_add in register path */
344 device_del(&device->device);
345
346 /* Balances vfio_device_set_group in register path */
347 vfio_device_remove_group(device);
348 }
349 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
350
351 #ifdef CONFIG_HAVE_KVM
_vfio_device_get_kvm_safe(struct vfio_device * device,struct kvm * kvm)352 void _vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
353 {
354 void (*pfn)(struct kvm *kvm);
355 bool (*fn)(struct kvm *kvm);
356 bool ret;
357
358 lockdep_assert_held(&device->dev_set->lock);
359
360 pfn = symbol_get(kvm_put_kvm);
361 if (WARN_ON(!pfn))
362 return;
363
364 fn = symbol_get(kvm_get_kvm_safe);
365 if (WARN_ON(!fn)) {
366 symbol_put(kvm_put_kvm);
367 return;
368 }
369
370 ret = fn(kvm);
371 symbol_put(kvm_get_kvm_safe);
372 if (!ret) {
373 symbol_put(kvm_put_kvm);
374 return;
375 }
376
377 device->put_kvm = pfn;
378 device->kvm = kvm;
379 }
380
vfio_device_put_kvm(struct vfio_device * device)381 void vfio_device_put_kvm(struct vfio_device *device)
382 {
383 lockdep_assert_held(&device->dev_set->lock);
384
385 if (!device->kvm)
386 return;
387
388 if (WARN_ON(!device->put_kvm))
389 goto clear;
390
391 device->put_kvm(device->kvm);
392 device->put_kvm = NULL;
393 symbol_put(kvm_put_kvm);
394
395 clear:
396 device->kvm = NULL;
397 }
398 #endif
399
400 /* true if the vfio_device has open_device() called but not close_device() */
vfio_assert_device_open(struct vfio_device * device)401 static bool vfio_assert_device_open(struct vfio_device *device)
402 {
403 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
404 }
405
vfio_device_first_open(struct vfio_device * device,struct iommufd_ctx * iommufd)406 static int vfio_device_first_open(struct vfio_device *device,
407 struct iommufd_ctx *iommufd)
408 {
409 int ret;
410
411 lockdep_assert_held(&device->dev_set->lock);
412
413 if (!try_module_get(device->dev->driver->owner))
414 return -ENODEV;
415
416 if (iommufd)
417 ret = vfio_iommufd_bind(device, iommufd);
418 else
419 ret = vfio_device_group_use_iommu(device);
420 if (ret)
421 goto err_module_put;
422
423 if (device->ops->open_device) {
424 ret = device->ops->open_device(device);
425 if (ret)
426 goto err_unuse_iommu;
427 }
428 return 0;
429
430 err_unuse_iommu:
431 if (iommufd)
432 vfio_iommufd_unbind(device);
433 else
434 vfio_device_group_unuse_iommu(device);
435 err_module_put:
436 module_put(device->dev->driver->owner);
437 return ret;
438 }
439
vfio_device_last_close(struct vfio_device * device,struct iommufd_ctx * iommufd)440 static void vfio_device_last_close(struct vfio_device *device,
441 struct iommufd_ctx *iommufd)
442 {
443 lockdep_assert_held(&device->dev_set->lock);
444
445 if (device->ops->close_device)
446 device->ops->close_device(device);
447 if (iommufd)
448 vfio_iommufd_unbind(device);
449 else
450 vfio_device_group_unuse_iommu(device);
451 module_put(device->dev->driver->owner);
452 }
453
vfio_device_open(struct vfio_device * device,struct iommufd_ctx * iommufd)454 int vfio_device_open(struct vfio_device *device, struct iommufd_ctx *iommufd)
455 {
456 int ret = 0;
457
458 lockdep_assert_held(&device->dev_set->lock);
459
460 device->open_count++;
461 if (device->open_count == 1) {
462 ret = vfio_device_first_open(device, iommufd);
463 if (ret)
464 device->open_count--;
465 }
466
467 return ret;
468 }
469
vfio_device_close(struct vfio_device * device,struct iommufd_ctx * iommufd)470 void vfio_device_close(struct vfio_device *device,
471 struct iommufd_ctx *iommufd)
472 {
473 lockdep_assert_held(&device->dev_set->lock);
474
475 vfio_assert_device_open(device);
476 if (device->open_count == 1)
477 vfio_device_last_close(device, iommufd);
478 device->open_count--;
479 }
480
481 /*
482 * Wrapper around pm_runtime_resume_and_get().
483 * Return error code on failure or 0 on success.
484 */
vfio_device_pm_runtime_get(struct vfio_device * device)485 static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
486 {
487 struct device *dev = device->dev;
488
489 if (dev->driver && dev->driver->pm) {
490 int ret;
491
492 ret = pm_runtime_resume_and_get(dev);
493 if (ret) {
494 dev_info_ratelimited(dev,
495 "vfio: runtime resume failed %d\n", ret);
496 return -EIO;
497 }
498 }
499
500 return 0;
501 }
502
503 /*
504 * Wrapper around pm_runtime_put().
505 */
vfio_device_pm_runtime_put(struct vfio_device * device)506 static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
507 {
508 struct device *dev = device->dev;
509
510 if (dev->driver && dev->driver->pm)
511 pm_runtime_put(dev);
512 }
513
514 /*
515 * VFIO Device fd
516 */
vfio_device_fops_release(struct inode * inode,struct file * filep)517 static int vfio_device_fops_release(struct inode *inode, struct file *filep)
518 {
519 struct vfio_device *device = filep->private_data;
520
521 vfio_device_group_close(device);
522
523 vfio_device_put_registration(device);
524
525 return 0;
526 }
527
528 /*
529 * vfio_mig_get_next_state - Compute the next step in the FSM
530 * @cur_fsm - The current state the device is in
531 * @new_fsm - The target state to reach
532 * @next_fsm - Pointer to the next step to get to new_fsm
533 *
534 * Return 0 upon success, otherwise -errno
535 * Upon success the next step in the state progression between cur_fsm and
536 * new_fsm will be set in next_fsm.
537 *
538 * This breaks down requests for combination transitions into smaller steps and
539 * returns the next step to get to new_fsm. The function may need to be called
540 * multiple times before reaching new_fsm.
541 *
542 */
vfio_mig_get_next_state(struct vfio_device * device,enum vfio_device_mig_state cur_fsm,enum vfio_device_mig_state new_fsm,enum vfio_device_mig_state * next_fsm)543 int vfio_mig_get_next_state(struct vfio_device *device,
544 enum vfio_device_mig_state cur_fsm,
545 enum vfio_device_mig_state new_fsm,
546 enum vfio_device_mig_state *next_fsm)
547 {
548 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
549 /*
550 * The coding in this table requires the driver to implement the
551 * following FSM arcs:
552 * RESUMING -> STOP
553 * STOP -> RESUMING
554 * STOP -> STOP_COPY
555 * STOP_COPY -> STOP
556 *
557 * If P2P is supported then the driver must also implement these FSM
558 * arcs:
559 * RUNNING -> RUNNING_P2P
560 * RUNNING_P2P -> RUNNING
561 * RUNNING_P2P -> STOP
562 * STOP -> RUNNING_P2P
563 *
564 * If precopy is supported then the driver must support these additional
565 * FSM arcs:
566 * RUNNING -> PRE_COPY
567 * PRE_COPY -> RUNNING
568 * PRE_COPY -> STOP_COPY
569 * However, if precopy and P2P are supported together then the driver
570 * must support these additional arcs beyond the P2P arcs above:
571 * PRE_COPY -> RUNNING
572 * PRE_COPY -> PRE_COPY_P2P
573 * PRE_COPY_P2P -> PRE_COPY
574 * PRE_COPY_P2P -> RUNNING_P2P
575 * PRE_COPY_P2P -> STOP_COPY
576 * RUNNING -> PRE_COPY
577 * RUNNING_P2P -> PRE_COPY_P2P
578 *
579 * Without P2P and precopy the driver must implement:
580 * RUNNING -> STOP
581 * STOP -> RUNNING
582 *
583 * The coding will step through multiple states for some combination
584 * transitions; if all optional features are supported, this means the
585 * following ones:
586 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
587 * PRE_COPY -> RUNNING -> RUNNING_P2P
588 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
589 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
590 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
591 * PRE_COPY_P2P -> RUNNING_P2P -> STOP
592 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
593 * RESUMING -> STOP -> RUNNING_P2P
594 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
595 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
596 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
597 * RESUMING -> STOP -> STOP_COPY
598 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
599 * RUNNING -> RUNNING_P2P -> STOP
600 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
601 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
602 * RUNNING_P2P -> RUNNING -> PRE_COPY
603 * RUNNING_P2P -> STOP -> RESUMING
604 * RUNNING_P2P -> STOP -> STOP_COPY
605 * STOP -> RUNNING_P2P -> PRE_COPY_P2P
606 * STOP -> RUNNING_P2P -> RUNNING
607 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
608 * STOP_COPY -> STOP -> RESUMING
609 * STOP_COPY -> STOP -> RUNNING_P2P
610 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
611 *
612 * The following transitions are blocked:
613 * STOP_COPY -> PRE_COPY
614 * STOP_COPY -> PRE_COPY_P2P
615 */
616 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
617 [VFIO_DEVICE_STATE_STOP] = {
618 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
619 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
620 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
621 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
622 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
623 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
624 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
625 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
626 },
627 [VFIO_DEVICE_STATE_RUNNING] = {
628 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
629 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
630 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
631 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
632 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
633 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
634 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
635 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
636 },
637 [VFIO_DEVICE_STATE_PRE_COPY] = {
638 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
639 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
640 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
641 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
642 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
643 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
644 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
645 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
646 },
647 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
648 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
649 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
650 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
651 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
652 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
653 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
654 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
655 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
656 },
657 [VFIO_DEVICE_STATE_STOP_COPY] = {
658 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
659 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
660 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
661 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
662 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
663 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
664 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
665 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
666 },
667 [VFIO_DEVICE_STATE_RESUMING] = {
668 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
669 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
670 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
671 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
672 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
673 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
674 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
675 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
676 },
677 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
678 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
679 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
680 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
681 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
682 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
683 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
684 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
685 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
686 },
687 [VFIO_DEVICE_STATE_ERROR] = {
688 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
689 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
690 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
691 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
692 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
693 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
694 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
695 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
696 },
697 };
698
699 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
700 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
701 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
702 [VFIO_DEVICE_STATE_PRE_COPY] =
703 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
704 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
705 VFIO_MIGRATION_P2P |
706 VFIO_MIGRATION_PRE_COPY,
707 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
708 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
709 [VFIO_DEVICE_STATE_RUNNING_P2P] =
710 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
711 [VFIO_DEVICE_STATE_ERROR] = ~0U,
712 };
713
714 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
715 (state_flags_table[cur_fsm] & device->migration_flags) !=
716 state_flags_table[cur_fsm]))
717 return -EINVAL;
718
719 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
720 (state_flags_table[new_fsm] & device->migration_flags) !=
721 state_flags_table[new_fsm])
722 return -EINVAL;
723
724 /*
725 * Arcs touching optional and unsupported states are skipped over. The
726 * driver will instead see an arc from the original state to the next
727 * logical state, as per the above comment.
728 */
729 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
730 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
731 state_flags_table[*next_fsm])
732 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
733
734 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
735 }
736 EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
737
738 /*
739 * Convert the drivers's struct file into a FD number and return it to userspace
740 */
vfio_ioct_mig_return_fd(struct file * filp,void __user * arg,struct vfio_device_feature_mig_state * mig)741 static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
742 struct vfio_device_feature_mig_state *mig)
743 {
744 int ret;
745 int fd;
746
747 fd = get_unused_fd_flags(O_CLOEXEC);
748 if (fd < 0) {
749 ret = fd;
750 goto out_fput;
751 }
752
753 mig->data_fd = fd;
754 if (copy_to_user(arg, mig, sizeof(*mig))) {
755 ret = -EFAULT;
756 goto out_put_unused;
757 }
758 fd_install(fd, filp);
759 return 0;
760
761 out_put_unused:
762 put_unused_fd(fd);
763 out_fput:
764 fput(filp);
765 return ret;
766 }
767
768 static int
vfio_ioctl_device_feature_mig_device_state(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)769 vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
770 u32 flags, void __user *arg,
771 size_t argsz)
772 {
773 size_t minsz =
774 offsetofend(struct vfio_device_feature_mig_state, data_fd);
775 struct vfio_device_feature_mig_state mig;
776 struct file *filp = NULL;
777 int ret;
778
779 if (!device->mig_ops)
780 return -ENOTTY;
781
782 ret = vfio_check_feature(flags, argsz,
783 VFIO_DEVICE_FEATURE_SET |
784 VFIO_DEVICE_FEATURE_GET,
785 sizeof(mig));
786 if (ret != 1)
787 return ret;
788
789 if (copy_from_user(&mig, arg, minsz))
790 return -EFAULT;
791
792 if (flags & VFIO_DEVICE_FEATURE_GET) {
793 enum vfio_device_mig_state curr_state;
794
795 ret = device->mig_ops->migration_get_state(device,
796 &curr_state);
797 if (ret)
798 return ret;
799 mig.device_state = curr_state;
800 goto out_copy;
801 }
802
803 /* Handle the VFIO_DEVICE_FEATURE_SET */
804 filp = device->mig_ops->migration_set_state(device, mig.device_state);
805 if (IS_ERR(filp) || !filp)
806 goto out_copy;
807
808 return vfio_ioct_mig_return_fd(filp, arg, &mig);
809 out_copy:
810 mig.data_fd = -1;
811 if (copy_to_user(arg, &mig, sizeof(mig)))
812 return -EFAULT;
813 if (IS_ERR(filp))
814 return PTR_ERR(filp);
815 return 0;
816 }
817
818 static int
vfio_ioctl_device_feature_migration_data_size(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)819 vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
820 u32 flags, void __user *arg,
821 size_t argsz)
822 {
823 struct vfio_device_feature_mig_data_size data_size = {};
824 unsigned long stop_copy_length;
825 int ret;
826
827 if (!device->mig_ops)
828 return -ENOTTY;
829
830 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
831 sizeof(data_size));
832 if (ret != 1)
833 return ret;
834
835 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
836 if (ret)
837 return ret;
838
839 data_size.stop_copy_length = stop_copy_length;
840 if (copy_to_user(arg, &data_size, sizeof(data_size)))
841 return -EFAULT;
842
843 return 0;
844 }
845
vfio_ioctl_device_feature_migration(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)846 static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
847 u32 flags, void __user *arg,
848 size_t argsz)
849 {
850 struct vfio_device_feature_migration mig = {
851 .flags = device->migration_flags,
852 };
853 int ret;
854
855 if (!device->mig_ops)
856 return -ENOTTY;
857
858 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
859 sizeof(mig));
860 if (ret != 1)
861 return ret;
862 if (copy_to_user(arg, &mig, sizeof(mig)))
863 return -EFAULT;
864 return 0;
865 }
866
867 /* Ranges should fit into a single kernel page */
868 #define LOG_MAX_RANGES \
869 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
870
871 static int
vfio_ioctl_device_feature_logging_start(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)872 vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
873 u32 flags, void __user *arg,
874 size_t argsz)
875 {
876 size_t minsz =
877 offsetofend(struct vfio_device_feature_dma_logging_control,
878 ranges);
879 struct vfio_device_feature_dma_logging_range __user *ranges;
880 struct vfio_device_feature_dma_logging_control control;
881 struct vfio_device_feature_dma_logging_range range;
882 struct rb_root_cached root = RB_ROOT_CACHED;
883 struct interval_tree_node *nodes;
884 u64 iova_end;
885 u32 nnodes;
886 int i, ret;
887
888 if (!device->log_ops)
889 return -ENOTTY;
890
891 ret = vfio_check_feature(flags, argsz,
892 VFIO_DEVICE_FEATURE_SET,
893 sizeof(control));
894 if (ret != 1)
895 return ret;
896
897 if (copy_from_user(&control, arg, minsz))
898 return -EFAULT;
899
900 nnodes = control.num_ranges;
901 if (!nnodes)
902 return -EINVAL;
903
904 if (nnodes > LOG_MAX_RANGES)
905 return -E2BIG;
906
907 ranges = u64_to_user_ptr(control.ranges);
908 nodes = kmalloc_array(nnodes, sizeof(struct interval_tree_node),
909 GFP_KERNEL);
910 if (!nodes)
911 return -ENOMEM;
912
913 for (i = 0; i < nnodes; i++) {
914 if (copy_from_user(&range, &ranges[i], sizeof(range))) {
915 ret = -EFAULT;
916 goto end;
917 }
918 if (!IS_ALIGNED(range.iova, control.page_size) ||
919 !IS_ALIGNED(range.length, control.page_size)) {
920 ret = -EINVAL;
921 goto end;
922 }
923
924 if (check_add_overflow(range.iova, range.length, &iova_end) ||
925 iova_end > ULONG_MAX) {
926 ret = -EOVERFLOW;
927 goto end;
928 }
929
930 nodes[i].start = range.iova;
931 nodes[i].last = range.iova + range.length - 1;
932 if (interval_tree_iter_first(&root, nodes[i].start,
933 nodes[i].last)) {
934 /* Range overlapping */
935 ret = -EINVAL;
936 goto end;
937 }
938 interval_tree_insert(nodes + i, &root);
939 }
940
941 ret = device->log_ops->log_start(device, &root, nnodes,
942 &control.page_size);
943 if (ret)
944 goto end;
945
946 if (copy_to_user(arg, &control, sizeof(control))) {
947 ret = -EFAULT;
948 device->log_ops->log_stop(device);
949 }
950
951 end:
952 kfree(nodes);
953 return ret;
954 }
955
956 static int
vfio_ioctl_device_feature_logging_stop(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)957 vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
958 u32 flags, void __user *arg,
959 size_t argsz)
960 {
961 int ret;
962
963 if (!device->log_ops)
964 return -ENOTTY;
965
966 ret = vfio_check_feature(flags, argsz,
967 VFIO_DEVICE_FEATURE_SET, 0);
968 if (ret != 1)
969 return ret;
970
971 return device->log_ops->log_stop(device);
972 }
973
vfio_device_log_read_and_clear(struct iova_bitmap * iter,unsigned long iova,size_t length,void * opaque)974 static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
975 unsigned long iova, size_t length,
976 void *opaque)
977 {
978 struct vfio_device *device = opaque;
979
980 return device->log_ops->log_read_and_clear(device, iova, length, iter);
981 }
982
983 static int
vfio_ioctl_device_feature_logging_report(struct vfio_device * device,u32 flags,void __user * arg,size_t argsz)984 vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
985 u32 flags, void __user *arg,
986 size_t argsz)
987 {
988 size_t minsz =
989 offsetofend(struct vfio_device_feature_dma_logging_report,
990 bitmap);
991 struct vfio_device_feature_dma_logging_report report;
992 struct iova_bitmap *iter;
993 u64 iova_end;
994 int ret;
995
996 if (!device->log_ops)
997 return -ENOTTY;
998
999 ret = vfio_check_feature(flags, argsz,
1000 VFIO_DEVICE_FEATURE_GET,
1001 sizeof(report));
1002 if (ret != 1)
1003 return ret;
1004
1005 if (copy_from_user(&report, arg, minsz))
1006 return -EFAULT;
1007
1008 if (report.page_size < SZ_4K || !is_power_of_2(report.page_size))
1009 return -EINVAL;
1010
1011 if (check_add_overflow(report.iova, report.length, &iova_end) ||
1012 iova_end > ULONG_MAX)
1013 return -EOVERFLOW;
1014
1015 iter = iova_bitmap_alloc(report.iova, report.length,
1016 report.page_size,
1017 u64_to_user_ptr(report.bitmap));
1018 if (IS_ERR(iter))
1019 return PTR_ERR(iter);
1020
1021 ret = iova_bitmap_for_each(iter, device,
1022 vfio_device_log_read_and_clear);
1023
1024 iova_bitmap_free(iter);
1025 return ret;
1026 }
1027
vfio_ioctl_device_feature(struct vfio_device * device,struct vfio_device_feature __user * arg)1028 static int vfio_ioctl_device_feature(struct vfio_device *device,
1029 struct vfio_device_feature __user *arg)
1030 {
1031 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1032 struct vfio_device_feature feature;
1033
1034 if (copy_from_user(&feature, arg, minsz))
1035 return -EFAULT;
1036
1037 if (feature.argsz < minsz)
1038 return -EINVAL;
1039
1040 /* Check unknown flags */
1041 if (feature.flags &
1042 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1043 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1044 return -EINVAL;
1045
1046 /* GET & SET are mutually exclusive except with PROBE */
1047 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1048 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1049 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1050 return -EINVAL;
1051
1052 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1053 case VFIO_DEVICE_FEATURE_MIGRATION:
1054 return vfio_ioctl_device_feature_migration(
1055 device, feature.flags, arg->data,
1056 feature.argsz - minsz);
1057 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1058 return vfio_ioctl_device_feature_mig_device_state(
1059 device, feature.flags, arg->data,
1060 feature.argsz - minsz);
1061 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1062 return vfio_ioctl_device_feature_logging_start(
1063 device, feature.flags, arg->data,
1064 feature.argsz - minsz);
1065 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1066 return vfio_ioctl_device_feature_logging_stop(
1067 device, feature.flags, arg->data,
1068 feature.argsz - minsz);
1069 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1070 return vfio_ioctl_device_feature_logging_report(
1071 device, feature.flags, arg->data,
1072 feature.argsz - minsz);
1073 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1074 return vfio_ioctl_device_feature_migration_data_size(
1075 device, feature.flags, arg->data,
1076 feature.argsz - minsz);
1077 default:
1078 if (unlikely(!device->ops->device_feature))
1079 return -EINVAL;
1080 return device->ops->device_feature(device, feature.flags,
1081 arg->data,
1082 feature.argsz - minsz);
1083 }
1084 }
1085
vfio_device_fops_unl_ioctl(struct file * filep,unsigned int cmd,unsigned long arg)1086 static long vfio_device_fops_unl_ioctl(struct file *filep,
1087 unsigned int cmd, unsigned long arg)
1088 {
1089 struct vfio_device *device = filep->private_data;
1090 int ret;
1091
1092 ret = vfio_device_pm_runtime_get(device);
1093 if (ret)
1094 return ret;
1095
1096 switch (cmd) {
1097 case VFIO_DEVICE_FEATURE:
1098 ret = vfio_ioctl_device_feature(device, (void __user *)arg);
1099 break;
1100
1101 default:
1102 if (unlikely(!device->ops->ioctl))
1103 ret = -EINVAL;
1104 else
1105 ret = device->ops->ioctl(device, cmd, arg);
1106 break;
1107 }
1108
1109 vfio_device_pm_runtime_put(device);
1110 return ret;
1111 }
1112
vfio_device_fops_read(struct file * filep,char __user * buf,size_t count,loff_t * ppos)1113 static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1114 size_t count, loff_t *ppos)
1115 {
1116 struct vfio_device *device = filep->private_data;
1117
1118 if (unlikely(!device->ops->read))
1119 return -EINVAL;
1120
1121 return device->ops->read(device, buf, count, ppos);
1122 }
1123
vfio_device_fops_write(struct file * filep,const char __user * buf,size_t count,loff_t * ppos)1124 static ssize_t vfio_device_fops_write(struct file *filep,
1125 const char __user *buf,
1126 size_t count, loff_t *ppos)
1127 {
1128 struct vfio_device *device = filep->private_data;
1129
1130 if (unlikely(!device->ops->write))
1131 return -EINVAL;
1132
1133 return device->ops->write(device, buf, count, ppos);
1134 }
1135
vfio_device_fops_mmap(struct file * filep,struct vm_area_struct * vma)1136 static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1137 {
1138 struct vfio_device *device = filep->private_data;
1139
1140 if (unlikely(!device->ops->mmap))
1141 return -EINVAL;
1142
1143 return device->ops->mmap(device, vma);
1144 }
1145
1146 const struct file_operations vfio_device_fops = {
1147 .owner = THIS_MODULE,
1148 .release = vfio_device_fops_release,
1149 .read = vfio_device_fops_read,
1150 .write = vfio_device_fops_write,
1151 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1152 .compat_ioctl = compat_ptr_ioctl,
1153 .mmap = vfio_device_fops_mmap,
1154 };
1155
1156 /*
1157 * Sub-module support
1158 */
1159 /*
1160 * Helper for managing a buffer of info chain capabilities, allocate or
1161 * reallocate a buffer with additional @size, filling in @id and @version
1162 * of the capability. A pointer to the new capability is returned.
1163 *
1164 * NB. The chain is based at the head of the buffer, so new entries are
1165 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1166 * next offsets prior to copying to the user buffer.
1167 */
vfio_info_cap_add(struct vfio_info_cap * caps,size_t size,u16 id,u16 version)1168 struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1169 size_t size, u16 id, u16 version)
1170 {
1171 void *buf;
1172 struct vfio_info_cap_header *header, *tmp;
1173
1174 buf = krealloc(caps->buf, caps->size + size, GFP_KERNEL);
1175 if (!buf) {
1176 kfree(caps->buf);
1177 caps->buf = NULL;
1178 caps->size = 0;
1179 return ERR_PTR(-ENOMEM);
1180 }
1181
1182 caps->buf = buf;
1183 header = buf + caps->size;
1184
1185 /* Eventually copied to user buffer, zero */
1186 memset(header, 0, size);
1187
1188 header->id = id;
1189 header->version = version;
1190
1191 /* Add to the end of the capability chain */
1192 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1193 ; /* nothing */
1194
1195 tmp->next = caps->size;
1196 caps->size += size;
1197
1198 return header;
1199 }
1200 EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1201
vfio_info_cap_shift(struct vfio_info_cap * caps,size_t offset)1202 void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1203 {
1204 struct vfio_info_cap_header *tmp;
1205 void *buf = (void *)caps->buf;
1206
1207 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1208 tmp->next += offset;
1209 }
1210 EXPORT_SYMBOL(vfio_info_cap_shift);
1211
vfio_info_add_capability(struct vfio_info_cap * caps,struct vfio_info_cap_header * cap,size_t size)1212 int vfio_info_add_capability(struct vfio_info_cap *caps,
1213 struct vfio_info_cap_header *cap, size_t size)
1214 {
1215 struct vfio_info_cap_header *header;
1216
1217 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1218 if (IS_ERR(header))
1219 return PTR_ERR(header);
1220
1221 memcpy(header + 1, cap + 1, size - sizeof(*header));
1222
1223 return 0;
1224 }
1225 EXPORT_SYMBOL(vfio_info_add_capability);
1226
vfio_set_irqs_validate_and_prepare(struct vfio_irq_set * hdr,int num_irqs,int max_irq_type,size_t * data_size)1227 int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1228 int max_irq_type, size_t *data_size)
1229 {
1230 unsigned long minsz;
1231 size_t size;
1232
1233 minsz = offsetofend(struct vfio_irq_set, count);
1234
1235 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1236 (hdr->count >= (U32_MAX - hdr->start)) ||
1237 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1238 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1239 return -EINVAL;
1240
1241 if (data_size)
1242 *data_size = 0;
1243
1244 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1245 return -EINVAL;
1246
1247 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1248 case VFIO_IRQ_SET_DATA_NONE:
1249 size = 0;
1250 break;
1251 case VFIO_IRQ_SET_DATA_BOOL:
1252 size = sizeof(uint8_t);
1253 break;
1254 case VFIO_IRQ_SET_DATA_EVENTFD:
1255 size = sizeof(int32_t);
1256 break;
1257 default:
1258 return -EINVAL;
1259 }
1260
1261 if (size) {
1262 if (hdr->argsz - minsz < hdr->count * size)
1263 return -EINVAL;
1264
1265 if (!data_size)
1266 return -EINVAL;
1267
1268 *data_size = hdr->count * size;
1269 }
1270
1271 return 0;
1272 }
1273 EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1274
1275 /*
1276 * Pin contiguous user pages and return their associated host pages for local
1277 * domain only.
1278 * @device [in] : device
1279 * @iova [in] : starting IOVA of user pages to be pinned.
1280 * @npage [in] : count of pages to be pinned. This count should not
1281 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1282 * @prot [in] : protection flags
1283 * @pages[out] : array of host pages
1284 * Return error or number of pages pinned.
1285 *
1286 * A driver may only call this function if the vfio_device was created
1287 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1288 */
vfio_pin_pages(struct vfio_device * device,dma_addr_t iova,int npage,int prot,struct page ** pages)1289 int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1290 int npage, int prot, struct page **pages)
1291 {
1292 /* group->container cannot change while a vfio device is open */
1293 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1294 return -EINVAL;
1295 if (vfio_device_has_container(device))
1296 return vfio_device_container_pin_pages(device, iova,
1297 npage, prot, pages);
1298 if (device->iommufd_access) {
1299 int ret;
1300
1301 if (iova > ULONG_MAX)
1302 return -EINVAL;
1303 /*
1304 * VFIO ignores the sub page offset, npages is from the start of
1305 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1306 * the sub page offset by doing:
1307 * pages[0] + (iova % PAGE_SIZE)
1308 */
1309 ret = iommufd_access_pin_pages(
1310 device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1311 npage * PAGE_SIZE, pages,
1312 (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1313 if (ret)
1314 return ret;
1315 return npage;
1316 }
1317 return -EINVAL;
1318 }
1319 EXPORT_SYMBOL(vfio_pin_pages);
1320
1321 /*
1322 * Unpin contiguous host pages for local domain only.
1323 * @device [in] : device
1324 * @iova [in] : starting address of user pages to be unpinned.
1325 * @npage [in] : count of pages to be unpinned. This count should not
1326 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1327 */
vfio_unpin_pages(struct vfio_device * device,dma_addr_t iova,int npage)1328 void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1329 {
1330 if (WARN_ON(!vfio_assert_device_open(device)))
1331 return;
1332
1333 if (vfio_device_has_container(device)) {
1334 vfio_device_container_unpin_pages(device, iova, npage);
1335 return;
1336 }
1337 if (device->iommufd_access) {
1338 if (WARN_ON(iova > ULONG_MAX))
1339 return;
1340 iommufd_access_unpin_pages(device->iommufd_access,
1341 ALIGN_DOWN(iova, PAGE_SIZE),
1342 npage * PAGE_SIZE);
1343 return;
1344 }
1345 }
1346 EXPORT_SYMBOL(vfio_unpin_pages);
1347
1348 /*
1349 * This interface allows the CPUs to perform some sort of virtual DMA on
1350 * behalf of the device.
1351 *
1352 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1353 * into/from a kernel buffer.
1354 *
1355 * As the read/write of user space memory is conducted via the CPUs and is
1356 * not a real device DMA, it is not necessary to pin the user space memory.
1357 *
1358 * @device [in] : VFIO device
1359 * @iova [in] : base IOVA of a user space buffer
1360 * @data [in] : pointer to kernel buffer
1361 * @len [in] : kernel buffer length
1362 * @write : indicate read or write
1363 * Return error code on failure or 0 on success.
1364 */
vfio_dma_rw(struct vfio_device * device,dma_addr_t iova,void * data,size_t len,bool write)1365 int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1366 size_t len, bool write)
1367 {
1368 if (!data || len <= 0 || !vfio_assert_device_open(device))
1369 return -EINVAL;
1370
1371 if (vfio_device_has_container(device))
1372 return vfio_device_container_dma_rw(device, iova,
1373 data, len, write);
1374
1375 if (device->iommufd_access) {
1376 unsigned int flags = 0;
1377
1378 if (iova > ULONG_MAX)
1379 return -EINVAL;
1380
1381 /* VFIO historically tries to auto-detect a kthread */
1382 if (!current->mm)
1383 flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1384 if (write)
1385 flags |= IOMMUFD_ACCESS_RW_WRITE;
1386 return iommufd_access_rw(device->iommufd_access, iova, data,
1387 len, flags);
1388 }
1389 return -EINVAL;
1390 }
1391 EXPORT_SYMBOL(vfio_dma_rw);
1392
1393 /*
1394 * Module/class support
1395 */
vfio_init(void)1396 static int __init vfio_init(void)
1397 {
1398 int ret;
1399
1400 ida_init(&vfio.device_ida);
1401
1402 ret = vfio_group_init();
1403 if (ret)
1404 return ret;
1405
1406 ret = vfio_virqfd_init();
1407 if (ret)
1408 goto err_virqfd;
1409
1410 /* /sys/class/vfio-dev/vfioX */
1411 vfio.device_class = class_create(THIS_MODULE, "vfio-dev");
1412 if (IS_ERR(vfio.device_class)) {
1413 ret = PTR_ERR(vfio.device_class);
1414 goto err_dev_class;
1415 }
1416
1417 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1418 return 0;
1419
1420 err_dev_class:
1421 vfio_virqfd_exit();
1422 err_virqfd:
1423 vfio_group_cleanup();
1424 return ret;
1425 }
1426
vfio_cleanup(void)1427 static void __exit vfio_cleanup(void)
1428 {
1429 ida_destroy(&vfio.device_ida);
1430 class_destroy(vfio.device_class);
1431 vfio.device_class = NULL;
1432 vfio_virqfd_exit();
1433 vfio_group_cleanup();
1434 xa_destroy(&vfio_device_set_xa);
1435 }
1436
1437 module_init(vfio_init);
1438 module_exit(vfio_cleanup);
1439
1440 MODULE_VERSION(DRIVER_VERSION);
1441 MODULE_LICENSE("GPL v2");
1442 MODULE_AUTHOR(DRIVER_AUTHOR);
1443 MODULE_DESCRIPTION(DRIVER_DESC);
1444 MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1445