1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Code related to the io_uring_register() syscall
4 *
5 * Copyright (C) 2023 Jens Axboe
6 */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31 #include "msg_ring.h"
32 #include "memmap.h"
33 #include "zcrx.h"
34
35 #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \
36 IORING_REGISTER_LAST + IORING_OP_LAST)
37
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)38 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
39 unsigned nr_args)
40 {
41 struct io_uring_probe *p;
42 size_t size;
43 int i, ret;
44
45 if (nr_args > IORING_OP_LAST)
46 nr_args = IORING_OP_LAST;
47
48 size = struct_size(p, ops, nr_args);
49 p = kzalloc(size, GFP_KERNEL);
50 if (!p)
51 return -ENOMEM;
52
53 ret = -EFAULT;
54 if (copy_from_user(p, arg, size))
55 goto out;
56 ret = -EINVAL;
57 if (memchr_inv(p, 0, size))
58 goto out;
59
60 p->last_op = IORING_OP_LAST - 1;
61
62 for (i = 0; i < nr_args; i++) {
63 p->ops[i].op = i;
64 if (io_uring_op_supported(i))
65 p->ops[i].flags = IO_URING_OP_SUPPORTED;
66 }
67 p->ops_len = i;
68
69 ret = 0;
70 if (copy_to_user(arg, p, size))
71 ret = -EFAULT;
72 out:
73 kfree(p);
74 return ret;
75 }
76
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)77 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
78 {
79 const struct cred *creds;
80
81 creds = xa_erase(&ctx->personalities, id);
82 if (creds) {
83 put_cred(creds);
84 return 0;
85 }
86
87 return -EINVAL;
88 }
89
90
io_register_personality(struct io_ring_ctx * ctx)91 static int io_register_personality(struct io_ring_ctx *ctx)
92 {
93 const struct cred *creds;
94 u32 id;
95 int ret;
96
97 creds = get_current_cred();
98
99 ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
100 XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
101 if (ret < 0) {
102 put_cred(creds);
103 return ret;
104 }
105 return id;
106 }
107
io_parse_restrictions(void __user * arg,unsigned int nr_args,struct io_restriction * restrictions)108 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
109 struct io_restriction *restrictions)
110 {
111 struct io_uring_restriction *res;
112 size_t size;
113 int i, ret;
114
115 if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
116 return -EINVAL;
117
118 size = array_size(nr_args, sizeof(*res));
119 if (size == SIZE_MAX)
120 return -EOVERFLOW;
121
122 res = memdup_user(arg, size);
123 if (IS_ERR(res))
124 return PTR_ERR(res);
125
126 ret = -EINVAL;
127
128 for (i = 0; i < nr_args; i++) {
129 switch (res[i].opcode) {
130 case IORING_RESTRICTION_REGISTER_OP:
131 if (res[i].register_op >= IORING_REGISTER_LAST)
132 goto err;
133 __set_bit(res[i].register_op, restrictions->register_op);
134 break;
135 case IORING_RESTRICTION_SQE_OP:
136 if (res[i].sqe_op >= IORING_OP_LAST)
137 goto err;
138 __set_bit(res[i].sqe_op, restrictions->sqe_op);
139 break;
140 case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
141 restrictions->sqe_flags_allowed = res[i].sqe_flags;
142 break;
143 case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
144 restrictions->sqe_flags_required = res[i].sqe_flags;
145 break;
146 default:
147 goto err;
148 }
149 }
150
151 ret = 0;
152
153 err:
154 kfree(res);
155 return ret;
156 }
157
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)158 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
159 void __user *arg, unsigned int nr_args)
160 {
161 int ret;
162
163 /* Restrictions allowed only if rings started disabled */
164 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
165 return -EBADFD;
166
167 /* We allow only a single restrictions registration */
168 if (ctx->restrictions.registered)
169 return -EBUSY;
170
171 ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
172 /* Reset all restrictions if an error happened */
173 if (ret != 0)
174 memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
175 else
176 ctx->restrictions.registered = true;
177 return ret;
178 }
179
io_register_enable_rings(struct io_ring_ctx * ctx)180 static int io_register_enable_rings(struct io_ring_ctx *ctx)
181 {
182 if (!(ctx->flags & IORING_SETUP_R_DISABLED))
183 return -EBADFD;
184
185 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
186 WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
187 /*
188 * Lazy activation attempts would fail if it was polled before
189 * submitter_task is set.
190 */
191 if (wq_has_sleeper(&ctx->poll_wq))
192 io_activate_pollwq(ctx);
193 }
194
195 if (ctx->restrictions.registered)
196 ctx->restricted = 1;
197
198 ctx->flags &= ~IORING_SETUP_R_DISABLED;
199 if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
200 wake_up(&ctx->sq_data->wait);
201 return 0;
202 }
203
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)204 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
205 cpumask_var_t new_mask)
206 {
207 int ret;
208
209 if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
210 ret = io_wq_cpu_affinity(current->io_uring, new_mask);
211 } else {
212 mutex_unlock(&ctx->uring_lock);
213 ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
214 mutex_lock(&ctx->uring_lock);
215 }
216
217 return ret;
218 }
219
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)220 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
221 void __user *arg, unsigned len)
222 {
223 cpumask_var_t new_mask;
224 int ret;
225
226 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
227 return -ENOMEM;
228
229 cpumask_clear(new_mask);
230 if (len > cpumask_size())
231 len = cpumask_size();
232
233 #ifdef CONFIG_COMPAT
234 if (in_compat_syscall())
235 ret = compat_get_bitmap(cpumask_bits(new_mask),
236 (const compat_ulong_t __user *)arg,
237 len * 8 /* CHAR_BIT */);
238 else
239 #endif
240 ret = copy_from_user(new_mask, arg, len);
241
242 if (ret) {
243 free_cpumask_var(new_mask);
244 return -EFAULT;
245 }
246
247 ret = __io_register_iowq_aff(ctx, new_mask);
248 free_cpumask_var(new_mask);
249 return ret;
250 }
251
io_unregister_iowq_aff(struct io_ring_ctx * ctx)252 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
253 {
254 return __io_register_iowq_aff(ctx, NULL);
255 }
256
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)257 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
258 void __user *arg)
259 __must_hold(&ctx->uring_lock)
260 {
261 struct io_tctx_node *node;
262 struct io_uring_task *tctx = NULL;
263 struct io_sq_data *sqd = NULL;
264 __u32 new_count[2];
265 int i, ret;
266
267 if (copy_from_user(new_count, arg, sizeof(new_count)))
268 return -EFAULT;
269 for (i = 0; i < ARRAY_SIZE(new_count); i++)
270 if (new_count[i] > INT_MAX)
271 return -EINVAL;
272
273 if (ctx->flags & IORING_SETUP_SQPOLL) {
274 sqd = ctx->sq_data;
275 if (sqd) {
276 struct task_struct *tsk;
277
278 /*
279 * Observe the correct sqd->lock -> ctx->uring_lock
280 * ordering. Fine to drop uring_lock here, we hold
281 * a ref to the ctx.
282 */
283 refcount_inc(&sqd->refs);
284 mutex_unlock(&ctx->uring_lock);
285 mutex_lock(&sqd->lock);
286 mutex_lock(&ctx->uring_lock);
287 tsk = sqpoll_task_locked(sqd);
288 if (tsk)
289 tctx = tsk->io_uring;
290 }
291 } else {
292 tctx = current->io_uring;
293 }
294
295 BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
296
297 for (i = 0; i < ARRAY_SIZE(new_count); i++)
298 if (new_count[i])
299 ctx->iowq_limits[i] = new_count[i];
300 ctx->iowq_limits_set = true;
301
302 if (tctx && tctx->io_wq) {
303 ret = io_wq_max_workers(tctx->io_wq, new_count);
304 if (ret)
305 goto err;
306 } else {
307 memset(new_count, 0, sizeof(new_count));
308 }
309
310 if (sqd) {
311 mutex_unlock(&ctx->uring_lock);
312 mutex_unlock(&sqd->lock);
313 io_put_sq_data(sqd);
314 mutex_lock(&ctx->uring_lock);
315 }
316
317 if (copy_to_user(arg, new_count, sizeof(new_count)))
318 return -EFAULT;
319
320 /* that's it for SQPOLL, only the SQPOLL task creates requests */
321 if (sqd)
322 return 0;
323
324 /* now propagate the restriction to all registered users */
325 list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
326 tctx = node->task->io_uring;
327 if (WARN_ON_ONCE(!tctx->io_wq))
328 continue;
329
330 for (i = 0; i < ARRAY_SIZE(new_count); i++)
331 new_count[i] = ctx->iowq_limits[i];
332 /* ignore errors, it always returns zero anyway */
333 (void)io_wq_max_workers(tctx->io_wq, new_count);
334 }
335 return 0;
336 err:
337 if (sqd) {
338 mutex_unlock(&ctx->uring_lock);
339 mutex_unlock(&sqd->lock);
340 io_put_sq_data(sqd);
341 mutex_lock(&ctx->uring_lock);
342 }
343 return ret;
344 }
345
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)346 static int io_register_clock(struct io_ring_ctx *ctx,
347 struct io_uring_clock_register __user *arg)
348 {
349 struct io_uring_clock_register reg;
350
351 if (copy_from_user(®, arg, sizeof(reg)))
352 return -EFAULT;
353 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
354 return -EINVAL;
355
356 switch (reg.clockid) {
357 case CLOCK_MONOTONIC:
358 ctx->clock_offset = 0;
359 break;
360 case CLOCK_BOOTTIME:
361 ctx->clock_offset = TK_OFFS_BOOT;
362 break;
363 default:
364 return -EINVAL;
365 }
366
367 ctx->clockid = reg.clockid;
368 return 0;
369 }
370
371 /*
372 * State to maintain until we can swap. Both new and old state, used for
373 * either mapping or freeing.
374 */
375 struct io_ring_ctx_rings {
376 struct io_rings *rings;
377 struct io_uring_sqe *sq_sqes;
378
379 struct io_mapped_region sq_region;
380 struct io_mapped_region ring_region;
381 };
382
io_register_free_rings(struct io_ring_ctx * ctx,struct io_uring_params * p,struct io_ring_ctx_rings * r)383 static void io_register_free_rings(struct io_ring_ctx *ctx,
384 struct io_uring_params *p,
385 struct io_ring_ctx_rings *r)
386 {
387 io_free_region(ctx, &r->sq_region);
388 io_free_region(ctx, &r->ring_region);
389 }
390
391 #define swap_old(ctx, o, n, field) \
392 do { \
393 (o).field = (ctx)->field; \
394 (ctx)->field = (n).field; \
395 } while (0)
396
397 #define RESIZE_FLAGS (IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
398 #define COPY_FLAGS (IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
399 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
400
io_register_resize_rings(struct io_ring_ctx * ctx,void __user * arg)401 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
402 {
403 struct io_uring_region_desc rd;
404 struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
405 size_t size, sq_array_offset;
406 unsigned i, tail, old_head;
407 struct io_uring_params p;
408 int ret;
409
410 /* for single issuer, must be owner resizing */
411 if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
412 current != ctx->submitter_task)
413 return -EEXIST;
414 /* limited to DEFER_TASKRUN for now */
415 if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
416 return -EINVAL;
417 if (copy_from_user(&p, arg, sizeof(p)))
418 return -EFAULT;
419 if (p.flags & ~RESIZE_FLAGS)
420 return -EINVAL;
421
422 /* properties that are always inherited */
423 p.flags |= (ctx->flags & COPY_FLAGS);
424
425 ret = io_uring_fill_params(p.sq_entries, &p);
426 if (unlikely(ret))
427 return ret;
428
429 /* nothing to do, but copy params back */
430 if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
431 if (copy_to_user(arg, &p, sizeof(p)))
432 return -EFAULT;
433 return 0;
434 }
435
436 size = rings_size(p.flags, p.sq_entries, p.cq_entries,
437 &sq_array_offset);
438 if (size == SIZE_MAX)
439 return -EOVERFLOW;
440
441 memset(&rd, 0, sizeof(rd));
442 rd.size = PAGE_ALIGN(size);
443 if (p.flags & IORING_SETUP_NO_MMAP) {
444 rd.user_addr = p.cq_off.user_addr;
445 rd.flags |= IORING_MEM_REGION_TYPE_USER;
446 }
447 ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
448 if (ret) {
449 io_register_free_rings(ctx, &p, &n);
450 return ret;
451 }
452 n.rings = io_region_get_ptr(&n.ring_region);
453
454 /*
455 * At this point n.rings is shared with userspace, just like o.rings
456 * is as well. While we don't expect userspace to modify it while
457 * a resize is in progress, and it's most likely that userspace will
458 * shoot itself in the foot if it does, we can't always assume good
459 * intent... Use read/write once helpers from here on to indicate the
460 * shared nature of it.
461 */
462 WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
463 WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
464 WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
465 WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
466
467 if (copy_to_user(arg, &p, sizeof(p))) {
468 io_register_free_rings(ctx, &p, &n);
469 return -EFAULT;
470 }
471
472 if (p.flags & IORING_SETUP_SQE128)
473 size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
474 else
475 size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
476 if (size == SIZE_MAX) {
477 io_register_free_rings(ctx, &p, &n);
478 return -EOVERFLOW;
479 }
480
481 memset(&rd, 0, sizeof(rd));
482 rd.size = PAGE_ALIGN(size);
483 if (p.flags & IORING_SETUP_NO_MMAP) {
484 rd.user_addr = p.sq_off.user_addr;
485 rd.flags |= IORING_MEM_REGION_TYPE_USER;
486 }
487 ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
488 if (ret) {
489 io_register_free_rings(ctx, &p, &n);
490 return ret;
491 }
492 n.sq_sqes = io_region_get_ptr(&n.sq_region);
493
494 /*
495 * If using SQPOLL, park the thread
496 */
497 if (ctx->sq_data) {
498 mutex_unlock(&ctx->uring_lock);
499 io_sq_thread_park(ctx->sq_data);
500 mutex_lock(&ctx->uring_lock);
501 }
502
503 /*
504 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
505 * any new mmap's on the ring fd. Clear out existing mappings to prevent
506 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
507 * existing rings beyond this point will fail. Not that it could proceed
508 * at this point anyway, as the io_uring mmap side needs go grab the
509 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
510 * duration of the actual swap.
511 */
512 mutex_lock(&ctx->mmap_lock);
513 spin_lock(&ctx->completion_lock);
514 o.rings = ctx->rings;
515 ctx->rings = NULL;
516 o.sq_sqes = ctx->sq_sqes;
517 ctx->sq_sqes = NULL;
518
519 /*
520 * Now copy SQ and CQ entries, if any. If either of the destination
521 * rings can't hold what is already there, then fail the operation.
522 */
523 tail = READ_ONCE(o.rings->sq.tail);
524 old_head = READ_ONCE(o.rings->sq.head);
525 if (tail - old_head > p.sq_entries)
526 goto overflow;
527 for (i = old_head; i < tail; i++) {
528 unsigned src_head = i & (ctx->sq_entries - 1);
529 unsigned dst_head = i & (p.sq_entries - 1);
530
531 n.sq_sqes[dst_head] = o.sq_sqes[src_head];
532 }
533 WRITE_ONCE(n.rings->sq.head, old_head);
534 WRITE_ONCE(n.rings->sq.tail, tail);
535
536 tail = READ_ONCE(o.rings->cq.tail);
537 old_head = READ_ONCE(o.rings->cq.head);
538 if (tail - old_head > p.cq_entries) {
539 overflow:
540 /* restore old rings, and return -EOVERFLOW via cleanup path */
541 ctx->rings = o.rings;
542 ctx->sq_sqes = o.sq_sqes;
543 to_free = &n;
544 ret = -EOVERFLOW;
545 goto out;
546 }
547 for (i = old_head; i < tail; i++) {
548 unsigned src_head = i & (ctx->cq_entries - 1);
549 unsigned dst_head = i & (p.cq_entries - 1);
550
551 n.rings->cqes[dst_head] = o.rings->cqes[src_head];
552 }
553 WRITE_ONCE(n.rings->cq.head, old_head);
554 WRITE_ONCE(n.rings->cq.tail, tail);
555 /* invalidate cached cqe refill */
556 ctx->cqe_cached = ctx->cqe_sentinel = NULL;
557
558 WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
559 atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
560 WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
561 WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
562
563 /* all done, store old pointers and assign new ones */
564 if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
565 ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
566
567 ctx->sq_entries = p.sq_entries;
568 ctx->cq_entries = p.cq_entries;
569
570 ctx->rings = n.rings;
571 ctx->sq_sqes = n.sq_sqes;
572 swap_old(ctx, o, n, ring_region);
573 swap_old(ctx, o, n, sq_region);
574 to_free = &o;
575 ret = 0;
576 out:
577 spin_unlock(&ctx->completion_lock);
578 mutex_unlock(&ctx->mmap_lock);
579 io_register_free_rings(ctx, &p, to_free);
580
581 if (ctx->sq_data)
582 io_sq_thread_unpark(ctx->sq_data);
583
584 return ret;
585 }
586
io_register_mem_region(struct io_ring_ctx * ctx,void __user * uarg)587 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
588 {
589 struct io_uring_mem_region_reg __user *reg_uptr = uarg;
590 struct io_uring_mem_region_reg reg;
591 struct io_uring_region_desc __user *rd_uptr;
592 struct io_uring_region_desc rd;
593 int ret;
594
595 if (io_region_is_set(&ctx->param_region))
596 return -EBUSY;
597 if (copy_from_user(®, reg_uptr, sizeof(reg)))
598 return -EFAULT;
599 rd_uptr = u64_to_user_ptr(reg.region_uptr);
600 if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
601 return -EFAULT;
602 if (memchr_inv(®.__resv, 0, sizeof(reg.__resv)))
603 return -EINVAL;
604 if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
605 return -EINVAL;
606
607 /*
608 * This ensures there are no waiters. Waiters are unlocked and it's
609 * hard to synchronise with them, especially if we need to initialise
610 * the region.
611 */
612 if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
613 !(ctx->flags & IORING_SETUP_R_DISABLED))
614 return -EINVAL;
615
616 ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
617 IORING_MAP_OFF_PARAM_REGION);
618 if (ret)
619 return ret;
620 if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
621 io_free_region(ctx, &ctx->param_region);
622 return -EFAULT;
623 }
624
625 if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
626 ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
627 ctx->cq_wait_size = rd.size;
628 }
629 return 0;
630 }
631
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)632 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
633 void __user *arg, unsigned nr_args)
634 __releases(ctx->uring_lock)
635 __acquires(ctx->uring_lock)
636 {
637 int ret;
638
639 /*
640 * We don't quiesce the refs for register anymore and so it can't be
641 * dying as we're holding a file ref here.
642 */
643 if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
644 return -ENXIO;
645
646 if (ctx->submitter_task && ctx->submitter_task != current)
647 return -EEXIST;
648
649 if (ctx->restricted) {
650 opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
651 if (!test_bit(opcode, ctx->restrictions.register_op))
652 return -EACCES;
653 }
654
655 switch (opcode) {
656 case IORING_REGISTER_BUFFERS:
657 ret = -EFAULT;
658 if (!arg)
659 break;
660 ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
661 break;
662 case IORING_UNREGISTER_BUFFERS:
663 ret = -EINVAL;
664 if (arg || nr_args)
665 break;
666 ret = io_sqe_buffers_unregister(ctx);
667 break;
668 case IORING_REGISTER_FILES:
669 ret = -EFAULT;
670 if (!arg)
671 break;
672 ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
673 break;
674 case IORING_UNREGISTER_FILES:
675 ret = -EINVAL;
676 if (arg || nr_args)
677 break;
678 ret = io_sqe_files_unregister(ctx);
679 break;
680 case IORING_REGISTER_FILES_UPDATE:
681 ret = io_register_files_update(ctx, arg, nr_args);
682 break;
683 case IORING_REGISTER_EVENTFD:
684 ret = -EINVAL;
685 if (nr_args != 1)
686 break;
687 ret = io_eventfd_register(ctx, arg, 0);
688 break;
689 case IORING_REGISTER_EVENTFD_ASYNC:
690 ret = -EINVAL;
691 if (nr_args != 1)
692 break;
693 ret = io_eventfd_register(ctx, arg, 1);
694 break;
695 case IORING_UNREGISTER_EVENTFD:
696 ret = -EINVAL;
697 if (arg || nr_args)
698 break;
699 ret = io_eventfd_unregister(ctx);
700 break;
701 case IORING_REGISTER_PROBE:
702 ret = -EINVAL;
703 if (!arg || nr_args > 256)
704 break;
705 ret = io_probe(ctx, arg, nr_args);
706 break;
707 case IORING_REGISTER_PERSONALITY:
708 ret = -EINVAL;
709 if (arg || nr_args)
710 break;
711 ret = io_register_personality(ctx);
712 break;
713 case IORING_UNREGISTER_PERSONALITY:
714 ret = -EINVAL;
715 if (arg)
716 break;
717 ret = io_unregister_personality(ctx, nr_args);
718 break;
719 case IORING_REGISTER_ENABLE_RINGS:
720 ret = -EINVAL;
721 if (arg || nr_args)
722 break;
723 ret = io_register_enable_rings(ctx);
724 break;
725 case IORING_REGISTER_RESTRICTIONS:
726 ret = io_register_restrictions(ctx, arg, nr_args);
727 break;
728 case IORING_REGISTER_FILES2:
729 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
730 break;
731 case IORING_REGISTER_FILES_UPDATE2:
732 ret = io_register_rsrc_update(ctx, arg, nr_args,
733 IORING_RSRC_FILE);
734 break;
735 case IORING_REGISTER_BUFFERS2:
736 ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
737 break;
738 case IORING_REGISTER_BUFFERS_UPDATE:
739 ret = io_register_rsrc_update(ctx, arg, nr_args,
740 IORING_RSRC_BUFFER);
741 break;
742 case IORING_REGISTER_IOWQ_AFF:
743 ret = -EINVAL;
744 if (!arg || !nr_args)
745 break;
746 ret = io_register_iowq_aff(ctx, arg, nr_args);
747 break;
748 case IORING_UNREGISTER_IOWQ_AFF:
749 ret = -EINVAL;
750 if (arg || nr_args)
751 break;
752 ret = io_unregister_iowq_aff(ctx);
753 break;
754 case IORING_REGISTER_IOWQ_MAX_WORKERS:
755 ret = -EINVAL;
756 if (!arg || nr_args != 2)
757 break;
758 ret = io_register_iowq_max_workers(ctx, arg);
759 break;
760 case IORING_REGISTER_RING_FDS:
761 ret = io_ringfd_register(ctx, arg, nr_args);
762 break;
763 case IORING_UNREGISTER_RING_FDS:
764 ret = io_ringfd_unregister(ctx, arg, nr_args);
765 break;
766 case IORING_REGISTER_PBUF_RING:
767 ret = -EINVAL;
768 if (!arg || nr_args != 1)
769 break;
770 ret = io_register_pbuf_ring(ctx, arg);
771 break;
772 case IORING_UNREGISTER_PBUF_RING:
773 ret = -EINVAL;
774 if (!arg || nr_args != 1)
775 break;
776 ret = io_unregister_pbuf_ring(ctx, arg);
777 break;
778 case IORING_REGISTER_SYNC_CANCEL:
779 ret = -EINVAL;
780 if (!arg || nr_args != 1)
781 break;
782 ret = io_sync_cancel(ctx, arg);
783 break;
784 case IORING_REGISTER_FILE_ALLOC_RANGE:
785 ret = -EINVAL;
786 if (!arg || nr_args)
787 break;
788 ret = io_register_file_alloc_range(ctx, arg);
789 break;
790 case IORING_REGISTER_PBUF_STATUS:
791 ret = -EINVAL;
792 if (!arg || nr_args != 1)
793 break;
794 ret = io_register_pbuf_status(ctx, arg);
795 break;
796 case IORING_REGISTER_NAPI:
797 ret = -EINVAL;
798 if (!arg || nr_args != 1)
799 break;
800 ret = io_register_napi(ctx, arg);
801 break;
802 case IORING_UNREGISTER_NAPI:
803 ret = -EINVAL;
804 if (nr_args != 1)
805 break;
806 ret = io_unregister_napi(ctx, arg);
807 break;
808 case IORING_REGISTER_CLOCK:
809 ret = -EINVAL;
810 if (!arg || nr_args)
811 break;
812 ret = io_register_clock(ctx, arg);
813 break;
814 case IORING_REGISTER_CLONE_BUFFERS:
815 ret = -EINVAL;
816 if (!arg || nr_args != 1)
817 break;
818 ret = io_register_clone_buffers(ctx, arg);
819 break;
820 case IORING_REGISTER_ZCRX_IFQ:
821 ret = -EINVAL;
822 if (!arg || nr_args != 1)
823 break;
824 ret = io_register_zcrx_ifq(ctx, arg);
825 break;
826 case IORING_REGISTER_RESIZE_RINGS:
827 ret = -EINVAL;
828 if (!arg || nr_args != 1)
829 break;
830 ret = io_register_resize_rings(ctx, arg);
831 break;
832 case IORING_REGISTER_MEM_REGION:
833 ret = -EINVAL;
834 if (!arg || nr_args != 1)
835 break;
836 ret = io_register_mem_region(ctx, arg);
837 break;
838 default:
839 ret = -EINVAL;
840 break;
841 }
842
843 return ret;
844 }
845
846 /*
847 * Given an 'fd' value, return the ctx associated with if. If 'registered' is
848 * true, then the registered index is used. Otherwise, the normal fd table.
849 * Caller must call fput() on the returned file, unless it's an ERR_PTR.
850 */
io_uring_register_get_file(unsigned int fd,bool registered)851 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
852 {
853 struct file *file;
854
855 if (registered) {
856 /*
857 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
858 * need only dereference our task private array to find it.
859 */
860 struct io_uring_task *tctx = current->io_uring;
861
862 if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
863 return ERR_PTR(-EINVAL);
864 fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
865 file = tctx->registered_rings[fd];
866 if (file)
867 get_file(file);
868 } else {
869 file = fget(fd);
870 }
871
872 if (unlikely(!file))
873 return ERR_PTR(-EBADF);
874 if (io_is_uring_fops(file))
875 return file;
876 fput(file);
877 return ERR_PTR(-EOPNOTSUPP);
878 }
879
880 /*
881 * "blind" registration opcodes are ones where there's no ring given, and
882 * hence the source fd must be -1.
883 */
io_uring_register_blind(unsigned int opcode,void __user * arg,unsigned int nr_args)884 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
885 unsigned int nr_args)
886 {
887 switch (opcode) {
888 case IORING_REGISTER_SEND_MSG_RING: {
889 struct io_uring_sqe sqe;
890
891 if (!arg || nr_args != 1)
892 return -EINVAL;
893 if (copy_from_user(&sqe, arg, sizeof(sqe)))
894 return -EFAULT;
895 /* no flags supported */
896 if (sqe.flags)
897 return -EINVAL;
898 if (sqe.opcode == IORING_OP_MSG_RING)
899 return io_uring_sync_msg_ring(&sqe);
900 }
901 }
902
903 return -EINVAL;
904 }
905
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)906 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
907 void __user *, arg, unsigned int, nr_args)
908 {
909 struct io_ring_ctx *ctx;
910 long ret = -EBADF;
911 struct file *file;
912 bool use_registered_ring;
913
914 use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
915 opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
916
917 if (opcode >= IORING_REGISTER_LAST)
918 return -EINVAL;
919
920 if (fd == -1)
921 return io_uring_register_blind(opcode, arg, nr_args);
922
923 file = io_uring_register_get_file(fd, use_registered_ring);
924 if (IS_ERR(file))
925 return PTR_ERR(file);
926 ctx = file->private_data;
927
928 mutex_lock(&ctx->uring_lock);
929 ret = __io_uring_register(ctx, opcode, arg, nr_args);
930
931 trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
932 ctx->buf_table.nr, ret);
933 mutex_unlock(&ctx->uring_lock);
934
935 fput(file);
936 return ret;
937 }
938