1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Code related to the io_uring_register() syscall
4  *
5  * Copyright (C) 2023 Jens Axboe
6  */
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/syscalls.h>
10 #include <linux/refcount.h>
11 #include <linux/bits.h>
12 #include <linux/fs.h>
13 #include <linux/file.h>
14 #include <linux/slab.h>
15 #include <linux/uaccess.h>
16 #include <linux/nospec.h>
17 #include <linux/compat.h>
18 #include <linux/io_uring.h>
19 #include <linux/io_uring_types.h>
20 
21 #include "io_uring.h"
22 #include "opdef.h"
23 #include "tctx.h"
24 #include "rsrc.h"
25 #include "sqpoll.h"
26 #include "register.h"
27 #include "cancel.h"
28 #include "kbuf.h"
29 #include "napi.h"
30 #include "eventfd.h"
31 #include "msg_ring.h"
32 #include "memmap.h"
33 #include "zcrx.h"
34 
35 #define IORING_MAX_RESTRICTIONS	(IORING_RESTRICTION_LAST + \
36 				 IORING_REGISTER_LAST + IORING_OP_LAST)
37 
io_probe(struct io_ring_ctx * ctx,void __user * arg,unsigned nr_args)38 static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg,
39 			   unsigned nr_args)
40 {
41 	struct io_uring_probe *p;
42 	size_t size;
43 	int i, ret;
44 
45 	if (nr_args > IORING_OP_LAST)
46 		nr_args = IORING_OP_LAST;
47 
48 	size = struct_size(p, ops, nr_args);
49 	p = kzalloc(size, GFP_KERNEL);
50 	if (!p)
51 		return -ENOMEM;
52 
53 	ret = -EFAULT;
54 	if (copy_from_user(p, arg, size))
55 		goto out;
56 	ret = -EINVAL;
57 	if (memchr_inv(p, 0, size))
58 		goto out;
59 
60 	p->last_op = IORING_OP_LAST - 1;
61 
62 	for (i = 0; i < nr_args; i++) {
63 		p->ops[i].op = i;
64 		if (io_uring_op_supported(i))
65 			p->ops[i].flags = IO_URING_OP_SUPPORTED;
66 	}
67 	p->ops_len = i;
68 
69 	ret = 0;
70 	if (copy_to_user(arg, p, size))
71 		ret = -EFAULT;
72 out:
73 	kfree(p);
74 	return ret;
75 }
76 
io_unregister_personality(struct io_ring_ctx * ctx,unsigned id)77 int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id)
78 {
79 	const struct cred *creds;
80 
81 	creds = xa_erase(&ctx->personalities, id);
82 	if (creds) {
83 		put_cred(creds);
84 		return 0;
85 	}
86 
87 	return -EINVAL;
88 }
89 
90 
io_register_personality(struct io_ring_ctx * ctx)91 static int io_register_personality(struct io_ring_ctx *ctx)
92 {
93 	const struct cred *creds;
94 	u32 id;
95 	int ret;
96 
97 	creds = get_current_cred();
98 
99 	ret = xa_alloc_cyclic(&ctx->personalities, &id, (void *)creds,
100 			XA_LIMIT(0, USHRT_MAX), &ctx->pers_next, GFP_KERNEL);
101 	if (ret < 0) {
102 		put_cred(creds);
103 		return ret;
104 	}
105 	return id;
106 }
107 
io_parse_restrictions(void __user * arg,unsigned int nr_args,struct io_restriction * restrictions)108 static __cold int io_parse_restrictions(void __user *arg, unsigned int nr_args,
109 					struct io_restriction *restrictions)
110 {
111 	struct io_uring_restriction *res;
112 	size_t size;
113 	int i, ret;
114 
115 	if (!arg || nr_args > IORING_MAX_RESTRICTIONS)
116 		return -EINVAL;
117 
118 	size = array_size(nr_args, sizeof(*res));
119 	if (size == SIZE_MAX)
120 		return -EOVERFLOW;
121 
122 	res = memdup_user(arg, size);
123 	if (IS_ERR(res))
124 		return PTR_ERR(res);
125 
126 	ret = -EINVAL;
127 
128 	for (i = 0; i < nr_args; i++) {
129 		switch (res[i].opcode) {
130 		case IORING_RESTRICTION_REGISTER_OP:
131 			if (res[i].register_op >= IORING_REGISTER_LAST)
132 				goto err;
133 			__set_bit(res[i].register_op, restrictions->register_op);
134 			break;
135 		case IORING_RESTRICTION_SQE_OP:
136 			if (res[i].sqe_op >= IORING_OP_LAST)
137 				goto err;
138 			__set_bit(res[i].sqe_op, restrictions->sqe_op);
139 			break;
140 		case IORING_RESTRICTION_SQE_FLAGS_ALLOWED:
141 			restrictions->sqe_flags_allowed = res[i].sqe_flags;
142 			break;
143 		case IORING_RESTRICTION_SQE_FLAGS_REQUIRED:
144 			restrictions->sqe_flags_required = res[i].sqe_flags;
145 			break;
146 		default:
147 			goto err;
148 		}
149 	}
150 
151 	ret = 0;
152 
153 err:
154 	kfree(res);
155 	return ret;
156 }
157 
io_register_restrictions(struct io_ring_ctx * ctx,void __user * arg,unsigned int nr_args)158 static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
159 					   void __user *arg, unsigned int nr_args)
160 {
161 	int ret;
162 
163 	/* Restrictions allowed only if rings started disabled */
164 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
165 		return -EBADFD;
166 
167 	/* We allow only a single restrictions registration */
168 	if (ctx->restrictions.registered)
169 		return -EBUSY;
170 
171 	ret = io_parse_restrictions(arg, nr_args, &ctx->restrictions);
172 	/* Reset all restrictions if an error happened */
173 	if (ret != 0)
174 		memset(&ctx->restrictions, 0, sizeof(ctx->restrictions));
175 	else
176 		ctx->restrictions.registered = true;
177 	return ret;
178 }
179 
io_register_enable_rings(struct io_ring_ctx * ctx)180 static int io_register_enable_rings(struct io_ring_ctx *ctx)
181 {
182 	if (!(ctx->flags & IORING_SETUP_R_DISABLED))
183 		return -EBADFD;
184 
185 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER && !ctx->submitter_task) {
186 		WRITE_ONCE(ctx->submitter_task, get_task_struct(current));
187 		/*
188 		 * Lazy activation attempts would fail if it was polled before
189 		 * submitter_task is set.
190 		 */
191 		if (wq_has_sleeper(&ctx->poll_wq))
192 			io_activate_pollwq(ctx);
193 	}
194 
195 	if (ctx->restrictions.registered)
196 		ctx->restricted = 1;
197 
198 	ctx->flags &= ~IORING_SETUP_R_DISABLED;
199 	if (ctx->sq_data && wq_has_sleeper(&ctx->sq_data->wait))
200 		wake_up(&ctx->sq_data->wait);
201 	return 0;
202 }
203 
__io_register_iowq_aff(struct io_ring_ctx * ctx,cpumask_var_t new_mask)204 static __cold int __io_register_iowq_aff(struct io_ring_ctx *ctx,
205 					 cpumask_var_t new_mask)
206 {
207 	int ret;
208 
209 	if (!(ctx->flags & IORING_SETUP_SQPOLL)) {
210 		ret = io_wq_cpu_affinity(current->io_uring, new_mask);
211 	} else {
212 		mutex_unlock(&ctx->uring_lock);
213 		ret = io_sqpoll_wq_cpu_affinity(ctx, new_mask);
214 		mutex_lock(&ctx->uring_lock);
215 	}
216 
217 	return ret;
218 }
219 
io_register_iowq_aff(struct io_ring_ctx * ctx,void __user * arg,unsigned len)220 static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx,
221 				       void __user *arg, unsigned len)
222 {
223 	cpumask_var_t new_mask;
224 	int ret;
225 
226 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
227 		return -ENOMEM;
228 
229 	cpumask_clear(new_mask);
230 	if (len > cpumask_size())
231 		len = cpumask_size();
232 
233 #ifdef CONFIG_COMPAT
234 	if (in_compat_syscall())
235 		ret = compat_get_bitmap(cpumask_bits(new_mask),
236 					(const compat_ulong_t __user *)arg,
237 					len * 8 /* CHAR_BIT */);
238 	else
239 #endif
240 		ret = copy_from_user(new_mask, arg, len);
241 
242 	if (ret) {
243 		free_cpumask_var(new_mask);
244 		return -EFAULT;
245 	}
246 
247 	ret = __io_register_iowq_aff(ctx, new_mask);
248 	free_cpumask_var(new_mask);
249 	return ret;
250 }
251 
io_unregister_iowq_aff(struct io_ring_ctx * ctx)252 static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx)
253 {
254 	return __io_register_iowq_aff(ctx, NULL);
255 }
256 
io_register_iowq_max_workers(struct io_ring_ctx * ctx,void __user * arg)257 static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
258 					       void __user *arg)
259 	__must_hold(&ctx->uring_lock)
260 {
261 	struct io_tctx_node *node;
262 	struct io_uring_task *tctx = NULL;
263 	struct io_sq_data *sqd = NULL;
264 	__u32 new_count[2];
265 	int i, ret;
266 
267 	if (copy_from_user(new_count, arg, sizeof(new_count)))
268 		return -EFAULT;
269 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
270 		if (new_count[i] > INT_MAX)
271 			return -EINVAL;
272 
273 	if (ctx->flags & IORING_SETUP_SQPOLL) {
274 		sqd = ctx->sq_data;
275 		if (sqd) {
276 			struct task_struct *tsk;
277 
278 			/*
279 			 * Observe the correct sqd->lock -> ctx->uring_lock
280 			 * ordering. Fine to drop uring_lock here, we hold
281 			 * a ref to the ctx.
282 			 */
283 			refcount_inc(&sqd->refs);
284 			mutex_unlock(&ctx->uring_lock);
285 			mutex_lock(&sqd->lock);
286 			mutex_lock(&ctx->uring_lock);
287 			tsk = sqpoll_task_locked(sqd);
288 			if (tsk)
289 				tctx = tsk->io_uring;
290 		}
291 	} else {
292 		tctx = current->io_uring;
293 	}
294 
295 	BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits));
296 
297 	for (i = 0; i < ARRAY_SIZE(new_count); i++)
298 		if (new_count[i])
299 			ctx->iowq_limits[i] = new_count[i];
300 	ctx->iowq_limits_set = true;
301 
302 	if (tctx && tctx->io_wq) {
303 		ret = io_wq_max_workers(tctx->io_wq, new_count);
304 		if (ret)
305 			goto err;
306 	} else {
307 		memset(new_count, 0, sizeof(new_count));
308 	}
309 
310 	if (sqd) {
311 		mutex_unlock(&ctx->uring_lock);
312 		mutex_unlock(&sqd->lock);
313 		io_put_sq_data(sqd);
314 		mutex_lock(&ctx->uring_lock);
315 	}
316 
317 	if (copy_to_user(arg, new_count, sizeof(new_count)))
318 		return -EFAULT;
319 
320 	/* that's it for SQPOLL, only the SQPOLL task creates requests */
321 	if (sqd)
322 		return 0;
323 
324 	/* now propagate the restriction to all registered users */
325 	list_for_each_entry(node, &ctx->tctx_list, ctx_node) {
326 		tctx = node->task->io_uring;
327 		if (WARN_ON_ONCE(!tctx->io_wq))
328 			continue;
329 
330 		for (i = 0; i < ARRAY_SIZE(new_count); i++)
331 			new_count[i] = ctx->iowq_limits[i];
332 		/* ignore errors, it always returns zero anyway */
333 		(void)io_wq_max_workers(tctx->io_wq, new_count);
334 	}
335 	return 0;
336 err:
337 	if (sqd) {
338 		mutex_unlock(&ctx->uring_lock);
339 		mutex_unlock(&sqd->lock);
340 		io_put_sq_data(sqd);
341 		mutex_lock(&ctx->uring_lock);
342 	}
343 	return ret;
344 }
345 
io_register_clock(struct io_ring_ctx * ctx,struct io_uring_clock_register __user * arg)346 static int io_register_clock(struct io_ring_ctx *ctx,
347 			     struct io_uring_clock_register __user *arg)
348 {
349 	struct io_uring_clock_register reg;
350 
351 	if (copy_from_user(&reg, arg, sizeof(reg)))
352 		return -EFAULT;
353 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
354 		return -EINVAL;
355 
356 	switch (reg.clockid) {
357 	case CLOCK_MONOTONIC:
358 		ctx->clock_offset = 0;
359 		break;
360 	case CLOCK_BOOTTIME:
361 		ctx->clock_offset = TK_OFFS_BOOT;
362 		break;
363 	default:
364 		return -EINVAL;
365 	}
366 
367 	ctx->clockid = reg.clockid;
368 	return 0;
369 }
370 
371 /*
372  * State to maintain until we can swap. Both new and old state, used for
373  * either mapping or freeing.
374  */
375 struct io_ring_ctx_rings {
376 	struct io_rings *rings;
377 	struct io_uring_sqe *sq_sqes;
378 
379 	struct io_mapped_region sq_region;
380 	struct io_mapped_region ring_region;
381 };
382 
io_register_free_rings(struct io_ring_ctx * ctx,struct io_uring_params * p,struct io_ring_ctx_rings * r)383 static void io_register_free_rings(struct io_ring_ctx *ctx,
384 				   struct io_uring_params *p,
385 				   struct io_ring_ctx_rings *r)
386 {
387 	io_free_region(ctx, &r->sq_region);
388 	io_free_region(ctx, &r->ring_region);
389 }
390 
391 #define swap_old(ctx, o, n, field)		\
392 	do {					\
393 		(o).field = (ctx)->field;	\
394 		(ctx)->field = (n).field;	\
395 	} while (0)
396 
397 #define RESIZE_FLAGS	(IORING_SETUP_CQSIZE | IORING_SETUP_CLAMP)
398 #define COPY_FLAGS	(IORING_SETUP_NO_SQARRAY | IORING_SETUP_SQE128 | \
399 			 IORING_SETUP_CQE32 | IORING_SETUP_NO_MMAP)
400 
io_register_resize_rings(struct io_ring_ctx * ctx,void __user * arg)401 static int io_register_resize_rings(struct io_ring_ctx *ctx, void __user *arg)
402 {
403 	struct io_uring_region_desc rd;
404 	struct io_ring_ctx_rings o = { }, n = { }, *to_free = NULL;
405 	size_t size, sq_array_offset;
406 	unsigned i, tail, old_head;
407 	struct io_uring_params p;
408 	int ret;
409 
410 	/* for single issuer, must be owner resizing */
411 	if (ctx->flags & IORING_SETUP_SINGLE_ISSUER &&
412 	    current != ctx->submitter_task)
413 		return -EEXIST;
414 	/* limited to DEFER_TASKRUN for now */
415 	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
416 		return -EINVAL;
417 	if (copy_from_user(&p, arg, sizeof(p)))
418 		return -EFAULT;
419 	if (p.flags & ~RESIZE_FLAGS)
420 		return -EINVAL;
421 
422 	/* properties that are always inherited */
423 	p.flags |= (ctx->flags & COPY_FLAGS);
424 
425 	ret = io_uring_fill_params(p.sq_entries, &p);
426 	if (unlikely(ret))
427 		return ret;
428 
429 	/* nothing to do, but copy params back */
430 	if (p.sq_entries == ctx->sq_entries && p.cq_entries == ctx->cq_entries) {
431 		if (copy_to_user(arg, &p, sizeof(p)))
432 			return -EFAULT;
433 		return 0;
434 	}
435 
436 	size = rings_size(p.flags, p.sq_entries, p.cq_entries,
437 				&sq_array_offset);
438 	if (size == SIZE_MAX)
439 		return -EOVERFLOW;
440 
441 	memset(&rd, 0, sizeof(rd));
442 	rd.size = PAGE_ALIGN(size);
443 	if (p.flags & IORING_SETUP_NO_MMAP) {
444 		rd.user_addr = p.cq_off.user_addr;
445 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
446 	}
447 	ret = io_create_region_mmap_safe(ctx, &n.ring_region, &rd, IORING_OFF_CQ_RING);
448 	if (ret) {
449 		io_register_free_rings(ctx, &p, &n);
450 		return ret;
451 	}
452 	n.rings = io_region_get_ptr(&n.ring_region);
453 
454 	/*
455 	 * At this point n.rings is shared with userspace, just like o.rings
456 	 * is as well. While we don't expect userspace to modify it while
457 	 * a resize is in progress, and it's most likely that userspace will
458 	 * shoot itself in the foot if it does, we can't always assume good
459 	 * intent... Use read/write once helpers from here on to indicate the
460 	 * shared nature of it.
461 	 */
462 	WRITE_ONCE(n.rings->sq_ring_mask, p.sq_entries - 1);
463 	WRITE_ONCE(n.rings->cq_ring_mask, p.cq_entries - 1);
464 	WRITE_ONCE(n.rings->sq_ring_entries, p.sq_entries);
465 	WRITE_ONCE(n.rings->cq_ring_entries, p.cq_entries);
466 
467 	if (copy_to_user(arg, &p, sizeof(p))) {
468 		io_register_free_rings(ctx, &p, &n);
469 		return -EFAULT;
470 	}
471 
472 	if (p.flags & IORING_SETUP_SQE128)
473 		size = array_size(2 * sizeof(struct io_uring_sqe), p.sq_entries);
474 	else
475 		size = array_size(sizeof(struct io_uring_sqe), p.sq_entries);
476 	if (size == SIZE_MAX) {
477 		io_register_free_rings(ctx, &p, &n);
478 		return -EOVERFLOW;
479 	}
480 
481 	memset(&rd, 0, sizeof(rd));
482 	rd.size = PAGE_ALIGN(size);
483 	if (p.flags & IORING_SETUP_NO_MMAP) {
484 		rd.user_addr = p.sq_off.user_addr;
485 		rd.flags |= IORING_MEM_REGION_TYPE_USER;
486 	}
487 	ret = io_create_region_mmap_safe(ctx, &n.sq_region, &rd, IORING_OFF_SQES);
488 	if (ret) {
489 		io_register_free_rings(ctx, &p, &n);
490 		return ret;
491 	}
492 	n.sq_sqes = io_region_get_ptr(&n.sq_region);
493 
494 	/*
495 	 * If using SQPOLL, park the thread
496 	 */
497 	if (ctx->sq_data) {
498 		mutex_unlock(&ctx->uring_lock);
499 		io_sq_thread_park(ctx->sq_data);
500 		mutex_lock(&ctx->uring_lock);
501 	}
502 
503 	/*
504 	 * We'll do the swap. Grab the ctx->mmap_lock, which will exclude
505 	 * any new mmap's on the ring fd. Clear out existing mappings to prevent
506 	 * mmap from seeing them, as we'll unmap them. Any attempt to mmap
507 	 * existing rings beyond this point will fail. Not that it could proceed
508 	 * at this point anyway, as the io_uring mmap side needs go grab the
509 	 * ctx->mmap_lock as well. Likewise, hold the completion lock over the
510 	 * duration of the actual swap.
511 	 */
512 	mutex_lock(&ctx->mmap_lock);
513 	spin_lock(&ctx->completion_lock);
514 	o.rings = ctx->rings;
515 	ctx->rings = NULL;
516 	o.sq_sqes = ctx->sq_sqes;
517 	ctx->sq_sqes = NULL;
518 
519 	/*
520 	 * Now copy SQ and CQ entries, if any. If either of the destination
521 	 * rings can't hold what is already there, then fail the operation.
522 	 */
523 	tail = READ_ONCE(o.rings->sq.tail);
524 	old_head = READ_ONCE(o.rings->sq.head);
525 	if (tail - old_head > p.sq_entries)
526 		goto overflow;
527 	for (i = old_head; i < tail; i++) {
528 		unsigned src_head = i & (ctx->sq_entries - 1);
529 		unsigned dst_head = i & (p.sq_entries - 1);
530 
531 		n.sq_sqes[dst_head] = o.sq_sqes[src_head];
532 	}
533 	WRITE_ONCE(n.rings->sq.head, old_head);
534 	WRITE_ONCE(n.rings->sq.tail, tail);
535 
536 	tail = READ_ONCE(o.rings->cq.tail);
537 	old_head = READ_ONCE(o.rings->cq.head);
538 	if (tail - old_head > p.cq_entries) {
539 overflow:
540 		/* restore old rings, and return -EOVERFLOW via cleanup path */
541 		ctx->rings = o.rings;
542 		ctx->sq_sqes = o.sq_sqes;
543 		to_free = &n;
544 		ret = -EOVERFLOW;
545 		goto out;
546 	}
547 	for (i = old_head; i < tail; i++) {
548 		unsigned src_head = i & (ctx->cq_entries - 1);
549 		unsigned dst_head = i & (p.cq_entries - 1);
550 
551 		n.rings->cqes[dst_head] = o.rings->cqes[src_head];
552 	}
553 	WRITE_ONCE(n.rings->cq.head, old_head);
554 	WRITE_ONCE(n.rings->cq.tail, tail);
555 	/* invalidate cached cqe refill */
556 	ctx->cqe_cached = ctx->cqe_sentinel = NULL;
557 
558 	WRITE_ONCE(n.rings->sq_dropped, READ_ONCE(o.rings->sq_dropped));
559 	atomic_set(&n.rings->sq_flags, atomic_read(&o.rings->sq_flags));
560 	WRITE_ONCE(n.rings->cq_flags, READ_ONCE(o.rings->cq_flags));
561 	WRITE_ONCE(n.rings->cq_overflow, READ_ONCE(o.rings->cq_overflow));
562 
563 	/* all done, store old pointers and assign new ones */
564 	if (!(ctx->flags & IORING_SETUP_NO_SQARRAY))
565 		ctx->sq_array = (u32 *)((char *)n.rings + sq_array_offset);
566 
567 	ctx->sq_entries = p.sq_entries;
568 	ctx->cq_entries = p.cq_entries;
569 
570 	ctx->rings = n.rings;
571 	ctx->sq_sqes = n.sq_sqes;
572 	swap_old(ctx, o, n, ring_region);
573 	swap_old(ctx, o, n, sq_region);
574 	to_free = &o;
575 	ret = 0;
576 out:
577 	spin_unlock(&ctx->completion_lock);
578 	mutex_unlock(&ctx->mmap_lock);
579 	io_register_free_rings(ctx, &p, to_free);
580 
581 	if (ctx->sq_data)
582 		io_sq_thread_unpark(ctx->sq_data);
583 
584 	return ret;
585 }
586 
io_register_mem_region(struct io_ring_ctx * ctx,void __user * uarg)587 static int io_register_mem_region(struct io_ring_ctx *ctx, void __user *uarg)
588 {
589 	struct io_uring_mem_region_reg __user *reg_uptr = uarg;
590 	struct io_uring_mem_region_reg reg;
591 	struct io_uring_region_desc __user *rd_uptr;
592 	struct io_uring_region_desc rd;
593 	int ret;
594 
595 	if (io_region_is_set(&ctx->param_region))
596 		return -EBUSY;
597 	if (copy_from_user(&reg, reg_uptr, sizeof(reg)))
598 		return -EFAULT;
599 	rd_uptr = u64_to_user_ptr(reg.region_uptr);
600 	if (copy_from_user(&rd, rd_uptr, sizeof(rd)))
601 		return -EFAULT;
602 	if (memchr_inv(&reg.__resv, 0, sizeof(reg.__resv)))
603 		return -EINVAL;
604 	if (reg.flags & ~IORING_MEM_REGION_REG_WAIT_ARG)
605 		return -EINVAL;
606 
607 	/*
608 	 * This ensures there are no waiters. Waiters are unlocked and it's
609 	 * hard to synchronise with them, especially if we need to initialise
610 	 * the region.
611 	 */
612 	if ((reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) &&
613 	    !(ctx->flags & IORING_SETUP_R_DISABLED))
614 		return -EINVAL;
615 
616 	ret = io_create_region_mmap_safe(ctx, &ctx->param_region, &rd,
617 					 IORING_MAP_OFF_PARAM_REGION);
618 	if (ret)
619 		return ret;
620 	if (copy_to_user(rd_uptr, &rd, sizeof(rd))) {
621 		io_free_region(ctx, &ctx->param_region);
622 		return -EFAULT;
623 	}
624 
625 	if (reg.flags & IORING_MEM_REGION_REG_WAIT_ARG) {
626 		ctx->cq_wait_arg = io_region_get_ptr(&ctx->param_region);
627 		ctx->cq_wait_size = rd.size;
628 	}
629 	return 0;
630 }
631 
__io_uring_register(struct io_ring_ctx * ctx,unsigned opcode,void __user * arg,unsigned nr_args)632 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
633 			       void __user *arg, unsigned nr_args)
634 	__releases(ctx->uring_lock)
635 	__acquires(ctx->uring_lock)
636 {
637 	int ret;
638 
639 	/*
640 	 * We don't quiesce the refs for register anymore and so it can't be
641 	 * dying as we're holding a file ref here.
642 	 */
643 	if (WARN_ON_ONCE(percpu_ref_is_dying(&ctx->refs)))
644 		return -ENXIO;
645 
646 	if (ctx->submitter_task && ctx->submitter_task != current)
647 		return -EEXIST;
648 
649 	if (ctx->restricted) {
650 		opcode = array_index_nospec(opcode, IORING_REGISTER_LAST);
651 		if (!test_bit(opcode, ctx->restrictions.register_op))
652 			return -EACCES;
653 	}
654 
655 	switch (opcode) {
656 	case IORING_REGISTER_BUFFERS:
657 		ret = -EFAULT;
658 		if (!arg)
659 			break;
660 		ret = io_sqe_buffers_register(ctx, arg, nr_args, NULL);
661 		break;
662 	case IORING_UNREGISTER_BUFFERS:
663 		ret = -EINVAL;
664 		if (arg || nr_args)
665 			break;
666 		ret = io_sqe_buffers_unregister(ctx);
667 		break;
668 	case IORING_REGISTER_FILES:
669 		ret = -EFAULT;
670 		if (!arg)
671 			break;
672 		ret = io_sqe_files_register(ctx, arg, nr_args, NULL);
673 		break;
674 	case IORING_UNREGISTER_FILES:
675 		ret = -EINVAL;
676 		if (arg || nr_args)
677 			break;
678 		ret = io_sqe_files_unregister(ctx);
679 		break;
680 	case IORING_REGISTER_FILES_UPDATE:
681 		ret = io_register_files_update(ctx, arg, nr_args);
682 		break;
683 	case IORING_REGISTER_EVENTFD:
684 		ret = -EINVAL;
685 		if (nr_args != 1)
686 			break;
687 		ret = io_eventfd_register(ctx, arg, 0);
688 		break;
689 	case IORING_REGISTER_EVENTFD_ASYNC:
690 		ret = -EINVAL;
691 		if (nr_args != 1)
692 			break;
693 		ret = io_eventfd_register(ctx, arg, 1);
694 		break;
695 	case IORING_UNREGISTER_EVENTFD:
696 		ret = -EINVAL;
697 		if (arg || nr_args)
698 			break;
699 		ret = io_eventfd_unregister(ctx);
700 		break;
701 	case IORING_REGISTER_PROBE:
702 		ret = -EINVAL;
703 		if (!arg || nr_args > 256)
704 			break;
705 		ret = io_probe(ctx, arg, nr_args);
706 		break;
707 	case IORING_REGISTER_PERSONALITY:
708 		ret = -EINVAL;
709 		if (arg || nr_args)
710 			break;
711 		ret = io_register_personality(ctx);
712 		break;
713 	case IORING_UNREGISTER_PERSONALITY:
714 		ret = -EINVAL;
715 		if (arg)
716 			break;
717 		ret = io_unregister_personality(ctx, nr_args);
718 		break;
719 	case IORING_REGISTER_ENABLE_RINGS:
720 		ret = -EINVAL;
721 		if (arg || nr_args)
722 			break;
723 		ret = io_register_enable_rings(ctx);
724 		break;
725 	case IORING_REGISTER_RESTRICTIONS:
726 		ret = io_register_restrictions(ctx, arg, nr_args);
727 		break;
728 	case IORING_REGISTER_FILES2:
729 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_FILE);
730 		break;
731 	case IORING_REGISTER_FILES_UPDATE2:
732 		ret = io_register_rsrc_update(ctx, arg, nr_args,
733 					      IORING_RSRC_FILE);
734 		break;
735 	case IORING_REGISTER_BUFFERS2:
736 		ret = io_register_rsrc(ctx, arg, nr_args, IORING_RSRC_BUFFER);
737 		break;
738 	case IORING_REGISTER_BUFFERS_UPDATE:
739 		ret = io_register_rsrc_update(ctx, arg, nr_args,
740 					      IORING_RSRC_BUFFER);
741 		break;
742 	case IORING_REGISTER_IOWQ_AFF:
743 		ret = -EINVAL;
744 		if (!arg || !nr_args)
745 			break;
746 		ret = io_register_iowq_aff(ctx, arg, nr_args);
747 		break;
748 	case IORING_UNREGISTER_IOWQ_AFF:
749 		ret = -EINVAL;
750 		if (arg || nr_args)
751 			break;
752 		ret = io_unregister_iowq_aff(ctx);
753 		break;
754 	case IORING_REGISTER_IOWQ_MAX_WORKERS:
755 		ret = -EINVAL;
756 		if (!arg || nr_args != 2)
757 			break;
758 		ret = io_register_iowq_max_workers(ctx, arg);
759 		break;
760 	case IORING_REGISTER_RING_FDS:
761 		ret = io_ringfd_register(ctx, arg, nr_args);
762 		break;
763 	case IORING_UNREGISTER_RING_FDS:
764 		ret = io_ringfd_unregister(ctx, arg, nr_args);
765 		break;
766 	case IORING_REGISTER_PBUF_RING:
767 		ret = -EINVAL;
768 		if (!arg || nr_args != 1)
769 			break;
770 		ret = io_register_pbuf_ring(ctx, arg);
771 		break;
772 	case IORING_UNREGISTER_PBUF_RING:
773 		ret = -EINVAL;
774 		if (!arg || nr_args != 1)
775 			break;
776 		ret = io_unregister_pbuf_ring(ctx, arg);
777 		break;
778 	case IORING_REGISTER_SYNC_CANCEL:
779 		ret = -EINVAL;
780 		if (!arg || nr_args != 1)
781 			break;
782 		ret = io_sync_cancel(ctx, arg);
783 		break;
784 	case IORING_REGISTER_FILE_ALLOC_RANGE:
785 		ret = -EINVAL;
786 		if (!arg || nr_args)
787 			break;
788 		ret = io_register_file_alloc_range(ctx, arg);
789 		break;
790 	case IORING_REGISTER_PBUF_STATUS:
791 		ret = -EINVAL;
792 		if (!arg || nr_args != 1)
793 			break;
794 		ret = io_register_pbuf_status(ctx, arg);
795 		break;
796 	case IORING_REGISTER_NAPI:
797 		ret = -EINVAL;
798 		if (!arg || nr_args != 1)
799 			break;
800 		ret = io_register_napi(ctx, arg);
801 		break;
802 	case IORING_UNREGISTER_NAPI:
803 		ret = -EINVAL;
804 		if (nr_args != 1)
805 			break;
806 		ret = io_unregister_napi(ctx, arg);
807 		break;
808 	case IORING_REGISTER_CLOCK:
809 		ret = -EINVAL;
810 		if (!arg || nr_args)
811 			break;
812 		ret = io_register_clock(ctx, arg);
813 		break;
814 	case IORING_REGISTER_CLONE_BUFFERS:
815 		ret = -EINVAL;
816 		if (!arg || nr_args != 1)
817 			break;
818 		ret = io_register_clone_buffers(ctx, arg);
819 		break;
820 	case IORING_REGISTER_ZCRX_IFQ:
821 		ret = -EINVAL;
822 		if (!arg || nr_args != 1)
823 			break;
824 		ret = io_register_zcrx_ifq(ctx, arg);
825 		break;
826 	case IORING_REGISTER_RESIZE_RINGS:
827 		ret = -EINVAL;
828 		if (!arg || nr_args != 1)
829 			break;
830 		ret = io_register_resize_rings(ctx, arg);
831 		break;
832 	case IORING_REGISTER_MEM_REGION:
833 		ret = -EINVAL;
834 		if (!arg || nr_args != 1)
835 			break;
836 		ret = io_register_mem_region(ctx, arg);
837 		break;
838 	default:
839 		ret = -EINVAL;
840 		break;
841 	}
842 
843 	return ret;
844 }
845 
846 /*
847  * Given an 'fd' value, return the ctx associated with if. If 'registered' is
848  * true, then the registered index is used. Otherwise, the normal fd table.
849  * Caller must call fput() on the returned file, unless it's an ERR_PTR.
850  */
io_uring_register_get_file(unsigned int fd,bool registered)851 struct file *io_uring_register_get_file(unsigned int fd, bool registered)
852 {
853 	struct file *file;
854 
855 	if (registered) {
856 		/*
857 		 * Ring fd has been registered via IORING_REGISTER_RING_FDS, we
858 		 * need only dereference our task private array to find it.
859 		 */
860 		struct io_uring_task *tctx = current->io_uring;
861 
862 		if (unlikely(!tctx || fd >= IO_RINGFD_REG_MAX))
863 			return ERR_PTR(-EINVAL);
864 		fd = array_index_nospec(fd, IO_RINGFD_REG_MAX);
865 		file = tctx->registered_rings[fd];
866 		if (file)
867 			get_file(file);
868 	} else {
869 		file = fget(fd);
870 	}
871 
872 	if (unlikely(!file))
873 		return ERR_PTR(-EBADF);
874 	if (io_is_uring_fops(file))
875 		return file;
876 	fput(file);
877 	return ERR_PTR(-EOPNOTSUPP);
878 }
879 
880 /*
881  * "blind" registration opcodes are ones where there's no ring given, and
882  * hence the source fd must be -1.
883  */
io_uring_register_blind(unsigned int opcode,void __user * arg,unsigned int nr_args)884 static int io_uring_register_blind(unsigned int opcode, void __user *arg,
885 				   unsigned int nr_args)
886 {
887 	switch (opcode) {
888 	case IORING_REGISTER_SEND_MSG_RING: {
889 		struct io_uring_sqe sqe;
890 
891 		if (!arg || nr_args != 1)
892 			return -EINVAL;
893 		if (copy_from_user(&sqe, arg, sizeof(sqe)))
894 			return -EFAULT;
895 		/* no flags supported */
896 		if (sqe.flags)
897 			return -EINVAL;
898 		if (sqe.opcode == IORING_OP_MSG_RING)
899 			return io_uring_sync_msg_ring(&sqe);
900 		}
901 	}
902 
903 	return -EINVAL;
904 }
905 
SYSCALL_DEFINE4(io_uring_register,unsigned int,fd,unsigned int,opcode,void __user *,arg,unsigned int,nr_args)906 SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode,
907 		void __user *, arg, unsigned int, nr_args)
908 {
909 	struct io_ring_ctx *ctx;
910 	long ret = -EBADF;
911 	struct file *file;
912 	bool use_registered_ring;
913 
914 	use_registered_ring = !!(opcode & IORING_REGISTER_USE_REGISTERED_RING);
915 	opcode &= ~IORING_REGISTER_USE_REGISTERED_RING;
916 
917 	if (opcode >= IORING_REGISTER_LAST)
918 		return -EINVAL;
919 
920 	if (fd == -1)
921 		return io_uring_register_blind(opcode, arg, nr_args);
922 
923 	file = io_uring_register_get_file(fd, use_registered_ring);
924 	if (IS_ERR(file))
925 		return PTR_ERR(file);
926 	ctx = file->private_data;
927 
928 	mutex_lock(&ctx->uring_lock);
929 	ret = __io_uring_register(ctx, opcode, arg, nr_args);
930 
931 	trace_io_uring_register(ctx, opcode, ctx->file_table.data.nr,
932 				ctx->buf_table.nr, ret);
933 	mutex_unlock(&ctx->uring_lock);
934 
935 	fput(file);
936 	return ret;
937 }
938